Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
421170e
Introduce the growable vector API
aitap Dec 28, 2024
3b31850
Use growable_* instead of SETLENGTH et al in fread
aitap Dec 28, 2024
f927451
Switch shallow() to use growable_allocate()
aitap Dec 28, 2024
e243ece
Use growable_allocate() in subsetDT()
aitap Dec 28, 2024
06e7db7
Use growable_* instead of SETLENGTH in dogroups()
aitap Dec 28, 2024
7b9b141
Mark internal_error*() as NORET
aitap Dec 28, 2024
61fdc06
ALTREP implementation of growable_*
aitap Dec 29, 2024
c16fd97
Drop redundant variable definitions
aitap Mar 22, 2025
deaa0f7
Merge branch 'truehash' into growable_refactor
ben-schwen Sep 25, 2025
cebe420
update growable for patch
ben-schwen Sep 25, 2025
2b5444b
tweak patch update
ben-schwen Sep 25, 2025
248bab7
reset truelength definition
ben-schwen Sep 25, 2025
50998be
remove last TRUELENGTH
ben-schwen Sep 25, 2025
c2c3bfd
update setgrowable
ben-schwen Sep 25, 2025
db6b6b9
remove change
ben-schwen Sep 25, 2025
4a166ff
delete extra spaces
ben-schwen Sep 25, 2025
78f45f8
Update src/data.table.h
ben-schwen Sep 26, 2025
a5874bd
remove R 3.4.0 backports
ben-schwen Sep 26, 2025
2db2a90
remove whole finalizer
ben-schwen Sep 26, 2025
c539c66
patch froll
ben-schwen Sep 26, 2025
ce5fcb5
assume no altrep at that point
jangorecki Sep 26, 2025
1117f6f
add patch to devcontainer
ben-schwen Sep 26, 2025
13a8b7d
update devcontainer to build from dockerfile
ben-schwen Sep 26, 2025
9e635d6
apply Jans loop suggestion
ben-schwen Sep 26, 2025
e06aaa7
remove spaces
ben-schwen Sep 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .devcontainer/r-devel-growable/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM registry.gitlab.com/rdatatable/dockerfiles/r-devel-growable

COPY DESCRIPTION .
RUN /usr/local/bin/Rscript -e ' \
read.dcf("DESCRIPTION", c("Imports", "Suggests")) |> \
tools:::.split_dependencies() |> \
names() |> \
setdiff(tools:::.get_standard_package_names()$base) |> \
install.packages() \
'

WORKDIR /root
COPY .devcontainer/.Rprofile .
15 changes: 15 additions & 0 deletions .devcontainer/r-devel-growable/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"name": "R-devel with Growable Vectors",
"build": {
"dockerfile": "Dockerfile",
"context": "../.."
},
"customizations": {
"vscode": {
"extensions": [
"REditorSupport.r",
"ms-vscode.cpptools-extension-pack"
]
}
}
}
2 changes: 2 additions & 0 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1356,6 +1356,7 @@ replace_dot_alias = function(e) {
}
if (!with || missing(j)) return(ans)
if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013
setgrowable(ans)
SDenv$.SDall = ans
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
SDenv$.N = nrow(ans)
Expand Down Expand Up @@ -1594,6 +1595,7 @@ replace_dot_alias = function(e) {
SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset
if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012
if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
SDenv$.SDall = setgrowable(SDenv$.SDall)
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
}
if (nrow(SDenv$.SDall)==0L) {
Expand Down
1 change: 1 addition & 0 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@ fitsInInt64 = function(x) .Call(CfitsInInt64R, x)

coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy)

setgrowable = function(x) .Call(Csetgrowable, x)
frev = function(x) .Call(Cfrev, x, TRUE)
setfrev = function(x) invisible(.Call(Cfrev, x, FALSE))
6 changes: 0 additions & 6 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -19444,12 +19444,6 @@ test(2290.3, DT[, `:=`(a, c := 3)], error="It looks like you re-used `:=` in arg
# partially-named `:=`(...) --> different branch, same error
test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in argument 2")

# segfault when selfref is not ok before set #6410
df = data.frame(a=1:3)
setDT(df)
attr(df, "att") = 1
test(2291.1, set(df, NULL, "new", "new"), error="attributes .* have been reassigned")

# ns-qualified bysub error, #6493
DT = data.table(a = 1)
test(2292.1, DT[, .N, by=base::mget("a")], data.table(a = 1, N = 1L))
Expand Down
69 changes: 18 additions & 51 deletions src/assign.c
Original file line number Diff line number Diff line change
@@ -1,28 +1,5 @@
#include "data.table.h"

static void finalizer(SEXP p)
{
SEXP x;
R_len_t n, l, tl;
if(!R_ExternalPtrAddr(p)) internal_error(__func__, "didn't receive an ExternalPtr"); // # nocov
p = R_ExternalPtrTag(p);
if (!isString(p)) internal_error(__func__, "ExternalPtr doesn't see names in tag"); // # nocov
l = LENGTH(p);
tl = TRUELENGTH(p);
if (l<0 || tl<l) internal_error(__func__, "l=%d, tl=%d", l, tl); // # nocov
n = tl-l;
if (n==0) {
// gc's ReleaseLargeFreeVectors() will have reduced R_LargeVallocSize by the correct amount
// already, so nothing to do (but almost never the case).
return;
}
x = PROTECT(allocVector(INTSXP, 50)); // 50 so it's big enough to be on LargeVector heap. See NodeClassSize in memory.c:allocVector
// INTSXP rather than VECSXP so that GC doesn't inspect contents after LENGTH (thanks to Karl Miller, Jul 2015)
SETLENGTH(x,50+n*2*sizeof(void *)/4); // 1*n for the names, 1*n for the VECSXP itself (both are over allocated).
UNPROTECT(1);
return;
}

void setselfref(SEXP x) {
if(!INHERITS(x, char_datatable)) return; // #5286
SEXP p;
Expand All @@ -38,7 +15,6 @@ void setselfref(SEXP x) {
R_NilValue
))
));
R_RegisterCFinalizerEx(p, finalizer, FALSE);
UNPROTECT(2);

/*
Expand Down Expand Up @@ -126,15 +102,13 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) {
tag = R_ExternalPtrTag(v);
if (!(isNull(tag) || isString(tag))) internal_error(__func__, ".internal.selfref tag is neither NULL nor a character vector"); // # nocov
names = getAttrib(x, R_NamesSymbol);
if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734
SET_TRUELENGTH(names, LENGTH(names));
// R copied this vector not data.table; it's not actually over-allocated. It looks over-allocated
// because R copies the original vector's tl over despite allocating length.
// On R >= 3.4, either
// (1) we allocate the data.table and/or its names, so it has the GROWABLE_BIT set, so copies will have zero TRUELENGTH, or
// (2) someone else creates them from scratch, so (only using the API) will have zero TRUELENGTH.
// We then return false and either re-create the data.table from scratch or signal an error, so the current object having a zero TRUELENGTH is fine.
prot = R_ExternalPtrProtected(v);
if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")).
return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r
if (x!=R_ExternalPtrAddr(prot) && !ALTREP(x))
SET_TRUELENGTH(x, LENGTH(x)); // R copied this vector not data.table, it's not actually over-allocated
return checkNames ? names==tag : x==R_ExternalPtrAddr(prot);
}

Expand All @@ -151,7 +125,8 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
// called from alloccol where n is checked carefully, or from shallow() at R level
// where n is set to truelength (i.e. a shallow copy only with no size change)
int protecti=0;
SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here?
const int l = isNull(cols) ? length(dt) : length(cols);
SEXP newdt = PROTECT(growable_allocate(VECSXP, l, n)); protecti++; // to do, use growVector here?
SHALLOW_DUPLICATE_ATTRIB(newdt, dt);

// TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It
Expand All @@ -169,8 +144,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
setAttrib(newdt, sym_sorted, duplicate(sorted));

SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++;
SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++;
const int l = isNull(cols) ? LENGTH(dt) : length(cols);
SEXP newnames = PROTECT(growable_allocate(STRSXP, l, n)); protecti++;
if (isNull(cols)) {
for (int i=0; i<l; ++i) SET_VECTOR_ELT(newdt, i, VECTOR_ELT(dt,i));
if (length(names)) {
Expand All @@ -186,13 +160,8 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
for (int i=0; i<l; ++i) SET_STRING_ELT( newnames, i, STRING_ELT(names,INTEGER(cols)[i]-1) );
}
}
// setAttrib used to change length and truelength, but as of R-3.3 no longer does that
setAttrib(newdt, R_NamesSymbol, newnames);
// setAttrib appears to change length and truelength, so need to do that first _then_ SET next,
// otherwise (if the SET were were first) the 100 tl is assigned to length.
SETLENGTH(newnames,l);
SET_TRUELENGTH(newnames,n);
SETLENGTH(newdt,l);
SET_TRUELENGTH(newdt,n);
setselfref(newdt);
UNPROTECT(protecti);
return(newdt);
Expand Down Expand Up @@ -257,10 +226,8 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose)
return shallow(dt,R_NilValue,(n>l) ? n : l); // e.g. test 848 and 851 in R > 3.0.2
// added (n>l) ? ... for #970, see test 1481.
// TO DO: test realloc names if selfrefnamesok (users can setattr(x,"name") themselves for example.
// if (TRUELENGTH(getAttrib(dt,R_NamesSymbol))!=tl)
// internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRUELENGTH(getAttrib(dt,R_NamesSymbol))); // # nocov

tl = TRUELENGTH(dt);
tl = growable_capacity(dt);
// R <= 2.13.2 and we didn't catch uninitialized tl somehow
if (tl<0) internal_error(__func__, "tl of class is marked but tl<0"); // # nocov
if (tl>0 && tl<l) internal_error(__func__, "tl (%d) < l (%d) but tl of class is marked", tl, l); // # nocov
Expand Down Expand Up @@ -310,11 +277,11 @@ SEXP shallowwrapper(SEXP dt, SEXP cols) {
if (!selfrefok(dt, FALSE)) {
int n = isNull(cols) ? length(dt) : length(cols);
return(shallow(dt, cols, n));
} else return(shallow(dt, cols, TRUELENGTH(dt)));
} else return(shallow(dt, cols, growable_capacity(dt)));
}

SEXP truelength(SEXP x) {
return ScalarInteger(isNull(x) ? 0 : TRUELENGTH(x));
return ScalarInteger(isNull(x) ? 0 : growable_capacity(x));
}

SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
Expand Down Expand Up @@ -509,7 +476,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
// modify DT by reference. Other than if new columns are being added and the allocVec() fails with
// out-of-memory. In that case the user will receive hard halt and know to rerun.
if (length(newcolnames)) {
oldtncol = TRUELENGTH(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.
oldtncol = is_growable(dt) ? growable_capacity(dt) : 0; // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.

if (oldtncol<oldncol) {
if (oldtncol==0) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
Expand All @@ -522,13 +489,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov
// Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as
// strong error message for now.
else if (TRUELENGTH(names) != oldtncol)
else if (growable_capacity(names) != oldtncol)
// Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work
internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)TRUELENGTH(names), oldtncol); // # nocov
internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)growable_capacity(names), oldtncol); // # nocov
if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref
error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker."));
SETLENGTH(dt, oldncol+LENGTH(newcolnames));
SETLENGTH(names, oldncol+LENGTH(newcolnames));
growable_resize(dt, oldncol+LENGTH(newcolnames));
growable_resize(names, oldncol+LENGTH(newcolnames));
for (int i=0; i<LENGTH(newcolnames); ++i)
SET_STRING_ELT(names,oldncol+i,STRING_ELT(newcolnames,i));
// truelengths of both already set by alloccol
Expand Down Expand Up @@ -726,8 +693,8 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
SET_VECTOR_ELT(dt, i, R_NilValue);
SET_STRING_ELT(names, i, NA_STRING); // release reference to the CHARSXP
}
SETLENGTH(dt, ndt-ndelete);
SETLENGTH(names, ndt-ndelete);
growable_resize(dt, ndt-ndelete);
growable_resize(names, ndt-ndelete);
if (LENGTH(names)==0) {
// That was last column deleted, leaving NULL data.table, so we need to reset .row_names, so that it really is the NULL data.table.
PROTECT(nullint=allocVector(INTSXP, 0)); protecti++;
Expand Down
3 changes: 3 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# define isDataFrame(x) isFrame(x) // #6180
#endif
#include <Rinternals.h>
#include <Rgrowable.h>
#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT
#include <stdint.h> // for uint64_t rather than unsigned long long
#include <stdarg.h> // for va_list, va_start
Expand Down Expand Up @@ -425,3 +426,5 @@ SEXP dt_has_zlib(void);
SEXP startsWithAny(SEXP, SEXP, SEXP);
SEXP convertDate(SEXP, SEXP);
SEXP fastmean(SEXP);
SEXP setgrowable(SEXP x);

27 changes: 12 additions & 15 deletions src/dogroups.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,10 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) {
// (see data.table.h), and isNewList() is true for NULL
if (n==0)
return false;
if (hash_lookup(specials, x, 0)<0) return true; // test 2158
if (isVectorAtomic(x))
return ALTREP(x) || hash_lookup(specials, x, 0)<0;
return ALTREP(x); // see test 2156: ALTREP is a source of sharing we can't trace reliably
if (isNewList(x)) {
if (hash_lookup(specials, x, 0)<0)
return true; // test 2158
for (int i=0; i<n; ++i) {
list_el = VECTOR_ELT(x,i);
if (anySpecialStatic(list_el, specials))
Expand Down Expand Up @@ -124,7 +123,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
for (R_len_t i=0; i<n; ++i) {
if (ilens[i] > maxGrpSize) maxGrpSize = ilens[i];
}
defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++;
defineVar(install(".I"), I = PROTECT(growable_allocate(INTSXP, maxGrpSize, maxGrpSize)), env); nprotect++;
hash_set(specials, I, -maxGrpSize); // marker for anySpecialStatic(); see its comments
R_LockBinding(install(".I"), env);

Expand Down Expand Up @@ -197,7 +196,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP
INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch
for (int j=0; j<length(SDall); ++j) {
SETLENGTH(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
growable_resize(VECTOR_ELT(SDall,j), grpn); // before copying data in otherwise assigning after the end could error R API checks
defineVar(nameSyms[j], VECTOR_ELT(SDall, j), env);
// Redo this defineVar for each group in case user's j assigned to the column names (env is static) (tests 387 and 388)
// nameSyms pre-stored to save repeated install() for efficiency, though.
Expand Down Expand Up @@ -230,14 +229,14 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// and we hope that reference counting on by default from R 4.0 will avoid costly gc()s.
}
grpn = 1; // it may not be 1 e.g. test 722. TODO: revisit.
SETLENGTH(I, grpn);
growable_resize(I, grpn);
INTEGER(I)[0] = 0;
for (int j=0; j<length(xSD); ++j) {
writeNA(VECTOR_ELT(xSD, j), 0, 1, false);
}
} else {
if (verbose) tstart = wallclock();
SETLENGTH(I, grpn);
growable_resize(I, grpn);
int *iI = INTEGER(I);
if (LENGTH(order)==0) {
const int rownum = grpn ? istarts[i]-1 : -1;
Expand Down Expand Up @@ -318,13 +317,13 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
RHS = VECTOR_ELT(jval,j%LENGTH(jval));
if (isNull(target)) {
// first time adding to new column
if (TRUELENGTH(dt) < colj+1) internal_error(__func__, "Trying to add new column by reference but tl is full; setalloccol should have run first at R level before getting to this point"); // # nocov
if (growable_capacity(dt) < colj+1) internal_error(__func__, "Trying to add new column by reference but table is full; setalloccol should have run first at R level before getting to this point"); // # nocov
target = PROTECT(allocNAVectorLike(RHS, n));
// Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
// contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
// Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
SETLENGTH(dtnames, LENGTH(dtnames)+1);
SETLENGTH(dt, LENGTH(dt)+1);
growable_resize(dtnames, LENGTH(dtnames)+1);
growable_resize(dt, LENGTH(dt)+1);
SET_VECTOR_ELT(dt, colj, target);
UNPROTECT(1);
SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
Expand Down Expand Up @@ -488,11 +487,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
// Also reset truelength on specials; see comments in anySpecialStatic().
for (int j=0; j<length(SDall); ++j) {
SEXP this = VECTOR_ELT(SDall,j);
SETLENGTH(this, maxGrpSize);
SET_TRUELENGTH(this, maxGrpSize);
growable_resize(this, maxGrpSize);
}
SETLENGTH(I, maxGrpSize);
SET_TRUELENGTH(I, maxGrpSize);
growable_resize(I, maxGrpSize);
if (verbose) {
if (nblock[0] && nblock[1]) internal_error(__func__, "block 0 [%d] and block 1 [%d] have both run", nblock[0], nblock[1]); // # nocov
int w = nblock[1]>0;
Expand Down Expand Up @@ -529,7 +526,7 @@ SEXP growVector(SEXP x, const R_len_t newlen)
SEXP newx;
R_len_t len = length(x);
if (isNull(x)) error(_("growVector passed NULL"));
PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
PROTECT(newx = growable_allocate(TYPEOF(x), newlen, newlen)); // may be shrunk later by fread
if (newlen < len) len=newlen; // i.e. shrink
if (!len) { // cannot memcpy invalid pointer, #6819
keepattr(newx, x);
Expand Down
Loading
Loading