From dbbee12b43074cf35ac54e4911f0f3f3d862401b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 01:16:06 +0100 Subject: [PATCH 1/7] set automatically allocates new column slots if needed --- NEWS.md | 2 ++ R/data.table.R | 3 ++- inst/tests/tests.Rraw | 13 +++++++++++-- src/assign.c | 22 ++++++++++++---------- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 17d97ee17..2ac50f19a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,8 @@ 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. +2. `set()` now automatically pre-allocates new column slots if needed, similar to what `:=` already does, [#1831](https://github.com/Rdatatable/data.table/issues/1831) [#4100](https://github.com/Rdatatable/data.table/issues/4100). Thanks to @zachokeeffe and @tyner for the report and @ben-schwen for the fix. + ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 ### BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index f05220a62..54e86aac8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2857,7 +2857,8 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent= set = function(x,i=NULL,j,value) # low overhead, loopable { # If removing columns from a table that's not selfrefok, need to call setalloccol first, #7488 - if ((is.null(value) || (is.list(value) && any(vapply_1b(value, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) { + if (((is.null(value) || (is.list(value) && any(vapply_1b(value, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) || + truelength(x) <= length(x)) { # automatically allocate more space when tl <= ncol (either full or loaded from disk) name = substitute(x) setalloccol(x, verbose=FALSE) if (is.name(name)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c7205e52a..06e307915 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14797,7 +14797,7 @@ test(2016.1, name, "DT") test(2016.2, DT, data.table(a=1:3)) test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column -test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") # just set() +test(2016.5, set(DT,2L,"newCol",5L), data.table(a=INT(1,4,5), newCol=INT(NA,5L,NA))) # works since set overallocates #4100 test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller) unlink(tt) @@ -19478,7 +19478,7 @@ test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in df = data.frame(a=1:3) setDT(df) attr(df, "att") = 1 -test(2291.1, set(df, NULL, "new", "new"), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") +test(2291.1, set(df, NULL, "new", "new"), df(a=1:3, new="new")) # fixed when calling setalloccol before set #4100 # ns-qualified bysub error, #6493 DT = data.table(a = 1) @@ -21959,3 +21959,12 @@ test(2355.1, fread(txt, skip=0), data.table(V1 = c("b1", "c1"), a1 test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name") test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE") test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3"))) + +# re-overallocate in set if quota is reached #496 #1831 #4100 +DT = data.table() +test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i); ncol(DT)}, 10L) +DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame")) +test(2356.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3)) +# ensure := and set are consistent if they need to overallocate +DT = data.table(); DT2 = data.table() +test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i)}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]}) diff --git a/src/assign.c b/src/assign.c index 849cb08f2..7df1f916d 100644 --- a/src/assign.c +++ b/src/assign.c @@ -264,7 +264,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; + R_len_t numToDo, targetlen, vlen, oldncol, tl, coln, protecti=0, newcolnum, indexLength; SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted @@ -445,21 +445,23 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // out-of-memory. In that case the user will receive hard halt and know to rerun. if (length(newcolnames)) { if (!R_isResizable(dt)) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996 - oldtncol = R_maxLength(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more. - - if (oldtncololdncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."),oldtncol, oldncol); - if (oldtncol < oldncol+LENGTH(newcolnames)) - internal_error(__func__, "input dt has not been allocated enough column slots. l=%d, tl=%d, adding %d", oldncol, oldtncol, LENGTH(newcolnames)); // # nocov + if (tl>oldncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."), tl, oldncol); if (!selfrefnamesok(dt,verbose)) error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov // Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as // strong error message for now. - else if (R_maxLength(names) != oldtncol) + else if (R_maxLength(names) != tl) // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work - internal_error(__func__, "selfrefnames is ok but maxLength(names) [%lld] != maxLength(dt) [%d]", (long long)R_maxLength(names), oldtncol); // # nocov + internal_error(__func__, "selfrefnames is ok but maxLength(names) [%lld] != maxLength(dt) [%d]", (long long)R_maxLength(names), tl); // # nocov if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker.")); R_resizeVector(dt, oldncol+LENGTH(newcolnames)); From a5912aa73fb87ca67fd68d7495a1876bdac2b05b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 01:23:54 +0100 Subject: [PATCH 2/7] use GetOption1 instead of GetOption --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 7df1f916d..a83cb9c8f 100644 --- a/src/assign.c +++ b/src/assign.c @@ -446,7 +446,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if (length(newcolnames)) { if (!R_isResizable(dt)) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996 tl = R_maxLength(dt); - int overAlloc = checkOverAlloc(GetOption(install("datatable.alloccol"), R_NilValue)); + int overAlloc = checkOverAlloc(GetOption1(install("datatable.alloccol"))); // perform re-overallocation since DT has too less columns #1831 #4100 if (tl < oldncol+LENGTH(newcolnames)) { // use MAX to mitigate case where data is loaded from disk (readRDS()/load()) or constructed manually (e.g. using structure()) From 32f60dd61745bad79762733e6b73bcf948cb922c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 01:32:58 +0100 Subject: [PATCH 3/7] fix test --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 06e307915..994d8b967 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19478,7 +19478,7 @@ test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in df = data.frame(a=1:3) setDT(df) attr(df, "att") = 1 -test(2291.1, set(df, NULL, "new", "new"), df(a=1:3, new="new")) # fixed when calling setalloccol before set #4100 +test(2291.1, set(df, NULL, "new", "new"), setattr(data.table(a=1:3, new="new"), "att", 1)) # fixed when calling setalloccol before set #4100 # ns-qualified bysub error, #6493 DT = data.table(a = 1) From 2dbbd406dd1e1709b24c7c6019cc8422a2558512 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 01:41:23 +0100 Subject: [PATCH 4/7] change froll test --- inst/tests/froll.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index 3171c16a2..22e50682e 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -2084,7 +2084,8 @@ if (use.fork) { test(6010.772, .selfref.ok(ans[[2L]])) ans = frollapply(1:2, 2, function(x) list(data.table(x)), fill=list(data.table(NA)), simplify=FALSE) test(6010.773, !.selfref.ok(ans[[2L]][[1L]])) - test(6010.7731, set(ans[[2L]][[1L]],, "newcol", 1L), error="data.table has either been loaded from disk") + # deactivated by #5443 + # test(6010.7731, set(ans[[2L]][[1L]],, "newcol", 1L), error="data.table has either been loaded from disk") ans = lapply(ans, lapply, setDT) test(6010.774, .selfref.ok(ans[[2L]][[1L]])) ## fix after ans = frollapply(1:2, 2, function(x) list(data.table(x)), fill=list(data.table(NA)), simplify=function(x) lapply(x, lapply, setDT)) From 76675c38b80cc50fb44809e30952b3e52eeab474 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 02:28:13 +0100 Subject: [PATCH 5/7] remove assign change --- src/assign.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/assign.c b/src/assign.c index a83cb9c8f..849cb08f2 100644 --- a/src/assign.c +++ b/src/assign.c @@ -264,7 +264,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, tl, coln, protecti=0, newcolnum, indexLength; + R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted @@ -445,23 +445,21 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // out-of-memory. In that case the user will receive hard halt and know to rerun. if (length(newcolnames)) { if (!R_isResizable(dt)) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996 - tl = R_maxLength(dt); - int overAlloc = checkOverAlloc(GetOption1(install("datatable.alloccol"))); - // perform re-overallocation since DT has too less columns #1831 #4100 - if (tl < oldncol+LENGTH(newcolnames)) { - // use MAX to mitigate case where data is loaded from disk (readRDS()/load()) or constructed manually (e.g. using structure()) - dt = shallow(dt,R_NilValue, MAX(oldncol-tl, 0) + oldncol + length(newcolnames) + overAlloc); - tl = TRUELENGTH(dt); - names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; + oldtncol = R_maxLength(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more. + + if (oldtncololdncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."), tl, oldncol); + if (oldtncol>oldncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."),oldtncol, oldncol); + if (oldtncol < oldncol+LENGTH(newcolnames)) + internal_error(__func__, "input dt has not been allocated enough column slots. l=%d, tl=%d, adding %d", oldncol, oldtncol, LENGTH(newcolnames)); // # nocov if (!selfrefnamesok(dt,verbose)) error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov // Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as // strong error message for now. - else if (R_maxLength(names) != tl) + else if (R_maxLength(names) != oldtncol) // Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work - internal_error(__func__, "selfrefnames is ok but maxLength(names) [%lld] != maxLength(dt) [%d]", (long long)R_maxLength(names), tl); // # nocov + internal_error(__func__, "selfrefnames is ok but maxLength(names) [%lld] != maxLength(dt) [%d]", (long long)R_maxLength(names), oldtncol); // # nocov if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker.")); R_resizeVector(dt, oldncol+LENGTH(newcolnames)); From abfde8f65a3a17ee229e4ace1a6f8ef75663c99b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 16:15:15 +0100 Subject: [PATCH 6/7] add output statements to test loop --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 994d8b967..8cad916e3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21967,4 +21967,4 @@ DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame")) test(2356.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3)) # ensure := and set are consistent if they need to overallocate DT = data.table(); DT2 = data.table() -test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i)}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]}) +test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = sprintf("V%d",i), value = i); DT}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]; DT2}) From 21a356796e724e2740905f7a1bcb78e7d38eb1ec Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 28 Dec 2025 16:21:28 +0100 Subject: [PATCH 7/7] add helper function --- R/data.table.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 54e86aac8..27c985e44 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2854,11 +2854,20 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent= invisible(x) } +.set_needs_alloccol = function(x, value) { + # automatically allocate more space when tl <= ncol (either full or loaded from disk) + if (truelength(x) <= length(x)) return(TRUE) + if (selfrefok(x, verbose=FALSE) >= 1L) return(FALSE) + # value can be NULL or list with NULLs inside + if (is.null(value)) return(TRUE) + if (!is.list(value)) return(FALSE) + any(vapply_1b(value, is.null)) +} + set = function(x,i=NULL,j,value) # low overhead, loopable { # If removing columns from a table that's not selfrefok, need to call setalloccol first, #7488 - if (((is.null(value) || (is.list(value) && any(vapply_1b(value, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) || - truelength(x) <= length(x)) { # automatically allocate more space when tl <= ncol (either full or loaded from disk) + if (.set_needs_alloccol(x, value)) { name = substitute(x) setalloccol(x, verbose=FALSE) if (is.name(name)) {