diff --git a/NEWS.md b/NEWS.md index 17d97ee17..2ac50f19a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,8 @@ 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. +2. `set()` now automatically pre-allocates new column slots if needed, similar to what `:=` already does, [#1831](https://github.com/Rdatatable/data.table/issues/1831) [#4100](https://github.com/Rdatatable/data.table/issues/4100). Thanks to @zachokeeffe and @tyner for the report and @ben-schwen for the fix. + ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 ### BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index f05220a62..27c985e44 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2854,10 +2854,20 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent= invisible(x) } +.set_needs_alloccol = function(x, value) { + # automatically allocate more space when tl <= ncol (either full or loaded from disk) + if (truelength(x) <= length(x)) return(TRUE) + if (selfrefok(x, verbose=FALSE) >= 1L) return(FALSE) + # value can be NULL or list with NULLs inside + if (is.null(value)) return(TRUE) + if (!is.list(value)) return(FALSE) + any(vapply_1b(value, is.null)) +} + set = function(x,i=NULL,j,value) # low overhead, loopable { # If removing columns from a table that's not selfrefok, need to call setalloccol first, #7488 - if ((is.null(value) || (is.list(value) && any(vapply_1b(value, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) { + if (.set_needs_alloccol(x, value)) { name = substitute(x) setalloccol(x, verbose=FALSE) if (is.name(name)) { diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index 3171c16a2..22e50682e 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -2084,7 +2084,8 @@ if (use.fork) { test(6010.772, .selfref.ok(ans[[2L]])) ans = frollapply(1:2, 2, function(x) list(data.table(x)), fill=list(data.table(NA)), simplify=FALSE) test(6010.773, !.selfref.ok(ans[[2L]][[1L]])) - test(6010.7731, set(ans[[2L]][[1L]],, "newcol", 1L), error="data.table has either been loaded from disk") + # deactivated by #5443 + # test(6010.7731, set(ans[[2L]][[1L]],, "newcol", 1L), error="data.table has either been loaded from disk") ans = lapply(ans, lapply, setDT) test(6010.774, .selfref.ok(ans[[2L]][[1L]])) ## fix after ans = frollapply(1:2, 2, function(x) list(data.table(x)), fill=list(data.table(NA)), simplify=function(x) lapply(x, lapply, setDT)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c7205e52a..8cad916e3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14797,7 +14797,7 @@ test(2016.1, name, "DT") test(2016.2, DT, data.table(a=1:3)) test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column -test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") # just set() +test(2016.5, set(DT,2L,"newCol",5L), data.table(a=INT(1,4,5), newCol=INT(NA,5L,NA))) # works since set overallocates #4100 test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller) unlink(tt) @@ -19478,7 +19478,7 @@ test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in df = data.frame(a=1:3) setDT(df) attr(df, "att") = 1 -test(2291.1, set(df, NULL, "new", "new"), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") +test(2291.1, set(df, NULL, "new", "new"), setattr(data.table(a=1:3, new="new"), "att", 1)) # fixed when calling setalloccol before set #4100 # ns-qualified bysub error, #6493 DT = data.table(a = 1) @@ -21959,3 +21959,12 @@ test(2355.1, fread(txt, skip=0), data.table(V1 = c("b1", "c1"), a1 test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name") test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE") test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3"))) + +# re-overallocate in set if quota is reached #496 #1831 #4100 +DT = data.table() +test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i); ncol(DT)}, 10L) +DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame")) +test(2356.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3)) +# ensure := and set are consistent if they need to overallocate +DT = data.table(); DT2 = data.table() +test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = sprintf("V%d",i), value = i); DT}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]; DT2})