Rdatatable · ben-schwen · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025
@@ -11,7 +11,7 @@ export(setindex, setindexv, indices)
 export(as.data.table,is.data.table,test.data.table)
 export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%")
 export(timetaken)
-export(truelength, setalloccol, alloc.col, ":=", let)
+export(truelength, setalloccol, setallocrow, alloc.col, ":=", let)
 export(setattr, setnames, setcolorder, set, setDT, setDF)
 export(setorder, setorderv)
 export(setNumericRounding, getNumericRounding)
@@ -28,7 +28,7 @@ export(tstrsplit)
 export(frank)
 export(frankv)
 export(address)
-export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI, measure, measurev, patterns)
+export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI,.ROW, measure, measurev, patterns)
 # TODO(#6197): Export these.
 # export(., J)
 export(rleid)

@@ -11,7 +11,7 @@ methods::setPackageName("data.table",.global)
 #   (1) add to man/special-symbols.Rd
 #   (2) export() in NAMESPACE
 #   (3) add to vignettes/datatable-importing.Rmd#globals section
-.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL
+.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = .ROW = NULL
 # These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
 # But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
 # And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
@@ -1182,6 +1182,21 @@ replace_dot_alias = function(e) {
           names(jsub)=""
           jsub[[1L]]=as.name("list")
         }
+
+        # Check for .ROW := NULL pattern (delete rows by reference)
+        if ((is.character(lhs) && length(lhs)==1L && lhs==".ROW") ||
+            (is.name(lhs) && identical(lhs, quote(.ROW)))) {
+          if (is.null(jsub) || identical(jsub, quote(NULL))) {
+            if (is.null(irows))
+              stopf(".ROW := NULL requires i= condition to specify rows to delete")
+            if (!missingby)
+              stopf(".ROW := NULL with 'by' or 'keyby' is not supported yet")
+            .Call(CdeleteRows, x, irows)
+            return(suppPrint(x))
+          } else {
+            stopf(".ROW can only be used with := NULL to delete rows")
+          }
+        }
         av = all.vars(jsub,TRUE)
         if (!is.atomic(lhs)) stopf("LHS of := must be a symbol, or an atomic vector (column names or positions).")
         if (is.character(lhs)) {
@@ -2719,6 +2734,10 @@ selfrefok = function(DT,verbose=getOption("datatable.verbose")) {
   .Call(Cselfrefokwrapper,DT,verbose)
 }
 
+setallocrow = function(DT) {
+  .Call(Callocrow, DT)
+}
+
 truelength = function(x) .Call(Ctruelength,x)
 # deliberately no "truelength<-" method.  setalloccol is the mechanism for that.
 # settruelength() no longer need (and so removed) now that data.table depends on R 2.14.0

@@ -21959,3 +21959,96 @@ test(2355.1, fread(txt, skip=0),              data.table(V1 = c("b1", "c1"), a1
 test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name")
 test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE")
 test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3")))
+
+# delete rows by reference #635
+# atomic types and list columns
+dt = data.table(
+  int = 1:5,
+  real = c(1.1, 2.2, 3.3, 4.4, 5.5),
+  char = letters[1:5],
+  lgl = c(TRUE, FALSE, TRUE, FALSE, TRUE),
+  cplx = as.complex(1:5),
+  raw_col = as.raw(1:5),
+  list_col = list(1L, 1:2, 1:3, 1:4, 1:5)
+)
+test(2356.01, copy(dt)[1L, .ROW := NULL], dt[-1])
+test(2356.02, copy(dt)[1, .ROW := NULL], dt[-1])
+test(2356.03, copy(dt)[c(TRUE, FALSE, FALSE, TRUE, FALSE), .ROW := NULL], dt[-c(1,4)])
+test(2356.04, copy(dt)[int==1L, .ROW := NULL], dt[-1])
+test(2356.05, copy(dt)[int<2L, .ROW := NULL], dt[-1])
+test(2356.06, copy(dt)[-1, .ROW := NULL], dt[1])
+# zero row or empty data.tables
+dt = data.table()
+test(2356.07, dt[logical(0), .ROW := NULL], dt)
+dt = data.table(a=integer(0), b=character(0))
+test(2356.08, dt[logical(0), .ROW := NULL], dt)
+# multirow
+dt = data.table(a=1:5, b=letters[1:5])
+test(2356.09, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)])
+test(2356.10, copy(dt)[c(TRUE, FALSE, TRUE, FALSE, TRUE), .ROW := NULL], dt[c(2,4)])
+test(2356.11, copy(dt)[1:2, .ROW := NULL], dt[3:5])
+test(2356.12, copy(dt)[1:5, .ROW := NULL], dt[0])
+# NA handling and edges case
+dt = data.table(a=1:5, b=letters[1:5])
+test(2356.13, copy(dt)[c(1L, NA_integer_, 3L), .ROW := NULL], dt[c(2,4,5)])
+test(2356.14, copy(dt)[c(NA_integer_, NA_integer_), .ROW := NULL], dt)
+test(2356.15, copy(dt)[c(TRUE, NA, FALSE, NA, TRUE), .ROW := NULL], dt[c(2,3,4)])
+test(2356.16, copy(dt)[integer(0), .ROW := NULL], dt)
+test(2356.17, copy(dt)[logical(0), .ROW := NULL], dt)
+test(2356.18, copy(dt)[c(FALSE, FALSE, FALSE, FALSE, FALSE), .ROW := NULL], dt)
+test(2356.19, copy(dt)[a > 100, .ROW := NULL], dt)  # no matches
+# Duplicate indices
+dt = data.table(a=1:5, b=letters[1:5])
+test(2356.20, copy(dt)[c(1L, 1L), .ROW := NULL], dt[-1])
+test(2356.21, copy(dt)[c(1L, 1L, 2L, 2L), .ROW := NULL], dt[3:5])
+test(2356.22, copy(dt)[c(3L, 1L, 3L, 1L), .ROW := NULL], dt[c(2,4,5)])
+# integer64
+if (test_bit64) {
+  dt = data.table(a=1:5, b=as.integer64(11:15))
+  test(2356.23, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)])
+  test(2356.24, copy(dt)[1:5, .ROW := NULL], data.table(a=integer(0), b=integer64(0)))
+}
+# Date/IDate/ITime columns
+dt = data.table(a=1:5, d=as.Date("2024-01-01") + 0:4, t=as.ITime(paste0(10:14, ":00:00")), dt=as.POSIXct("2024-01-01 12:00:00") + 3600*0:4)
+test(2356.25, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)])
+test(2356.26, copy(dt)[c(2L, 4L), .ROW := NULL]$d, as.Date("2024-01-01") + c(0,2,4))
+# Factor columns
+dt = data.table(a=1:5, f=factor(letters[1:5], levels=letters[1:10]))
+test(2356.27, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)])
+test(2356.28, levels(copy(dt)[c(1L, 3L), .ROW := NULL]$f), letters[1:10])
+dt = data.table(a=1:5, of=ordered(letters[1:5], levels=letters[5:1]))
+test(2356.29, copy(dt)[c(2L, 4L), .ROW := NULL], dt[-c(2L,4L)])
+test(2356.30, is.ordered(copy(dt)[c(2, 4L), .ROW := NULL]$of))
+# Keys - should be cleared after deletion
+dt = data.table(a=5:1, b=letters[1:5], key="a")
+test(2356.31, key(copy(dt)[1L, .ROW := NULL]), NULL)
+test(2356.32, haskey(copy(dt)[1L, .ROW := NULL]), FALSE)
+# Indices - should be cleared after deletion
+dt = data.table(a=1:5, b=letters[1:5], c=5:1)
+setindex(dt, b)
+test(2356.33, indices(copy(dt)[1L, .ROW := NULL]), NULL)
+# row names
+dt = data.table(a=1:5, b=letters[1:5])
+test(2356.34, attr(copy(dt)[c(1L, 3L), .ROW := NULL], "row.names"), 1:3)
+# selfref check
+test(2356.35, selfrefok(copy(dt)[1L, .ROW := NULL]), 1L)
+# errors
+dt = data.table(a=1:4, g=1:2)
+test(2356.36, dt[1L, .ROW := 1L], error=".ROW can only be used with := NULL")
+test(2356.37, dt[1L, .ROW := "delete"], error=".ROW can only be used with := NULL")
+test(2356.38, dt[1L, .ROW := FALSE], error=".ROW can only be used with := NULL")
+test(2356.39, dt[, .ROW := NULL], error=".ROW := NULL requires i= condition")
+test(2356.40, dt[1L, .ROW := NULL, by=g], error=".ROW := NULL with 'by' or 'keyby' is not supported")
+# large table
+dt = data.table(a=1:20000, b=rep(letters, length.out=20000))
+idx = seq(1L, 20000L, by=2L)
+test(2356.41, copy(dt)[idx, .ROW := NULL], dt[-idx])
+# Chaining and complexer i expressions
+dt = data.table(a=1:10, b=letters[1:10])
+test(2356.42, copy(dt)[a>2, .ROW := NULL][b=="a"], data.table(a=1L, b="a"))
+test(2356.43, copy(dt)[a %% 2 == 0, .ROW := NULL], dt[a %% 2 != 0])
+test(2356.44, copy(dt)[!(a < 5 & b != "d"), .ROW := NULL], dt[1:3])
+# make columns resizable
+dt = data.table(a=1:3)
+test(2356.91, truelength(dt$a), 0L)
+test(2356.92, {setallocrow(dt); truelength(dt$a)}, 3L)
@@ -2,6 +2,7 @@
 \alias{truelength}
 \alias{setalloccol}
 \alias{alloc.col}
+\alias{setallocrow}
 \title{ Over-allocation access }
 \description{
     These functions are experimental and somewhat advanced. By \emph{experimental} we mean their names might change and perhaps the syntax, argument names and types. So if you write a lot of code using them, you have been warned! They should work and be stable, though, so please report problems with them. \code{alloc.col} is just an alias to \code{setalloccol}. We recommend to use \code{setalloccol} (though \code{alloc.col} will continue to be supported) because the \code{set*} prefix in \code{setalloccol} makes it clear that its input argument is modified in-place.
@@ -14,6 +15,7 @@ setalloccol(DT,
 alloc.col(DT,
     n = getOption("datatable.alloccol"),        # default: 1024L
     verbose = getOption("datatable.verbose"))   # default: FALSE
+setallocrow(DT)
 }
 \arguments{
 \item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. }
@@ -34,6 +36,11 @@ alloc.col(DT,
     (perhaps in your .Rprofile); e.g., \code{options(datatable.alloccol=10000L)}.
 
     Please note: over-allocation of the column pointer vector is not for efficiency \emph{per se}; it is so that \code{:=} can add columns by reference without a shallow copy.
+
+    \code{setallocrow} is a utility function that prepares columns for fast row operations (delete or insert (not implemented yet)) by reference. 
+    Before deleting or inserting rows by reference, columns must be resizable.
+    \code{setallocrow} ensures all columns are in a resizable state by converting ALTREP columns to materialized form and duplicating
+    non-resizable columns as resizable vectors. This operation modifies \code{DT} by reference.
 }
 \value{
     \code{truelength(x)} returns the length of the vector allocated in memory. \code{length(x)} of those items are in use. Currently, it is just the list vector of column
@@ -43,6 +50,8 @@ alloc.col(DT,
 
     \code{setalloccol} \emph{reallocates} \code{DT} by reference. This may be useful for efficiency if you know you are about to going to add a lot of columns in a loop.
     It also returns the new \code{DT}, for convenience in compound queries.
+
+    \code{setallocrow} modifies \code{DT} by reference to ensure all columns are resizable.
 }
 \seealso{ \code{\link{copy}} }
 \examples{

@@ -496,7 +496,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
           Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"),
                   i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols));
         }
-        thisvalue = copyAsPlain(thisvalue);   // PROTECT not needed as assigned as element to protected list below.
+        thisvalue = copyAsPlain(thisvalue, false);   // PROTECT not needed as assigned as element to protected list below.
       } else {
         if (verbose) Rprintf(_("Direct plonk of unnamed RHS, no copy. MAYBE_REFERENCED==%d, MAYBE_SHARED==%d\n"), MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue));  // e.g. DT[,a:=as.character(a)] as tested by 754.5
       }

@@ -52,7 +52,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) {
       error(_("Item %d is length %d but the first item is length %d. Only singletons are recycled."), i+2, length(item), nrow);
   }
   if (!inplace) {
-    first = PROTECT(copyAsPlain(first)); nprotect++;
+    first = PROTECT(copyAsPlain(first, false)); nprotect++;
     if (verbose) Rprintf(_("coalesce copied first item (inplace=FALSE)\n"));
   }
   const void **valP = (const void **)R_alloc(nval, sizeof(*valP));

@@ -183,6 +183,10 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA);
 SEXP subsetVector(SEXP x, SEXP idx);
 const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out);
 
+// deleterows.c
+SEXP deleteRows(SEXP dt, SEXP rows_to_delete);
+SEXP allocrow(SEXP dt);
+
 // fcast.c
 SEXP int_vec_init(R_len_t n, int val);
 
@@ -321,7 +325,7 @@ SEXP fitsInInt64R(SEXP x);
 bool allNA(SEXP x, bool errorForBadType);
 SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent);
 bool INHERITS(SEXP x, SEXP char_);
-SEXP copyAsPlain(SEXP x);
+SEXP copyAsPlain(SEXP x, bool resizable);
 void copySharedColumns(SEXP x);
 SEXP lock(SEXP x);
 SEXP unlock(SEXP x);