Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export(setindex, setindexv, indices)
export(as.data.table,is.data.table,test.data.table)
export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%")
export(timetaken)
export(truelength, setalloccol, alloc.col, ":=", let)
export(truelength, setalloccol, setallocrow, alloc.col, ":=", let)
export(setattr, setnames, setcolorder, set, setDT, setDF)
export(setorder, setorderv)
export(setNumericRounding, getNumericRounding)
Expand All @@ -28,7 +28,7 @@ export(tstrsplit)
export(frank)
export(frankv)
export(address)
export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI, measure, measurev, patterns)
export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI,.ROW, measure, measurev, patterns)
# TODO(#6197): Export these.
# export(., J)
export(rleid)
Expand Down
21 changes: 20 additions & 1 deletion R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ methods::setPackageName("data.table",.global)
# (1) add to man/special-symbols.Rd
# (2) export() in NAMESPACE
# (3) add to vignettes/datatable-importing.Rmd#globals section
.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL
.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = .ROW = NULL
# These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler.
# But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols.
# And NULL makes it clear (to the R's mask check on loading) that they're variables not functions.
Expand Down Expand Up @@ -1182,6 +1182,21 @@ replace_dot_alias = function(e) {
names(jsub)=""
jsub[[1L]]=as.name("list")
}

# Check for .ROW := NULL pattern (delete rows by reference)
if ((is.character(lhs) && length(lhs)==1L && lhs==".ROW") ||
(is.name(lhs) && identical(lhs, quote(.ROW)))) {
if (is.null(jsub) || identical(jsub, quote(NULL))) {
if (is.null(irows))
stopf(".ROW := NULL requires i= condition to specify rows to delete")
if (!missingby)
stopf(".ROW := NULL with 'by' or 'keyby' is not supported yet")
.Call(CdeleteRows, x, irows)
return(suppPrint(x))
} else {
stopf(".ROW can only be used with := NULL to delete rows")
}
}
av = all.vars(jsub,TRUE)
if (!is.atomic(lhs)) stopf("LHS of := must be a symbol, or an atomic vector (column names or positions).")
if (is.character(lhs)) {
Expand Down Expand Up @@ -2719,6 +2734,10 @@ selfrefok = function(DT,verbose=getOption("datatable.verbose")) {
.Call(Cselfrefokwrapper,DT,verbose)
}

setallocrow = function(DT) {
.Call(Callocrow, DT)
}

truelength = function(x) .Call(Ctruelength,x)
# deliberately no "truelength<-" method. setalloccol is the mechanism for that.
# settruelength() no longer need (and so removed) now that data.table depends on R 2.14.0
Expand Down
93 changes: 93 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21959,3 +21959,96 @@ test(2355.1, fread(txt, skip=0), data.table(V1 = c("b1", "c1"), a1
test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name")
test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE")
test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3")))

# delete rows by reference #635
# atomic types and list columns
dt = data.table(
int = 1:5,
real = c(1.1, 2.2, 3.3, 4.4, 5.5),
char = letters[1:5],
lgl = c(TRUE, FALSE, TRUE, FALSE, TRUE),
cplx = as.complex(1:5),
raw_col = as.raw(1:5),
list_col = list(1L, 1:2, 1:3, 1:4, 1:5)
)
test(2356.01, copy(dt)[1L, .ROW := NULL], dt[-1])
test(2356.02, copy(dt)[1, .ROW := NULL], dt[-1])
test(2356.03, copy(dt)[c(TRUE, FALSE, FALSE, TRUE, FALSE), .ROW := NULL], dt[-c(1,4)])
test(2356.04, copy(dt)[int==1L, .ROW := NULL], dt[-1])
test(2356.05, copy(dt)[int<2L, .ROW := NULL], dt[-1])
test(2356.06, copy(dt)[-1, .ROW := NULL], dt[1])
# zero row or empty data.tables
dt = data.table()
test(2356.07, dt[logical(0), .ROW := NULL], dt)
dt = data.table(a=integer(0), b=character(0))
test(2356.08, dt[logical(0), .ROW := NULL], dt)
# multirow
dt = data.table(a=1:5, b=letters[1:5])
test(2356.09, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)])
test(2356.10, copy(dt)[c(TRUE, FALSE, TRUE, FALSE, TRUE), .ROW := NULL], dt[c(2,4)])
test(2356.11, copy(dt)[1:2, .ROW := NULL], dt[3:5])
test(2356.12, copy(dt)[1:5, .ROW := NULL], dt[0])
# NA handling and edges case
dt = data.table(a=1:5, b=letters[1:5])
test(2356.13, copy(dt)[c(1L, NA_integer_, 3L), .ROW := NULL], dt[c(2,4,5)])
test(2356.14, copy(dt)[c(NA_integer_, NA_integer_), .ROW := NULL], dt)
test(2356.15, copy(dt)[c(TRUE, NA, FALSE, NA, TRUE), .ROW := NULL], dt[c(2,3,4)])
test(2356.16, copy(dt)[integer(0), .ROW := NULL], dt)
test(2356.17, copy(dt)[logical(0), .ROW := NULL], dt)
test(2356.18, copy(dt)[c(FALSE, FALSE, FALSE, FALSE, FALSE), .ROW := NULL], dt)
test(2356.19, copy(dt)[a > 100, .ROW := NULL], dt) # no matches
# Duplicate indices
dt = data.table(a=1:5, b=letters[1:5])
test(2356.20, copy(dt)[c(1L, 1L), .ROW := NULL], dt[-1])
test(2356.21, copy(dt)[c(1L, 1L, 2L, 2L), .ROW := NULL], dt[3:5])
test(2356.22, copy(dt)[c(3L, 1L, 3L, 1L), .ROW := NULL], dt[c(2,4,5)])
# integer64
if (test_bit64) {
dt = data.table(a=1:5, b=as.integer64(11:15))
test(2356.23, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)])
test(2356.24, copy(dt)[1:5, .ROW := NULL], data.table(a=integer(0), b=integer64(0)))
}
# Date/IDate/ITime columns
dt = data.table(a=1:5, d=as.Date("2024-01-01") + 0:4, t=as.ITime(paste0(10:14, ":00:00")), dt=as.POSIXct("2024-01-01 12:00:00") + 3600*0:4)
test(2356.25, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)])
test(2356.26, copy(dt)[c(2L, 4L), .ROW := NULL]$d, as.Date("2024-01-01") + c(0,2,4))
# Factor columns
dt = data.table(a=1:5, f=factor(letters[1:5], levels=letters[1:10]))
test(2356.27, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)])
test(2356.28, levels(copy(dt)[c(1L, 3L), .ROW := NULL]$f), letters[1:10])
dt = data.table(a=1:5, of=ordered(letters[1:5], levels=letters[5:1]))
test(2356.29, copy(dt)[c(2L, 4L), .ROW := NULL], dt[-c(2L,4L)])
test(2356.30, is.ordered(copy(dt)[c(2, 4L), .ROW := NULL]$of))
# Keys - should be cleared after deletion
dt = data.table(a=5:1, b=letters[1:5], key="a")
test(2356.31, key(copy(dt)[1L, .ROW := NULL]), NULL)
test(2356.32, haskey(copy(dt)[1L, .ROW := NULL]), FALSE)
# Indices - should be cleared after deletion
dt = data.table(a=1:5, b=letters[1:5], c=5:1)
setindex(dt, b)
test(2356.33, indices(copy(dt)[1L, .ROW := NULL]), NULL)
# row names
dt = data.table(a=1:5, b=letters[1:5])
test(2356.34, attr(copy(dt)[c(1L, 3L), .ROW := NULL], "row.names"), 1:3)
# selfref check
test(2356.35, selfrefok(copy(dt)[1L, .ROW := NULL]), 1L)
# errors
dt = data.table(a=1:4, g=1:2)
test(2356.36, dt[1L, .ROW := 1L], error=".ROW can only be used with := NULL")
test(2356.37, dt[1L, .ROW := "delete"], error=".ROW can only be used with := NULL")
test(2356.38, dt[1L, .ROW := FALSE], error=".ROW can only be used with := NULL")
test(2356.39, dt[, .ROW := NULL], error=".ROW := NULL requires i= condition")
test(2356.40, dt[1L, .ROW := NULL, by=g], error=".ROW := NULL with 'by' or 'keyby' is not supported")
# large table
dt = data.table(a=1:20000, b=rep(letters, length.out=20000))
idx = seq(1L, 20000L, by=2L)
test(2356.41, copy(dt)[idx, .ROW := NULL], dt[-idx])
# Chaining and complexer i expressions
dt = data.table(a=1:10, b=letters[1:10])
test(2356.42, copy(dt)[a>2, .ROW := NULL][b=="a"], data.table(a=1L, b="a"))
test(2356.43, copy(dt)[a %% 2 == 0, .ROW := NULL], dt[a %% 2 != 0])
test(2356.44, copy(dt)[!(a < 5 & b != "d"), .ROW := NULL], dt[1:3])
# make columns resizable
dt = data.table(a=1:3)
test(2356.91, truelength(dt$a), 0L)
test(2356.92, {setallocrow(dt); truelength(dt$a)}, 3L)
9 changes: 9 additions & 0 deletions man/truelength.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
\alias{truelength}
\alias{setalloccol}
\alias{alloc.col}
\alias{setallocrow}
\title{ Over-allocation access }
\description{
These functions are experimental and somewhat advanced. By \emph{experimental} we mean their names might change and perhaps the syntax, argument names and types. So if you write a lot of code using them, you have been warned! They should work and be stable, though, so please report problems with them. \code{alloc.col} is just an alias to \code{setalloccol}. We recommend to use \code{setalloccol} (though \code{alloc.col} will continue to be supported) because the \code{set*} prefix in \code{setalloccol} makes it clear that its input argument is modified in-place.
Expand All @@ -14,6 +15,7 @@ setalloccol(DT,
alloc.col(DT,
n = getOption("datatable.alloccol"), # default: 1024L
verbose = getOption("datatable.verbose")) # default: FALSE
setallocrow(DT)
}
\arguments{
\item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. }
Expand All @@ -34,6 +36,11 @@ alloc.col(DT,
(perhaps in your .Rprofile); e.g., \code{options(datatable.alloccol=10000L)}.

Please note: over-allocation of the column pointer vector is not for efficiency \emph{per se}; it is so that \code{:=} can add columns by reference without a shallow copy.

\code{setallocrow} is a utility function that prepares columns for fast row operations (delete or insert (not implemented yet)) by reference.
Before deleting or inserting rows by reference, columns must be resizable.
\code{setallocrow} ensures all columns are in a resizable state by converting ALTREP columns to materialized form and duplicating
non-resizable columns as resizable vectors. This operation modifies \code{DT} by reference.
}
\value{
\code{truelength(x)} returns the length of the vector allocated in memory. \code{length(x)} of those items are in use. Currently, it is just the list vector of column
Expand All @@ -43,6 +50,8 @@ alloc.col(DT,

\code{setalloccol} \emph{reallocates} \code{DT} by reference. This may be useful for efficiency if you know you are about to going to add a lot of columns in a loop.
It also returns the new \code{DT}, for convenience in compound queries.

\code{setallocrow} modifies \code{DT} by reference to ensure all columns are resizable.
}
\seealso{ \code{\link{copy}} }
\examples{
Expand Down
2 changes: 1 addition & 1 deletion src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"),
i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols));
}
thisvalue = copyAsPlain(thisvalue); // PROTECT not needed as assigned as element to protected list below.
thisvalue = copyAsPlain(thisvalue, false); // PROTECT not needed as assigned as element to protected list below.
} else {
if (verbose) Rprintf(_("Direct plonk of unnamed RHS, no copy. MAYBE_REFERENCED==%d, MAYBE_SHARED==%d\n"), MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue)); // e.g. DT[,a:=as.character(a)] as tested by 754.5
}
Expand Down
2 changes: 1 addition & 1 deletion src/coalesce.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) {
error(_("Item %d is length %d but the first item is length %d. Only singletons are recycled."), i+2, length(item), nrow);
}
if (!inplace) {
first = PROTECT(copyAsPlain(first)); nprotect++;
first = PROTECT(copyAsPlain(first, false)); nprotect++;
if (verbose) Rprintf(_("coalesce copied first item (inplace=FALSE)\n"));
}
const void **valP = (const void **)R_alloc(nval, sizeof(*valP));
Expand Down
6 changes: 5 additions & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA);
SEXP subsetVector(SEXP x, SEXP idx);
const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out);

// deleterows.c
SEXP deleteRows(SEXP dt, SEXP rows_to_delete);
SEXP allocrow(SEXP dt);

// fcast.c
SEXP int_vec_init(R_len_t n, int val);

Expand Down Expand Up @@ -321,7 +325,7 @@ SEXP fitsInInt64R(SEXP x);
bool allNA(SEXP x, bool errorForBadType);
SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent);
bool INHERITS(SEXP x, SEXP char_);
SEXP copyAsPlain(SEXP x);
SEXP copyAsPlain(SEXP x, bool resizable);
void copySharedColumns(SEXP x);
SEXP lock(SEXP x);
SEXP unlock(SEXP x);
Expand Down
Loading
Loading