Given an arbitrary list of column names in a data.table
, I want to concatenate the contents of those columns into a single string stored in a new column. The column
I don't know how representative the sample data is for your actual data, but in the case of your sampled data you can achieve a substantial performance improvement by only concatenating each unique combination of ConcatCols once instead of multiple times.
That means for the sample data, you'd be looking at ~500k concatenations vs 10 million if you do all the duplicates too.
See the following code and timing example:
system.time({
setkeyv(DT, ConcatCols)
DTunique <- unique(DT[, ConcatCols, with=FALSE], by = key(DT))
DTunique[, State := do.call(paste, c(DTunique, sep = ""))]
DT[DTunique, State := i.State, on = ConcatCols]
})
# user system elapsed
# 7.448 0.462 4.618
About half the time is spent on the setkey
part. In case your data is already keyed, the time is cut down further to just a bit more than 2 seconds.
setkeyv(DT, ConcatCols)
system.time({
DTunique <- unique(DT[, ConcatCols, with=FALSE], by = key(DT))
DTunique[, State := do.call(paste, c(DTunique, sep = ""))]
DT[DTunique, State := i.State, on = ConcatCols]
})
# user system elapsed
# 2.526 0.280 2.181
This uses unite
from package tidyr
. May not be the fastest, but it is probably faster than hand-coded R code.
library(tidyr)
system.time(
DNew <- DT %>% unite(State, ConcatCols, sep = "", remove = FALSE)
)
# user system elapsed
# 14.974 0.183 15.343
DNew[1:10]
# State x y a b c d e f
# 1: foo211621bar foo bar 2 1 1 6 2 1
# 2: foo532735bar foo bar 5 3 2 7 3 5
# 3: foo965776bar foo bar 9 6 5 7 7 6
# 4: foo221284bar foo bar 2 2 1 2 8 4
# 5: foo485976bar foo bar 4 8 5 9 7 6
# 6: foo566778bar foo bar 5 6 6 7 7 8
# 7: foo892636bar foo bar 8 9 2 6 3 6
# 8: foo836672bar foo bar 8 3 6 6 7 2
# 9: foo963926bar foo bar 9 6 3 9 2 6
# 10: foo385216bar foo bar 3 8 5 2 1 6
C to the rescue!
Stealing some code from data.table we can write a C function that works way faster (and could be parallelized to be even faster).
First make sure you have a working C++ toolchain with:
library(inline)
fx <- inline::cfunction( signature(x = "integer", y = "numeric" ) , '
return ScalarReal( INTEGER(x)[0] * REAL(y)[0] ) ;
' )
fx( 2L, 5 ) #Should return 10
Then this should work (assuming integer-only data, but the code can be extended to other types):
library(inline)
library(data.table)
library(stringi)
header <- "
//Taken from https://github.com/Rdatatable/data.table/blob/master/src/fwrite.c
static inline void reverse(char *upp, char *low)
{
upp--;
while (upp>low) {
char tmp = *upp;
*upp = *low;
*low = tmp;
upp--;
low++;
}
}
void writeInt32(int *col, size_t row, char **pch)
{
char *ch = *pch;
int x = col[row];
if (x == INT_MIN) {
*ch++ = 'N';
*ch++ = 'A';
} else {
if (x<0) { *ch++ = '-'; x=-x; }
// Avoid log() for speed. Write backwards then reverse when we know how long.
char *low = ch;
do { *ch++ = '0'+x%10; x/=10; } while (x>0);
reverse(ch, low);
}
*pch = ch;
}
//end of copied code
"
worker_fun <- inline::cfunction( signature(x = "list", preallocated_target = "character", columns = "integer", start_row = "integer", end_row = "integer"), includes = header , "
const size_t _start_row = INTEGER(start_row)[0] - 1;
const size_t _end_row = INTEGER(end_row)[0];
const int max_out_len = 256 * 256; //max length of the final string
char buffer[max_out_len];
const size_t num_elements = _end_row - _start_row;
const size_t num_columns = LENGTH(columns);
const int * _columns = INTEGER(columns);
for(size_t i = _start_row; i < _end_row; ++i) {
char *buf_pos = buffer;
for(size_t c = 0; c < num_columns; ++c) {
if(c > 0) {
buf_pos[0] = ',';
++buf_pos;
}
writeInt32(INTEGER(VECTOR_ELT(x, _columns[c] - 1)), i, &buf_pos);
}
SET_STRING_ELT(preallocated_target,i, mkCharLen(buffer, buf_pos - buffer));
}
return preallocated_target;
" )
#Test with the same data
RowCount <- 5e6
DT <- data.table(x = "foo",
y = "bar",
a = sample.int(9, RowCount, TRUE),
b = sample.int(9, RowCount, TRUE),
c = sample.int(9, RowCount, TRUE),
d = sample.int(9, RowCount, TRUE),
e = sample.int(9, RowCount, TRUE),
f = sample.int(9, RowCount, TRUE))
## Generate an expression to paste an arbitrary list of columns together
ConcatCols <- list("a","b","c","d","e","f")
## Do it 3x as many times
ConcatCols <- c(ConcatCols,ConcatCols,ConcatCols)
ptm <- proc.time()
preallocated_target <- character(RowCount)
column_indices <- sapply(ConcatCols, FUN = function(x) { which(colnames(DT) == x )})
x <- worker_fun(DT, preallocated_target, column_indices, as.integer(1), as.integer(RowCount))
DT[, State := preallocated_target]
proc.time() - ptm
While your (integer only) example runs in about 20s on my PC, this runs in ~5s and can be easily parallelized.
Some things to note:
Parallelization
EDIT: The approach below would actually fail due to the way clusterExport
and R string storage work. Paralellization thus probably needs to be done in C as well, similarly to the way it is achieved in data.table.
Since you cannot pass inline-compiled functions across R processes, paralellization requires some more work. To be able to use the above function in parallel, you either need to compile it separately with R compiler and use dyn.load
OR wrap it in a package OR use a forking backend for parallel (I don't have one, forking works only on UNIX).
Running in parallel would then look something like (not tested):
no_cores <- detectCores()
# Initiate cluster
cl <- makeCluster(no_cores)
#Preallocated target and prepare params
num_elements <- length(DT[[1]])
preallocated_target <- character(num_elements)
block_size <- 4096 #No of rows processed at once. Adjust for best performance
column_indices <- sapply(ConcatCols, FUN = function(x) { which(colnames(DT) == x )})
num_blocks <- ceiling(num_elements / block_size)
clusterExport(cl,
c("DT","preallocated_target","column_indices","num_elements", "block_size"))
clusterEvalQ(cl, <CODE TO LOAD THE NATIVE FUNCTION HERE>)
parLapply(cl, 1:num_blocks ,
function(block_id)
{
throw_away <-
worker_fun(DT, preallocated_target, columns,
(block_id - 1) * block_size + 1, min(num_elements, block_id * block_size - 1))
return(NULL)
})
stopCluster(cl)