Yet another reshape problem in data.table
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c(\"A\",\"B\"), v=sample(1:100,12))
#
Here's another way:
ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
cols <- paste0("SUM.", sdcols, ".", ys[i])
DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)],
c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
setnames(DT, c("v1", "v2"), cols)
}
My version of benchmarking with mnel's (from his post) and this function:
arun <- function(DT) {
ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
cols <- paste0("SUM.", sdcols, ".", ys[i])
DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)],
c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
setnames(DT, c("v1", "v2"), cols)
}
DT
}
mnel <- function(DT) {
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]
setkey(DT, y)
uniqY <- unique(DT$y)
for(jj in uniqY){
nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]
}
setkey(DT, id)
DT[, 5:8 := lapply(.SD, function(x) {
xn <- is.na(x)
x[xn] <- -Inf
xx <- cummax(x)
# deal with leading NA values
if(xn[1]){
xn1 <- which(xn)[1]
xx[seq_len(xn1)] <- NA}
xx }), .SDcols = 5:8]
}
statquant <- function(DT){
#first step is to create cumsum columns
colNames <- c("x","v")
DT[, paste0("SUM.",colNames):=lapply(.SD,cumsum) ,by=y, .SDcols=colNames];
#now we need to reshape each SUM.* to SUM.*.{yvalue}
DT[,N:=.I]; setattr(DT,"sorted","N")
g <- function(DT,SD){
cols <- c('N',grep('SUM',colnames(SD), value=T));
Yval <- unique(SD[,y]);
merge(DT, SD[,cols, with=F], suffix=c('',paste0('.',Yval)), all.x=T);
}
DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));
locf = function(x) {
ind = which(!is.na(x))
if(is.na(x[1])) ind = c(1,ind)
rep(x[ind], times = diff( c(ind, length(x) + 1) ))
}
newColNames <- grep('SUM',colnames(DT),value=T);
DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
DT
}
grothendieck <- function(DT) {
cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
DT2
}
library(data.table)
library(zoo)
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
library(microbenchmark)
microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)),
m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)
# Unit: milliseconds
# expr min lq median uq max neval
# s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186 1000
# g <- grothendieck(copy(DT)) 3.634120 3.859143 4.006085 4.443388 80.01984 1000
# m <- mnel(copy(DT)) 7.819286 8.234178 8.596090 10.423668 87.07668 1000
# a <- arun(copy(DT)) 6.925419 7.369286 7.703003 9.262719 53.39823 1000
# x y v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
# 1: 1 A 12 1 12 1 12 NA NA
# 2: 1 B 62 1 62 1 12 1 62
# 3: 1 A 60 2 72 2 72 1 62
# 4: 1 B 61 2 123 2 72 2 123
# 5: 2 A 83 4 155 4 155 2 123
# 6: 2 B 97 4 220 4 155 4 220
# 7: 2 A 1 6 156 6 156 4 220
# 8: 2 B 22 6 242 6 156 6 242
# 9: 3 A 99 9 255 9 255 6 242
# 10: 3 B 47 9 289 9 255 9 289
# 11: 3 A 63 12 318 12 318 9 289
# 12: 3 B 49 12 338 12 318 12 338
# x y v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
# 1: 1 A 12 1 1 12 NA NA
# 2: 1 B 62 2 1 12 1 62
# 3: 1 A 60 3 2 72 1 62
# 4: 1 B 61 4 2 72 2 123
# 5: 2 A 83 5 4 155 2 123
# 6: 2 B 97 6 4 155 4 220
# 7: 2 A 1 7 6 156 4 220
# 8: 2 B 22 8 6 156 6 242
# 9: 3 A 99 9 9 255 6 242
# 10: 3 B 47 10 9 255 9 289
# 11: 3 A 63 11 12 318 9 289
# 12: 3 B 49 12 12 318 12 338
# N x y v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
# 1: 1 1 A 12 1 12 1 12 NA NA
# 2: 2 1 B 62 1 62 1 12 1 62
# 3: 3 1 A 60 2 72 2 72 1 62
# 4: 4 1 B 61 2 123 2 72 2 123
# 5: 5 2 A 83 4 155 4 155 2 123
# 6: 6 2 B 97 4 220 4 155 4 220
# 7: 7 2 A 1 6 156 6 156 4 220
# 8: 8 2 B 22 6 242 6 156 6 242
# 9: 9 3 A 99 9 255 9 255 6 242
# 10: 10 3 B 47 9 289 9 255 9 289
# 11: 11 3 A 63 12 318 12 318 9 289
# 12: 12 3 B 49 12 338 12 318 12 338
# SUM.x.A SUM.x.B SUM.v.A SUM.v.B
# 1: 1 NA 12 NA
# 2: 1 1 12 62
# 3: 2 1 72 62
# 4: 2 2 72 123
# 5: 4 2 155 123
# 6: 4 4 155 220
# 7: 6 4 156 220
# 8: 6 6 156 242
# 9: 9 6 255 242
# 10: 9 9 255 289
# 11: 12 9 318 289
# 12: 12 12 318 338