Yet another reshape problem in data.table
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c(\"A\",\"B\"), v=sample(1:100,12))
#
Try this:
cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
Simplifications:
1) If using 0
in place of NA
is ok then it can be simplified by omitting the first line which defines cumsum0
and replacing cumsum0
in the next line with cumsum
.
2) The result of the second line has these names:
> names(DT2)
[1] "SUM.A:x" "SUM.B:x" "SUM.A:v" "SUM.B:v"
so if that is sufficient the last line can be dropped since its only purpose is to make the names exactly the same as in the question.
The result (without the simplifications) is:
> DT2
SUM.x.A SUM.x.B SUM.v.A SUM.v.B
1: 1 NA 12 NA
2: 1 1 12 62
3: 2 1 72 62
4: 2 2 72 123
5: 4 2 155 123
6: 4 4 155 220
7: 6 4 156 220
8: 6 6 156 242
9: 9 6 255 242
10: 9 9 255 289
11: 12 9 318 289
12: 12 12 318 338
Not sure this is the best solution, but you could do something like the following.
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]
setkey(DT, y)
uniqY <- unique(DT$y)
for(jj in uniqY){
nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]
}
setkey(DT, id)
DT[, 5:8 := lapply(.SD, function(x) {
xn <- is.na(x)
x[xn] <- -Inf
xx <- cummax(x)
# deal with leading NA values
if(xn[1]){
xn1 <- which(xn)[1]
xx[seq_len(xn1)] <- NA}
xx }), .SDcols = 5:8]
Here's another way:
ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
cols <- paste0("SUM.", sdcols, ".", ys[i])
DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)],
c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
setnames(DT, c("v1", "v2"), cols)
}
My version of benchmarking with mnel's (from his post) and this function:
arun <- function(DT) {
ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
cols <- paste0("SUM.", sdcols, ".", ys[i])
DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)],
c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
setnames(DT, c("v1", "v2"), cols)
}
DT
}
mnel <- function(DT) {
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]
setkey(DT, y)
uniqY <- unique(DT$y)
for(jj in uniqY){
nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]
}
setkey(DT, id)
DT[, 5:8 := lapply(.SD, function(x) {
xn <- is.na(x)
x[xn] <- -Inf
xx <- cummax(x)
# deal with leading NA values
if(xn[1]){
xn1 <- which(xn)[1]
xx[seq_len(xn1)] <- NA}
xx }), .SDcols = 5:8]
}
statquant <- function(DT){
#first step is to create cumsum columns
colNames <- c("x","v")
DT[, paste0("SUM.",colNames):=lapply(.SD,cumsum) ,by=y, .SDcols=colNames];
#now we need to reshape each SUM.* to SUM.*.{yvalue}
DT[,N:=.I]; setattr(DT,"sorted","N")
g <- function(DT,SD){
cols <- c('N',grep('SUM',colnames(SD), value=T));
Yval <- unique(SD[,y]);
merge(DT, SD[,cols, with=F], suffix=c('',paste0('.',Yval)), all.x=T);
}
DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));
locf = function(x) {
ind = which(!is.na(x))
if(is.na(x[1])) ind = c(1,ind)
rep(x[ind], times = diff( c(ind, length(x) + 1) ))
}
newColNames <- grep('SUM',colnames(DT),value=T);
DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
DT
}
grothendieck <- function(DT) {
cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
DT2
}
library(data.table)
library(zoo)
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
library(microbenchmark)
microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)),
m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)
# Unit: milliseconds
# expr min lq median uq max neval
# s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186 1000
# g <- grothendieck(copy(DT)) 3.634120 3.859143 4.006085 4.443388 80.01984 1000
# m <- mnel(copy(DT)) 7.819286 8.234178 8.596090 10.423668 87.07668 1000
# a <- arun(copy(DT)) 6.925419 7.369286 7.703003 9.262719 53.39823 1000
# x y v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
# 1: 1 A 12 1 12 1 12 NA NA
# 2: 1 B 62 1 62 1 12 1 62
# 3: 1 A 60 2 72 2 72 1 62
# 4: 1 B 61 2 123 2 72 2 123
# 5: 2 A 83 4 155 4 155 2 123
# 6: 2 B 97 4 220 4 155 4 220
# 7: 2 A 1 6 156 6 156 4 220
# 8: 2 B 22 6 242 6 156 6 242
# 9: 3 A 99 9 255 9 255 6 242
# 10: 3 B 47 9 289 9 255 9 289
# 11: 3 A 63 12 318 12 318 9 289
# 12: 3 B 49 12 338 12 318 12 338
# x y v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
# 1: 1 A 12 1 1 12 NA NA
# 2: 1 B 62 2 1 12 1 62
# 3: 1 A 60 3 2 72 1 62
# 4: 1 B 61 4 2 72 2 123
# 5: 2 A 83 5 4 155 2 123
# 6: 2 B 97 6 4 155 4 220
# 7: 2 A 1 7 6 156 4 220
# 8: 2 B 22 8 6 156 6 242
# 9: 3 A 99 9 9 255 6 242
# 10: 3 B 47 10 9 255 9 289
# 11: 3 A 63 11 12 318 9 289
# 12: 3 B 49 12 12 318 12 338
# N x y v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
# 1: 1 1 A 12 1 12 1 12 NA NA
# 2: 2 1 B 62 1 62 1 12 1 62
# 3: 3 1 A 60 2 72 2 72 1 62
# 4: 4 1 B 61 2 123 2 72 2 123
# 5: 5 2 A 83 4 155 4 155 2 123
# 6: 6 2 B 97 4 220 4 155 4 220
# 7: 7 2 A 1 6 156 6 156 4 220
# 8: 8 2 B 22 6 242 6 156 6 242
# 9: 9 3 A 99 9 255 9 255 6 242
# 10: 10 3 B 47 9 289 9 255 9 289
# 11: 11 3 A 63 12 318 12 318 9 289
# 12: 12 3 B 49 12 338 12 318 12 338
# SUM.x.A SUM.x.B SUM.v.A SUM.v.B
# 1: 1 NA 12 NA
# 2: 1 1 12 62
# 3: 2 1 72 62
# 4: 2 2 72 123
# 5: 4 2 155 123
# 6: 4 4 155 220
# 7: 6 4 156 220
# 8: 6 6 156 242
# 9: 9 6 255 242
# 10: 9 9 255 289
# 11: 12 9 318 289
# 12: 12 12 318 338