A reshape puzzle in data.table

后端未结

关注

 3  526

Yet another reshape problem in data.table

set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c(\"A\",\"B\"), v=sample(1:100,12))
#


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  旧时难觅i        
                
              
                            
                2021-01-19 13:58
              
            
            
                                                                       
Try this:

cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))


Simplifications:

1) If using 0 in place of NA is ok then it can be simplified by omitting the first line which defines cumsum0 and replacing cumsum0 in the next line with cumsum.  

2) The result of the second line has these names:

> names(DT2)
[1] "SUM.A:x" "SUM.B:x" "SUM.A:v" "SUM.B:v"


so if that is sufficient the last line can be dropped since its only purpose is to make the names exactly the same as in the question.

The result (without the simplifications) is:

> DT2
    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
 1:       1      NA      12      NA
 2:       1       1      12      62
 3:       2       1      72      62
 4:       2       2      72     123
 5:       4       2     155     123
 6:       4       4     155     220
 7:       6       4     156     220
 8:       6       6     156     242
 9:       9       6     255     242
10:       9       9     255     289
11:      12       9     318     289
12:      12      12     318     338

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  情深已故        
                
              
                            
                2021-01-19 13:58
              
            
            
                                                                       
Not sure this is the best solution, but you could do something like the following.

set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
DT[, id := seq_len(nrow(DT))]

setkey(DT, y)

uniqY <- unique(DT$y)

for(jj in uniqY){
  nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
  DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]

}

setkey(DT, id)

DT[, 5:8 := lapply(.SD, function(x) { 
  xn <- is.na(x)
  x[xn] <- -Inf
  xx <- cummax(x)
  # deal with leading NA values
    if(xn[1]){
    xn1 <- which(xn)[1]
  xx[seq_len(xn1)] <- NA}   

  xx }), .SDcols = 5:8]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  旧时难觅i        
                
              
                            
                2021-01-19 14:08
              
            
            
                                                                       
Here's another way:

ys <- unique(DT$y)
sdcols <- c("x", "v")
cols <- paste0("SUM.", sdcols)
DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
for( i in seq_along(ys)) {
    cols <- paste0("SUM.", sdcols, ".", ys[i])
    DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
    DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
    c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
    setnames(DT, c("v1", "v2"), cols)
}




My version of benchmarking with mnel's (from his post) and this function:

The function from this post:

arun <- function(DT) {

    ys <- unique(DT$y)
    sdcols <- c("x", "v")
    cols <- paste0("SUM.", sdcols)
    DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
    for( i in seq_along(ys)) {
        cols <- paste0("SUM.", sdcols, ".", ys[i])
        DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
        DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
        c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
        setnames(DT, c("v1", "v2"), cols)
    }
    DT
}


Function from mnel's post:

mnel <- function(DT) {
    set.seed(1234)
    DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
    DT[, id := seq_len(nrow(DT))]
    setkey(DT, y)
    uniqY <- unique(DT$y)
    for(jj in uniqY){
      nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
      DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]

    }
    setkey(DT, id)
    DT[, 5:8 := lapply(.SD, function(x) { 
      xn <- is.na(x)
      x[xn] <- -Inf
      xx <- cummax(x)
      # deal with leading NA values
        if(xn[1]){
        xn1 <- which(xn)[1]
      xx[seq_len(xn1)] <- NA}   
      xx }), .SDcols = 5:8]
}


Function from statquant:

statquant <- function(DT){
    #first step is to create cumsum columns
    colNames <- c("x","v")
    DT[, paste0("SUM.",colNames):=lapply(.SD,cumsum) ,by=y, .SDcols=colNames];
    #now we need to reshape each SUM.* to SUM.*.{yvalue}
    DT[,N:=.I]; setattr(DT,"sorted","N")

    g <- function(DT,SD){
      cols <- c('N',grep('SUM',colnames(SD), value=T));
      Yval <- unique(SD[,y]);
      merge(DT, SD[,cols, with=F], suffix=c('',paste0('.',Yval)), all.x=T);    
    }

    DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));

    locf = function(x) {
      ind = which(!is.na(x))    
      if(is.na(x[1])) ind = c(1,ind)
      rep(x[ind], times = diff( c(ind, length(x) + 1) )) 
    }

    newColNames <- grep('SUM',colnames(DT),value=T);
    DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
    DT
}


Function from grothendieck

grothendieck <- function(DT) {
    cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
    DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
    setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
    DT2
}


Benchmarking:

library(data.table)
library(zoo)
set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))

library(microbenchmark)
microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)), 
                m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)

# Unit: milliseconds
#                         expr       min        lq    median        uq       max neval
#     s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186  1000
#  g <- grothendieck(copy(DT))  3.634120  3.859143  4.006085  4.443388  80.01984  1000
#          m <- mnel(copy(DT))  7.819286  8.234178  8.596090 10.423668  87.07668  1000
#          a <- arun(copy(DT))  6.925419  7.369286  7.703003  9.262719  53.39823  1000


resulting data.table "a" (arun's)

#     x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
#  1: 1 A 12     1    12       1      12      NA      NA
#  2: 1 B 62     1    62       1      12       1      62
#  3: 1 A 60     2    72       2      72       1      62
#  4: 1 B 61     2   123       2      72       2     123
#  5: 2 A 83     4   155       4     155       2     123
#  6: 2 B 97     4   220       4     155       4     220
#  7: 2 A  1     6   156       6     156       4     220
#  8: 2 B 22     6   242       6     156       6     242
#  9: 3 A 99     9   255       9     255       6     242
# 10: 3 B 47     9   289       9     255       9     289
# 11: 3 A 63    12   318      12     318       9     289
# 12: 3 B 49    12   338      12     318      12     338


Resulting data.table "m" (mnel's)

#    x y  v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
#  1: 1 A 12  1       1      12      NA      NA
#  2: 1 B 62  2       1      12       1      62
#  3: 1 A 60  3       2      72       1      62
#  4: 1 B 61  4       2      72       2     123
#  5: 2 A 83  5       4     155       2     123
#  6: 2 B 97  6       4     155       4     220
#  7: 2 A  1  7       6     156       4     220
#  8: 2 B 22  8       6     156       6     242
#  9: 3 A 99  9       9     255       6     242
# 10: 3 B 47 10       9     255       9     289
# 11: 3 A 63 11      12     318       9     289
# 12: 3 B 49 12      12     318      12     338


Resulting data.table "s" (statquant's)

#      N x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
#  1:  1 1 A 12     1    12       1      12      NA      NA
#  2:  2 1 B 62     1    62       1      12       1      62
#  3:  3 1 A 60     2    72       2      72       1      62
#  4:  4 1 B 61     2   123       2      72       2     123
#  5:  5 2 A 83     4   155       4     155       2     123
#  6:  6 2 B 97     4   220       4     155       4     220
#  7:  7 2 A  1     6   156       6     156       4     220
#  8:  8 2 B 22     6   242       6     156       6     242
#  9:  9 3 A 99     9   255       9     255       6     242
# 10: 10 3 B 47     9   289       9     255       9     289
# 11: 11 3 A 63    12   318      12     318       9     289
# 12: 12 3 B 49    12   338      12     318      12     338


Resulting data.table "g" (grothendieck's)

#    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
#  1:       1      NA      12      NA
#  2:       1       1      12      62
#  3:       2       1      72      62
#  4:       2       2      72     123
#  5:       4       2     155     123
#  6:       4       4     155     220
#  7:       6       4     156     220
#  8:       6       6     156     242
#  9:       9       6     255     242
# 10:       9       9     255     289
# 11:      12       9     318     289
# 12:      12      12     318     338

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复