A reshape puzzle in data.table

后端 未结 3 520
遥遥无期
遥遥无期 2021-01-19 13:07

Yet another reshape problem in data.table

set.seed(1234)
DT <- data.table(x=rep(c(1,2,3),each=4), y=c(\"A\",\"B\"), v=sample(1:100,12))
#             


        
3条回答
  •  旧时难觅i
    2021-01-19 14:08

    Here's another way:

    ys <- unique(DT$y)
    sdcols <- c("x", "v")
    cols <- paste0("SUM.", sdcols)
    DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
    for( i in seq_along(ys)) {
        cols <- paste0("SUM.", sdcols, ".", ys[i])
        DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
        DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
        c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
        setnames(DT, c("v1", "v2"), cols)
    }
    

    My version of benchmarking with mnel's (from his post) and this function:

    The function from this post:

    arun <- function(DT) {
    
        ys <- unique(DT$y)
        sdcols <- c("x", "v")
        cols <- paste0("SUM.", sdcols)
        DT[, c(cols) := lapply(.SD, cumsum), by = y, .SDcols = sdcols]
        for( i in seq_along(ys)) {
            cols <- paste0("SUM.", sdcols, ".", ys[i])
            DT[, c("v1", "v2") := list(SUM.x, SUM.v[i]), by = SUM.x]
            DT[, c("v1", "v2") := list(c(rep(NA_integer_, (i-1)), v1)[seq_len(.N)], 
            c(rep(NA_integer_, (i-1)), v2)[seq_len(.N)])]
            setnames(DT, c("v1", "v2"), cols)
        }
        DT
    }
    

    Function from mnel's post:

    mnel <- function(DT) {
        set.seed(1234)
        DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
        DT[, id := seq_len(nrow(DT))]
        setkey(DT, y)
        uniqY <- unique(DT$y)
        for(jj in uniqY){
          nc <- do.call(paste, c(expand.grid('Sum', c('x','v'),jj), sep ='.'))
          DT[.(jj), (nc) := list(cumsum(x), cumsum(v))]
    
        }
        setkey(DT, id)
        DT[, 5:8 := lapply(.SD, function(x) { 
          xn <- is.na(x)
          x[xn] <- -Inf
          xx <- cummax(x)
          # deal with leading NA values
            if(xn[1]){
            xn1 <- which(xn)[1]
          xx[seq_len(xn1)] <- NA}   
          xx }), .SDcols = 5:8]
    }
    

    Function from statquant:

    statquant <- function(DT){
        #first step is to create cumsum columns
        colNames <- c("x","v")
        DT[, paste0("SUM.",colNames):=lapply(.SD,cumsum) ,by=y, .SDcols=colNames];
        #now we need to reshape each SUM.* to SUM.*.{yvalue}
        DT[,N:=.I]; setattr(DT,"sorted","N")
    
        g <- function(DT,SD){
          cols <- c('N',grep('SUM',colnames(SD), value=T));
          Yval <- unique(SD[,y]);
          merge(DT, SD[,cols, with=F], suffix=c('',paste0('.',Yval)), all.x=T);    
        }
    
        DT <- Reduce(f=g,init=DT,x=split(DT,DT$y));
    
        locf = function(x) {
          ind = which(!is.na(x))    
          if(is.na(x[1])) ind = c(1,ind)
          rep(x[ind], times = diff( c(ind, length(x) + 1) )) 
        }
    
        newColNames <- grep('SUM',colnames(DT),value=T);
        DT <- DT[, (newColNames):=lapply(.SD, locf), .SDcols=newColNames]
        DT
    }
    

    Function from grothendieck

    grothendieck <- function(DT) {
        cumsum0 <- function(x) { x <- cumsum(x); ifelse(x == 0, NA, x) }
        DT2 <- DT[, {SUM.<-y; lapply(data.table(model.matrix(~ SUM.:x + SUM.:v + 0)), cumsum0)}]
        setnames(DT2, sub("(.):(.)", "\\2.\\1", names(DT2)))
        DT2
    }
    

    Benchmarking:

    library(data.table)
    library(zoo)
    set.seed(1234)
    DT <- data.table(x=rep(c(1,2,3),each=4), y=c("A","B"), v=sample(1:100,12))
    
    library(microbenchmark)
    microbenchmark( s <- statquant(copy(DT)), g <- grothendieck(copy(DT)), 
                    m <- mnel(copy(DT)), a <- arun(copy(DT)), times = 1e3)
    
    # Unit: milliseconds
    #                         expr       min        lq    median        uq       max neval
    #     s <- statquant(copy(DT)) 13.041125 13.674083 14.493870 17.273151 144.74186  1000
    #  g <- grothendieck(copy(DT))  3.634120  3.859143  4.006085  4.443388  80.01984  1000
    #          m <- mnel(copy(DT))  7.819286  8.234178  8.596090 10.423668  87.07668  1000
    #          a <- arun(copy(DT))  6.925419  7.369286  7.703003  9.262719  53.39823  1000
    

    resulting data.table "a" (arun's)

    #     x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
    #  1: 1 A 12     1    12       1      12      NA      NA
    #  2: 1 B 62     1    62       1      12       1      62
    #  3: 1 A 60     2    72       2      72       1      62
    #  4: 1 B 61     2   123       2      72       2     123
    #  5: 2 A 83     4   155       4     155       2     123
    #  6: 2 B 97     4   220       4     155       4     220
    #  7: 2 A  1     6   156       6     156       4     220
    #  8: 2 B 22     6   242       6     156       6     242
    #  9: 3 A 99     9   255       9     255       6     242
    # 10: 3 B 47     9   289       9     255       9     289
    # 11: 3 A 63    12   318      12     318       9     289
    # 12: 3 B 49    12   338      12     318      12     338
    

    Resulting data.table "m" (mnel's)

    #    x y  v id Sum.x.A Sum.v.A Sum.x.B Sum.v.B
    #  1: 1 A 12  1       1      12      NA      NA
    #  2: 1 B 62  2       1      12       1      62
    #  3: 1 A 60  3       2      72       1      62
    #  4: 1 B 61  4       2      72       2     123
    #  5: 2 A 83  5       4     155       2     123
    #  6: 2 B 97  6       4     155       4     220
    #  7: 2 A  1  7       6     156       4     220
    #  8: 2 B 22  8       6     156       6     242
    #  9: 3 A 99  9       9     255       6     242
    # 10: 3 B 47 10       9     255       9     289
    # 11: 3 A 63 11      12     318       9     289
    # 12: 3 B 49 12      12     318      12     338
    

    Resulting data.table "s" (statquant's)

    #      N x y  v SUM.x SUM.v SUM.x.A SUM.v.A SUM.x.B SUM.v.B
    #  1:  1 1 A 12     1    12       1      12      NA      NA
    #  2:  2 1 B 62     1    62       1      12       1      62
    #  3:  3 1 A 60     2    72       2      72       1      62
    #  4:  4 1 B 61     2   123       2      72       2     123
    #  5:  5 2 A 83     4   155       4     155       2     123
    #  6:  6 2 B 97     4   220       4     155       4     220
    #  7:  7 2 A  1     6   156       6     156       4     220
    #  8:  8 2 B 22     6   242       6     156       6     242
    #  9:  9 3 A 99     9   255       9     255       6     242
    # 10: 10 3 B 47     9   289       9     255       9     289
    # 11: 11 3 A 63    12   318      12     318       9     289
    # 12: 12 3 B 49    12   338      12     318      12     338
    

    Resulting data.table "g" (grothendieck's)

    #    SUM.x.A SUM.x.B SUM.v.A SUM.v.B
    #  1:       1      NA      12      NA
    #  2:       1       1      12      62
    #  3:       2       1      72      62
    #  4:       2       2      72     123
    #  5:       4       2     155     123
    #  6:       4       4     155     220
    #  7:       6       4     156     220
    #  8:       6       6     156     242
    #  9:       9       6     255     242
    # 10:       9       9     255     289
    # 11:      12       9     318     289
    # 12:      12      12     318     338
    

提交回复
热议问题