Complicated reshaping

后端 未结 8 1420
南方客
南方客 2020-12-25 14:01

I want to reshape my dataframe from long to wide format and I loose some data that I\'d like to keep. For the following example:

df <- data.frame(Par1 =          


        
相关标签:
8条回答
  • 2020-12-25 14:38

    What a great opprotunity to benchmark! Below are some runs of the plyr method (as suggested by @agstudy) compared with the data.table method (as suggested by @Arun) using different sample sizes (N = 900, 2700, 10800)

    Summary:
    The data.table method outperforms the plyr method by a factor of 7.5

    #-------------------#
    #   M E T H O D S   #
    #-------------------#
    
      # additional methods below, in the updates
    
      # Method 1  -- suggested by @agstudy
      plyrMethod <- quote({
                      dfw<-dcast(df,
                             formula = Par1+Par2~Type,
                             value.var="Val",
                             fun.aggregate=mean)
                      dat <- ddply(df,.(Par1,Par2),function(x){
                        data.frame(ParD=paste(paste(x$ParD),collapse='_'),
                                   Num.pre =length(x$Type[x$Type =='pre']),
                                   Num.post = length(x$Type[x$Type =='post']))
                      })
                      merge(dfw,dat)
                    })
    
      # Method 2 -- suggested by @Arun
      dtMethod <- quote(
                    dt[, list(pre=mean(Val[Type == "pre"]), 
                              post=mean(Val[Type == "post"]), 
                              Num.pre=length(Val[Type == "pre"]), 
                              Num.post=length(Val[Type == "post"]), 
                              ParD = paste(ParD, collapse="_")), 
                    by=list(Par1, Par2)]
                  ) 
    
     # Method 3 -- suggested by @regetz
     reduceMethod <- quote(
                      Reduce(merge, list(
                          dcast(df, formula = Par1+Par2~Type, value.var="Val",
                              fun.aggregate=mean),
                          setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
                              fun.aggregate=length), c("Par1", "Par2", "Num.post",
                              "Num.pre")),
                          aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
                          ))
                      )
    
     # Method 4 -- suggested by @Ramnath
     castddplyMethod <- quote(
                          reshape::cast(Par1 + Par2 + ParD ~ Type, 
                               data = ddply(df, .(Par1, Par2), transform, 
                               ParD = paste(ParD, collapse = "_")), 
                               fun  = c(mean, length)
                              )
                          )
    
    
    
    # SAMPLE DATA #
    #-------------#
    
    library(data.table)
    library(plyr)
    library(reshape2)
    library(rbenchmark)
    
    
      # for Par1, ParD
      LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
      lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")
    
      # max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
      size <- 17568
      size <- 10800
      size <- 900  
    
      set.seed(1)
      df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                     , Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                     , ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
                     , Type=rep(c("pre","post"), size/2)
                     , Val =sample(seq(10,100,10), size, TRUE)
                     )
    
      dt <- data.table(df, key=c("Par1", "Par2"))
    
    
    # Confirming Same Results # 
    #-------------------------#
      # Evaluate
      DF1 <- eval(plyrMethod)
      DF2 <- eval(dtMethod)
    
      # Convert to DF and sort columns and sort ParD levels, for use in identical
      colOrder <- sort(names(DF1))
      DF1 <- DF1[, colOrder]
      DF2 <- as.data.frame(DF2)[, colOrder]
      DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
      identical((DF1), (DF2))
      # [1] TRUE
    #-------------------------#
    

    RESULTS

    #--------------------#
    #     BENCHMARK      #
    #--------------------#
    benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
              replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
              order="relative")
    
    
    # SAMPLE SIZE = 900
      relative      test elapsed user.self sys.self replications
         1.000    reduce   0.392     0.375    0.018            5
         1.003        dt   0.393     0.377    0.016            5
         7.064      plyr   2.769     2.721    0.047            5
         8.003 castddply   3.137     3.030    0.106            5
    
    # SAMPLE SIZE = 2,700
      relative   test elapsed user.self sys.self replications
         1.000     dt   1.371     1.327    0.090            5
         2.205 reduce   3.023     2.927    0.102            5
         7.291   plyr   9.996     9.644    0.377            5
    
    # SAMPLE SIZE = 10,800
      relative      test elapsed user.self sys.self replications
         1.000        dt   8.678     7.168    1.507            5
         2.769    reduce  24.029    23.231    0.786            5
         6.946      plyr  60.277    52.298    7.947            5
        13.796 castddply 119.719   113.333   10.816            5
    
    # SAMPLE SIZE = 17,568
      relative   test elapsed user.self sys.self replications
         1.000     dt  27.421    13.042   14.470            5
         4.030 reduce 110.498    75.853   34.922            5
         5.414   plyr 148.452   105.776   43.156            5
    

    Update : Added results for baseMethod1

    # Used only sample size of 90, as it was taking long
    relative  test elapsed user.self sys.self replications
       1.000    dt   0.044     0.043    0.001            5
       7.773  plyr   0.342     0.339    0.003            5
      65.614 base1   2.887     2.866    0.028            5
    
    Where
       baseMethod1 <- quote({
                      step1 <- with(df, split(df, list(Par1, Par2)))
                      step2 <- step1[sapply(step1, nrow) > 0]
                      step3 <- lapply(step2, function(x) {
                          piece1 <- tapply(x$Val, x$Type, mean)
                          piece2 <- tapply(x$Type, x$Type, length)
                          names(piece2) <- paste0("Num.", names(piece2))
                          out <- x[1, 1:2]
                          out[, 3:6] <- c(piece1, piece2)
                          names(out)[3:6] <-  names(c(piece1, piece2))
                          out$ParD <- paste(unique(x$ParD), collapse="_")
                          out
                      })
                      data.frame(do.call(rbind, step3), row.names=NULL)
                    })
    

    Update 2: Added keying the DT as part of the metric

    Adding the indexing step to the benchmark for fairness as per @MatthewDowle s comment.
    However, presumably, if data.table is used, it will be in place of the data.frame and hence the indexing will occur once and not simply for this procedure

       dtMethod.withkey <- quote({
                           dt <- data.table(df, key=c("Par1", "Par2"))       
                           dt[, list(pre=mean(Val[Type == "pre"]), 
                                     post=mean(Val[Type == "post"]), 
                                     Num.pre=length(Val[Type == "pre"]), 
                                     Num.post=length(Val[Type == "post"]), 
                                     ParD = paste(ParD, collapse="_")), 
                           by=list(Par1, Par2)]
                         }) 
    
    # SAMPLE SIZE = 10,800
      relative       test elapsed user.self sys.self replications
         1.000         dt   9.155     7.055    2.137            5
         1.043 dt.withkey   9.553     7.245    2.353            5
         3.567     reduce  32.659    31.196    1.586            5
         6.703       plyr  61.364    54.080    7.600            5
    

    Update 3: Benchmarking @MD's edits to @Arun's original answer

    dtMethod.MD1 <- quote(
                      dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
                                post=mean(Val[.post <- Type=="post"]),  # save .post
                                pre.num=sum(.pre),                      # reuse .pre
                                post.num=sum(.post),                    # reuse .post
                                ParD = paste(ParD, collapse="_")), 
                         by=list(Par1, Par2)]
                      )
    
    dtMethod.MD2 <- quote(
                      dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
                             .post <- Type=="post"
                             list(pre=mean(Val[.pre]), 
                                  post=mean(Val[.post]),
                                  pre.num=sum(.pre),
                                  post.num=sum(.post), 
                                  ParD = paste(ParD, collapse="_")) }
                      , by=list(Par1, Par2)]
                      )
    
    dtMethod.MD3 <- quote(
                    dt[, { .pre <- Type=="pre"
                           .post <- Type=="post"
                           list(pre=mean(Val[.pre]), 
                                post=mean(Val[.post]),
                                pre.num=sum(.pre),
                                post.num=sum(.post), 
                                ParD = list(ParD)) }     # list() faster than paste()
                    , by=list(Par1, Par2)]
                    )
    
    benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
          replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
          order="relative")
    
    #--------------------#
    
    Comparing the different data.table methods amongst themselves
    
    
    # SAMPLE SIZE = 900
      relative  test elapsed user.self sys.self replications
         1.000 dt.M3   0.198     0.197    0.001            5  <~~~ "list()" Method
         1.242 dt.M1   0.246     0.243    0.004            5
         1.253 dt.M2   0.248     0.242    0.007            5
         1.884    dt   0.373     0.367    0.007            5
    
    # SAMPLE SIZE = 17,568
      relative  test elapsed user.self sys.self replications
         1.000 dt.M3  33.492    24.487    9.122            5   <~~~ "list()" Method
         1.086 dt.M1  36.388    11.442   25.086            5
         1.086 dt.M2  36.388    10.845   25.660            5
         1.126    dt  37.701    13.256   24.535            5
    
    Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
    Using a clean session  (ie, removing string cache)
    _Note: Ran the following twice, fresh session each time, with practically identical results
           Then re-ran in the *same* session, with reps=5. Results very different._
    
    
    benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
    # SAMPLE SIZE=17,568;  CLEAN SESSION
      relative  test elapsed user.self sys.self replications
         1.000 dt.M1   8.885     4.260    4.617            1
         1.633 dt.M3  14.506    12.821    1.677            1
    
    # SAMPLE SIZE=17,568;  *SAME* SESSION
      relative  test elapsed user.self sys.self replications
         1.000 dt.M1  33.443    10.200   23.226            5
         1.048 dt.M3  35.060    26.127    8.915            5
    
    #--------------------#
    
    New benchmarks against previous methods
    _Note: Not using the "list method" as results are not the same as other methods_
    
    # SAMPLE SIZE = 900
      relative   test elapsed user.self sys.self replications
         1.000  dt.M1   0.254     0.247    0.008            5
         1.705 reduce   0.433     0.425    0.010            5
        11.280   plyr   2.865     2.842    0.031            5
    
    # SAMPLE SIZE = 17,568
      relative   test elapsed user.self sys.self replications
         1.000  dt.M1  24.826    10.427   14.458            5
         4.348 reduce 107.935    70.107   38.314            5
         5.942   plyr 147.508   106.958   41.083            5
    
    0 讨论(0)
  • 2020-12-25 14:41

    I believe this base R solution is comparable with @Arun's data table solution. (Which isn't to say I would prefer it; that code is much simpler!)

    baseMethod2 <- quote({
        is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="\b"))))
        i1 <- sapply(is, `[`, 1)
        out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
        js <- lapply(is, function(i) split(i, df$Type[i]))
        out$post <- sapply(js, function(j) mean(df$Val[j$post]))
        out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
        out$Num.pre <- sapply(js, function(j) length(j$pre))
        out$Num.post <- sapply(js, function(j) length(j$post))
        out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
        out
    })
    

    Using @RicardoSaporta's timing code with 900, 2700, and 10,800, respectively:

    > relative        test elapsed user.self sys.self replications
    3    1.000 baseMethod2   0.230     0.229        0            5
    1    1.130          dt   0.260     0.257        0            5
    2    8.752        plyr   2.013     2.006        0            5
    
    > relative        test elapsed user.self sys.self replications
    3    1.000 baseMethod2   0.877     0.872        0            5
    1    1.068          dt   0.937     0.934        0            5
    2    8.060        plyr   7.069     7.043        0            5
    
    > relative        test elapsed user.self sys.self replications
    1    1.000          dt   6.232     6.178    0.031            5
    3    1.085 baseMethod2   6.763     6.683    0.054            5
    2    7.263        plyr  45.261    44.983    0.104            5
    
    0 讨论(0)
  • 2020-12-25 14:42

    Trying to wrap different aggregation expressions into a self-contained function (expressions should yield atomic values)...

    multi.by <- function(X, INDEX,...) {
        expressions <- substitute(...())
        duplicates <- duplicated(INDEX)
        res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part) 
            sapply(expressions,eval,part,simplify=F),simplify=F))
        if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
        else rownames(res) <- INDEX[!duplicates]
        res
    }
    
    multi.by(df,df[,1:2],
        pre=mean(Val[Type=="pre"]), 
        post=mean(Val[Type=="post"]),
        Num.pre=sum(Type=="pre"),
        Num.post=sum(Type=="post"),
        ParD=paste(ParD, collapse="_"))
    
    0 讨论(0)
  • 2020-12-25 14:46

    Late to the party, but here's another alternative using data.table:

    require(data.table)
    dt <- data.table(df, key=c("Par1", "Par2"))
    dt[, list(pre=mean(Val[Type == "pre"]), 
              post=mean(Val[Type == "post"]), 
              pre.num=length(Val[Type == "pre"]), 
              post.num=length(Val[Type == "post"]), 
              ParD = paste(ParD, collapse="_")), 
    by=list(Par1, Par2)]
    
    #    Par1 Par2 pre post pre.num post.num        ParD
    # 1:    A    D  10   20       1        1     foo_bar
    # 2:    B    E  30   40       1        1     baz_qux
    # 3:    C    F  50   65       1        2 bla_xyz_meh
    

    [from Matthew] +1 Some minor improvements to save repeating the same ==, and to demonstrate local variables inside j.

    dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
              post=mean(Val[.post <- Type=="post"]),  # save .post
              pre.num=sum(.pre),                      # reuse .pre
              post.num=sum(.post),                    # reuse .post
              ParD = paste(ParD, collapse="_")), 
    by=list(Par1, Par2)]
    
    #    Par1 Par2 pre post pre.num post.num        ParD
    # 1:    A    D  10   20       1        1     foo_bar
    # 2:    B    E  30   40       1        1     baz_qux
    # 3:    C    F  50   65       1        2 bla_xyz_meh
    
    dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
           .post <- Type=="post"
           list(pre=mean(Val[.pre]), 
                post=mean(Val[.post]),
                pre.num=sum(.pre),
                post.num=sum(.post), 
                ParD = paste(ParD, collapse="_")) }
    , by=list(Par1, Par2)]
    
    #    Par1 Par2 pre post pre.num post.num        ParD
    # 1:    A    D  10   20       1        1     foo_bar
    # 2:    B    E  30   40       1        1     baz_qux
    # 3:    C    F  50   65       1        2 bla_xyz_meh
    

    And if a list column is ok rather than a paste, then this should be faster :

    dt[, { .pre <- Type=="pre"
           .post <- Type=="post"
           list(pre=mean(Val[.pre]), 
                post=mean(Val[.post]),
                pre.num=sum(.pre),
                post.num=sum(.post), 
                ParD = list(ParD)) }     # list() faster than paste()
    , by=list(Par1, Par2)]
    
    #    Par1 Par2 pre post pre.num post.num        ParD
    # 1:    A    D  10   20       1        1     foo,bar
    # 2:    B    E  30   40       1        1     baz,qux
    # 3:    C    F  50   65       1        2 bla,xyz,meh
    
    0 讨论(0)
  • 2020-12-25 14:52

    You could do a merge of two dcasts and an aggregate, here all wrapped into one large expression mostly to avoid having intermediate objects hanging around afterwards:

    Reduce(merge, list(
        dcast(df, formula = Par1+Par2~Type, value.var="Val",
            fun.aggregate=mean),
        setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
            fun.aggregate=length), c("Par1", "Par2", "Num.post",
            "Num.pre")),
        aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
        ))
    
    0 讨论(0)
  • 2020-12-25 14:53

    I'll post but agstudy's puts me to shame:

    step1 <- with(df, split(df, list(Par1, Par2)))
    step2 <- step1[sapply(step1, nrow) > 0]
    step3 <- lapply(step2, function(x) {
        piece1 <- tapply(x$Val, x$Type, mean)
        piece2 <- tapply(x$Type, x$Type, length)
        names(piece2) <- paste0("Num.", names(piece2))
        out <- x[1, 1:2]
        out[, 3:6] <- c(piece1, piece2)
        names(out)[3:6] <-  names(c(piece1, piece2))
        out$ParD <- paste(unique(x$ParD), collapse="_")
        out
    })
    data.frame(do.call(rbind, step3), row.names=NULL)
    

    Yielding:

      Par1 Par2 post pre Num.post Num.pre        ParD
    1    A    D  2.0   1        1       1     foo_bar
    2    B    E  4.0   3        1       1     baz_qux
    3    C    F  6.5   5        2       1 bla_xyz_meh
    
    0 讨论(0)
提交回复
热议问题