for each group summarise means for all variables in dataframe (ddply? split?)

前端 未结 6 1056
难免孤独
难免孤独 2020-12-13 16:00

A week ago I would have done this manually: subset dataframe by group to new dataframes. For each dataframe compute means for each variables, then rbind. very clunky ...

相关标签:
6条回答
  • 2020-12-13 16:30

    Given the format you want for the result, the reshape package will be more efficient than plyr.

    test_data <- data.frame(
    var0 = rnorm(100),
    var1 = rnorm(100,1),
    var2 = rnorm(100,2),
    var3 = rnorm(100,3),
    var4 = rnorm(100,4),
    group = sample(letters[1:10],100,replace=T),
    year = sample(c(2007,2009),100, replace=T))
    
    library(reshape)
    Molten <- melt(test_data, id.vars = c("group", "year"))
    cast(group + variable ~ year, data = Molten, fun = mean)
    

    The result looks like this

       group variable         2007         2009
    1      a     var0  0.003767891  0.340989068
    2      a     var1  2.009026385  1.162786943
    3      a     var2  1.861061882  2.676524736
    4      a     var3  2.998011426  3.311250399
    5      a     var4  3.979255971  4.165715967
    6      b     var0 -0.112883844 -0.179762343
    7      b     var1  1.342447279  1.199554144
    8      b     var2  2.486088196  1.767431740
    9      b     var3  3.261451449  2.934903824
    10     b     var4  3.489147597  3.076779626
    11     c     var0  0.493591055 -0.113469315
    12     c     var1  0.157424796 -0.186590644
    13     c     var2  2.366594176  2.458204041
    14     c     var3  3.485808031  2.817153628
    15     c     var4  3.681576886  3.057915666
    16     d     var0  0.360188789  1.205875725
    17     d     var1  1.271541181  0.898973536
    18     d     var2  1.824468264  1.944708165
    19     d     var3  2.323315162  3.550719308
    20     d     var4  3.852223640  4.647498956
    21     e     var0 -0.556751465  0.273865769
    22     e     var1  1.173899189  0.719520372
    23     e     var2  1.935402724  2.046313047
    24     e     var3  3.318669590  2.871462470
    25     e     var4  4.374478734  4.522511874
    26     f     var0 -0.258956555 -0.007729091
    27     f     var1  1.424479454  1.175242755
    28     f     var2  1.797948551  2.411030282
    29     f     var3  3.083169793  3.324584667
    30     f     var4  4.160641429  3.546527820
    31     g     var0  0.189038036 -0.683028110
    32     g     var1  0.429915866  0.827761101
    33     g     var2  1.839982321  1.513104866
    34     g     var3  3.106414330  2.755975622
    35     g     var4  4.599340239  3.691478466
    36     h     var0  0.015557352 -0.707257185
    37     h     var1  0.933199148  1.037655156
    38     h     var2  1.927442457  2.521369108
    39     h     var3  3.246734239  3.703213646
    40     h     var4  4.242387776  4.407960355
    41     i     var0  0.885226638 -0.288221276
    42     i     var1  1.216012653  1.502514588
    43     i     var2  2.302815441  1.905731471
    44     i     var3  2.026631277  2.836508446
    45     i     var4  4.800676814  4.772964668
    46     j     var0 -0.435661855  0.192703997
    47     j     var1  0.836814185  0.394505861
    48     j     var2  1.663523873  2.377640369
    49     j     var3  3.489536343  3.457597835
    50     j     var4  4.146020948  4.281599816
    
    0 讨论(0)
  • 2020-12-13 16:36

    You can do this with by(). First set up some data:

    R> set.seed(42)
    R> testdf <- data.frame(var1=rnorm(100), var2=rnorm(100,2), var3=rnorm(100,3),  
                            group=as.factor(sample(letters[1:10],100,replace=T)),  
                            year=as.factor(sample(c(2007,2009),100,replace=T)))
    R> summary(testdf)
          var1              var2              var3          group      year   
     Min.   :-2.9931   Min.   :-0.0247   Min.   :0.30   e      :15   2007:50  
     1st Qu.:-0.6167   1st Qu.: 1.4085   1st Qu.:2.29   c      :14   2009:50  
     Median : 0.0898   Median : 1.9307   Median :2.98   f      :12            
     Mean   : 0.0325   Mean   : 1.9125   Mean   :2.99   h      :12            
     3rd Qu.: 0.6616   3rd Qu.: 2.4618   3rd Qu.:3.65   d      :11            
     Max.   : 2.2866   Max.   : 4.7019   Max.   :5.46   b      :10            
                                                        (Other):26  
    

    Use by():

    R> by(testdf[,1:3], testdf$year, mean)
    testdf$year: 2007
       var1    var2    var3 
    0.04681 1.77638 3.00122 
    --------------------------------------------------------------------- 
    testdf$year: 2009
       var1    var2    var3 
    0.01822 2.04865 2.97805 
    R> by(testdf[,1:3], list(testdf$group, testdf$year), mean)  
    ## longer answer by group and year suppressed
    

    You still need to reformat this for your table but it does give you the gist of your answer in one line.

    Edit: Further processing can be had via

    R> foo <- by(testdf[,1:3], list(testdf$group, testdf$year), mean)  
    R> do.call(rbind, foo)
              var1   var2  var3
     [1,]  0.62352 0.2549 3.157
     [2,]  0.08867 1.8313 3.607
     [3,] -0.69093 2.5431 3.094
     [4,]  0.02792 2.8068 3.181
     [5,] -0.26423 1.3269 2.781
     [6,]  0.07119 1.9453 3.284
     [7,] -0.10438 2.1181 3.783
     [8,]  0.21147 1.6345 2.470
     [9,]  1.17986 1.6518 2.362
    [10,] -0.42708 1.5683 3.144
    [11,] -0.82681 1.9528 2.740
    [12,] -0.27191 1.8333 3.090
    [13,]  0.15854 2.2830 2.949
    [14,]  0.16438 2.2455 3.100
    [15,]  0.07489 2.1798 2.451
    [16,] -0.03479 1.6800 3.099
    [17,]  0.48082 1.8883 2.569
    [18,]  0.32381 2.4015 3.332
    [19,] -0.47319 1.5016 2.903
    [20,]  0.11743 2.2645 3.452
    R> do.call(rbind, dimnames(foo))
         [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]   [,9]   [,10] 
    [1,] "a"    "b"    "c"    "d"    "e"    "f"    "g"    "h"    "i"    "j"   
    [2,] "2007" "2009" "2007" "2009" "2007" "2009" "2007" "2009" "2007" "2009"
    

    You can play with the dimnames some more:

    R> expand.grid(dimnames(foo))
       Var1 Var2
    1     a 2007
    2     b 2007
    3     c 2007
    4     d 2007
    5     e 2007
    6     f 2007
    7     g 2007
    8     h 2007
    9     i 2007
    10    j 2007
    11    a 2009
    12    b 2009
    13    c 2009
    14    d 2009
    15    e 2009
    16    f 2009
    17    g 2009
    18    h 2009
    19    i 2009
    20    j 2009
    R> 
    

    Edit: And with that, we can create a data.frame for the result without resorting to external packages using only base R:

    R> data.frame(cbind(expand.grid(dimnames(foo)), do.call(rbind, foo)))
       Var1 Var2     var1   var2  var3
    1     a 2007  0.62352 0.2549 3.157
    2     b 2007  0.08867 1.8313 3.607
    3     c 2007 -0.69093 2.5431 3.094
    4     d 2007  0.02792 2.8068 3.181
    5     e 2007 -0.26423 1.3269 2.781
    6     f 2007  0.07119 1.9453 3.284
    7     g 2007 -0.10438 2.1181 3.783
    8     h 2007  0.21147 1.6345 2.470
    9     i 2007  1.17986 1.6518 2.362
    10    j 2007 -0.42708 1.5683 3.144
    11    a 2009 -0.82681 1.9528 2.740
    12    b 2009 -0.27191 1.8333 3.090
    13    c 2009  0.15854 2.2830 2.949
    14    d 2009  0.16438 2.2455 3.100
    15    e 2009  0.07489 2.1798 2.451
    16    f 2009 -0.03479 1.6800 3.099
    17    g 2009  0.48082 1.8883 2.569
    18    h 2009  0.32381 2.4015 3.332
    19    i 2009 -0.47319 1.5016 2.903
    20    j 2009  0.11743 2.2645 3.452
    R> 
    
    0 讨论(0)
  • 2020-12-13 16:40

    EDIT: I wrote the following and then realized that Thierry had already written up almost EXACTLY the same answer. I somehow overlooked his answer. So if you like this answer, vote his up instead. I'm going ahead and posting since I spent the time typing it up.


    This sort of stuff consumes way more of my time than I wish it did! Here's a solution using the reshape package by Hadley Wickham. This example does not do exactly what you asked because the results are all in one big table, not a table for each group.

    The trouble you were having with the numeric values showing up as factors was because you were using cbind and everything was getting slammed into a matrix of type character. The cool thing is you don't need cbind with data.frame.

    test_data <- data.frame(
    var0 = rnorm(100),
    var1 = rnorm(100,1),
    var2 = rnorm(100,2),
    var3 = rnorm(100,3),
    var4 = rnorm(100,4),
    group = sample(letters[1:10],100,replace=T),
    year = sample(c(2007,2009),100, replace=T))
    
    library(reshape)
    molten_data <- melt(test_data, id=c("group", "year")))
    cast(molten_data, group + variable ~ year, mean)
    

    and this results in the following:

        group variable        2007         2009
    1      a     var0 -0.92040686 -0.154746420
    2      a     var1  1.06603832  0.559765035
    3      a     var2  2.34476321  2.206521587
    4      a     var3  3.01652065  3.256580166
    5      a     var4  3.75256699  3.907777127
    6      b     var0 -0.53207427 -0.149144766
    7      b     var1  0.75677714  0.879387608
    8      b     var2  2.41739521  1.224854891
    9      b     var3  2.63877431  2.436837719
    10     b     var4  3.69640598  4.439047363
    ...
    

    I wrote a blog post recently about doing something similar with plyr. I should do a part 2 about how to do the same thing using the reshape package. Both plyr and reshape were written by Hadley Wickham and are crazy useful tools.

    0 讨论(0)
  • 2020-12-13 16:45

    First do a simple aggregate to get it summarized.

    df <- aggregate(cbind(var0, var1, var2, var3, var4) ~ year + group, test_data, mean)
    

    That makes a data.frame like this...

       year group     var0      var1     var2     var3     var4
    1  2007     a 42.25000 0.2031277 2.145394 2.801812 3.571999
    2  2009     a 30.50000 1.2033653 1.475158 3.618023 4.127601
    3  2007     b 52.60000 1.4564604 2.224850 3.053322 4.339109
    ...
    

    That, by itself, is pretty close to what you wanted. You could just break it up by group now.

    l <- split(df, df$group)
    

    OK, so that's not quite it but we can refine the output if you really want to.

    lapply(l, function(x) {d <- t(x[,3:7]); colnames(d) <- x[,2]; d})
    
    $a
               2007      2009
    var0 42.2500000 30.500000
    var1  0.2031277  1.203365
    var2  2.1453939  1.475158
    ...
    

    That doesn't have all your table formatting but it's organized exactly as you describe and is darn close. This last step you could pretty up how you like.

    This is the only answer here that matches the requested organization, and it's the fastest way to do it in R. BTW, I wouldn't bother doing that last step and just stick with the very first output from the aggregate... or maybe the split.

    0 讨论(0)
  • 2020-12-13 16:46

    First of all, you don't need to use cbind, and that's why everything is a factor. This works:

    test_data <- data.frame(
    var0 = rnorm(100),
    var1 = rnorm(100,1),
    var2 = rnorm(100,2),
    var3 = rnorm(100,3),
    var4 = rnorm(100,4),
    group = sample(letters[1:10],100,replace=T),
    year = sample(c(2007,2009),100, replace=T))
    

    Secondly, the best practice is to use "." instead of "_" in variable names. See the google style guide (for instance).

    Finally, you can use the Rigroup package; it's very fast. Combine the igroupMeans() function with apply, and set the index i=as.factor(paste(test_data$group,test_data$year,sep="")). I'll try to include an example of this later.

    EDIT 6/9/2017

    Rigroup package was removed from CRAN. See this

    0 讨论(0)
  • 2020-12-13 16:49

    It could be done with basic R function:

    n <- 100
    test_data <- data.frame(
        var0 = rnorm(n),
        var1 = rnorm(n,1),
        var2 = rnorm(n,2),
        var3 = rnorm(n,3),
        var4 = rnorm(n,4),
        group = sample(letters[1:10],n,replace=TRUE),
        year = sample(c(2007,2009),n, replace=TRUE)
    )
    
    tapply(
        seq_len(nrow(test_data)),
        test_data$group,
        function(ind) sapply(
            c("var0","var1","var2","var3","var4"),
            function(x_name) tapply(
                test_data[[x_name]][ind],
                test_data$year[ind],
                mean
            )
        )
    )
    

    Explanations:

    • tip: when generating random data is usefull to define number of observations. Changing sample size is easier that way,
    • first tapply split row index 1:nrow(test_data) by groups,
    • then for each group sapply over variables
    • for fixed group and variable do simple tapply returnig mean of variable per year.

    In R 2.9.2 result is:

    $a
     var0.2007  var1.2007  var2.2007  var3.2007  var4.2007 
    -0.3123034  0.8759787  1.9832617  2.7063034  4.1322758 
    
    $b
                var0      var1     var2     var3     var4
    2007  0.81366885 0.4189896 2.331256 3.073276 4.164639
    2009 -0.08916257 1.5442126 3.008014 3.215019 4.398279
    
    $c
              var0      var1     var2     var3     var4
    2007 0.4232098 1.3657369 1.386627 2.808511 3.878809
    2009 0.3245751 0.6672073 1.797886 1.752568 3.632318
    
    $d
               var0      var1     var2     var3     var4
    2007 -0.1335138 0.5925237 2.303543 3.293281 3.234386
    2009  0.9547751 2.2111581 2.678878 2.845234 3.300512
    
    $e
               var0      var1     var2     var3     var4
    2007 -0.5958653 1.3535658 1.886918 3.036121 4.120889
    2009  0.1372080 0.7215648 2.298064 3.186617 3.551147
    
    $f
               var0      var1     var2     var3     var4
    2007 -0.3401813 0.7883120 1.949329 2.811438 4.194481
    2009  0.3012627 0.2702647 3.332480 3.480494 2.963951
    
    $g
             var0       var1      var2     var3     var4
    2007 1.225245 -0.3289711 0.7599302 2.903581 4.200023
    2009 0.273858  0.2445733 1.7690299 2.620026 4.182050
    
    $h
               var0     var1     var2     var3     var4
    2007 -1.0126650 1.554403 2.220979 3.713874 3.924151
    2009 -0.6187407 1.504297 1.321930 2.796882 4.179695
    
    $i
                var0     var1     var2     var3     var4
    2007  0.01697314 1.318965 1.794635 2.709925 2.899440
    2009 -0.75790995 1.033483 2.363052 2.422679 3.863526
    
    $j
               var0      var1     var2     var3     var4
    2007 -0.7440600 1.6466291 2.020379 3.242770 3.727347
    2009 -0.2842126 0.5450029 1.669964 2.747455 4.179531
    

    With my random data there is problem with "a" group - only 2007 cases were present. If year will be factor (with levels 2007 and 2009) then results may look better (you will have two rows for each year, but there probably be NA).

    Result is list, so you can use lapply to eg. convert to latex table, html table, print on screen transpose, etc.

    0 讨论(0)
提交回复
热议问题