for each group summarise means for all variables in dataframe (ddply? split?)

前端未结

关注

 6  1062

A week ago I would have done this manually: subset dataframe by group to new dataframes. For each dataframe compute means for each variables, then rbind. very clunky ...

相关标签:

6条回答

感情败类

2020-12-13 16:30

Given the format you want for the result, the reshape package will be more efficient than plyr.

test_data <- data.frame(
var0 = rnorm(100),
var1 = rnorm(100,1),
var2 = rnorm(100,2),
var3 = rnorm(100,3),
var4 = rnorm(100,4),
group = sample(letters[1:10],100,replace=T),
year = sample(c(2007,2009),100, replace=T))

library(reshape)
Molten <- melt(test_data, id.vars = c("group", "year"))
cast(group + variable ~ year, data = Molten, fun = mean)

The result looks like this

   group variable         2007         2009
1      a     var0  0.003767891  0.340989068
2      a     var1  2.009026385  1.162786943
3      a     var2  1.861061882  2.676524736
4      a     var3  2.998011426  3.311250399
5      a     var4  3.979255971  4.165715967
6      b     var0 -0.112883844 -0.179762343
7      b     var1  1.342447279  1.199554144
8      b     var2  2.486088196  1.767431740
9      b     var3  3.261451449  2.934903824
10     b     var4  3.489147597  3.076779626
11     c     var0  0.493591055 -0.113469315
12     c     var1  0.157424796 -0.186590644
13     c     var2  2.366594176  2.458204041
14     c     var3  3.485808031  2.817153628
15     c     var4  3.681576886  3.057915666
16     d     var0  0.360188789  1.205875725
17     d     var1  1.271541181  0.898973536
18     d     var2  1.824468264  1.944708165
19     d     var3  2.323315162  3.550719308
20     d     var4  3.852223640  4.647498956
21     e     var0 -0.556751465  0.273865769
22     e     var1  1.173899189  0.719520372
23     e     var2  1.935402724  2.046313047
24     e     var3  3.318669590  2.871462470
25     e     var4  4.374478734  4.522511874
26     f     var0 -0.258956555 -0.007729091
27     f     var1  1.424479454  1.175242755
28     f     var2  1.797948551  2.411030282
29     f     var3  3.083169793  3.324584667
30     f     var4  4.160641429  3.546527820
31     g     var0  0.189038036 -0.683028110
32     g     var1  0.429915866  0.827761101
33     g     var2  1.839982321  1.513104866
34     g     var3  3.106414330  2.755975622
35     g     var4  4.599340239  3.691478466
36     h     var0  0.015557352 -0.707257185
37     h     var1  0.933199148  1.037655156
38     h     var2  1.927442457  2.521369108
39     h     var3  3.246734239  3.703213646
40     h     var4  4.242387776  4.407960355
41     i     var0  0.885226638 -0.288221276
42     i     var1  1.216012653  1.502514588
43     i     var2  2.302815441  1.905731471
44     i     var3  2.026631277  2.836508446
45     i     var4  4.800676814  4.772964668
46     j     var0 -0.435661855  0.192703997
47     j     var1  0.836814185  0.394505861
48     j     var2  1.663523873  2.377640369
49     j     var3  3.489536343  3.457597835
50     j     var4  4.146020948  4.281599816

0 讨论(0)

执念已碎

2020-12-13 16:36

You can do this with by(). First set up some data:

R> set.seed(42)
R> testdf <- data.frame(var1=rnorm(100), var2=rnorm(100,2), var3=rnorm(100,3),  
                        group=as.factor(sample(letters[1:10],100,replace=T)),  
                        year=as.factor(sample(c(2007,2009),100,replace=T)))
R> summary(testdf)
      var1              var2              var3          group      year   
 Min.   :-2.9931   Min.   :-0.0247   Min.   :0.30   e      :15   2007:50  
 1st Qu.:-0.6167   1st Qu.: 1.4085   1st Qu.:2.29   c      :14   2009:50  
 Median : 0.0898   Median : 1.9307   Median :2.98   f      :12            
 Mean   : 0.0325   Mean   : 1.9125   Mean   :2.99   h      :12            
 3rd Qu.: 0.6616   3rd Qu.: 2.4618   3rd Qu.:3.65   d      :11            
 Max.   : 2.2866   Max.   : 4.7019   Max.   :5.46   b      :10            
                                                    (Other):26

Use by():

R> by(testdf[,1:3], testdf$year, mean)
testdf$year: 2007
   var1    var2    var3 
0.04681 1.77638 3.00122 
--------------------------------------------------------------------- 
testdf$year: 2009
   var1    var2    var3 
0.01822 2.04865 2.97805 
R> by(testdf[,1:3], list(testdf$group, testdf$year), mean)  
## longer answer by group and year suppressed

You still need to reformat this for your table but it does give you the gist of your answer in one line.

Edit: Further processing can be had via

R> foo <- by(testdf[,1:3], list(testdf$group, testdf$year), mean)  
R> do.call(rbind, foo)
          var1   var2  var3
 [1,]  0.62352 0.2549 3.157
 [2,]  0.08867 1.8313 3.607
 [3,] -0.69093 2.5431 3.094
 [4,]  0.02792 2.8068 3.181
 [5,] -0.26423 1.3269 2.781
 [6,]  0.07119 1.9453 3.284
 [7,] -0.10438 2.1181 3.783
 [8,]  0.21147 1.6345 2.470
 [9,]  1.17986 1.6518 2.362
[10,] -0.42708 1.5683 3.144
[11,] -0.82681 1.9528 2.740
[12,] -0.27191 1.8333 3.090
[13,]  0.15854 2.2830 2.949
[14,]  0.16438 2.2455 3.100
[15,]  0.07489 2.1798 2.451
[16,] -0.03479 1.6800 3.099
[17,]  0.48082 1.8883 2.569
[18,]  0.32381 2.4015 3.332
[19,] -0.47319 1.5016 2.903
[20,]  0.11743 2.2645 3.452
R> do.call(rbind, dimnames(foo))
     [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]   [,9]   [,10] 
[1,] "a"    "b"    "c"    "d"    "e"    "f"    "g"    "h"    "i"    "j"   
[2,] "2007" "2009" "2007" "2009" "2007" "2009" "2007" "2009" "2007" "2009"

You can play with the dimnames some more:

R> expand.grid(dimnames(foo))
   Var1 Var2
1     a 2007
2     b 2007
3     c 2007
4     d 2007
5     e 2007
6     f 2007
7     g 2007
8     h 2007
9     i 2007
10    j 2007
11    a 2009
12    b 2009
13    c 2009
14    d 2009
15    e 2009
16    f 2009
17    g 2009
18    h 2009
19    i 2009
20    j 2009
R>

Edit: And with that, we can create a data.frame for the result without resorting to external packages using only base R:

R> data.frame(cbind(expand.grid(dimnames(foo)), do.call(rbind, foo)))
   Var1 Var2     var1   var2  var3
1     a 2007  0.62352 0.2549 3.157
2     b 2007  0.08867 1.8313 3.607
3     c 2007 -0.69093 2.5431 3.094
4     d 2007  0.02792 2.8068 3.181
5     e 2007 -0.26423 1.3269 2.781
6     f 2007  0.07119 1.9453 3.284
7     g 2007 -0.10438 2.1181 3.783
8     h 2007  0.21147 1.6345 2.470
9     i 2007  1.17986 1.6518 2.362
10    j 2007 -0.42708 1.5683 3.144
11    a 2009 -0.82681 1.9528 2.740
12    b 2009 -0.27191 1.8333 3.090
13    c 2009  0.15854 2.2830 2.949
14    d 2009  0.16438 2.2455 3.100
15    e 2009  0.07489 2.1798 2.451
16    f 2009 -0.03479 1.6800 3.099
17    g 2009  0.48082 1.8883 2.569
18    h 2009  0.32381 2.4015 3.332
19    i 2009 -0.47319 1.5016 2.903
20    j 2009  0.11743 2.2645 3.452
R>

0 讨论(0)

囚心锁ツ

2020-12-13 16:40
EDIT: I wrote the following and then realized that Thierry had already written up almost EXACTLY the same answer. I somehow overlooked his answer. So if you like this answer, vote his up instead. I'm going ahead and posting since I spent the time typing it up.

This sort of stuff consumes way more of my time than I wish it did! Here's a solution using the reshape package by Hadley Wickham. This example does not do exactly what you asked because the results are all in one big table, not a table for each group.

The trouble you were having with the numeric values showing up as factors was because you were using cbind and everything was getting slammed into a matrix of type character. The cool thing is you don't need cbind with data.frame.
```
test_data <- data.frame(
var0 = rnorm(100),
var1 = rnorm(100,1),
var2 = rnorm(100,2),
var3 = rnorm(100,3),
var4 = rnorm(100,4),
group = sample(letters[1:10],100,replace=T),
year = sample(c(2007,2009),100, replace=T))

library(reshape)
molten_data <- melt(test_data, id=c("group", "year")))
cast(molten_data, group + variable ~ year, mean)
```
and this results in the following:
```
    group variable        2007         2009
1      a     var0 -0.92040686 -0.154746420
2      a     var1  1.06603832  0.559765035
3      a     var2  2.34476321  2.206521587
4      a     var3  3.01652065  3.256580166
5      a     var4  3.75256699  3.907777127
6      b     var0 -0.53207427 -0.149144766
7      b     var1  0.75677714  0.879387608
8      b     var2  2.41739521  1.224854891
9      b     var3  2.63877431  2.436837719
10     b     var4  3.69640598  4.439047363
...
```
I wrote a blog post recently about doing something similar with plyr. I should do a part 2 about how to do the same thing using the reshape package. Both plyr and reshape were written by Hadley Wickham and are crazy useful tools.
0 讨论(0)
发布评论:

提交评论
- 加载中...
天命终不由人

2020-12-13 16:45
First do a simple aggregate to get it summarized.
```
df <- aggregate(cbind(var0, var1, var2, var3, var4) ~ year + group, test_data, mean)
```
That makes a data.frame like this...
```
   year group     var0      var1     var2     var3     var4
1  2007     a 42.25000 0.2031277 2.145394 2.801812 3.571999
2  2009     a 30.50000 1.2033653 1.475158 3.618023 4.127601
3  2007     b 52.60000 1.4564604 2.224850 3.053322 4.339109
...
```
That, by itself, is pretty close to what you wanted. You could just break it up by group now.
```
l <- split(df, df$group)
```
OK, so that's not quite it but we can refine the output if you really want to.
```
lapply(l, function(x) {d <- t(x[,3:7]); colnames(d) <- x[,2]; d})

$a
           2007      2009
var0 42.2500000 30.500000
var1  0.2031277  1.203365
var2  2.1453939  1.475158
...
```
That doesn't have all your table formatting but it's organized exactly as you describe and is darn close. This last step you could pretty up how you like.

This is the only answer here that matches the requested organization, and it's the fastest way to do it in R. BTW, I wouldn't bother doing that last step and just stick with the very first output from the aggregate... or maybe the split.
0 讨论(0)
发布评论:

提交评论
- 加载中...
北恋

2020-12-13 16:46
First of all, you don't need to use cbind, and that's why everything is a factor. This works:
```
test_data <- data.frame(
var0 = rnorm(100),
var1 = rnorm(100,1),
var2 = rnorm(100,2),
var3 = rnorm(100,3),
var4 = rnorm(100,4),
group = sample(letters[1:10],100,replace=T),
year = sample(c(2007,2009),100, replace=T))
```
Secondly, the best practice is to use "." instead of "_" in variable names. See the google style guide (for instance).

Finally, you can use the Rigroup package; it's very fast. Combine the igroupMeans() function with apply, and set the index i=as.factor(paste(test_data$group,test_data$year,sep="")). I'll try to include an example of this later.

EDIT 6/9/2017

Rigroup package was removed from CRAN. See this
0 讨论(0)
发布评论:

提交评论
- 加载中...

日久生厌

2020-12-13 16:49

It could be done with basic R function:

n <- 100
test_data <- data.frame(
    var0 = rnorm(n),
    var1 = rnorm(n,1),
    var2 = rnorm(n,2),
    var3 = rnorm(n,3),
    var4 = rnorm(n,4),
    group = sample(letters[1:10],n,replace=TRUE),
    year = sample(c(2007,2009),n, replace=TRUE)
)

tapply(
    seq_len(nrow(test_data)),
    test_data$group,
    function(ind) sapply(
        c("var0","var1","var2","var3","var4"),
        function(x_name) tapply(
            test_data[[x_name]][ind],
            test_data$year[ind],
            mean
        )
    )
)

Explanations:

tip: when generating random data is usefull to define number of observations. Changing sample size is easier that way,
first tapply split row index 1:nrow(test_data) by groups,
then for each group sapply over variables
for fixed group and variable do simple tapply returnig mean of variable per year.

In R 2.9.2 result is:

$a
 var0.2007  var1.2007  var2.2007  var3.2007  var4.2007 
-0.3123034  0.8759787  1.9832617  2.7063034  4.1322758 

$b
            var0      var1     var2     var3     var4
2007  0.81366885 0.4189896 2.331256 3.073276 4.164639
2009 -0.08916257 1.5442126 3.008014 3.215019 4.398279

$c
          var0      var1     var2     var3     var4
2007 0.4232098 1.3657369 1.386627 2.808511 3.878809
2009 0.3245751 0.6672073 1.797886 1.752568 3.632318

$d
           var0      var1     var2     var3     var4
2007 -0.1335138 0.5925237 2.303543 3.293281 3.234386
2009  0.9547751 2.2111581 2.678878 2.845234 3.300512

$e
           var0      var1     var2     var3     var4
2007 -0.5958653 1.3535658 1.886918 3.036121 4.120889
2009  0.1372080 0.7215648 2.298064 3.186617 3.551147

$f
           var0      var1     var2     var3     var4
2007 -0.3401813 0.7883120 1.949329 2.811438 4.194481
2009  0.3012627 0.2702647 3.332480 3.480494 2.963951

$g
         var0       var1      var2     var3     var4
2007 1.225245 -0.3289711 0.7599302 2.903581 4.200023
2009 0.273858  0.2445733 1.7690299 2.620026 4.182050

$h
           var0     var1     var2     var3     var4
2007 -1.0126650 1.554403 2.220979 3.713874 3.924151
2009 -0.6187407 1.504297 1.321930 2.796882 4.179695

$i
            var0     var1     var2     var3     var4
2007  0.01697314 1.318965 1.794635 2.709925 2.899440
2009 -0.75790995 1.033483 2.363052 2.422679 3.863526

$j
           var0      var1     var2     var3     var4
2007 -0.7440600 1.6466291 2.020379 3.242770 3.727347
2009 -0.2842126 0.5450029 1.669964 2.747455 4.179531

With my random data there is problem with "a" group - only 2007 cases were present. If year will be factor (with levels 2007 and 2009) then results may look better (you will have two rows for each year, but there probably be NA).

Result is list, so you can use lapply to eg. convert to latex table, html table, print on screen transpose, etc.

0 讨论(0)