问题

I know it is a very silly question but I could not sort it out that is why asking... How can I extract the rows from a large data set by common IDs and take the means of these rows and make a column having these IDs as rownames. e.g.

IDs Var2    
Ae4 2       
Ae4 4
Ae4 6
Bc3 3   
Bc3 5   
Ad2 8
Ad2 7

OutPut
Var(x)
Ae4 4
Bc3 4
Ad2 7.5

回答1:

This kinds of things can easily be done using the plyr function ddply:

dat = data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(100))
> head(dat)
  ID      value
1  A 0.45800889
2  A 0.11221072
3  A 0.58833532
4  A 0.70056704
5  A 0.08337996
6  A 0.05195357

ddply(dat, .(ID), summarize, mn = mean(value))
  ID        mn
1  A 0.4960083
2  B 0.5809681
3  C 0.4512388
4  D 0.5079790
5  E 0.5397708

If your dataset is big, and/or the number of unique ID's is big, you could use data.table. See this paper for more detail about plyr.

回答2:

If you have a large data.frame you could use data.table

Some alternatives to ddply are aggregate and data.table

set.seed(001)
dat <- data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(1e6))


library(data.table)
DT <- data.table(dat)
DT[, mean(value), by=list(ID)]  # data.table approach

aggregate(.~ID, data=dat, mean) # aggregate (R Base function) approach




library(rbenchmark) # comparing performance
benchmark(DT[, mean(value), by=list(ID)],                 # data.table approach
          aggregate(.~ID, data=dat, mean),                # aggregate  approach
          ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
          columns=c("test", "replications", "elapsed", "relative"),
          order='relative',
          replications=1)

                                            test replications elapsed relative
1               DT[, mean(value), by = list(ID)]            1    0.14    1.000
3 ddply(dat, .(ID), summarize, mn = mean(value))            1    0.58    4.143
2            aggregate(. ~ ID, data = dat, mean)            1    3.59   25.643

As you can see the fastest is data.table approach.

Edit

There's an R base approach even faster than data.table, let's see:

 unlist(lapply(split(dat$value, dat$ID), mean)) # another R Base approach

    benchmark(DT[, mean(value), by=list(ID)],                 # data.table approach
              aggregate(.~ID, data=dat, mean),                # aggregate  approach
              ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
              unlist(lapply(split(dat$value, dat$ID), mean)), # lapply, split approach
              columns=c("test", "replications", "elapsed", "relative"),
              order='relative',
              replications=1)
                                            test replications elapsed relative
4 unlist(lapply(split(dat$value, dat$ID), mean))            1    0.06    1.000
1               DT[, mean(value), by = list(ID)]            1    0.10    1.667
3 ddply(dat, .(ID), summarize, mn = mean(value))            1    0.56    9.333
2            aggregate(. ~ ID, data = dat, mean)            1    3.28   54.667

Venables and Ripley (2000, pag.37) suggests that combining unlist, lapply and split is faster than just using sapply and in this particular example it turned out to be even faster than data.table

Reference:

Venables, W. N. and Ripley, B. D. (2000). S Programming. Springer. Statistics and Computing ISBN 0-387-98966-8 (alk. paper)

Scaling up (edit from Matthew Dowle)

More groups

dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e6))
DT <- as.data.table(dat)
benchmark(
  DT[, mean(value), by=ID],
  aggregate(.~ID, data=dat, mean),
  ddply(dat, .(ID), summarize, mn = mean(value)),
  unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
                                            test replications elapsed relative
1                     DT[, mean(value), by = ID]            3    0.33    1.000
4 unlist(lapply(split(dat$value, dat$ID), mean))            3    0.41    1.242
2            aggregate(. ~ ID, data = dat, mean)            3    7.69   23.303
3 ddply(dat, .(ID), summarize, mn = mean(value))            3   17.08   51.758

More rows

dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e7))
DT <- as.data.table(dat)
benchmark(
  DT[, mean(value), by=ID],
  aggregate(.~ID, data=dat, mean),
  ddply(dat, .(ID), summarize, mn = mean(value)),
  unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
                                            test replications elapsed relative
1                     DT[, mean(value), by = ID]            3    3.18    1.000
4 unlist(lapply(split(dat$value, dat$ID), mean))            3    4.26    1.340
2            aggregate(. ~ ID, data = dat, mean)            3   90.28   28.390
3 ddply(dat, .(ID), summarize, mn = mean(value))            3  268.86   84.547

Setting a key first

system.time(setkey(DT,ID))
   user  system elapsed 
   0.71    0.03    0.75 
object.size(dat)
152.7 Mb              # Quite small. Easy for a 32bit PC with 2GB RAM.
object.size(DT)
152.7 Mb
benchmark(
  DT[, mean(value), by=ID],
  aggregate(.~ID, data=dat, mean),
  ddply(dat, .(ID), summarize, mn = mean(value)),
  unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
                                            test replications elapsed relative
1                     DT[, mean(value), by = ID]            3    0.95    1.000
4 unlist(lapply(split(dat$value, dat$ID), mean))            3    4.08    4.295
2            aggregate(. ~ ID, data = dat, mean)            3   91.76   96.589
3 ddply(dat, .(ID), summarize, mn = mean(value))            3  265.15  279.105

Even more rows

dat <- data.frame(ID = rep(1:2000,each=50000), value = runif(1e8))
DT <- as.data.table(dat)
system.time(setkey(DT,ID))
   user  system elapsed 
   2.10    0.25    2.34 
object.size(dat)
   1.1 Gb             # Comfortable for a 64bit PC with 8GB RAM
object.size(DT)
   1.1 Gb
benchmark(
  DT[, mean(value), by=ID],                 
  unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
                                            test replications elapsed relative
1                     DT[, mean(value), by = ID]            3    7.30    1.000
2 unlist(lapply(split(dat$value, dat$ID), mean))            3  184.83   25.319

来源：https://stackoverflow.com/questions/12956185/how-can-i-extract-the-rows-from-a-large-data-set-by-common-ids-and-take-the-mean

标签

data.table

plyr