问题
I know it is a very silly question but I could not sort it out that is why asking... How can I extract the rows from a large data set by common IDs and take the means of these rows and make a column having these IDs as rownames. e.g.
IDs Var2
Ae4 2
Ae4 4
Ae4 6
Bc3 3
Bc3 5
Ad2 8
Ad2 7
OutPut
Var(x)
Ae4 4
Bc3 4
Ad2 7.5
回答1:
This kinds of things can easily be done using the plyr function ddply:
dat = data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(100))
> head(dat)
ID value
1 A 0.45800889
2 A 0.11221072
3 A 0.58833532
4 A 0.70056704
5 A 0.08337996
6 A 0.05195357
ddply(dat, .(ID), summarize, mn = mean(value))
ID mn
1 A 0.4960083
2 B 0.5809681
3 C 0.4512388
4 D 0.5079790
5 E 0.5397708
If your dataset is big, and/or the number of unique ID's is big, you could use data.table. See this paper for more detail about plyr.
回答2:
If you have a large data.frame you could use data.table
Some alternatives to ddply are aggregate and data.table
set.seed(001)
dat <- data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(1e6))
library(data.table)
DT <- data.table(dat)
DT[, mean(value), by=list(ID)] # data.table approach
aggregate(.~ID, data=dat, mean) # aggregate (R Base function) approach
library(rbenchmark) # comparing performance
benchmark(DT[, mean(value), by=list(ID)], # data.table approach
aggregate(.~ID, data=dat, mean), # aggregate approach
ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=1)
test replications elapsed relative
1 DT[, mean(value), by = list(ID)] 1 0.14 1.000
3 ddply(dat, .(ID), summarize, mn = mean(value)) 1 0.58 4.143
2 aggregate(. ~ ID, data = dat, mean) 1 3.59 25.643
As you can see the fastest is data.table approach.
Edit
There's an R base approach even faster than data.table, let's see:
unlist(lapply(split(dat$value, dat$ID), mean)) # another R Base approach
benchmark(DT[, mean(value), by=list(ID)], # data.table approach
aggregate(.~ID, data=dat, mean), # aggregate approach
ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
unlist(lapply(split(dat$value, dat$ID), mean)), # lapply, split approach
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=1)
test replications elapsed relative
4 unlist(lapply(split(dat$value, dat$ID), mean)) 1 0.06 1.000
1 DT[, mean(value), by = list(ID)] 1 0.10 1.667
3 ddply(dat, .(ID), summarize, mn = mean(value)) 1 0.56 9.333
2 aggregate(. ~ ID, data = dat, mean) 1 3.28 54.667
Venables and Ripley (2000, pag.37) suggests that combining unlist, lapply and split is faster than just using sapply and in this particular example it turned out to be even faster than data.table
Reference:
Venables, W. N. and Ripley, B. D. (2000). S Programming. Springer. Statistics and Computing ISBN 0-387-98966-8 (alk. paper)
Scaling up (edit from Matthew Dowle)
More groups
dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e6))
DT <- as.data.table(dat)
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 0.33 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 0.41 1.242
2 aggregate(. ~ ID, data = dat, mean) 3 7.69 23.303
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 17.08 51.758
More rows
dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e7))
DT <- as.data.table(dat)
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 3.18 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 4.26 1.340
2 aggregate(. ~ ID, data = dat, mean) 3 90.28 28.390
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 268.86 84.547
Setting a key first
system.time(setkey(DT,ID))
user system elapsed
0.71 0.03 0.75
object.size(dat)
152.7 Mb # Quite small. Easy for a 32bit PC with 2GB RAM.
object.size(DT)
152.7 Mb
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 0.95 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 4.08 4.295
2 aggregate(. ~ ID, data = dat, mean) 3 91.76 96.589
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 265.15 279.105
Even more rows
dat <- data.frame(ID = rep(1:2000,each=50000), value = runif(1e8))
DT <- as.data.table(dat)
system.time(setkey(DT,ID))
user system elapsed
2.10 0.25 2.34
object.size(dat)
1.1 Gb # Comfortable for a 64bit PC with 8GB RAM
object.size(DT)
1.1 Gb
benchmark(
DT[, mean(value), by=ID],
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 7.30 1.000
2 unlist(lapply(split(dat$value, dat$ID), mean)) 3 184.83 25.319
来源:https://stackoverflow.com/questions/12956185/how-can-i-extract-the-rows-from-a-large-data-set-by-common-ids-and-take-the-mean