问题
I know it is a very silly question but I could not sort it out that is why asking... How can I extract the rows from a large data set by common IDs and take the means of these rows and make a column having these IDs as rownames. e.g.
IDs Var2
Ae4 2
Ae4 4
Ae4 6
Bc3 3
Bc3 5
Ad2 8
Ad2 7
OutPut
Var(x)
Ae4 4
Bc3 4
Ad2 7.5
回答1:
This kinds of things can easily be done using the plyr
function ddply
:
dat = data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(100))
> head(dat)
ID value
1 A 0.45800889
2 A 0.11221072
3 A 0.58833532
4 A 0.70056704
5 A 0.08337996
6 A 0.05195357
ddply(dat, .(ID), summarize, mn = mean(value))
ID mn
1 A 0.4960083
2 B 0.5809681
3 C 0.4512388
4 D 0.5079790
5 E 0.5397708
If your dataset is big, and/or the number of unique ID
's is big, you could use data.table
. See this paper for more detail about plyr
.
回答2:
If you have a large data.frame you could use data.table
Some alternatives to ddply
are aggregate
and data.table
set.seed(001)
dat <- data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(1e6))
library(data.table)
DT <- data.table(dat)
DT[, mean(value), by=list(ID)] # data.table approach
aggregate(.~ID, data=dat, mean) # aggregate (R Base function) approach
library(rbenchmark) # comparing performance
benchmark(DT[, mean(value), by=list(ID)], # data.table approach
aggregate(.~ID, data=dat, mean), # aggregate approach
ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=1)
test replications elapsed relative
1 DT[, mean(value), by = list(ID)] 1 0.14 1.000
3 ddply(dat, .(ID), summarize, mn = mean(value)) 1 0.58 4.143
2 aggregate(. ~ ID, data = dat, mean) 1 3.59 25.643
As you can see the fastest is data.table
approach.
Edit
There's an R base approach even faster than data.table
, let's see:
unlist(lapply(split(dat$value, dat$ID), mean)) # another R Base approach
benchmark(DT[, mean(value), by=list(ID)], # data.table approach
aggregate(.~ID, data=dat, mean), # aggregate approach
ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
unlist(lapply(split(dat$value, dat$ID), mean)), # lapply, split approach
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=1)
test replications elapsed relative
4 unlist(lapply(split(dat$value, dat$ID), mean)) 1 0.06 1.000
1 DT[, mean(value), by = list(ID)] 1 0.10 1.667
3 ddply(dat, .(ID), summarize, mn = mean(value)) 1 0.56 9.333
2 aggregate(. ~ ID, data = dat, mean) 1 3.28 54.667
Venables and Ripley (2000, pag.37) suggests that combining unlist
, lapply
and split
is faster than just using sapply
and in this particular example it turned out to be even faster than data.table
Reference:
Venables, W. N. and Ripley, B. D. (2000). S Programming. Springer. Statistics and Computing ISBN 0-387-98966-8 (alk. paper)
Scaling up (edit from Matthew Dowle)
More groups
dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e6))
DT <- as.data.table(dat)
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 0.33 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 0.41 1.242
2 aggregate(. ~ ID, data = dat, mean) 3 7.69 23.303
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 17.08 51.758
More rows
dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e7))
DT <- as.data.table(dat)
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 3.18 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 4.26 1.340
2 aggregate(. ~ ID, data = dat, mean) 3 90.28 28.390
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 268.86 84.547
Setting a key first
system.time(setkey(DT,ID))
user system elapsed
0.71 0.03 0.75
object.size(dat)
152.7 Mb # Quite small. Easy for a 32bit PC with 2GB RAM.
object.size(DT)
152.7 Mb
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 0.95 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 4.08 4.295
2 aggregate(. ~ ID, data = dat, mean) 3 91.76 96.589
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 265.15 279.105
Even more rows
dat <- data.frame(ID = rep(1:2000,each=50000), value = runif(1e8))
DT <- as.data.table(dat)
system.time(setkey(DT,ID))
user system elapsed
2.10 0.25 2.34
object.size(dat)
1.1 Gb # Comfortable for a 64bit PC with 8GB RAM
object.size(DT)
1.1 Gb
benchmark(
DT[, mean(value), by=ID],
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 7.30 1.000
2 unlist(lapply(split(dat$value, dat$ID), mean)) 3 184.83 25.319
来源:https://stackoverflow.com/questions/12956185/how-can-i-extract-the-rows-from-a-large-data-set-by-common-ids-and-take-the-mean