In one of my application there is a piece of code that retrieve information from a data.table
object depending on values in another.
# say this tabl
This is a great spot to use the roll
argument of data.table
:
setkey(dt1, id, date)
setkey(dt, id, start)
dt[dt1, roll = TRUE][end >= start,
list(start = start[1], end = end[1], result = mean(var)), by = id]
# benchmark
microbenchmark(OP = adply(dt, 1, myfunc),
Frank = dt[dt1[as.list(dt[,seq.Date(start,end,"day"),by="id"])][,mean(var),by=id]],
eddi = dt[dt1, roll = TRUE][end >= start,list(start = start[1], end = end[1], result = mean(var)), by = id])
#Unit: milliseconds
# expr min lq median uq max neval
# OP 24.436126 29.184786 30.853094 32.493521 50.898664 100
# Frank 9.115676 11.303691 12.081000 13.122753 28.370415 100
# eddi 5.336315 6.323643 6.771898 7.497285 9.531376 100
The time difference will become much more dramatic as the size of the datasets grows.
I can give you a bunch of nested [.data.table
calls:
set.seed(1)
require(data.table)
# generate dt, dt1 as above
dt[
dt1[
as.list(dt[,seq.Date(start,end,"day"),by="id"])
][,mean(var),by=id]
]
# id start end V1
# 1: A 2010-01-01 2010-01-07 0.04475859
# 2: B 2010-02-01 2010-02-09 -0.01681972
# 3: C 2010-03-01 2010-03-11 0.39791318
# 4: D 2010-04-01 2010-04-06 0.77854732
I'm using as.list
to unset the key. I wonder if there's a better way than this...
require(microbenchmark)
require(plyr)
microbenchmark(
adply=adply(dt, 1, myfunc),
dtdtdt= dt[dt1[as.list(dt[,seq.Date(start,end,"day"),by="id"])][,mean(var),by=id]]
)
# Unit: milliseconds
# expr min lq median uq max neval
# adply 12.987334 13.247374 13.477386 14.371258 18.362505 100
# dtdtdt 4.854708 4.944596 4.993678 5.233507 7.082461 100
EDIT: (eddi) Alternatives to the above that would require one less merge (as discussed in comments) are:
setkey(dt, NULL)
dt1[dt[, list(seq.Date(start,end,"day"), end), by=id]][,
list(start = date[1], end = end[1], result = mean(var)), by = id]
# or
dt1[dt[, seq.Date(start,end,"day"), by=id]][,
list(start = date[1], end = date[.N], result = mean(var)), by = id]