In a dataset with multiple observations for each subject I want to take a subset with only the maximum data value for each record. For example, with a following dataset:
Another data.table
option:
library(data.table)
setDT(group)
group[group[order(-pt), .I[1L], Subject]$V1]
Or another (less readable but slightly faster):
group[group[, rn := .I][order(Subject, -pt), {
rn[c(1L, 1L + which(diff(Subject)>0L))]
}]]
timing code:
library(data.table)
nr <- 1e7L
ng <- nr/4L
set.seed(0L)
DT <- data.table(Subject=sample(ng, nr, TRUE), pt=1:nr)#rnorm(nr))
DT2 <- copy(DT)
microbenchmark::microbenchmark(times=3L,
mtd0 = {a0 <- DT[DT[, .I[which.max(pt)], by=Subject]$V1]},
mtd1 = {a1 <- DT[DT[order(-pt), .I[1L], Subject]$V1]},
mtd2 = {a2 <- DT2[DT2[, rn := .I][
order(Subject, -pt), rn[c(TRUE, diff(Subject)>0L)]
]]},
mtd3 = {a3 <- unique(DT[order(Subject, -pt)], by="Subject")}
)
fsetequal(a0[order(Subject)], a1[order(Subject)])
#[1] TRUE
fsetequal(a0[order(Subject)], a2[, rn := NULL][order(Subject)])
#[1] TRUE
fsetequal(a0[order(Subject)], a3[order(Subject)])
#[1] TRUE
timings:
Unit: seconds
expr min lq mean median uq max neval
mtd0 3.256322 3.335412 3.371439 3.414502 3.428998 3.443493 3
mtd1 1.733162 1.748538 1.786033 1.763915 1.812468 1.861022 3
mtd2 1.136307 1.159606 1.207009 1.182905 1.242359 1.301814 3
mtd3 1.123064 1.166161 1.228058 1.209257 1.280554 1.351851 3