In R (which I am relatively new to) I have a data frame consists of many column and a numeric column I need to aggregate according to groups determined by another column.
<
Using data.table
package:
library(data.table)
dt = data.table(SessionID=c(1,1,1,7,7,7), Price=c(624,697,649,779,710,2679))
dt[, c("Min", "Max"):=list(min(Price),max(Price)), by=SessionID]
dt
# SessionId Price Min Max
#1: 1 624 624 697
#2: 1 697 624 697
#3: 1 649 624 697
#4: 7 779 710 2679
#5: 7 710 710 2679
#6: 7 2679 710 2679
In your case if you have a data.frame df
, just do dt=as.data.table(df)
and use the code above.
I am curious about the benchmark of the solutions on an average data.frame:
df = data.frame(SessionID=rep(1:1000, each=100), Price=runif(100000, 1, 2000))
dt = as.data.table(df)
algo1 <- function()
{
df %>% group_by(SessionID) %>% mutate(Min = min(Price), Max = max(Price))
}
algo2 <- function()
{
dt[, c("Min", "Max"):=list(min(Price),max(Price)), by=SessionID]
}
algo3 <- function()
{
tmp <- aggregate(Price ~ SessionID, df, function(x) c(Min = min(x), Max = max(x)))
cbind(df, tmp[match(df$SessionID, tmp$SessionID), 2])
}
algo4 <- function()
{
transform(df, Min = ave(Price, SessionID, FUN = min), Max = ave(Price, SessionID, FUN = max))
}
#> system.time(algo1())
# user system elapsed
# 0.03 0.00 0.19
#> system.time(algo2())
# user system elapsed
# 0.01 0.00 0.01
#> system.time(algo3())
# user system elapsed
# 0.77 0.01 0.78
#> system.time(algo4())
# user system elapsed
# 0.02 0.01 0.03