Rolling sums for groups with uneven time gaps

前端 未结 6 2500
误落风尘
误落风尘 2021-02-15 23:42

Here\'s the tweak to my previously posted question. Here\'s my data:

set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
            date = as.D         


        
6条回答
  •  礼貌的吻别
    2021-02-16 00:23

    Try runner package if you want to calculate on time/date windows. Go to github documentation and check Windows depending on date section.

    library(runner)
    DF2 %>%
        group_by(user_id) %>%
        mutate(
          v_minus7 = sum_run(value, 7, idx = date),
          v_minus14 = sum_run(value, 14, idx = date)
        )
    

    Benchmark here

    library(data.table)
    library(dplyr)
    library(zoo)
    library(tbrf)
    set.seed(3737)
    DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
                     date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
                     value = round(rnorm(14, 15, 5), 1))
    
    
    
    # example 1
    data_table <- function(DF2) {
      setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
                        v_minus14 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
                 by = c("user_id", "date")][]
    }
    
    
    # example 2
    dplyr_grid <- function(DF2) {
      all_combinations <- expand.grid(user_id=unique(DF2$user_id),
                                      date=seq(min(DF2$date), max(DF2$date), by="day"))
    
      DF2 %>%
        merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
        group_by(user_id) %>%
        arrange(date) %>%
        mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
               v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
        filter(!is.na(value))
    }
    
    # example 3
    dplyr_tbrf <- function(DF2) {
      DF2 %>%
        group_by(user_id) %>%
        tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
        arrange(user_id, date) %>%
        rename(v_minus7 = sum) %>%
        tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
        rename(v_minus14 = sum)
    }
    
    # example 4
    runner <- function(DF2) {
      DF2 %>%
        group_by(user_id) %>%
        mutate(
          v_minus7 = sum_run(value, 7, idx = date),
          v_minus14 = sum_run(value, 14, idx = date)
        )
    }
    
    
    microbenchmark::microbenchmark(
      runner = runner(DF2),
      data.table = data_table(DF2),
      dplyr = dplyr_tbrf(DF2),
      dplyr_tbrf = dplyr_tbrf(DF2),
      times = 100L
    )
    
    # Unit: milliseconds
    #       expr       min        lq      mean    median        uq        max neval
    #     runner  1.478331  1.797512  2.350416  2.083680  2.559875   9.181675   100
    # data.table  5.432618  5.970619  7.107540  6.424862  7.563405  13.674661   100
    #      dplyr 63.841710 73.652023 86.228112 79.861760 92.304231 256.841078   100
    # dplyr_tbrf 60.582381 72.511075 90.175891 80.435700 92.865997 307.454643   100
    

提交回复
热议问题