Rolling sums for groups with uneven time gaps

前端未结

关注

 6  1388

Here\'s the tweak to my previously posted question. Here\'s my data:

set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
            date = as.D


                      
              相关标签:


      
      
        
          6条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  青春惊慌失措        
                
              
                            
                2021-02-16 00:27
              
            
            
                                                                       
Here are some approaches using zoo.

1) Define a function sum_last that given a zoo object takes the sum of the values whose times are within k days of the last day in the series and define a roll function which applies it to an entire series.  Then use ave to apply roll to each user_id once for k=7 and once for k=14.  

Note that this makes use of the coredata argument to rollapply that was introduced in the most recent version of zoo so be sure you don't have an earlier version.

library(zoo)

# compute sum of values within k time units of last time point
sum_last <- function(z, k) {
  tt <- time(z)
  sum(z[tt > tail(tt, 1) - k])
}

# given indexes ix run rollapplyr on read.zoo(DF2[ix, -1])
roll <- function(ix, k) {
 rollapplyr(read.zoo(DF2[ix, -1]), k, sum_last, coredata = FALSE, partial = TRUE, k = k)
}

nr <- nrow(DF2)
transform(DF2, 
  v_minus7 = ave(1:nr, user_id, FUN = function(x) roll(x, 7)),
  v_minus14 = ave(1:nr, user_id, FUN = function(x) roll(x, 14)))


2) An alternative would be to replace roll with the version shown below.  This converts DF2[ix, -1] to "zoo" and merges it with a zero width grid with filled-in gaps.  Then rollapply is applied to that and we use window to subset it back to the original times.

roll <- function(ix, k) {
   z <- read.zoo(DF2[ix, -1])
   g <- zoo(, seq(start(z), end(z), "day"))
   m <- merge(z, g, fill = 0)
   r <- rollapplyr(m, k, sum, partial = TRUE)
   window(r, time(z))
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  遥遥无期        
                
              
                            
                2021-02-16 00:34
              
            
            
                                                                       
You can use rollapply from zoo once you fill out the missing dates first:

library(dplyr)
library(zoo)

set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
             date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
             value = round(rnorm(14, 15, 5), 1))

all_combinations <- expand.grid(user_id=unique(DF2$user_id), 
                            date=seq(min(DF2$date), max(DF2$date), by="day"))

res <- DF2 %>% 
    merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
    group_by(user_id) %>% 
    arrange(date) %>% 
    mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
           v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
    filter(!is.na(value))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  面向向阳花        
                
              
                            
                2021-02-16 00:37
              
            
            
                                                                       
Here is a new option using dplyr and tbrf

library(tbrf)
library(dplyr)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
                 date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
                 value = round(rnorm(14, 15, 5), 1))

DF2 %>%
  group_by(user_id) %>%
  tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
  arrange(user_id, date) %>%
  rename(v_minus7 = sum) %>%
  tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
  rename(v_minus14 = sum)


Creates a tibble:

# A tibble: 14 x 5
   user_id date       value v_minus7 v_minus14
     <dbl> <date>     <dbl>    <dbl>     <dbl>
 1      11 2016-01-01   6.8      6.8      21.8
 2      27 2016-01-01  15       15        21.8
 3      11 2016-01-03  21.3     28.1      65.5
 4      27 2016-01-03  22.4     37.4      65.5
 5      11 2016-01-05  19.8     47.9      98.6
 6      27 2016-01-05  13.3     50.7      98.6
 7      11 2016-01-07  22       69.9     142. 
 8      27 2016-01-07  21.9     72.6     142. 
 9      11 2016-01-10  19.4     82.5     182. 
10      27 2016-01-10  20.6     78.2     182. 
11      11 2016-01-14  17.5     58.9     219. 
12      27 2016-01-14  18.6     61.1     219. 
13      11 2016-01-16  19.3     56.2     232. 
14      27 2016-01-16  16.4     55.6     232. 


I suspect this isn't the fastest solution with larger datasets, but it works well in dplyr chains.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  长情又很酷        
                
              
                            
                2021-02-16 00:38
              
            
            
                                                                       
Here is another idea with findInterval to minimize comparisons and operations. First define a function to accomodate the basic part ignoring the grouping. The following function computes the cumulative sum, and subtracts the cumulative sum at each position from the one at its respective past date:

ff = function(date, value, minus)
{
    cs = cumsum(value)  
    i = findInterval(date - minus, date, left.open = TRUE) 
    w = which(as.logical(i))
    i[w] = cs[i[w]]
    cs - i
}


And apply it by group:

do.call(rbind, 
        lapply(split(DF2, DF2$user_id), 
               function(x) data.frame(x, 
                         minus7 = ff(x$date, x$value, 7), 
                         minus14 = ff(x$date, x$value, 14))))
#      user_id       date value minus7 minus14
#11.8       11 2016-01-01   6.8    6.8     6.8
#11.9       11 2016-01-03  21.3   28.1    28.1
#11.10      11 2016-01-05  19.8   47.9    47.9
#11.11      11 2016-01-07  22.0   69.9    69.9
#11.12      11 2016-01-10  19.4   82.5    89.3
#11.13      11 2016-01-14  17.5   58.9   106.8
#11.14      11 2016-01-16  19.3   56.2   119.3
#27.1       27 2016-01-01  15.0   15.0    15.0
#27.2       27 2016-01-03  22.4   37.4    37.4
#27.3       27 2016-01-05  13.3   50.7    50.7
#27.4       27 2016-01-07  21.9   72.6    72.6
#27.5       27 2016-01-10  20.6   78.2    93.2
#27.6       27 2016-01-14  18.6   61.1   111.8
#27.7       27 2016-01-16  16.4   55.6   113.2


The above apply-by-group operation can, of course, be replaced by any method prefereable.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  -上瘾入骨i        
                
              
                            
                2021-02-16 00:43
              
            
            
                                                                       
Try runner package if you want to calculate on time/date windows. Go to github documentation and check Windows depending on date section.

library(runner)
DF2 %>%
    group_by(user_id) %>%
    mutate(
      v_minus7 = sum_run(value, 7, idx = date),
      v_minus14 = sum_run(value, 14, idx = date)
    )




Benchmark here

library(data.table)
library(dplyr)
library(zoo)
library(tbrf)
set.seed(3737)
DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)),
                 date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)),
                 value = round(rnorm(14, 15, 5), 1))



# example 1
data_table <- function(DF2) {
  setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]),
                    v_minus14 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
             by = c("user_id", "date")][]
}


# example 2
dplyr_grid <- function(DF2) {
  all_combinations <- expand.grid(user_id=unique(DF2$user_id),
                                  date=seq(min(DF2$date), max(DF2$date), by="day"))

  DF2 %>%
    merge(all_combinations, by=c('user_id','date'), all=TRUE) %>%
    group_by(user_id) %>%
    arrange(date) %>%
    mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'),
           v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>%
    filter(!is.na(value))
}

# example 3
dplyr_tbrf <- function(DF2) {
  DF2 %>%
    group_by(user_id) %>%
    tbrf::tbr_sum(value, date, unit = "days", n = 7) %>%
    arrange(user_id, date) %>%
    rename(v_minus7 = sum) %>%
    tbrf::tbr_sum(value, date, unit = "days", n = 14) %>%
    rename(v_minus14 = sum)
}

# example 4
runner <- function(DF2) {
  DF2 %>%
    group_by(user_id) %>%
    mutate(
      v_minus7 = sum_run(value, 7, idx = date),
      v_minus14 = sum_run(value, 14, idx = date)
    )
}


microbenchmark::microbenchmark(
  runner = runner(DF2),
  data.table = data_table(DF2),
  dplyr = dplyr_tbrf(DF2),
  dplyr_tbrf = dplyr_tbrf(DF2),
  times = 100L
)

# Unit: milliseconds
#       expr       min        lq      mean    median        uq        max neval
#     runner  1.478331  1.797512  2.350416  2.083680  2.559875   9.181675   100
# data.table  5.432618  5.970619  7.107540  6.424862  7.563405  13.674661   100
#      dplyr 63.841710 73.652023 86.228112 79.861760 92.304231 256.841078   100
# dplyr_tbrf 60.582381 72.511075 90.175891 80.435700 92.865997 307.454643   100

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  隐瞒了意图╮        
                
              
                            
                2021-02-16 00:45
              
            
            
                                                                       
logic : first group by user_id, followed by date. Now for each subset of data, we are checking which all dates lie between the current date and 7/14 days back using between() which returns a logical vector.

Based on this logical vector I add the value column

library(data.table)
setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]), 
                 v_minus14 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])),
           by = c("user_id", "date")][]
 #   user_id       date value v_minus7 v_minus14
 #1:      27 2016-01-01  15.0     15.0      15.0
 #2:      27 2016-01-03  22.4     37.4      37.4
 #3:      27 2016-01-05  13.3     50.7      50.7
 #4:      27 2016-01-07  21.9     72.6      72.6
 #5:      27 2016-01-10  20.6     78.2      93.2
 #6:      27 2016-01-14  18.6     61.1     111.8
 #7:      27 2016-01-16  16.4     55.6     113.2
 #8:      11 2016-01-01   6.8      6.8       6.8
 #9:      11 2016-01-03  21.3     28.1      28.1
#10:      11 2016-01-05  19.8     47.9      47.9
#11:      11 2016-01-07  22.0     69.9      69.9
#12:      11 2016-01-10  19.4     82.5      89.3
#13:      11 2016-01-14  17.5     58.9     106.8
#14:      11 2016-01-16  19.3     56.2     119.3




# from alexis_laz answer.
ff = function(date, value, minus){
  cs = cumsum(value)  
  i = findInterval(date - minus, date, rightmost.closed = TRUE) 
  w = which(as.logical(i))
  i[w] = cs[i[w]]
  cs - i
} 
setDT(DF2)
DF2[, `:=`( v_minus7 = ff(date, value, 7), 
            v_minus14 = ff(date, value, 14)), by = c("user_id")]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复