Efficient way to perform running total in the last 365 day window

前端 未结 3 1156
梦如初夏
梦如初夏 2021-01-06 10:40

This is what my data frame looks like:

library(data.table)

df <- fread(\'
                Name  EventType  Date  SalesAmount RunningTotal Runningt         


        
相关标签:
3条回答
  • 2021-01-06 11:29

    Using newer non-equi joins feature in data.table:

        df1 = df[.(iName=Name,start = Date - 365L, end = Date),
        on=.(Name=iName,Date >= start, Date <= end),nomatch = 0, allow.cart=TRUE][,
      .(MyTotal = sum(SalesAmount)), by=.(Name,Date = Date.1)]
    
    
        df[df1, on = .(Name,Date)]
    
    0 讨论(0)
  • 2021-01-06 11:38

    Here's an approach using foverlaps function from data.table package:

    require(data.table)
    setDT(df)[, end := as.Date(EventDate, format="%d/%m/%Y")
            ][, start := end - 365L]
    setkey(df, Name, start, end)
    olaps = foverlaps(df, df, nomatch=0L, which=TRUE)
    olaps = olaps[xid >= yid, .(ans = sum(dt$SalesAmount[yid])), by=xid]
    
    df[olaps$xid, Runningtotal := olaps$ans]
    

    You can remove the start and end columns, if necessary, by doing:

    df[, c("start", "end") := NULL]
    

    Would be nice to know how fast/slow it is..

    0 讨论(0)
  • 2021-01-06 11:47

    Give this a try:

    DF <- read.table(text = "Name  EventType  EventDate  SalesAmount RunningTotal Runningtotal(prior365Days)
    John    Email      1/1/2014      0          0            0
    John    Sale       2/1/2014     10          10           10
    John    Sale       7/1/2014     20          30           30
    John    Sale       4/1/2015     30          60           50 
    John    Webinar    5/1/2015      0          60           50
    Tom     Email      1/1/2014      0          0            0
    Tom     Sale       2/1/2014     15          15           15
    Tom     Sale       7/1/2014     10          25           25
    Tom     Sale       4/1/2015     25          50           35 
    Tom     Webinar    5/1/2015      0          50           35", header = TRUE)
    
    
    fun <- function(x, date, thresh) {
      D <- as.matrix(dist(date)) #distance matrix between dates
      D <- D <= thresh
      D[lower.tri(D)] <- FALSE #don't sum to future
      R <- D * x #FALSE is treated as 0
      colSums(R)
    }
    
    
    library(data.table)
    setDT(DF)
    DF[, EventDate := as.Date(EventDate, format = "%m/%d/%Y")]
    setkey(DF, Name, EventDate)
    
    DF[, RT365 := fun(SalesAmount, EventDate, 365), by = Name]
    
    #    Name EventType  EventDate SalesAmount RunningTotal Runningtotal.prior365Days. RT365
    # 1: John     Email 2014-01-01           0            0                          0     0
    # 2: John      Sale 2014-02-01          10           10                         10    10
    # 3: John      Sale 2014-07-01          20           30                         30    30
    # 4: John      Sale 2015-04-01          30           60                         50    50
    # 5: John   Webinar 2015-05-01           0           60                         50    50
    # 6:  Tom     Email 2014-01-01           0            0                          0     0
    # 7:  Tom      Sale 2014-02-01          15           15                         15    15
    # 8:  Tom      Sale 2014-07-01          10           25                         25    25
    # 9:  Tom      Sale 2015-04-01          25           50                         35    35
    #10:  Tom   Webinar 2015-05-01           0           50                         35    35
    
    0 讨论(0)
提交回复
热议问题