Grouping every n minutes with dplyr

前端 未结 4 1137
南方客
南方客 2020-12-03 05:11

I have a dataset containing 10 events occuring at a certain time on a given day, with corresponding value for each event:

d1 <- data.frame(date = as.POSIX         


        
相关标签:
4条回答
  • 2020-12-03 05:50

    If you need to group data into n minute bins, the floor_date function can allow multiple units to be specified within the unit argument of the function. For example:

    library(lubridate)
    x <- ymd_hms("2009-08-03 12:25:59.23")
    floor_date(x, unit = "3minutes")
    

    "2009-08-03 12:24:00 UTC"

    Using your example:

    library(lubridate)
    library(tidyverse)
    
    # make complete time sequence
    d2 <- data.frame(timePeriod = seq(as.POSIXct("2010-05-21 00:00:00"), 
                            by="3 min", length.out=(1440/3)))
    
    d1 %>%
      mutate(timePeriod = floor_date(date, "3minutes")) %>%
      group_by(timePeriod) %>%
      summarise(sum = sum(value)) %>%
      right_join(d2)
    
    0 讨论(0)
  • 2020-12-03 06:01

    Recently, the padr package has been developed which can also solve this in a clean way.


    library(lubridate)
    library(dplyr)
    library(padr)
    
    d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30", 
                                         "21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34", 
                                         "21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53", 
                                         "21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
                     value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
    
    res <- d1 %>% 
      as_tibble() %>%
      arrange(date) %>%
    
      # Thicken the results to fall in 3 minute buckets
      thicken(
        interval  = '3 min', 
        start_val = as.POSIXct('2010-05-21 00:00:00'),
        colname   = "date_pad") %>% 
    
      # Pad the results to fill in the rest of the 3 minute buckets
      pad(
        interval  = '3 min', 
        by        = 'date_pad', 
        start_val = as.POSIXct('2010-05-21 00:00:00'),
        end_val   = as.POSIXct('2010-05-21 23:57:00')) %>%
    
      select(date_pad, value)
    
    res
    #> # A tibble: 480 x 2
    #>    date_pad            value
    #>    <dttm>              <dbl>
    #>  1 2010-05-21 00:00:00    NA
    #>  2 2010-05-21 00:03:00    NA
    #>  3 2010-05-21 00:06:00    NA
    #>  4 2010-05-21 00:09:00    NA
    #>  5 2010-05-21 00:12:00    NA
    #>  6 2010-05-21 00:15:00    NA
    #>  7 2010-05-21 00:18:00    NA
    #>  8 2010-05-21 00:21:00    NA
    #>  9 2010-05-21 00:24:00    NA
    #> 10 2010-05-21 00:27:00    NA
    #> # ... with 470 more rows
    
    res[450,]
    #> # A tibble: 1 x 2
    #>   date_pad             value
    #>   <dttm>               <dbl>
    #> 1 2010-05-21 22:27:00 643426
    
    0 讨论(0)
  • 2020-12-03 06:02

    lubridate-dplyr-esque solution.

    library(lubridate)
    library(dplyr)
    d2 <- data.frame(interval = seq(ymd_hms('2010-05-21 00:00:00'), by = '3 min',length.out=(1440/3)))
    d3 <- d1 %>% 
      mutate(interval = floor_date(date, unit="hour")+minutes(floor(minute(date)/3)*3)) %>% 
      group_by(interval) %>% 
      mutate(sumvalue=sum(value))  %>% 
      select(interval,sumvalue) 
    d4 <- merge(d2,d3, all=TRUE) # better if left_join is used
    tail(d4)
    #               interval sumvalue
    #475 2010-05-21 23:42:00       NA
    #476 2010-05-21 23:45:00       NA
    #477 2010-05-21 23:48:00       NA
    #478 2010-05-21 23:51:00       NA
    #479 2010-05-21 23:54:00       NA
    #480 2010-05-21 23:57:00       NA
    d4[450,]
    #               interval sumvalue
    #450 2010-05-21 22:27:00   643426
    

    If you are comfortable working with Date (I am not), you can dispense with lubridate, and replace the final merge with left_join.

    0 讨论(0)
  • 2020-12-03 06:08

    I'm not sure about a dplyr solution, but here's an xts solution:

    startpoints <- function (x, on = "months", k = 1) {
      head(endpoints(x, on, k) + 1, -1)
    }
    m3 <- seq(as.POSIXct("2010-05-21 00:00:00"),
      by="3 min", length.out=1440/3)
    x <- merge(value=xts(d1$value, d1$date), xts(,m3))
    y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
    

    Update: Here's another xts solution that is a bit more careful about correctly aligning the aggregated values. Not to suggest the prior solution was wrong, but this solution is easier to follow and repeat in other analysis.

    m3 <- seq(as.POSIXct("2010-05-20 23:59:59.999"),
      by="3 min", length.out=1440/3)
    x <- merge(value=xts(d1$value, d1$date), xts(,m3))
    y <- period.apply(x, endpoints(x, "minutes", 3), sum, na.rm=TRUE)
    y <- align.time(y, 60*3)
    
    0 讨论(0)
提交回复
热议问题