Grouping every n minutes with dplyr

前端未结

关注

 4  1139

I have a dataset containing 10 events occuring at a certain time on a given day, with corresponding value for each event:

d1 <- data.frame(date = as.POSIX


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  别跟我提以往        
                
              
                            
                2020-12-03 05:50
              
            
            
                                                                       
If you need to group data into n minute bins, the floor_date function can allow multiple units to be specified within the unit argument of the function. For example:

library(lubridate)
x <- ymd_hms("2009-08-03 12:25:59.23")
floor_date(x, unit = "3minutes")



  "2009-08-03 12:24:00 UTC"


Using your example:

library(lubridate)
library(tidyverse)

# make complete time sequence
d2 <- data.frame(timePeriod = seq(as.POSIXct("2010-05-21 00:00:00"), 
                        by="3 min", length.out=(1440/3)))

d1 %>%
  mutate(timePeriod = floor_date(date, "3minutes")) %>%
  group_by(timePeriod) %>%
  summarise(sum = sum(value)) %>%
  right_join(d2)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  南旧        
                
              
                            
                2020-12-03 06:01
              
            
            
                                                                       
Recently, the padr package has been developed which can also solve this in a clean way.





library(lubridate)
library(dplyr)
library(padr)

d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30", 
                                     "21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34", 
                                     "21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53", 
                                     "21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
                 value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))

res <- d1 %>% 
  as_tibble() %>%
  arrange(date) %>%

  # Thicken the results to fall in 3 minute buckets
  thicken(
    interval  = '3 min', 
    start_val = as.POSIXct('2010-05-21 00:00:00'),
    colname   = "date_pad") %>% 

  # Pad the results to fill in the rest of the 3 minute buckets
  pad(
    interval  = '3 min', 
    by        = 'date_pad', 
    start_val = as.POSIXct('2010-05-21 00:00:00'),
    end_val   = as.POSIXct('2010-05-21 23:57:00')) %>%

  select(date_pad, value)

res
#> # A tibble: 480 x 2
#>    date_pad            value
#>    <dttm>              <dbl>
#>  1 2010-05-21 00:00:00    NA
#>  2 2010-05-21 00:03:00    NA
#>  3 2010-05-21 00:06:00    NA
#>  4 2010-05-21 00:09:00    NA
#>  5 2010-05-21 00:12:00    NA
#>  6 2010-05-21 00:15:00    NA
#>  7 2010-05-21 00:18:00    NA
#>  8 2010-05-21 00:21:00    NA
#>  9 2010-05-21 00:24:00    NA
#> 10 2010-05-21 00:27:00    NA
#> # ... with 470 more rows

res[450,]
#> # A tibble: 1 x 2
#>   date_pad             value
#>   <dttm>               <dbl>
#> 1 2010-05-21 22:27:00 643426

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  眼角桃花        
                
              
                            
                2020-12-03 06:02
              
            
            
                                                                       
lubridate-dplyr-esque solution. 

library(lubridate)
library(dplyr)
d2 <- data.frame(interval = seq(ymd_hms('2010-05-21 00:00:00'), by = '3 min',length.out=(1440/3)))
d3 <- d1 %>% 
  mutate(interval = floor_date(date, unit="hour")+minutes(floor(minute(date)/3)*3)) %>% 
  group_by(interval) %>% 
  mutate(sumvalue=sum(value))  %>% 
  select(interval,sumvalue) 
d4 <- merge(d2,d3, all=TRUE) # better if left_join is used
tail(d4)
#               interval sumvalue
#475 2010-05-21 23:42:00       NA
#476 2010-05-21 23:45:00       NA
#477 2010-05-21 23:48:00       NA
#478 2010-05-21 23:51:00       NA
#479 2010-05-21 23:54:00       NA
#480 2010-05-21 23:57:00       NA
d4[450,]
#               interval sumvalue
#450 2010-05-21 22:27:00   643426


If you are comfortable working with Date (I am not), you can dispense with lubridate, and replace the final merge with left_join. 
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  清酒与你        
                
              
                            
                2020-12-03 06:08
              
            
            
                                                                       
I'm not sure about a dplyr solution, but here's an xts solution:

startpoints <- function (x, on = "months", k = 1) {
  head(endpoints(x, on, k) + 1, -1)
}
m3 <- seq(as.POSIXct("2010-05-21 00:00:00"),
  by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)


Update: Here's another xts solution that is a bit more careful about correctly aligning the aggregated values. Not to suggest the prior solution was wrong, but this solution is easier to follow and repeat in other analysis.

m3 <- seq(as.POSIXct("2010-05-20 23:59:59.999"),
  by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, endpoints(x, "minutes", 3), sum, na.rm=TRUE)
y <- align.time(y, 60*3)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复