Complete time-series with sparklyr

前端未结

关注

 1  824

旧巷少年郎 2021-01-21 20:07

I\'m trying to find missing minutes in my time-series-dataset. I wrote an R code for a local performance on a small sample:

test <- dfv %>% mutate(timestam


      
      
        
          1条回答        

        
                    
            
            
                         
                
              
              
                
                   抹茶落季
                                             
                
                
                (楼主)
            
              
              
                2021-01-21 21:00
              

            
            
                        
Find a min and max values as epoch time:

df <- copy_to(sc, tibble(id=1:4, timestamp=c(
    "2017-07-01 23:49:00.000", "2017-07-01 23:50:00.000",
    # 6 minutes gap
    "2017-07-01 23:56:00.000",
    # 1 minute gap
    "2017-07-01 23:58:00.000")
), "df", overwrite=TRUE)

min_max <- df %>% 
  summarise(min(unix_timestamp(timestamp)), max(unix_timestamp(timestamp))) %>% 
  collect() %>% 
  unlist()


Generate a reference range from min(epoch_time) to max(epoch_time) + interval:

library(glue) 

query <- glue("SELECT id AS timestamp FROM RANGE({min_max[1]}, {min_max[2] + 60}, 60)") %>%
  as.character()

ref <- spark_session(sc) %>% invoke("sql", query) %>% 
  sdf_register() %>%
  mutate(timestamp = from_unixtime(timestamp, "yyyy-MM-dd HH:mm:ss.SSS"))


Outer join both:

ref %>% left_join(df, by="timestamp")


# Source:   lazy query [?? x 2]
# Database: spark_connection
   timesptamp                 id
                      
 1 2017-07-01 23:49:00.000     1
 2 2017-07-01 23:50:00.000     2
 3 2017-07-01 23:51:00.000    NA
 4 2017-07-01 23:52:00.000    NA
 5 2017-07-01 23:53:00.000    NA
 6 2017-07-01 23:54:00.000    NA
 7 2017-07-01 23:55:00.000    NA
 8 2017-07-01 23:56:00.000     3
 9 2017-07-01 23:57:00.000    NA
10 2017-07-01 23:58:00.000     4
# ... with more rows


Note:

If you experience issues related to SPARK-20145 you can replace SQL query with:

spark_session(sc) %>%
  invoke("range", as.integer(min_max[1]), as.integer(min_max[2]), 60L) %>% 
  sdf_register()

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                    
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复