R - Identify a sequence of row elements by groups in a dataframe

后端未结

关注

 3  1245

执念已碎 2021-01-06 00:20

Consider the following sample dataframe:

> df
   id name time
1   1    b   10
2   1    b   12
3   1    a    0
4   2    a    5
5   2    b   11
6   2    a


      
      
        
          3条回答        

        
                    
            
            
                         
                
              
              
                
                   小蘑菇
                                             
                
                
                (楼主)
            
              
              
                2021-01-06 00:52
              

            
            
                        
library(dplyr); library(tidyr)

# sort data frame by id and time
df %>% arrange(id, time) %>% group_by(id) %>% 

       # get logical vector indicating rows of a followed by b and mark each pair as unique
       # by cumsum
       mutate(ab = name == "a" & lead(name) == "b", g = cumsum(ab)) %>% 

       # subset rows where conditions are met
       filter(ab | lag(ab)) %>% 

       # reshape your data frame to wide format
       select(-ab) %>% spread(name, time)


#Source: local data frame [3 x 4]
#Groups: id [2]

#     id     g     a     b
#*    
#1     1     1     3    10
#2     2     1     5     7
#3     2     2     9    11




If length of the sequence is larger than two, then you will need to check multiple lags, and one option of this is to use shift function(which accepts a vector as lag/lead steps) from data.table combined with Reduce, say if we need to check pattern abb:

library(dplyr); library(tidyr); library(data.table)
pattern = c("a", "b", "b")
len_pattern = length(pattern)

df %>% arrange(id, time) %>% group_by(id) %>% 

       # same logic as before but use Reduce function to check multiple lags condition
       mutate(ab = Reduce("&", Map("==", shift(name, n = 0:(len_pattern - 1), type = "lead"), pattern)), 
              g = cumsum(ab)) %>% 

       # use reduce or to subset sequence rows having the same length as the pattern
       filter(Reduce("|", shift(ab, n = 0:(len_pattern - 1), type = "lag"))) %>% 

       # make unique names
       group_by(g, add = TRUE) %>% mutate(name = paste(name, 1:n(), sep = "_")) %>% 

       # pivoting the table to wide format
       select(-ab) %>% spread(name, time) 

#Source: local data frame [1 x 5]
#Groups: id, g [1]

#     id     g   a_1   b_2   b_3
#*     
#1     1     1     3    10    12

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它3个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复