Find all sequences with the same column value

后端未结

关注

 9  845

I have the following data frame:

╔══════╦═════════╗
║ Code ║ Airline ║
╠══════╬═════════╣
║    1 ║ AF      ║
║    1 ║ KL      ║
║    8 ║ AR      ║
║    8 ║ A


                      
              相关标签:


      
      
        
          9条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  挽巷        
                
              
                            
                2020-12-17 19:06
              
            
            
                                                                       
There is likely a more efficient route, but this should fly: 

# example data
d <- data.frame(code = c(1,1,8,8,8),
     airline = c("AF","KL","AR","AZ","DL"),
     stringsAsFactors = FALSE)

# merge d to itself on the code column.  This isn't necessarily efficient
d2 <- merge(d, d, by = "code")

# prune d2 to remove occasions where
# airline.x and airline.y (from the merge) are equal
d2 <- d2[d2[["airline.x"]] != d2[["airline.y"]], ]
# construct the combinations for each airline using a split, apply, combine
# then, use stack to get a nice structure for merging
d2 <- stack(
      lapply(split(d2, d2[["airline.x"]]),
        function(ii) paste0(ii$airline.y, collapse = ",")))

# merge d and d2.  "ind" is a column produced by stack
merge(d, d2, by.x = "airline", by.y = "ind")
#  airline code values
#1      AF    1     KL
#2      AR    8  AZ,DL
#3      AZ    8  AR,DL
#4      DL    8  AR,AZ
#5      KL    1     AF

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  醉梦人生        
                
              
                            
                2020-12-17 19:06
              
            
            
                                                                       
You can do this quickly with tidyr's nest (although unless you first translate Airline as factor to character it's less quick) and merge

 library(tidyr)
 dat$Airline <- as.character(dat$Airline)
 new_dat <- merge(dat, dat %>% nest(-Code, .key= SharedWith), by="Code")


and

> new_dat
  Code Airline SharedWith
1    1      AF     AF, KL
2    1      KL     AF, KL
3    8      AR AR, AZ, DL
4    8      AZ AR, AZ, DL
5    8      DL AR, AZ, DL


an advantage of this solution over some of the others: SharedWith becomes a list-column of data.frame rather than say a character

> str(new_dat$SharedWith)
List of 5
 $ :'data.frame':   2 obs. of  1 variable:
  ..$ Airline: chr [1:2] "AF" "KL"
 $ :'data.frame':   2 obs. of  1 variable:
  ..$ Airline: chr [1:2] "AF" "KL"
 $ :'data.frame':   3 obs. of  1 variable:
  ..$ Airline: chr [1:3] "AR" "AZ" "DL"
 $ :'data.frame':   3 obs. of  1 variable:
  ..$ Airline: chr [1:3] "AR" "AZ" "DL"
 $ :'data.frame':   3 obs. of  1 variable:
  ..$ Airline: chr [1:3] "AR" "AZ" "DL"


so you can then easily (albiet not prettily) index out vectors of the shared values, like:

> new_dat$SharedWith[[1]]$Airline
[1] "AF" "KL"


rather than having to use strsplit or similar
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  粉色の甜心        
                
              
                            
                2020-12-17 19:15
              
            
            
                                                                       
I think all you would need is a table

dat <- structure(list(Code = c(1L, 1L, 8L, 8L, 8L),Airline = structure(c(1L, 5L, 2L, 3L, 4L),.Label = c("AF", "AR", "AZ", "DL", "KL"),class = "factor")),.Names = c("Code", "Airline"),class = "data.frame", row.names = c(NA, -5L))

tbl <- crossprod(table(dat))
diag(tbl) <- 0

#        Airline
# Airline AF AR AZ DL KL
#      AF  0  0  0  0  1
#      AR  0  0  1  1  0
#      AZ  0  1  0  1  0
#      DL  0  1  1  0  0
#      KL  1  0  0  0  0

dd <- data.frame(Airline = colnames(tbl),
                 shared = apply(tbl, 1, function(x)
                   paste(names(x)[x > 0], collapse = ', ')))

merge(dat, dd)
#   Airline Code shared
# 1      AF    1     KL
# 2      AR    8 AZ, DL
# 3      AZ    8 AR, DL
# 4      DL    8 AR, AZ
# 5      KL    1     AF

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     上一页
1
2
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复