Find all sequences with the same column value

后端未结

关注

 9  843

I have the following data frame:

╔══════╦═════════╗
║ Code ║ Airline ║
╠══════╬═════════╣
║    1 ║ AF      ║
║    1 ║ KL      ║
║    8 ║ AR      ║
║    8 ║ A


                      
              相关标签:


      
      
        
          9条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  鱼传尺愫        
                
              
                            
                2020-12-17 18:49
              
            
            
                                                                       
Several options using the data.table package:

1) Using strsplit, paste & operate by row:

library(data.table)
setDT(dat)[, Airline := trimws(Airline)  # this step is needed to remove the leading and trailing whitespaces
           ][, sharedwith := paste(Airline, collapse = ','), Code
            ][, sharedwith := paste(unlist(strsplit(sharedwith,','))[!unlist(strsplit(sharedwith,',')) %in% Airline], 
                                    collapse = ','), 1:nrow(dat)]


which gives:

> dat
   Code Airline sharedwith
1:    1      AF         KL
2:    1      KL         AF
3:    8      AR      AZ,DL
4:    8      AZ      AR,DL
5:    8      DL      AR,AZ


2) Using strsplit & paste with mapply instead of by = 1:nrow(dat):

setDT(dat)[, Airline := trimws(Airline)
           ][, sharedwith := paste(Airline, collapse = ','), Code
             ][, sharedwith := mapply(function(s,a) paste(unlist(strsplit(s,','))[!unlist(strsplit(s,',')) %in% a], 
                                                          collapse = ','),
                                      sharedwith, Airline)][]


which will give you the same result.

3) Or by using the CJ function with paste (inspired by the expand.grid solution of @zx8754):

library(data.table)
setDT(dat)[, Airline := trimws(Airline)
           ][, CJ(air=Airline, Airline,  unique=TRUE)[air!=V2][, .(shared=paste(V2,collapse=',')), air],
             Code]


which gives:

   Code air shared
1:    1  AF     KL
2:    1  KL     AF
3:    8  AR  AZ,DL
4:    8  AZ  AR,DL
5:    8  DL  AR,AZ




A solution with dplyr & tidyr to get the desired solution (inspired by @jaimedash):

library(dplyr)
library(tidyr)

dat <- dat %>% mutate(Airline = trimws(as.character(Airline)))

dat %>%
  mutate(SharedWith = Airline) %>% 
  group_by(Code) %>%
  nest(-Code, -Airline, .key = SharedWith) %>%
  left_join(dat, ., by = 'Code') %>%
  unnest() %>%
  filter(Airline != SharedWith) %>%
  group_by(Code, Airline) %>%
  summarise(SharedWith = toString(SharedWith))


which gives:

   Code Airline SharedWith
  (int)   (chr)      (chr)
1     1      AF         KL
2     1      KL         AF
3     8      AR     AZ, DL
4     8      AZ     AR, DL
5     8      DL     AR, AZ

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  醉梦人生        
                
              
                            
                2020-12-17 18:52
              
            
            
                                                                       
split helps. Here's a fully reproducible EDIT that works w/o any additional package. Works with the OPs data.frame - changed it after OP added a reproducible dataset. 

# strip white space in Airline names:
dat$Airline <- gsub(" ","",dat$Airline)
li <- split(dat,factor(dat$Code))
do.call("rbind",lapply(li,function(x) 
data.frame(Airline = x[1,2],
         SharedWith = paste(x$Airline[-1]
                            ,collapse=",")
))
)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  攒了一身酷        
                
              
                            
                2020-12-17 18:55
              
            
            
                                                                       
An an igraph approach

library(igraph)

g <- graph_from_data_frame(dat)

# Find neighbours for select nodes
ne <- setNames(ego(g,2, nodes=as.character(dat$Airline), mindist=2), dat$Airline)
ne
#$`AF  `
#+ 1/7 vertex, named:
#[1] KL  

#$`KL  `
#+ 1/7 vertex, named:
#[1] AF  
---
---

# Get final format
data.frame(Airline=names(ne), 
           Shared=sapply(ne, function(x)
                                      paste(V(g)$name[x], collapse=",")))
#   Airline Shared
# 1      AF     KL
# 2      KL     AF
# 3      AR  AZ,DL
# 4      AZ  AR,DL
# 5      DL  AR,AZ

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦毁少年i        
                
              
                            
                2020-12-17 18:55
              
            
            
                                                                       
Using expand.grid and aggregate:

do.call(rbind,
        lapply(split(dat, dat$Code), function(i){
          x <- expand.grid(i$Airline, i$Airline)
          x <- x[ x$Var1 != x$Var2, ]
          x <- aggregate(x$Var2, list(x$Var1), paste, collapse = ",")
          colnames(x) <- c("Airline", "SharedWith")
          cbind(Code = i$Code, x)
        }))

# output
#     Code Airline SharedWith
# 1.1    1      AF         KL
# 1.2    1      KL         AF
# 8.1    8      AR      AZ,DL
# 8.2    8      AZ      AR,DL
# 8.3    8      DL      AR,AZ

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  萌比男神i        
                
              
                            
                2020-12-17 19:01
              
            
            
                                                                       
You can try something like this in dplyr

library(dplyr)
df %>% group_by(code) %>% mutate(SharedWith = paste(sort(Airline), collapse = ', ')) %>% ungroup() %>% select(Airline, SharedWith)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  庸人自扰        
                
              
                            
                2020-12-17 19:02
              
            
            
                                                                       
Take the following as a comment that is posted as an answer just because this allows more convenient formatting.

for each code
  lookup all rows in the table where the value = code


ummm... sorry, I don't get how this psedudocode is related to your desired output

+--------------------+
| Airline SharedWith |
+--------------------+
| AF      "KL"       |
| KL      "AF"       |
| AR      "AZ","DL"  |
+--------------------+


The result of this pseudocode should rather be:

+---------------------+
+ Code  +  Airlines   +
+---------------------+
+  1    +  AF, KL     +
+  2    +  AR, AZ, DL +
+---------------------+


That is, 

codes <- unique(dat$Code)
data.frame(Code=codes, Airlines = sapply(codes, function(x) paste(subset(dat, Code %in% x)$Airline, collapse=",")))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复