Find all sequences with the same column value

后端 未结 9 806
误落风尘
误落风尘 2020-12-17 18:35

I have the following data frame:

╔══════╦═════════╗
║ Code ║ Airline ║
╠══════╬═════════╣
║    1 ║ AF      ║
║    1 ║ KL      ║
║    8 ║ AR      ║
║    8 ║ A         


        
相关标签:
9条回答
  • 2020-12-17 18:49

    Several options using the data.table package:

    1) Using strsplit, paste & operate by row:

    library(data.table)
    setDT(dat)[, Airline := trimws(Airline)  # this step is needed to remove the leading and trailing whitespaces
               ][, sharedwith := paste(Airline, collapse = ','), Code
                ][, sharedwith := paste(unlist(strsplit(sharedwith,','))[!unlist(strsplit(sharedwith,',')) %in% Airline], 
                                        collapse = ','), 1:nrow(dat)]
    

    which gives:

    > dat
       Code Airline sharedwith
    1:    1      AF         KL
    2:    1      KL         AF
    3:    8      AR      AZ,DL
    4:    8      AZ      AR,DL
    5:    8      DL      AR,AZ
    

    2) Using strsplit & paste with mapply instead of by = 1:nrow(dat):

    setDT(dat)[, Airline := trimws(Airline)
               ][, sharedwith := paste(Airline, collapse = ','), Code
                 ][, sharedwith := mapply(function(s,a) paste(unlist(strsplit(s,','))[!unlist(strsplit(s,',')) %in% a], 
                                                              collapse = ','),
                                          sharedwith, Airline)][]
    

    which will give you the same result.

    3) Or by using the CJ function with paste (inspired by the expand.grid solution of @zx8754):

    library(data.table)
    setDT(dat)[, Airline := trimws(Airline)
               ][, CJ(air=Airline, Airline,  unique=TRUE)[air!=V2][, .(shared=paste(V2,collapse=',')), air],
                 Code]
    

    which gives:

       Code air shared
    1:    1  AF     KL
    2:    1  KL     AF
    3:    8  AR  AZ,DL
    4:    8  AZ  AR,DL
    5:    8  DL  AR,AZ
    

    A solution with dplyr & tidyr to get the desired solution (inspired by @jaimedash):

    library(dplyr)
    library(tidyr)
    
    dat <- dat %>% mutate(Airline = trimws(as.character(Airline)))
    
    dat %>%
      mutate(SharedWith = Airline) %>% 
      group_by(Code) %>%
      nest(-Code, -Airline, .key = SharedWith) %>%
      left_join(dat, ., by = 'Code') %>%
      unnest() %>%
      filter(Airline != SharedWith) %>%
      group_by(Code, Airline) %>%
      summarise(SharedWith = toString(SharedWith))
    

    which gives:

       Code Airline SharedWith
      (int)   (chr)      (chr)
    1     1      AF         KL
    2     1      KL         AF
    3     8      AR     AZ, DL
    4     8      AZ     AR, DL
    5     8      DL     AR, AZ
    
    0 讨论(0)
  • 2020-12-17 18:52

    split helps. Here's a fully reproducible EDIT that works w/o any additional package. Works with the OPs data.frame - changed it after OP added a reproducible dataset.

    # strip white space in Airline names:
    dat$Airline <- gsub(" ","",dat$Airline)
    li <- split(dat,factor(dat$Code))
    do.call("rbind",lapply(li,function(x) 
    data.frame(Airline = x[1,2],
             SharedWith = paste(x$Airline[-1]
                                ,collapse=",")
    ))
    )
    
    0 讨论(0)
  • 2020-12-17 18:55

    An an igraph approach

    library(igraph)
    
    g <- graph_from_data_frame(dat)
    
    # Find neighbours for select nodes
    ne <- setNames(ego(g,2, nodes=as.character(dat$Airline), mindist=2), dat$Airline)
    ne
    #$`AF  `
    #+ 1/7 vertex, named:
    #[1] KL  
    
    #$`KL  `
    #+ 1/7 vertex, named:
    #[1] AF  
    ---
    ---
    
    # Get final format
    data.frame(Airline=names(ne), 
               Shared=sapply(ne, function(x)
                                          paste(V(g)$name[x], collapse=",")))
    #   Airline Shared
    # 1      AF     KL
    # 2      KL     AF
    # 3      AR  AZ,DL
    # 4      AZ  AR,DL
    # 5      DL  AR,AZ
    
    0 讨论(0)
  • 2020-12-17 18:55

    Using expand.grid and aggregate:

    do.call(rbind,
            lapply(split(dat, dat$Code), function(i){
              x <- expand.grid(i$Airline, i$Airline)
              x <- x[ x$Var1 != x$Var2, ]
              x <- aggregate(x$Var2, list(x$Var1), paste, collapse = ",")
              colnames(x) <- c("Airline", "SharedWith")
              cbind(Code = i$Code, x)
            }))
    
    # output
    #     Code Airline SharedWith
    # 1.1    1      AF         KL
    # 1.2    1      KL         AF
    # 8.1    8      AR      AZ,DL
    # 8.2    8      AZ      AR,DL
    # 8.3    8      DL      AR,AZ
    
    0 讨论(0)
  • 2020-12-17 19:01

    You can try something like this in dplyr

    library(dplyr)
    df %>% group_by(code) %>% mutate(SharedWith = paste(sort(Airline), collapse = ', ')) %>% ungroup() %>% select(Airline, SharedWith)
    
    0 讨论(0)
  • 2020-12-17 19:02

    Take the following as a comment that is posted as an answer just because this allows more convenient formatting.

    for each code
      lookup all rows in the table where the value = code
    

    ummm... sorry, I don't get how this psedudocode is related to your desired output

    +--------------------+
    | Airline SharedWith |
    +--------------------+
    | AF      "KL"       |
    | KL      "AF"       |
    | AR      "AZ","DL"  |
    +--------------------+
    

    The result of this pseudocode should rather be:

    +---------------------+
    + Code  +  Airlines   +
    +---------------------+
    +  1    +  AF, KL     +
    +  2    +  AR, AZ, DL +
    +---------------------+
    

    That is,

    codes <- unique(dat$Code)
    data.frame(Code=codes, Airlines = sapply(codes, function(x) paste(subset(dat, Code %in% x)$Airline, collapse=",")))
    
    0 讨论(0)
提交回复
热议问题