Group by similar results in R

前端 未结 2 582
青春惊慌失措
青春惊慌失措 2021-01-29 05:14

I want to group_by similar results (not unique) and I don\'t know how to do it.

I mean, I have a df with a column called \'name\' that has similar results like: ARPO, AR

相关标签:
2条回答
  • 2021-01-29 05:32

    Here I have an example to input:

    df <- tibble::tribble(
      ~name,       ~number,       ~ind,
      "ARPO",      "405162",      5,
      "ARPO S.L.", "504653",      22,
      "ARPOS",     "900232",      1,
      "ARPO",      "504694",      12,
      "ARPO",      "400304",      42,
      "JJJJ",      "401605",      2,
      "JJJJ",      "900029",      31,
      "BBBBB",     "400090",      25,
      "BBBBB",     "403004",      33,
      "JJJJ",      "900222",      2,
      "BBBBB",     "403967",      11,
      "BBBB",      "400304",      52,
      "JJJJ",      "404308",      200,
      "ARPO",      "403898",      2,
      "ARPO",      "158159",      24,
      "BBBBBBB",   "700805",      2,
      "ARPO S.L.", "900245",      24,
      "JJJJ",      "501486",      2,
      "JJJJ",      "400215",      210,
      "JJJJ",      "504379",      26,
      "HARPO",     "900222",      400,
      "BBBBB",     "109700",      46,
      "ARPO",      "142173",      14,
      "BBBBB",     "400586",      22,
      "ARPO",      "401605",      322
    )
    

    I found a similar solution here: Group together levels with similar names R

    x <- df$name
    
    groups <- list()
    i <- 1
    while(length(x) > 0) {
    
      id <- agrep(x[1], x, ignore.case = TRUE, max.distance = 0.1)
      groups[[i]] <- x[id]
      x <- x[-id]
      i <- i + 1
    
    }
    
    

    So, from that point, you can create a group variable:

    df$group <- ""
    
    for (j in 1:length(groups)){
      df$group <- ifelse(df$name %in% groups[[j]], paste0("group_",j), df$group)
    }
    

    Maybe you can find a simpler solution, but this works!

    0 讨论(0)
  • 2021-01-29 05:46

    The function below uses agrepl to get similar strings, given a threshold thresh. And returns an integer vector of positions where the first in a group was found.

    The test data is the data in NoeliaNC's answer.

    library(dplyr)
    
    similarGroups <- function(x, thresh = 0.8){
      grp <- integer(length(x))
      name <- x
      for(i in seq_along(name)){
        if(!is.na(name[i])){
          sim <- agrepl(x[i], x, ignore.case = TRUE, max.distance = 1 - thresh)
          k <- which(sim & !is.na(name))
          grp[k] <- i
          is.na(name) <- k
        }
      }
      grp
    }
    
    similarGroups(df[['name']])
    # [1] 1 1 1 1 1 6 6 8 8 6 8 8 6 1 1 8 1 6 6 6 1 8 1 8 1
    

    Now apply the function to grouping the dataframe.

    df %>%
      mutate(group = name[similarGroups(name)]) %>%
      count(group)
    ## A tibble: 3 x 2
    #  group     n
    #  <chr> <int>
    #1 ARPO     11
    #2 BBBBB     7
    #3 JJJJ      7
    

    Edit

    Another way is to use the stringsim function in package stringdist. It features several distance/similarity measures, that can be tested to see which one gives better results.

    similarGroups2 <- function(x, thresh = 0.8, method = "soundex"){
      grp <- integer(length(x))
      name <- x
      x <- tolower(x)
      for(i in seq_along(name)){
        if(!is.na(name[i])){
          sim <- stringdist::stringsim(x[i], x, method = method)
          k <- which(sim > thresh & !is.na(name))
          grp[k] <- i
          is.na(name) <- k
        }
      }
      grp
    }
    
    df %>%
       mutate(group = name[similarGroups2(name, thresh = 0.7, method = "jw")]) %>%
       count(group)
    ## A tibble: 4 x 2
    #  group             n
    #  <chr>         <int>
    #1 Antonio Gomez     3
    #2 ARPO             11
    #3 BBBBB             7
    #4 JJJJ              7
    

    New data

    df <- tibble::tribble(
      ~name,       ~number,       ~ind,
      'Antonio Gomez', 1234,       1,
      'Antonio Sanches', 5678,     2,
      'Antonio Ruiz',  9089,       3,
      "ARPO",      "405162",      5,
      "ARPO S.L.", "504653",      22,
      "ARPOS",     "900232",      1,
      "ARPO",      "504694",      12,
      "ARPO",      "400304",      42,
      "JJJJ",      "401605",      2,
      "JJJJ",      "900029",      31,
      "BBBBB",     "400090",      25,
      "BBBBB",     "403004",      33,
      "JJJJ",      "900222",      2,
      "BBBBB",     "403967",      11,
      "BBBB",      "400304",      52,
      "JJJJ",      "404308",      200,
      "ARPO",      "403898",      2,
      "ARPO",      "158159",      24,
      "BBBBBBB",   "700805",      2,
      "ARPO S.L.", "900245",      24,
      "JJJJ",      "501486",      2,
      "JJJJ",      "400215",      210,
      "JJJJ",      "504379",      26,
      "HARPO",     "900222",      400,
      "BBBBB",     "109700",      46,
      "ARPO",      "142173",      14,
      "BBBBB",     "400586",      22,
      "ARPO",      "401605",      322
    )
    
    0 讨论(0)
提交回复
热议问题