R function for a function to be repeated based on column values

后端 未结 1 478
我寻月下人不归
我寻月下人不归 2021-01-16 00:35

I have a large data set for which I need to do string matching. I have got some very useful posts from this site and referring them I have created a function to do the strin

相关标签:
1条回答
  • 2021-01-16 01:08

    While you can probably use an apply function to repeat over separate data files of different regions, here is a fuzzyjoin solution based on my answer to your previous question.

    It looks for the best stringdist match for Address and the AreaCode must match exactly (==). I also specified year2 had to be >= year1, just for demonstration.

    Finally, I used dplyr::group_by and dplyr::top_n to get the minimum distance matches and I had to assume what to do in matching ties (picked matches with largest year2). You can also use slice_min which replaces the older top_n and if the original order is important and not alphabetical, use mutate(rank = row_number(dist)) %>% filter(rank == 1)

    Data:

    Address1 <- c("786, GALI NO 5, XYZ","rambo, 45, strret 4, atlast, pqr","23/4, 23RD FLOOR, STREET 2, ABC-E, PQR","45-B, GALI NO5, XYZ","HECTIC, 99 STREET, PQR")
    AREACODE <- c('10','10','14','20','30')
    Year1 <- c(2001:2005)
    
    Address2 <- c("abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR","abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR")
    Year2 <- c(2001:2010)
    AREA_CODE <- c('10','10','10','20','30','40','50','61','64', '99')
    
    data1 <- data.table(Address1, Year1, AREACODE)
    data2 <- data.table(Address2, Year2, AREA_CODE)
    data2[, unique_id := sprintf("%06d", 1:nrow(data2))]
    

    Solution:

    library(fuzzyjoin, quietly = TRUE); library(dplyr, quietly = TRUE)
    
    # First, need to define match_fun_stringdist 
    # Code from stringdist_join from https://github.com/dgrtwo/fuzzyjoin
    match_fun_stringdist <- function(v1, v2) {
      
      # Can't pass these parameters in from fuzzy_join because of multiple incompatible match_funs, so I set them here.
      ignore_case = FALSE
      method = "dl"
      max_dist = 99
      distance_col = "dist"
      
      if (ignore_case) {
        v1 <- stringr::str_to_lower(v1)
        v2 <- stringr::str_to_lower(v2)
      }
      
      # shortcut for Levenshtein-like methods: if the difference in
      # string length is greater than the maximum string distance, the
      # edit distance must be at least that large
      
      # length is much faster to compute than string distance
      if (method %in% c("osa", "lv", "dl")) {
        length_diff <- abs(stringr::str_length(v1) - stringr::str_length(v2))
        include <- length_diff <= max_dist
        
        dists <- rep(NA, length(v1))
        
        dists[include] <- stringdist::stringdist(v1[include], v2[include], method = method)
      } else {
        # have to compute them all
        dists <- stringdist::stringdist(v1, v2, method = method)
      }
      ret <- dplyr::data_frame(include = (dists <= max_dist))
      if (!is.null(distance_col)) {
        ret[[distance_col]] <- dists
      }
      ret
    }
    
    # Finally, call fuzzy_join
    fuzzy_join(data1, data2, 
               by = list(x = c("Address1", "AREACODE", "Year1"), y = c("Address2", "AREA_CODE", "Year2")), 
               match_fun = list(match_fun_stringdist, `==`, `<=`),
               mode = "left"
               ) %>%
      group_by(Address1, Year1, AREACODE) %>%
      top_n(1, -Address1.dist) %>%
      top_n(1, Year2) %>%
      select(unique_id, Address1.dist, everything())
    
    0 讨论(0)
提交回复
热议问题