merge data with partial match in r

前端 未结 2 1416
天涯浪人
天涯浪人 2020-12-30 14:33

I have two datasets

datf1 <- data.frame (name = c(\"regular\", \"kklmin\", \"notSo\", \"Jijoh\",
 \"Kish\", \"Lissp\", \"Kcn\", \"CCCa\"),
 number1 = c(1         


        
相关标签:
2条回答
  • 2020-12-30 14:59

    Maybe there is a simple solution but I can't find any.
    IMHO you have to implement this kind of merging for your own.
    Please find an ugly example below (there is a lot of space for improvements):

    uglyMerge <- function(df1, df2) {
    
        ## lower all strings to allow case-insensitive comparison
        lowerNames1 <- tolower(df1[, 1]);
        lowerNames2 <- tolower(df2[, 1]);
    
        ## split strings into single characters
        names1 <- strsplit(lowerNames1, "");
        names2 <- strsplit(lowerNames2, "");
    
        ## create the final dataframe
        mergedDf <- data.frame(name1=as.character(df1[,1]), name2=NA, 
                            number1=df1[,2], number2=NA, matches=0,
                            stringsAsFactors=FALSE);
    
        ## store names of dataframe2 (to remember which strings have no match)
        toMerge <- df2[, 1];
    
        for (i in seq(along=names1)) {
            for (j in seq(along=names2)) {
                ## set minimal match to 4 or to string length
                minMatch <- min(4, length(names2[[j]]));
    
                ## find single matches
                matches <- names1[[i]] %in% names2[[j]];
    
                ## look for consecutive matches
                r <- rle(matches);
    
                ## any matches found?
                if (any(r$values)) {
                    ## find max consecutive match
                    possibleMatch <- r$value == TRUE;
                    maxPos <- which(which.max(r$length[possibleMatch]) & possibleMatch)[1];
    
                    ## store max conscutive match length
                    maxMatch <- r$length[maxPos];
    
                    ## to remove FALSE-POSITIVES (e.g. CCC and kcn) find 
                    ## largest substring
                    start <- sum(r$length[0:(maxPos-1)]) + 1;
                    stop <- start + r$length[maxPos] - 1;
                    maxSubStr <- substr(lowerNames1[i], start, stop);
    
                    ## all matching criteria fulfilled
                    isConsecutiveMatch <- maxMatch >= minMatch &&
                                        grepl(pattern=maxSubStr, x=lowerNames2[j], fixed=TRUE) &&
                                        nchar(maxSubStr) > 0;
    
                    if (isConsecutiveMatch) {
                        ## merging
                        mergedDf[i, "matches"] <- maxMatch
                        mergedDf[i, "name2"] <- as.character(df2[j, 1]);
                        mergedDf[i, "number2"] <- df2[j, 2];
    
                        ## don't append this row to mergedDf because already merged
                        toMerge[j] <- NA;
    
                        ## stop inner for loop here to avoid possible second match
                        break;
                    }
                }
            } 
        }
    
        ## append not matched rows to mergedDf
        toMerge <- which(df2[, 1] == toMerge);
        df2 <- data.frame(name1=NA, name2=as.character(df2[toMerge, 1]), 
                        number1=NA, number2=df2[toMerge, 2], matches=0, 
                        stringsAsFactors=FALSE);
        mergedDf <- rbind(mergedDf, df2);
    
        return (mergedDf);
    }
    

    Output:

    > uglyMerge(datf1, datf2)
        name1  name2 number1 number2 matches
    1  xxregular reGulr       1       2       5
    2     kklmin   <NA>       8      NA       0
    3      notSo   <NA>       9      NA       0
    4      Jijoh  Jijoh       2      12       5
    5       Kish   <NA>      18      NA       0
    6      Lissp  LiSsp      25      20       5
    7        Kcn    KcN      33      18       3
    8       CCCa   <NA>       8      NA       0
    9       <NA>   ntSo      NA       8       0
    10      <NA>   sean      NA      13       0
    11      <NA>   CaPN      NA      13       0
    
    0 讨论(0)
  • 2020-12-30 15:00

    agrep will get you started.

    something like:

    lapply(tolower(datf1$name), function(x) agrep(x, tolower(datf2$name)))
    

    then you can adjust the max.distance parameter until you get the appropriate amount of matching. then merge however you like.

    0 讨论(0)
提交回复
热议问题