merge data with partial match in r

前端未结

关注

 2  1416

I have two datasets

datf1 <- data.frame (name = c(\"regular\", \"kklmin\", \"notSo\", \"Jijoh\",
 \"Kish\", \"Lissp\", \"Kcn\", \"CCCa\"),
 number1 = c(1


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  时光取名叫无心        
                
              
                            
                2020-12-30 14:59
              
            
            
                                                                       
Maybe there is a simple solution but I can't find any.

IMHO you have to implement this kind of merging for your own.

Please find an ugly example below (there is a lot of space for improvements):

uglyMerge <- function(df1, df2) {

    ## lower all strings to allow case-insensitive comparison
    lowerNames1 <- tolower(df1[, 1]);
    lowerNames2 <- tolower(df2[, 1]);

    ## split strings into single characters
    names1 <- strsplit(lowerNames1, "");
    names2 <- strsplit(lowerNames2, "");

    ## create the final dataframe
    mergedDf <- data.frame(name1=as.character(df1[,1]), name2=NA, 
                        number1=df1[,2], number2=NA, matches=0,
                        stringsAsFactors=FALSE);

    ## store names of dataframe2 (to remember which strings have no match)
    toMerge <- df2[, 1];

    for (i in seq(along=names1)) {
        for (j in seq(along=names2)) {
            ## set minimal match to 4 or to string length
            minMatch <- min(4, length(names2[[j]]));

            ## find single matches
            matches <- names1[[i]] %in% names2[[j]];

            ## look for consecutive matches
            r <- rle(matches);

            ## any matches found?
            if (any(r$values)) {
                ## find max consecutive match
                possibleMatch <- r$value == TRUE;
                maxPos <- which(which.max(r$length[possibleMatch]) & possibleMatch)[1];

                ## store max conscutive match length
                maxMatch <- r$length[maxPos];

                ## to remove FALSE-POSITIVES (e.g. CCC and kcn) find 
                ## largest substring
                start <- sum(r$length[0:(maxPos-1)]) + 1;
                stop <- start + r$length[maxPos] - 1;
                maxSubStr <- substr(lowerNames1[i], start, stop);

                ## all matching criteria fulfilled
                isConsecutiveMatch <- maxMatch >= minMatch &&
                                    grepl(pattern=maxSubStr, x=lowerNames2[j], fixed=TRUE) &&
                                    nchar(maxSubStr) > 0;

                if (isConsecutiveMatch) {
                    ## merging
                    mergedDf[i, "matches"] <- maxMatch
                    mergedDf[i, "name2"] <- as.character(df2[j, 1]);
                    mergedDf[i, "number2"] <- df2[j, 2];

                    ## don't append this row to mergedDf because already merged
                    toMerge[j] <- NA;

                    ## stop inner for loop here to avoid possible second match
                    break;
                }
            }
        } 
    }

    ## append not matched rows to mergedDf
    toMerge <- which(df2[, 1] == toMerge);
    df2 <- data.frame(name1=NA, name2=as.character(df2[toMerge, 1]), 
                    number1=NA, number2=df2[toMerge, 2], matches=0, 
                    stringsAsFactors=FALSE);
    mergedDf <- rbind(mergedDf, df2);

    return (mergedDf);
}


Output:

> uglyMerge(datf1, datf2)
    name1  name2 number1 number2 matches
1  xxregular reGulr       1       2       5
2     kklmin   <NA>       8      NA       0
3      notSo   <NA>       9      NA       0
4      Jijoh  Jijoh       2      12       5
5       Kish   <NA>      18      NA       0
6      Lissp  LiSsp      25      20       5
7        Kcn    KcN      33      18       3
8       CCCa   <NA>       8      NA       0
9       <NA>   ntSo      NA       8       0
10      <NA>   sean      NA      13       0
11      <NA>   CaPN      NA      13       0

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  孤城傲影        
                
              
                            
                2020-12-30 15:00
              
            
            
                                                                       
agrep will get you started.  

something like:

lapply(tolower(datf1$name), function(x) agrep(x, tolower(datf2$name)))


then you can adjust the max.distance parameter until you get the appropriate amount of matching.  then merge however you like.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复