Merge data frames based on numeric rownames within a chosen threshold and keeping unmatched rows as well

只谈情不闲聊 提交于 2020-01-15 10:43:36

问题


How can I merge two data frames based on their numeric rownames while setting a threshold for the match?

df1 <- structure(list(c(4974622.505928, 170582.149747, 130545.004516, 
143528.819582, 49416.594892, 51879.515558, 52027.462651, 42491.317116, 
49173.145029, 44040.01261), c(4664319.00309, 266278.599338, 204772.412837, 
204819.210688, 77718.961761, 82742.852809, 79706.774944, 67123.603629, 
67264.401059, 66750.260768), c(5906075.502923, 385318.121061, 
296824.944672, 308432.753482, 113407.50333, 120352.400266, 122622.356104, 
98656.179336, 107669.002489, 100262.855064), c(5401712.020682, 
204595.653994, 163485.509823, 179567.339348, 62690.116298, 63790.0244, 
64660.971879, 52545.84055, 59080.66972, 54579.538267), c(5273676.522307, 
159130.126808, 129607.971309, 142279.787439, 45812.561022, 47230.447746, 
48367.405274, 39578.235275, 45489.065198, 43102.923417)), row.names = c("34.9816256", 
"35.0576674", "35.0898006", "35.1270264", "35.1738664", "35.1936282", 
"35.2043582", "35.2359934", "35.2716016", "35.2993064"), class = "data.frame")

df2 <- structure(list(c(5898584.48405, 302326.226264, 185567.968257, 
205617.778019, 84476.66928, 65505.560486, 68121.465276, 63221.947902, 
55028.866127, 36821.607091), c(3719350.766633, 108177.577417, 
68855.378083, 78201.248427, 17558.118703, 23387.078772, 25374.978916, 
18833.579115, 12761.529092, 11507.348928), c(3587498.99736, 96793.741428, 
59750.485295, 70217.309923, 26233.188472, 20200.080468, 22241.999451, 
20268.485836, 17330.391134, 12503.133961), c(3128479.008712, 
70298.795438, 45668.592667, 56013.453832, 20323.368372, 16795.27218, 
16358.208042, 15722.790712, 12276.726458, 9155.522864), c(3847005.494149, 
138762.296854, 94196.099405, 106888.964213, 36614.870588, 30856.787329, 
33880.704043, 31399.328936, 27819.255931, 18560.05768)), row.names = c("34.9815906", 
"35.0356588", "35.0897702", "35.1269978", "35.1535182", "35.1744048", 
"35.1952968", "35.3032464", "35.3207828", "35.3739834"), class = "data.frame")

Output of the dataframes (first row is rownames)

> df1
34.9816256 4974622.51 4664319.00 5906075.50 5401712.02 5273676.52
35.0576674  170582.15  266278.60  385318.12  204595.65  159130.13
35.0898006  130545.00  204772.41  296824.94  163485.51  129607.97
35.1270264  143528.82  204819.21  308432.75  179567.34  142279.79
35.1738664   49416.59   77718.96  113407.50   62690.12   45812.56
35.1936282   51879.52   82742.85  120352.40   63790.02   47230.45
35.2043582   52027.46   79706.77  122622.36   64660.97   48367.41
35.2359934   42491.32   67123.60   98656.18   52545.84   39578.24
35.2716016   49173.15   67264.40  107669.00   59080.67   45489.07
35.2993064   44040.01   66750.26  100262.86   54579.54   43102.92

> df2
34.9815906 5898584.48 3719350.77 3587499.00 3128479.009 3847005.49
35.0356588  302326.23  108177.58   96793.74   70298.795  138762.30
35.0897702  185567.97   68855.38   59750.49   45668.593   94196.10
35.1269978  205617.78   78201.25   70217.31   56013.454  106888.96
35.1535182   84476.67   17558.12   26233.19   20323.368   36614.87
35.1744048   65505.56   23387.08   20200.08   16795.272   30856.79
35.1952968   68121.47   25374.98   22242.00   16358.208   33880.70
35.3032464   63221.95   18833.58   20268.49   15722.791   31399.33
35.3207828   55028.87   12761.53   17330.39   12276.726   27819.26
35.3739834   36821.61   11507.35   12503.13    9155.523   18560.06

I want to merge these two datasets based on their rownames IF the difference between the two numbers in the rownames is between [-0.02, 0.02]

In other words, each rowname from df1 should be compared to each rowname in df2, and if two rownames are found with their difference falling in the range [-0.02, 0.02], then the data can be merged on the same row. If a match is not found, NA will be added to where there was no matching data from the other df (as in full_join).


回答1:


  1. You need column names and add rownames as a columns

    library(tibble)
    colnames(df1) <- c('a1', 'b1', 'c1', 'd1', 'e1')
    df1 <- rownames_to_column(df1, "rn1")
    
    colnames(df2) <- c('a2', 'b2', 'c2', 'd2', 'e2')
    df2 <- rownames_to_column(df2, "rn2")
    
  2. concatenate the two data frames

    df3 <- cbind(df1, df2)
    
  3. Calculate the difference between rownames

    df3['diff'] <- as.numeric(df3$rn1) - as.numeric(df3$rn2)
    
  4. filter and drop the columns you do not need

    library(tidyverse)
    df4 <- df3 %>%
       filter(diff >= -0.02 & diff <= 0.02) %>%
       select(-c(rn1, rn2, diff))
    
    #          a1         b1        c1         d1         e1         a2         b2         c2         d2         e2
    #1 4974622.51 4664319.00 5906075.5 5401712.02 5273676.52 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49
    #2  130545.00  204772.41  296824.9  163485.51  129607.97  185567.97   68855.38   59750.49   45668.59   94196.10
    #3  143528.82  204819.21  308432.8  179567.34  142279.79  205617.78   78201.25   70217.31   56013.45  106888.96
    #4   51879.52   82742.85  120352.4   63790.02   47230.45   65505.56   23387.08   20200.08   16795.27   30856.79
    #5   52027.46   79706.77  122622.4   64660.97   48367.41   68121.47   25374.98   22242.00   16358.21   33880.70
    



回答2:


You can use foverlaps from data.table package

library(data.table)

#add column names to sample data as it's NULL currently
names(df1) <- paste0("df1_", 1:ncol(df1))
names(df2) <- paste0("df2_", 1:ncol(df2))

#convert rownames as first column
setDT(df1, keep.rownames = TRUE)[]
setnames(df1, 1, "df1_rn")
setDT(df2, keep.rownames = TRUE)[]
setnames(df2, 1, "df2_rn")

#add temporary columns to both data tables
df1[, `:=`(df1_rn = as.numeric(df1_rn), temp = as.numeric(df1_rn))]
df2[, `:=`(df2_rn_minus_2 = as.numeric(df2_rn) - 0.02, df2_rn_plus_2 = as.numeric(df2_rn) + 0.02)]
setkey(df2, df2_rn_minus_2, df2_rn_plus_2)

DT = foverlaps(df1, df2, by.x = c("df1_rn", "temp"))[, !c("df2_rn_minus_2", "df2_rn_plus_2", "temp"), with = F]

which gives

> DT
        df2_rn      df2_1      df2_2      df2_3      df2_4      df2_5   df1_rn      df1_1      df1_2      df1_3
 1: 34.9815906 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49 34.98163 4974622.51 4664319.00 5906075.50
 2:       <NA>         NA         NA         NA         NA         NA 35.05767  170582.15  266278.60  385318.12
 3: 35.0897702  185567.97   68855.38   59750.49   45668.59   94196.10 35.08980  130545.00  204772.41  296824.94
 4: 35.1269978  205617.78   78201.25   70217.31   56013.45  106888.96 35.12703  143528.82  204819.21  308432.75
 5: 35.1744048   65505.56   23387.08   20200.08   16795.27   30856.79 35.17387   49416.59   77718.96  113407.50
 6: 35.1744048   65505.56   23387.08   20200.08   16795.27   30856.79 35.19363   51879.52   82742.85  120352.40
 7: 35.1952968   68121.47   25374.98   22242.00   16358.21   33880.70 35.19363   51879.52   82742.85  120352.40
 8: 35.1952968   68121.47   25374.98   22242.00   16358.21   33880.70 35.20436   52027.46   79706.77  122622.36
 9:       <NA>         NA         NA         NA         NA         NA 35.23599   42491.32   67123.60   98656.18
10:       <NA>         NA         NA         NA         NA         NA 35.27160   49173.15   67264.40  107669.00
11: 35.3032464   63221.95   18833.58   20268.49   15722.79   31399.33 35.29931   44040.01   66750.26  100262.86
         df1_4      df1_5
 1: 5401712.02 5273676.52
 2:  204595.65  159130.13
 3:  163485.51  129607.97
 4:  179567.34  142279.79
 5:   62690.12   45812.56
 6:   63790.02   47230.45
 7:   63790.02   47230.45
 8:   64660.97   48367.41
 9:   52545.84   39578.24
10:   59080.67   45489.07
11:   54579.54   43102.92


来源:https://stackoverflow.com/questions/50988718/merge-data-frames-based-on-numeric-rownames-within-a-chosen-threshold-and-keepin

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!