问题
How can I merge two data frames based on their numeric rownames while setting a threshold for the match?
df1 <- structure(list(c(4974622.505928, 170582.149747, 130545.004516,
143528.819582, 49416.594892, 51879.515558, 52027.462651, 42491.317116,
49173.145029, 44040.01261), c(4664319.00309, 266278.599338, 204772.412837,
204819.210688, 77718.961761, 82742.852809, 79706.774944, 67123.603629,
67264.401059, 66750.260768), c(5906075.502923, 385318.121061,
296824.944672, 308432.753482, 113407.50333, 120352.400266, 122622.356104,
98656.179336, 107669.002489, 100262.855064), c(5401712.020682,
204595.653994, 163485.509823, 179567.339348, 62690.116298, 63790.0244,
64660.971879, 52545.84055, 59080.66972, 54579.538267), c(5273676.522307,
159130.126808, 129607.971309, 142279.787439, 45812.561022, 47230.447746,
48367.405274, 39578.235275, 45489.065198, 43102.923417)), row.names = c("34.9816256",
"35.0576674", "35.0898006", "35.1270264", "35.1738664", "35.1936282",
"35.2043582", "35.2359934", "35.2716016", "35.2993064"), class = "data.frame")
df2 <- structure(list(c(5898584.48405, 302326.226264, 185567.968257,
205617.778019, 84476.66928, 65505.560486, 68121.465276, 63221.947902,
55028.866127, 36821.607091), c(3719350.766633, 108177.577417,
68855.378083, 78201.248427, 17558.118703, 23387.078772, 25374.978916,
18833.579115, 12761.529092, 11507.348928), c(3587498.99736, 96793.741428,
59750.485295, 70217.309923, 26233.188472, 20200.080468, 22241.999451,
20268.485836, 17330.391134, 12503.133961), c(3128479.008712,
70298.795438, 45668.592667, 56013.453832, 20323.368372, 16795.27218,
16358.208042, 15722.790712, 12276.726458, 9155.522864), c(3847005.494149,
138762.296854, 94196.099405, 106888.964213, 36614.870588, 30856.787329,
33880.704043, 31399.328936, 27819.255931, 18560.05768)), row.names = c("34.9815906",
"35.0356588", "35.0897702", "35.1269978", "35.1535182", "35.1744048",
"35.1952968", "35.3032464", "35.3207828", "35.3739834"), class = "data.frame")
Output of the dataframes (first row is rownames)
> df1
34.9816256 4974622.51 4664319.00 5906075.50 5401712.02 5273676.52
35.0576674 170582.15 266278.60 385318.12 204595.65 159130.13
35.0898006 130545.00 204772.41 296824.94 163485.51 129607.97
35.1270264 143528.82 204819.21 308432.75 179567.34 142279.79
35.1738664 49416.59 77718.96 113407.50 62690.12 45812.56
35.1936282 51879.52 82742.85 120352.40 63790.02 47230.45
35.2043582 52027.46 79706.77 122622.36 64660.97 48367.41
35.2359934 42491.32 67123.60 98656.18 52545.84 39578.24
35.2716016 49173.15 67264.40 107669.00 59080.67 45489.07
35.2993064 44040.01 66750.26 100262.86 54579.54 43102.92
> df2
34.9815906 5898584.48 3719350.77 3587499.00 3128479.009 3847005.49
35.0356588 302326.23 108177.58 96793.74 70298.795 138762.30
35.0897702 185567.97 68855.38 59750.49 45668.593 94196.10
35.1269978 205617.78 78201.25 70217.31 56013.454 106888.96
35.1535182 84476.67 17558.12 26233.19 20323.368 36614.87
35.1744048 65505.56 23387.08 20200.08 16795.272 30856.79
35.1952968 68121.47 25374.98 22242.00 16358.208 33880.70
35.3032464 63221.95 18833.58 20268.49 15722.791 31399.33
35.3207828 55028.87 12761.53 17330.39 12276.726 27819.26
35.3739834 36821.61 11507.35 12503.13 9155.523 18560.06
I want to merge these two datasets based on their rownames IF the difference between the two numbers in the rownames is between [-0.02, 0.02]
In other words, each rowname from df1 should be compared to each rowname in df2, and if two rownames are found with their difference falling in the range [-0.02, 0.02], then the data can be merged on the same row. If a match is not found, NA will be added to where there was no matching data from the other df (as in full_join).
回答1:
You need column names and add
rownames
as a columnslibrary(tibble) colnames(df1) <- c('a1', 'b1', 'c1', 'd1', 'e1') df1 <- rownames_to_column(df1, "rn1") colnames(df2) <- c('a2', 'b2', 'c2', 'd2', 'e2') df2 <- rownames_to_column(df2, "rn2")
concatenate the two data frames
df3 <- cbind(df1, df2)
Calculate the difference between
rownames
df3['diff'] <- as.numeric(df3$rn1) - as.numeric(df3$rn2)
filter and drop the columns you do not need
library(tidyverse) df4 <- df3 %>% filter(diff >= -0.02 & diff <= 0.02) %>% select(-c(rn1, rn2, diff)) # a1 b1 c1 d1 e1 a2 b2 c2 d2 e2 #1 4974622.51 4664319.00 5906075.5 5401712.02 5273676.52 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49 #2 130545.00 204772.41 296824.9 163485.51 129607.97 185567.97 68855.38 59750.49 45668.59 94196.10 #3 143528.82 204819.21 308432.8 179567.34 142279.79 205617.78 78201.25 70217.31 56013.45 106888.96 #4 51879.52 82742.85 120352.4 63790.02 47230.45 65505.56 23387.08 20200.08 16795.27 30856.79 #5 52027.46 79706.77 122622.4 64660.97 48367.41 68121.47 25374.98 22242.00 16358.21 33880.70
回答2:
You can use foverlaps
from data.table
package
library(data.table)
#add column names to sample data as it's NULL currently
names(df1) <- paste0("df1_", 1:ncol(df1))
names(df2) <- paste0("df2_", 1:ncol(df2))
#convert rownames as first column
setDT(df1, keep.rownames = TRUE)[]
setnames(df1, 1, "df1_rn")
setDT(df2, keep.rownames = TRUE)[]
setnames(df2, 1, "df2_rn")
#add temporary columns to both data tables
df1[, `:=`(df1_rn = as.numeric(df1_rn), temp = as.numeric(df1_rn))]
df2[, `:=`(df2_rn_minus_2 = as.numeric(df2_rn) - 0.02, df2_rn_plus_2 = as.numeric(df2_rn) + 0.02)]
setkey(df2, df2_rn_minus_2, df2_rn_plus_2)
DT = foverlaps(df1, df2, by.x = c("df1_rn", "temp"))[, !c("df2_rn_minus_2", "df2_rn_plus_2", "temp"), with = F]
which gives
> DT
df2_rn df2_1 df2_2 df2_3 df2_4 df2_5 df1_rn df1_1 df1_2 df1_3
1: 34.9815906 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49 34.98163 4974622.51 4664319.00 5906075.50
2: <NA> NA NA NA NA NA 35.05767 170582.15 266278.60 385318.12
3: 35.0897702 185567.97 68855.38 59750.49 45668.59 94196.10 35.08980 130545.00 204772.41 296824.94
4: 35.1269978 205617.78 78201.25 70217.31 56013.45 106888.96 35.12703 143528.82 204819.21 308432.75
5: 35.1744048 65505.56 23387.08 20200.08 16795.27 30856.79 35.17387 49416.59 77718.96 113407.50
6: 35.1744048 65505.56 23387.08 20200.08 16795.27 30856.79 35.19363 51879.52 82742.85 120352.40
7: 35.1952968 68121.47 25374.98 22242.00 16358.21 33880.70 35.19363 51879.52 82742.85 120352.40
8: 35.1952968 68121.47 25374.98 22242.00 16358.21 33880.70 35.20436 52027.46 79706.77 122622.36
9: <NA> NA NA NA NA NA 35.23599 42491.32 67123.60 98656.18
10: <NA> NA NA NA NA NA 35.27160 49173.15 67264.40 107669.00
11: 35.3032464 63221.95 18833.58 20268.49 15722.79 31399.33 35.29931 44040.01 66750.26 100262.86
df1_4 df1_5
1: 5401712.02 5273676.52
2: 204595.65 159130.13
3: 163485.51 129607.97
4: 179567.34 142279.79
5: 62690.12 45812.56
6: 63790.02 47230.45
7: 63790.02 47230.45
8: 64660.97 48367.41
9: 52545.84 39578.24
10: 59080.67 45489.07
11: 54579.54 43102.92
来源:https://stackoverflow.com/questions/50988718/merge-data-frames-based-on-numeric-rownames-within-a-chosen-threshold-and-keepin