I would like to merge 2 data frames based on multiple conditions.
DF1 <- data.frame(\"col1\" = rep(c(\"A\",\"B\"), 18),
\"col2\" = rep(c
With the recent versions of data.table
, non-equi joins and update on join are possible:
library(data.table)
head(setDT(DF1)[setDT(DF2), on = c("col1", "col2", "value>=min", "value<=max"),
data := data])
rn col1 col2 value col4 data 1: 1 A C 22 NA 1 2: 2 B D 58 NA NA 3: 3 A E 35 NA NA 4: 4 B C 86 NA NA 5: 5 A D 37 NA 3 6: 6 B E 16 NA NA
DF1 <- structure(list(rn = 1:36, col1 = c("A", "B", "A", "B", "A", "B",
"A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A",
"B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B", "A", "B",
"A", "B", "A", "B"), col2 = c("C", "D", "E", "C", "D", "E", "C",
"D", "E", "C", "D", "E", "C", "D", "E", "C", "D", "E", "C", "D",
"E", "C", "D", "E", "C", "D", "E", "C", "D", "E", "C", "D", "E",
"C", "D", "E"), value = c(22L, 58L, 35L, 86L, 37L, 16L, 46L,
23L, 88L, 3L, 33L, 25L, 19L, 24L, 9L, 76L, 62L, 68L, 97L, 43L,
8L, 84L, 36L, 20L, 57L, 99L, 42L, 64L, 87L, 1L, 78L, 34L, 41L,
32L, 10L, 72L), col4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("rn",
"col1", "col2", "value", "col4"), row.names = c(NA, -36L), class = "data.frame")
DF2 <- structure(list(rn = 1:6, col1 = c("A", "A", "A", "A", "A", "A"
), col2 = c("C", "D", "C", "D", "C", "D"), data = c(1L, 3L, 1L,
3L, 1L, 3L), min = c(0L, 10L, 20L, 30L, 40L, 50L), max = c(10L,
20L, 30L, 40L, 50L, 60L)), .Names = c("rn", "col1", "col2", "data",
"min", "max"), row.names = c(NA, -6L), class = "data.frame")
Your data, changing stringsAsFactors=F
DF1 <- data.frame("col1" = rep(c("A","B"), 18),
"col2" = rep(c("C","D","E"), 12),
"value"= (sample(1:100,36)),
"col4" = rep(NA,36),
stringsAsFactors=F)
DF2 <- data.frame("col1" = rep("A",6),
"col2" = rep(c("C","D"),3),
"data" = rep(c(1,3),3),
"min" = seq(0,59,by=10),
"max" = seq(10,69,by=10),
stringsAsFactors=F)
Using dplyr
, 1) merge the two data using left_join
, 2) check ifelse
value
is between
min
and max
rowwise
, then 3) unselect min
and max
columns...
library(dplyr)
left_join(DF1, DF2, by=c("col1","col2")) %>%
rowwise() %>%
mutate(data = ifelse(between(value,min,max), data, NA)) %>%
select(-min, -max)
Not sure if you were expecting to perform some kind of aggregation, but here's the output of the above code
col1 col2 value col4 data
1 A C 23 NA NA
2 A C 23 NA 1
3 A C 23 NA NA
4 B D 59 NA NA
5 A E 57 NA NA
6 B C 8 NA NA
You can do it in two steps:
final <- merge(DF1,DF2,by=c("col1","col2"),all.x = T)
final$data <- ifelse(final$data>=final$min & final$data<=final$max,final$data,"NULL")
Using my package safejoin which wraps fuzzyjoin functions, you can do :
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
debugonce(safe_left_join)
safe_left_join(DF1, DF2, ~
X("col1") == Y("col1") &
X("col2") == Y("col2") &
X("value") >= Y("min") &
X("value") <= Y("max"),
conflict = ~.x) %>%
head(15)
# col1 col2 value col4 data min max
# 1 A C 90 NA NA NA NA
# 2 B D 20 NA NA NA NA
# 3 A E 8 NA NA NA NA
# 4 B C 99 NA NA NA NA
# 5 A D 42 NA NA NA NA
# 6 B E 37 NA NA NA NA
# 7 A C 47 NA 1 40 50
# 8 B D 61 NA NA NA NA
# 9 A E 55 NA NA NA NA
# 10 B C 11 NA NA NA NA
# 11 A D 81 NA NA NA NA
# 12 B E 48 NA NA NA NA
# 13 A C 77 NA NA NA NA
# 14 B D 58 NA NA NA NA
# 15 A E 3 NA NA NA NA
The conflict
argument here tells the function to return only the conflicted columns from the lhs (col1
and col2
).
By using all.x=TRUE all rows of DF1 are kept then adjust condition in filter as follows:
iMed=merge(DF1,DF2,by.x=c('col1','col2'),by.y=c('col1','col2'),all.x=TRUE)
Res=iMed[is.na(iMed[,'min'])|is.na(iMed[,'max'])|(iMed[,'value']<=iMed[,'max'] & iMed[,'value']>=iMed[,'min'] ),]