I want to join two tibbles by a range or a virtual column. but it seems the by
- parameter just allow to handle chr
oder vector(chr)
o
We can use mutate
and case_when
from dplyr
.
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Update
Here is a workaround by defining a function for this task.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
I really liked @WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data
I don't think inequality joins is implemented in dplyr
yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Alternatively, if you want to integrate the join into your dplyr
chain, you can use fuzzyjoin::fuzzy_join
:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Result:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Notice I added single quotes around from
and to
since those are reserved words for the SQL language.
We can use sapply
for this
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F
You use the cut function to create a "class" in object d and then use a left join.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
To get the right buckets, you just need to work with the cut function to get what you want.
Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join
(inspired by @ycw's code) and compared all described solution in view of runtime.
I like fuzzy_join but with only 50k rows in d
it needs more than 40sec. Thats too slow.
Here the result with 5k rows in d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by @WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by @ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# @useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# @jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# @WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# @WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# @hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# @ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# @WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Results:
# @useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# @jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# @WiWeber floor
Time difference of 0.02223396 secs
# @WiWeber cross join - filter
Time difference of 0.0201931 secs
# @hardik-gupta sapply
Time difference of 5.166633 secs
# @ycw dynamic join
Time difference of 0.03124094 secs
# @WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber