R dplyr join by range or virtual column

后端 未结 6 1313
独厮守ぢ
独厮守ぢ 2020-12-10 16:33

I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) o

相关标签:
6条回答
  • 2020-12-10 17:05

    We can use mutate and case_when from dplyr.

    library(dplyr)
    
    d2 <- d %>%
      mutate(class = case_when(
        value >= 1 & value < 2 ~ "A",
        value >= 2 & value < 3 ~ "B",
        value >= 3 & value < 4 ~ "C",
        value >= 4 & value < 5 ~ "D",
        value >= 5 & value < 6 ~ "E",
        value >= 6             ~ "F"
      ))
    d2
    # A tibble: 26 x 2
       value class
       <dbl> <chr>
     1   1.0     A
     2   1.2     A
     3   1.4     A
     4   1.6     A
     5   1.8     A
     6   2.0     B
     7   2.2     B
     8   2.4     B
     9   2.6     B
    10   2.8     B
    # ... with 16 more rows
    

    Update

    Here is a workaround by defining a function for this task.

    d <- tibble(value = seq(1,6, by = 0.2))
    r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
    
    library(dplyr)
    
    # Define a function for dynamic join
    dynamic_join <- function(d, r){
    
      if (!("class" %in% colnames(d))){
        d[["class"]] <- NA_character_
      }
    
      d <- d %>%
        mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
      return(d)
    }
    
    re_dynamic_join <- function(d, r){
      r_list <- split(r, r$class)
      for (i in 1:length(r_list)){
        d <- dynamic_join(d, r_list[[i]])
      }
      return(d)
    }
    
    # Apply the function
    d2 <- d %>% re_dynamic_join(r)
    d2
    # A tibble: 26 x 2
       value class
       <dbl> <chr>
     1   1.0     A
     2   1.2     A
     3   1.4     A
     4   1.6     A
     5   1.8     A
     6   2.0     B
     7   2.2     B
     8   2.4     B
     9   2.6     B
    10   2.8     B
    # ... with 16 more rows
    
    0 讨论(0)
  • 2020-12-10 17:06

    I really liked @WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification

    library(dplyr)
    
    d <- tibble(value = c(seq(1,4, by = 0.2),9))
    r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
    
    
    range_join <- function(x, y, value, left, right){
    all_matches <- tibble()
    x = as.data.frame(x)
    y = as.data.frame(y)
    x$index=x[,value]
    for (i in 1:nrow(y)){
        matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
        if (nrow(matches)>0){
            all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
        }
    }
    all_matches = all_matches %>% select(-index)
    return(all_matches)
    }
    
    
    data <- d %>% 
    range_join(r, "value", "from", "to")
    
    data
    
    0 讨论(0)
  • 2020-12-10 17:07

    I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:

    library(tibble)
    library(sqldf)
    
    as.tibble(sqldf("select d.value, r.class from d
                    join r on d.value >= r.'from' and 
                              d.value < r.'to'"))
    

    Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:

    library(dplyr)
    library(fuzzyjoin)
    
    d %>%
      fuzzy_join(r, by = c("value" = "from", "value" = "to"), 
                 match_fun = list(`>=`, `<`)) %>%
      select(value, class)
    

    Result:

    # A tibble: 31 x 2
       value class
       <dbl> <chr>
     1   1.0     A
     2   1.2     A
     3   1.4     A
     4   1.6     A
     5   1.8     A
     6   2.0     A
     7   2.0     B
     8   2.2     B
     9   2.4     B
    10   2.6     B
    # ... with 21 more rows
    

    Notice I added single quotes around from and to since those are reserved words for the SQL language.

    0 讨论(0)
  • 2020-12-10 17:09

    We can use sapply for this

    library(tibble)
    
    d <- tibble(value = seq(1,6, by = 0.2))
    r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
    d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
    
    d
       value class
    1    1.0     A
    2    1.2     A
    3    1.4     A
    4    1.6     A
    5    1.8     A
    6    2.0     B
    7    2.2     B
    8    2.4     B
    9    2.6     B
    10   2.8     B
    11   3.0     C
    12   3.2     C
    13   3.4     C
    14   3.6     C
    15   3.8     C
    16   4.0     D
    17   4.2     D
    18   4.4     D
    19   4.6     D
    20   4.8     D
    21   5.0     E
    22   5.2     E
    23   5.4     E
    24   5.6     E
    25   5.8     E
    26   6.0     F 
    
    0 讨论(0)
  • 2020-12-10 17:16

    You use the cut function to create a "class" in object d and then use a left join.

    d <- tibble(value = seq(1,6, by = 0.2))
    r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
    
    d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
    d <- left_join(d, r)
    

    To get the right buckets, you just need to work with the cut function to get what you want.

    0 讨论(0)
  • 2020-12-10 17:18

    Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by @ycw's code) and compared all described solution in view of runtime.

    I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.

    Here the result with 5k rows in d

    library(dplyr)
    library(fuzzyjoin)
    library(sqldf)
    
    #join by range by @WiWeber
    range_join <- function(x, y, value, left, right){
      x_result <- tibble()
      for (y_ in split(y, 1:nrow(y)))
        x_result <-  x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
      return(x_result)
    }
    
    #dynamic join by @ycw
    dynamic_join <- function(d, r){
      d$type <- NA_character_
      for (r_ in split(r, r$type))
        d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
      return(d)
    }
    
    d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
    r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
    
    # @useR sqldf - fast and intuitive but extra library with horrible code
    start <- Sys.time()
    d2 <- tbl_df(sqldf("select d.value, r.type from d
                    join r on d.value >= r.'from' and 
                    d.value < r.'to'"))
    Sys.time() - start
    
    # @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
    start <- Sys.time()
    d2 <- d %>%
      fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
      select(value, type)
    Sys.time() - start
    
    
    # @jonathande4 cut pretty fast
    start <- Sys.time()
    d2 <- d
    d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
    Sys.time() - start
    
    # @WiWeber floor
    start <- Sys.time()
    d2 <- d %>% 
      mutate(join_value = floor(value)) %>% 
      inner_join(r, by = c("join_value" = "from")) %>% 
      select(value, type)
    Sys.time() - start
    
    #  @WiWeber cross join - filter
    start <- Sys.time()
    d2 <- d %>%
      inner_join(r, by = "join") %>% 
      filter(value >= from, value < to) %>%
      select(value, type)
    Sys.time() - start
    
    # @hardik-gupta sapply
    start <- Sys.time()
    d2 <- d %>%
      mutate(
        type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
      ) %>% 
      select(value, type)
    Sys.time() - start
    
    # @ycw re-dynamic join
    start <- Sys.time()
    d2 <- d %>% dynamic_join(r)
    Sys.time() - start
    
    # @WiWeber range_join
    start <- Sys.time()
    d2 <- d %>% 
      range_join(r, "value", "from", "to") %>%
      select(value, type)
    Sys.time() - start
    

    Results:

    # @useR sqldf - fast and intuitive but extra library with horrible code
    Time difference of 0.06221986 secs
    
    # @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
    Time difference of 4.765595 secs
    
    # @jonathande4 cut pretty fast
    Time difference of 0.004637003 secs
    
    # @WiWeber floor
    Time difference of 0.02223396 secs
    
    # @WiWeber cross join - filter
    Time difference of 0.0201931 secs
    
    # @hardik-gupta sapply
    Time difference of 5.166633 secs
    
    # @ycw dynamic join
    Time difference of 0.03124094 secs
    
    # @WiWeber range_join
    Time difference of 0.02691698 secs
    

    greez WiWeber

    0 讨论(0)
提交回复
热议问题