Inner join using an inequality expression

前端 未结 2 449
猫巷女王i
猫巷女王i 2020-12-31 18:22

Background

(Not required for the question, but may be useful to read)

Rolling join on data.table with duplicate keys

Odd behaviour w

相关标签:
2条回答
  • 2020-12-31 19:03

    A potential solution is to use foverlaps by making some arbitrary interval columns

    setDT(dt_arrive)
    setDT(dt_depart)
    
    dt_arrive[, `:=`(arrival_minutes_copy = arrival_minutes)]
    ## reorder columns
    dt_arrive <- dt_arrive[, .(txn_id, place, journey_id, arrival_minutes, arrival_minutes_copy)]
    
    dt_depart[, `:=`(arrival_minutes_copy = min(arrival_minutes))]
    ## reorder columns
    dt_depart <- dt_depart[, .(txn_id, place, journey_id, arrival_minutes_copy, arrival_minutes)]
    
    setkey(dt_arrive, arrival_minutes, arrival_minutes_copy)
    setkey(dt_depart, arrival_minutes_copy, arrival_minutes)
    
    foverlaps(dt_arrive, 
              dt_depart,
              type = "within",
              nomatch=0L)
    
    
    #      place txn_id journey_id arrival_minutes_copy arrival_minutes i.txn_id i.journey_id i.arrival_minutes i.arrival_minutes_copy
    # 1: place_a      1         12                  489             519        1            1               515                    515
    # 2: place_a      1         13                  489             543        1            1               515                    515
    # 3: place_a      1         13                  489             543        1            2               534                    534
    

    Benchmarking

    library(microbenchmark)
    
    fun_foverlap <- function(dt_a, dt_d){
        dt <- foverlaps(dt_a, 
                                        dt_d,
                            type = "within",
                            nomatch=0L)
        return(dt)
    }
    
    fun_merge <- function(dt_a, dt_d){
        dt <- merge(dt_a, dt_d, allow.cartesian=TRUE)[arrival_minutes.y > arrival_minutes.x]
        return(dt)
    }
    
    fun_nomatch <- function(dt_a, dt_d){
        dt <- dt_a[dt_d, nomatch=0, allow.cartesian=TRUE][i.arrival_minutes > arrival_minutes]
        return(dt)
    }
    
    microbenchmark(fun_foverlap(dt_arrive_foverlap, dt_depart_foverlap),
                                 fun_merge(dt_arrive_merge, dt_depart_merge),
                                 fun_nomatch(dt_arrive_nomatch, dt_depart_nomatch))
    
    # Unit: microseconds
                                                      expr      min       lq      mean   median        uq      max neval cld
     # fun_foverlap(dt_arrive_foverlap, dt_depart_foverlap) 3538.189 3717.077 3967.6648 3872.586 4006.7205 5812.355   100   c
     #          fun_merge(dt_arrive_merge, dt_depart_merge)  883.697  925.655  980.4159  942.877  967.9745 2223.147   100  b 
     #    fun_nomatch(dt_arrive_nomatch, dt_depart_nomatch)  593.082  625.471  682.8975  643.034  665.4125 2077.748   100 a 
    
    0 讨论(0)
  • 2020-12-31 19:16

    Here's the unclever approach: take the cross/Cartesian join, and then filter.

    merge(dt_arrive, dt_depart, allow.cartesian=TRUE)[arrival_minutes.y > arrival_minutes.x]
    
    #    txn_id   place arrival_minutes.x journey_id.x arrival_minutes.y journey_id.y
    # 1:      1 place_a               515            1               519           12
    # 2:      1 place_a               515            1               543           13
    # 3:      1 place_a               534            2               543           13
    

    By taking the Cartesian join, we're liable to eat up a lot of memory.

    0 讨论(0)
提交回复
热议问题