R – How to join two data frames by nearest time-date?

前端 未结 2 1494
栀梦
栀梦 2020-12-03 06:30

I have 2 data sets, each containing a date-time value in POSIXlt format, and some other numeric and character variables.

I want to combine both data sets based on th

相关标签:
2条回答
  • 2020-12-03 06:43

    I had a similar problem, but instead of using data.table or tidyverse I created my own function amerge for "approximate merge". It takes 4 arguments:

    • two data frames,
    • a vector of column names for "firm" (not approximate) merge - these must exist in both data frames,
    • and the name of a single column (in both data frames) for approximate merge. It will work for any numeric values, including dates.

    The idea was to merge rows 1-to-1 of best matches, and not loose any rows from any data frame. Here is my commented code with a working example.

    amerge <- function(d1, d2, firm=NULL, approx=NULL) {
      rt = Sys.time()
    
      # Take care of conflicting column names
      n2 = data.frame(oldname = names(d2), newname = names(d2))
      n2$newname = as.character(n2$newname)
      n2$newname[(n2$oldname %in% names(d1)) & !(n2$oldname %in% firm)] =
        paste(n2$newname[(n2$oldname %in% names(d1)) & !(n2$oldname %in% firm)], "2", sep=".")
    
      # Add unique row IDs
      if (length(c(firm, approx))>1) {
        d1$ID1 = factor(apply(d1[,c(approx,firm)], 1, paste, collapse=" "))
        d2$ID2 = factor(apply(d2[,c(approx,firm)], 1, paste, collapse=" "))
      } else {
        d1$ID1 = factor(d1[,c(approx,firm)])
        d2$ID2 = factor(d2[,c(approx,firm)])
      }
    
      # Perform initial merge on the 'firm' parameters, if any
      # Otherwise match all to all
      if (length(firm)>0) {
        t1 = merge(d1, d2, by=firm, all=T, suff=c("",".2"))
      } else {
        names(d2)= c(n2$newname,"ID2")
        t1 = data.frame()
        for (i1 in 1:nrow(d1)) {
          trow = d1[i1,]
          t1 = rbind(t1, cbind(trow, d2))
        }
      }
    
      # Match by the most approximate record
      if (length(approx)==1) {
        # Calculate the differential for approximate merging
        t1$DIFF = abs(t1[,approx] - t1[,n2$newname[n2$oldname==approx]])
        # Sort data by ascending DIFF, so that best matching records are used first
        t1 = t1[order(t1$DIFF, t1$ID1, t1$ID2),]
        t2 = data.frame()
        d2$used = 0
        # For each record of d1, find match from d2
        for (i1 in na.omit(unique(t1$ID1))) {
          tx = t1[!is.na(t1$DIFF) & t1$ID1==i1,]
          # If there are non-missing records, get the one with minimum DIFF (top one)
          if (nrow(tx)>0) {
            tx = tx[1,]
            # If matching record found, remove it from the pool, so it's not used again
            t1[!is.na(t1$ID2) & t1$ID2==tx$ID2, c(n2$newname[!(n2$newname %in% firm)], "DIFF")] = NA
            # And mark it as used
            d2$used[d2$ID2==tx$ID2] = 1
          } else {
            # If there are no non-missing records, just get the first one from the top
            tx = t1[!is.na(t1$ID1) & t1$ID1==i1,][1,]
          }
          t2 = rbind(t2,tx)
        }
      } else {
        t2 = t1
      }
      # Make the records the same order as d1
      t2 = t2[match(d1$ID1, t2$ID1),]
      # Add unmatched records from d2 to the end of output
      if (any(d2$used==0)) {
        tx = t1[t1$ID2 %in% d2$ID2[d2$used==0], ]
        tx = tx[!duplicated(tx$ID2),]
        tx[, names(d1)[!(names(d1) %in% c(firm))]] = NA
        t2 = rbind(t2,tx)
        t2[is.na(t2[,approx]), approx] = t2[is.na(t2[,approx]), n2$newname[n2$oldname==approx]]
      }
      t2$DIFF = t2$ID1 = t2$ID2 = NULL
      cat("* Run time: ", round(difftime(Sys.time(),rt, "secs"),1), " seconds.\n", sep="")
      return(t2)
    }
    

    And the example:

    new <- data.frame(ID=c(1,1,1,2), date = as.POSIXct( c("2016-03-02 12:20:00", "2016-03-07 12:20:00", "2016-04-02 12:20:00", "2016-04-12 11:03:00")), new = c("t","u","v","x"))
    old <- data.frame(ID=c(1,1,1,1,1), date = as.POSIXct( c("2016-03-07 12:20:00", "2016-04-02 12:20:00", "2016-03-01 10:09:00", "2015-04-12 10:09:00","2016-03-03 12:20:00")), old = c("a","b","c","d","e"))
    
    amerge(old, new, firm="ID", approx="date")
    

    It outputs:

       ID                date  old              date.2  new
    2   1 2016-03-07 12:20:00    a 2016-03-07 12:20:00    u
    6   1 2016-04-02 12:20:00    b 2016-04-02 12:20:00    v
    7   1 2016-03-01 10:09:00    c                <NA> <NA>
    10  1 2015-04-12 10:09:00    d                <NA> <NA>
    13  1 2016-03-03 12:20:00    e 2016-03-02 12:20:00    t
    16  2 2016-04-12 11:03:00 <NA> 2016-04-12 11:03:00    x
    
    

    So works for my purpose as intended - there is exactly one copy of each row from both data frames - matched by shortest time difference. One note: the function copies date.2 into date column where the date would be missing.

    0 讨论(0)
  • 2020-12-03 07:01

    data.table should work for this (can you explain the error you're coming up against?), although it does tend to convert POSIXlt to POSIXct on its own (perhaps do that conversion on your datetime column manually to keep data.table happy). Also make sure you're setting the key column before using roll.

    (I've created my own example tables here to make my life that little bit easier. If you want to use dput on yours, I'm happy to update this example with your data):

    new <- data.table( date = as.POSIXct( c( "2016-03-02 12:20:00", "2016-03-07 12:20:00", "2016-04-02 12:20:00" ) ), data.new = c( "t","u","v" ) )
    head( new, 2 )
    
                      date data.new
    1: 2016-03-02 12:20:00        t
    2: 2016-03-07 12:20:00        u
    
    old <- data.table( date = as.POSIXct( c( "2016-03-02 12:20:00", "2016-03-07 12:20:00", "2016-04-02 12:20:00", "2015-03-02 12:20:00" ) ), data.old = c( "a","b","c","d" ) )
    head( old, 2 )
    
    
                      date data.old
    1: 2016-03-02 12:20:00        a
    2: 2016-03-07 12:20:00        b
    
    setkey( new, date )
    setkey( old, date )
    
    combined <- new[ old, roll = "nearest" ]
    combined
    
                      date data.new data.old
    1: 2015-03-02 12:20:00        t        d
    2: 2016-03-02 12:20:00        t        a
    3: 2016-03-07 12:20:00        u        b
    4: 2016-04-02 12:20:00        v        c
    

    I've intentionally made the two tables different row lengths, in order to show how the rolling join deals with multiple matches. You can switch the way it joins with:

    combined <- old[ new, roll = "nearest" ]
    combined
    
                      date data.old data.new
    1: 2016-03-02 12:20:00        a        t
    2: 2016-03-07 12:20:00        b        u
    3: 2016-04-02 12:20:00        c        v
    
    0 讨论(0)
提交回复
热议问题