Replace NA with last non-NA in data.table by using only data.table

后端 未结 2 769
遇见更好的自我
遇见更好的自我 2021-01-13 08:59

I want to replace NA values with last non-NA values in data.table and using data.table. I have one solution, but it\'s considerably slower than na.locf

相关标签:
2条回答
  • 2021-01-13 09:19

    As I mentioned in my comment, Rcpp is pretty fast for this. Below I compare the zoo::na.locf approach, @eddi's f3 and f4, and the Rcpp approach posted here by @RomainFrancois.

    First, the benchmark results:

    microbenchmark(f.zoo(m1), eddi.f3(m2), eddi.f4(m3), f.Rcpp(m4), times = 10)
    
    ## Unit: milliseconds
    ##         expr      min         lq    median        uq       max neval
    ##    f.zoo(m1) 1297.969 1403.67418 1443.5441 1527.7644 1597.9724    10
    ##  eddi.f3(m2) 2982.103 2998.48809 3039.6543 3068.9303 3078.3963    10
    ##  eddi.f4(m3) 1970.650 2017.55740 2061.6599 2074.1497 2099.8892    10
    ##   f.Rcpp(m4)   95.411   98.44505  107.6925  119.2838  171.7855    10
    

    And the function definitions:

    library(data.table)
    library(zoo)
    library(microbenchmark)
    library(Rcpp)
    
    m1 <- m2 <- m3 <- m4 <- 
      data.table(X = rep(c(NA, NA, 1, 2, NA, NA, NA, 6, 7, 8), 1e6))
    
    f.zoo <- function(x) {
      x[, X := na.locf(X, na.rm = F)]
      x
    }
    
    eddi.f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
    
    eddi.f4 = function(x) {
      x[, tmp := cumsum(!is.na(X))]
      setattr(x, "sorted", "tmp")
      x[x[!is.na(X)], X := i.X][, tmp := NULL]
    }
    
    # Make the Cpp function available
    cppFunction('
    NumericVector naLocfCpp(NumericVector x) {
        double *p=x.begin(), *end = x.end() ;
        double v = *p ; p++ ;
    
        while( p < end ){
            while( p<end && !NumericVector::is_na(*p) ) p++ ;
            v = *(p-1) ;
            while( p<end && NumericVector::is_na(*p) ) {
                *p = v ;
                p++ ;
            }
        }
    
        return x;
    }')
    
    f.Rcpp <- function(x) {
      naLocfCpp(x$X)
      x
    }
    

    And all produce identical results:

    out1 <- f.zoo(m1)
    out2 <- eddi.f3(m2)
    out3 <- eddi.f4(m3)
    out4 <- f.Rcpp(m4)
    
    all(identical(out1, out2), identical(out1, out3), identical(out1, out4))
    
    ## TRUE
    
    0 讨论(0)
  • 2021-01-13 09:28

    Here's a data.table-only solution, but it's slightly slower than na.locf:

    m1[, X := X[1], by = cumsum(!is.na(X))]
    m1
    #       X
    #   1: NA
    #   2: NA
    #   3:  1
    #   4:  2
    #   5:  2
    #  ---   
    # 996:  2
    # 997:  2
    # 998:  6
    # 999:  7
    #1000:  8
    

    Speed test:

    m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 1e6))
    f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
    
    system.time(f1(copy(m1)))
    # user  system elapsed 
    # 3.84    0.58    4.62 
    system.time(f3(copy(m1)))
    # user  system elapsed 
    # 5.56    0.19    6.04 
    

    And here's a perverse way of making it faster, but I think one that makes it considerably less readable:

    f4 = function(x) {
      x[, tmp := cumsum(!is.na(X))]
      setattr(x, "sorted", "tmp") # set the key without any checks
      x[x[!is.na(X)], X := i.X][, tmp := NULL]
    }
    
    system.time(f4(copy(m1)))
    # user  system elapsed 
    # 3.32    0.51    4.00 
    
    0 讨论(0)
提交回复
热议问题