I want to replace NA
values with last non-NA values in data.table and using data.table. I have one solution, but it\'s considerably slower than na.locf
As I mentioned in my comment, Rcpp
is pretty fast for this. Below I compare the zoo::na.locf
approach, @eddi's f3
and f4
, and the Rcpp
approach posted here by @RomainFrancois.
First, the benchmark results:
microbenchmark(f.zoo(m1), eddi.f3(m2), eddi.f4(m3), f.Rcpp(m4), times = 10)
## Unit: milliseconds
## expr min lq median uq max neval
## f.zoo(m1) 1297.969 1403.67418 1443.5441 1527.7644 1597.9724 10
## eddi.f3(m2) 2982.103 2998.48809 3039.6543 3068.9303 3078.3963 10
## eddi.f4(m3) 1970.650 2017.55740 2061.6599 2074.1497 2099.8892 10
## f.Rcpp(m4) 95.411 98.44505 107.6925 119.2838 171.7855 10
And the function definitions:
library(data.table)
library(zoo)
library(microbenchmark)
library(Rcpp)
m1 <- m2 <- m3 <- m4 <-
data.table(X = rep(c(NA, NA, 1, 2, NA, NA, NA, 6, 7, 8), 1e6))
f.zoo <- function(x) {
x[, X := na.locf(X, na.rm = F)]
x
}
eddi.f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
eddi.f4 = function(x) {
x[, tmp := cumsum(!is.na(X))]
setattr(x, "sorted", "tmp")
x[x[!is.na(X)], X := i.X][, tmp := NULL]
}
# Make the Cpp function available
cppFunction('
NumericVector naLocfCpp(NumericVector x) {
double *p=x.begin(), *end = x.end() ;
double v = *p ; p++ ;
while( p < end ){
while( p
And all produce identical results:
out1 <- f.zoo(m1)
out2 <- eddi.f3(m2)
out3 <- eddi.f4(m3)
out4 <- f.Rcpp(m4)
all(identical(out1, out2), identical(out1, out3), identical(out1, out4))
## TRUE