How to fill in the preceding numbers whenever there is a 0 in R?

前端 未结 5 1344
小蘑菇
小蘑菇 2021-02-13 18:19

I have a string of numbers:

n1 = c(1, 1, 0, 6, 0, 0, 10, 10, 11, 12, 0, 0, 19, 23, 0, 0)

I need to replace 0 with the corresponding number righ

相关标签:
5条回答
  • 2021-02-13 18:19

    Here's a solution using data.table:

    require(data.table) ## >= 1.9.2
    idx = which(!n1 %in% 0L)
    DT <- data.table(val=n1[idx], idx=idx)
    setattr(DT, 'sorted', "idx")
    n1 = DT[J(seq_along(n1)), roll=Inf]$val
    #  [1]  1  1  1  6  6  6 10 10 11 12 12 12 19 23 23 23
    

    Benchmarks on bigger data:

    require(zoo)
    require(data.table)
    
    set.seed(1L)
    n1 = sample(c(0:10), 1e6, TRUE)
    
    ## data.table
    dt_fun <- function(n1) {
        idx = which(!n1 %in% 0L)
        DT <- data.table(val=n1[idx], idx=idx)
        setattr(DT, 'sorted', "idx")
        DT[J(seq_along(n1)), roll=Inf]$val
    }
    
    # na.locf from zoo - gagolews
    zoo_fun <- function(n1) {
        wh_na <- which(is.na(n1))
        n1[n1 == 0] <- NA
        n2 <- na.locf(n1)
        n2[wh_na] <- NA
        n2
    }
    
    ## rle - thelatemail
    rle_fun <- function(n1) {
        r <- rle(n1)
        r$values[which(r$values==0)] <- r$values[which(r$values==0)-1]
        inverse.rle(r)
    }
    
    flodel_fun <- function(n1) n1[cummax(seq_along(n1) * (n1 != 0))]
    
    require(microbenchmark)
    microbenchmark(a1 <- dt_fun(n1), 
                   a2 <- zoo_fun(n1), 
                   a3 <- rle_fun(n1), 
                   a4 <- flodel_fun(n1), times=10L)
    

    Here's the benchmarking result:

    # Unit: milliseconds
    #                  expr       min        lq    median        uq       max neval
    #      a1 <- dt_fun(n1) 155.49495 164.04133 199.39133 243.22995 289.80908    10
    #     a2 <- zoo_fun(n1) 596.33039 632.07841 671.51439 682.85950 697.33500    10
    #     a3 <- rle_fun(n1) 356.95103 377.61284 383.63109 406.79794 495.09942    10
    #  a4 <- flodel_fun(n1)  51.52259  55.54499  56.20325  56.39517  60.15248    10
    
    0 讨论(0)
  • 2021-02-13 18:24

    Don't forget the simplicity and performance gain of Rcpp...

    Using Arun's sample size I get...

    Unit: milliseconds
                                      expr       min        lq    median        uq      max neval
                             rollValue(n1)  3.998953  4.105954  5.803294  8.774286 36.52492   100
     n1[cummax(seq_along(n1) * (n1 != 0))] 17.634569 18.295344 20.698524 23.104847 74.72795   100
    

    The .cpp file to source is simply...

    #include <Rcpp.h>
    using namespace Rcpp;
    
    // [[Rcpp::plugins("cpp11")]]
    
    // [[Rcpp::export]]
    NumericVector rollValue(const NumericVector v) {
      auto out = clone(v);
      auto tmp = v[0];
      for( auto & e : out) {
        if( e == 0 ) {
          e = tmp;
          continue;
        }
        tmp = e;
      }
      return out;
    }
    
    0 讨论(0)
  • 2021-02-13 18:31

    Try na.locf() from the package zoo:

    library(zoo)
    n1 <- c(1, 1, 0, 6, 0, 0, 10, 10, 11, 12, 0, 0, 19, 23, 0, 0)
    n1[n1 == 0] <- NA
    na.locf(n1)
    ## [1]  1  1  1  6  6  6 10 10 11 12 12 12 19 23 23 23
    

    This function replaces each NA with the most recent non-NA prior to it. This is why I substituted all 0s with NA before applying it.

    Here's a discussion on a similar (yet not identical) issue.

    EDIT: If n1 eventually consists of NAs, try e.g.

    n1 <- c(1, 1, 0, 6, 0, 0, 10, NA, 11, 12, 0, 0, 19, NA, 0, 0)
    wh_na <- which(is.na(n1))
    n1[n1 == 0] <- NA
    n2 <- na.locf(n1)
    n2[wh_na] <- NA
    n2
    ##  [1]  1  1  1  6  6  6 10 NA 11 12 12 12 19 NA 19 19
    

    EDIT2: This approach for c(1,NA,0) returns c(1,NA,1). The other two funs give c(1,NA,NA). In other words, here we're replacing 0 with last non-missing, non-zero value. Choose your favourite option.

    EDIT3: Inspired by @Thell's Rcpp solution, I'd like to add another one - this time using "pure" R/C API.

    library('inline')
    sexp0 <- cfunction(signature(x="numeric"), "
       x = Rf_coerceVector(x, INTSXP); // will not work for factors
       R_len_t n = LENGTH(x);
       SEXP ret;
       PROTECT(ret = Rf_allocVector(INTSXP, n));
       int lval = NA_INTEGER;
       int* xin = INTEGER(x);
       int* rin = INTEGER(ret);
       for (R_len_t i=0; i<n; ++i, ++xin, ++rin) {
          if (*xin == 0)
             *rin = lval;
          else {
             lval = *xin;
             *rin = lval;
          }
       }
       UNPROTECT(1);
       return ret;
    ", language="C++")
    

    In this case we will get c(1,NA,NA) for c(1,NA,0). Some benchmarks:

    library(microbenchmark)
    set.seed(1L)
    n1 <- sample(c(0:10), 1e6, TRUE)
    microbenchmark(sexp0(n1), rollValue(n1), n1[cummax(seq_along(n1) * (n1 != 0))])
    ## Unit: milliseconds
    ##                                   expr       min        lq    median        uq       max neval
    ##                              sexp0(n1)  2.468588  2.494233  3.198711  4.216908  63.21236   100
    ##                          rollValue(n1)  8.151000  9.359731 10.603078 12.760594  75.88901   100
    ##  n1[cummax(seq_along(n1) * (n1 != 0))] 32.899420 36.956711 39.673726 45.419449 106.48180   100
    
    0 讨论(0)
  • Because rle is the answer to everything:

    #make an example including an NA value
    n1 <- c(1, 1, 0, 6, NA, 0, 10, 10, 11, 12, 0, 0, 19, 23, 0, 0)
    r <- rle(n1)
    r$values[which(r$values==0)] <- r$values[which(r$values==0)-1]
    inverse.rle(r)
    # [1]  1  1  1  6 NA NA 10 10 11 12 12 12 19 23 23 23
    

    A version that skips NAs would be:

    n1 <- c(1, 1, 0, 6, NA, 0, 10, 10, 11, 12, 0, 0, 19, 23, 0, 0)
    r <- rle(n1[!is.na(n1)])
    r$values[which(r$values==0)] <- r$values[which(r$values==0)-1]
    n1[!is.na(n1)] <- inverse.rle(r)
    n1
    # [1]  1  1  1  6 NA  6 10 10 11 12 12 12 19 23 23 23
    
    0 讨论(0)
  • 2021-02-13 18:42
    n2 <- n1[cummax(seq_along(n1) * (n1 != 0))]
    
    0 讨论(0)
提交回复
热议问题