R fill vector efficiently

后端 未结 4 646
不思量自难忘°
不思量自难忘° 2021-01-22 20:22

I have a fairly big vector (>500,000 in length). It contains a bunch of NA interspersed with 1 and it is always guaranteed that it begins with 1<

相关标签:
4条回答
  • 2021-01-22 20:26

    A vectorised solution would look like:

    v1[-1] <- ifelse(diff(v2), 0, v1[-length(v1)])
    

    But the above won't work, and I don't think you can avoid an explicit loop since, if I understand correctly, you want to propagate new values. So, how about:

    cmp <- diff(v2)
    for (i in 2:length(v1)){
        v1[i] <- if(cmp[i-1]) 0 else v1[i-1]
    }
    
    0 讨论(0)
  • 2021-01-22 20:31

    There's probably a faster solution, but this is the best I could come up with in a couple minutes. My solution is slower than the OPs for small vectors, but increasingly faster for larger vectors.

    library(zoo)  # for na.locf
    library(rbenchmark)
    
    v1<-c(1,NA,NA,NA,1,NA,NA,NA,NA,NA,1,NA,NA,1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1)
    v2<-c(10,10,10,9,10,9,9,9,9,9,10,10,10,11,8,12,12,12,12,12,12,12,12,12,12,13)
    V1 <- rep(v1, each=20000)  # 520,000 observations
    V2 <- rep(v2, each=20000)  # 520,000 observations
    
    fun1 <- function(v1,v2) {
      for (i in 2:length(v1)){ 
        if (!is.na(v1[i-1]) && is.na(v1[i]) && v2[i]==v2[i-1]){
          v1[i]<-1
        }
      }
      v1
    }
    fun2 <- function(v1,v2) {
      # create groups in which we need to assess missing values
      d <- cumsum(as.logical(c(0,diff(v2))))
      # for each group, carry the first obs forward
      ave(v1, d, FUN=function(x) na.locf(x, na.rm=FALSE))
    }
    all.equal(fun1(V1,V2), fun2(V1,V2))
    # [1] TRUE
    benchmark(fun1(V1,V2), fun2(V1,V2))
    #           test replications elapsed relative user.self sys.self
    # 1 fun1(V1, V2)          100  194.29 6.113593    192.72     0.17
    # 2 fun2(V1, V2)          100   31.78 1.000000     30.74     0.95
    
    0 讨论(0)
  • 2021-01-22 20:35

    The function fun1 can be speeded up considerably by using the compiler package. Using the code provided by Joshua and extending it with the compiler package:

    library(zoo)  # for na.locf
    library(rbenchmark)
    library(compiler)
    
    v1 <- c(1,NA,NA,NA,1,NA,NA,NA,NA,NA,1,NA,NA,1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1)
    v2 <- c(10,10,10,9,10,9,9,9,9,9,10,10,10,11,8,12,12,12,12,12,12,12,12,12,12,13)
    
    fun1 <- function(v1,v2) {
        for (i in 2:length(v1)){
            if (!is.na(v1[i-1]) && is.na(v1[i]) && v2[i]==v2[i-1]){
                v1[i]<-1
            }
        }
        v1
    }
    
    fun2 <- function(v1,v2) {
        # create groups in which we need to assess missing values
        d <- cumsum(as.logical(c(0,diff(v2))))
        # for each group, carry the first obs forward
        ave(v1, d, FUN=function(x) na.locf(x, na.rm=FALSE))
    }
    
    fun3 <- cmpfun(fun1)
    
    fun1(v1,v2)
    fun2(v1,v2)
    all.equal(fun1(v1,v2), fun2(v1,v2))
    all.equal(fun1(v1,v2), fun3(v1,v2))
    
    Nrep <- 1000
    
    V1 <- rep(v1, each=Nrep)
    V2 <- rep(v2, each=Nrep)
    all.equal(fun1(V1,V2), fun2(V1,V2))
    all.equal(fun1(V1,V2), fun3(V1,V2))
    
    benchmark(fun1(V1,V2), fun2(V1,V2), fun3(V1,V2))
    

    we get the following result

    benchmark(fun1(V1,V2), fun2(V1,V2), fun3(V1,V2))
              test replications elapsed relative user.self sys.self user.child
    1 fun1(V1, V2)          100  12.252 5.706567    12.190    0.045          0
    2 fun2(V1, V2)          100   2.147 1.000000     2.133    0.013          0
    3 fun3(V1, V2)          100   3.702 1.724266     3.644    0.023          0
    

    So the compiled fun1 is a lot faster than the original fun1 but still slower than fun2.

    0 讨论(0)
  • 2021-01-22 20:42

    It may not be faster, but v1[i] <- v1[i-1] * (cmp[i-1] == 0) avoids all explicit "if" calls. I can't test it right now, but you might try @James solution vs. looping over this form for, say a vector of 1e4 length to see which executes faster.

    0 讨论(0)
提交回复
热议问题