R fill vector efficiently

后端未结

关注

 4  651

I have a fairly big vector (>500,000 in length). It contains a bunch of NA interspersed with 1 and it is always guaranteed that it begins with 1<


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  轻奢々        
                
              
                            
                2021-01-22 20:26
              
            
            
                                                                       
A vectorised solution would look like:

v1[-1] <- ifelse(diff(v2), 0, v1[-length(v1)])


But the above won't work, and I don't think you can avoid an explicit loop since, if I understand correctly, you want to propagate new values. So, how about:

cmp <- diff(v2)
for (i in 2:length(v1)){
    v1[i] <- if(cmp[i-1]) 0 else v1[i-1]
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  我寻月下人不归        
                
              
                            
                2021-01-22 20:31
              
            
            
                                                                       
There's probably a faster solution, but this is the best I could come up with in a couple minutes.  My solution is slower than the OPs for small vectors, but increasingly faster for larger vectors.

library(zoo)  # for na.locf
library(rbenchmark)

v1<-c(1,NA,NA,NA,1,NA,NA,NA,NA,NA,1,NA,NA,1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1)
v2<-c(10,10,10,9,10,9,9,9,9,9,10,10,10,11,8,12,12,12,12,12,12,12,12,12,12,13)
V1 <- rep(v1, each=20000)  # 520,000 observations
V2 <- rep(v2, each=20000)  # 520,000 observations

fun1 <- function(v1,v2) {
  for (i in 2:length(v1)){ 
    if (!is.na(v1[i-1]) && is.na(v1[i]) && v2[i]==v2[i-1]){
      v1[i]<-1
    }
  }
  v1
}
fun2 <- function(v1,v2) {
  # create groups in which we need to assess missing values
  d <- cumsum(as.logical(c(0,diff(v2))))
  # for each group, carry the first obs forward
  ave(v1, d, FUN=function(x) na.locf(x, na.rm=FALSE))
}
all.equal(fun1(V1,V2), fun2(V1,V2))
# [1] TRUE
benchmark(fun1(V1,V2), fun2(V1,V2))
#           test replications elapsed relative user.self sys.self
# 1 fun1(V1, V2)          100  194.29 6.113593    192.72     0.17
# 2 fun2(V1, V2)          100   31.78 1.000000     30.74     0.95

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  伪装坚强ぢ        
                
              
                            
                2021-01-22 20:35
              
            
            
                                                                       
The function fun1 can be speeded up considerably by using the compiler package.
Using the code provided by Joshua and extending it with the compiler package:

library(zoo)  # for na.locf
library(rbenchmark)
library(compiler)

v1 <- c(1,NA,NA,NA,1,NA,NA,NA,NA,NA,1,NA,NA,1,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,1)
v2 <- c(10,10,10,9,10,9,9,9,9,9,10,10,10,11,8,12,12,12,12,12,12,12,12,12,12,13)

fun1 <- function(v1,v2) {
    for (i in 2:length(v1)){
        if (!is.na(v1[i-1]) && is.na(v1[i]) && v2[i]==v2[i-1]){
            v1[i]<-1
        }
    }
    v1
}

fun2 <- function(v1,v2) {
    # create groups in which we need to assess missing values
    d <- cumsum(as.logical(c(0,diff(v2))))
    # for each group, carry the first obs forward
    ave(v1, d, FUN=function(x) na.locf(x, na.rm=FALSE))
}

fun3 <- cmpfun(fun1)

fun1(v1,v2)
fun2(v1,v2)
all.equal(fun1(v1,v2), fun2(v1,v2))
all.equal(fun1(v1,v2), fun3(v1,v2))

Nrep <- 1000

V1 <- rep(v1, each=Nrep)
V2 <- rep(v2, each=Nrep)
all.equal(fun1(V1,V2), fun2(V1,V2))
all.equal(fun1(V1,V2), fun3(V1,V2))

benchmark(fun1(V1,V2), fun2(V1,V2), fun3(V1,V2))


we get the following result

benchmark(fun1(V1,V2), fun2(V1,V2), fun3(V1,V2))
          test replications elapsed relative user.self sys.self user.child
1 fun1(V1, V2)          100  12.252 5.706567    12.190    0.045          0
2 fun2(V1, V2)          100   2.147 1.000000     2.133    0.013          0
3 fun3(V1, V2)          100   3.702 1.724266     3.644    0.023          0


So the compiled fun1 is a lot faster than the original fun1 but still slower than fun2.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  抹茶落季        
                
              
                            
                2021-01-22 20:42
              
            
            
                                                                       
It may not be faster, but v1[i] <-  v1[i-1] * (cmp[i-1] == 0)  avoids all explicit "if" calls.  I can't test it right now, but you might try @James solution vs. looping over this form for, say a vector of 1e4 length to see which executes faster.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复