Double for-loop operation in R (with an example)

后端 未结 5 810
無奈伤痛
無奈伤痛 2021-01-06 00:02

Please look at the following small working example:

#### Pseudo data
nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <         


        
相关标签:
5条回答
  • 2021-01-06 00:05

    I ran this 3 times, once as you have here, once loading up library compiler (in base R) and running enableJIT(3), and running (with enableJIT) after compiling your thedistance function with cmpfun() (in compiler). The results are (respectively):

    > proc.time() - ptm
       user  system elapsed 
     335.26    0.17  339.83
    
    > proc.time() - ptm
       user  system elapsed 
     120.73    0.12  122.52 
    
    > proc.time() - ptm
       user  system elapsed 
     117.86    0.12  118.95 
    

    The difference with 2 and 3 is neglible, as I think enableJIT just compiles the function anyway, but it looks like there is a decent improvement with enableJIT alone. so just throw

    library(compiler)
    

    enableJIT(3)

    at the top of your code.

    More information can be found here

    0 讨论(0)
  • 2021-01-06 00:12

    Here are two solutions, one is partly vectorised that uses sapply, and I think the other is fully vectorised, but I've written a couple of functions for part of the vector calculations so there may be a better way.

    I also only ran these for nobs1 = 4, 40 & 400 and nobs2 = 5, 50 & 500 as my laptop struggles for memory with large matrices.

    Solution - sapply

    R <- 6371 # Earth mean radius [km]
    
    delta.lon <- sapply(mylon2, "-", mylon1)
    delta.lon <- sin(delta.lon/2)^2
    
    coslat2 <- cos(mylat2)
    coslat1 <- cos(mylat1)
    
    delta.lan <- sapply(mylat2, "-", mylat1)
    delta.lan <- sin(delta.lan/2)^2
    
    a <- delta.lan + t(sapply(coslat1, "*", coslat2) * t(delta.lon))
    b <- ifelse(sqrt(a)<1,sqrt(a),1)
    c <- asin(b)
    d <- 2 * c
    e <- R * d
    
    f <- c(t(e))
    

    And to check the output

    sum(f)
    sum(mydistance)
    
    all.equal(f, mydistance)
    
    
    > sum(f)
    [1] 647328856
    > sum(mydistance)
    [1] 647328856
    > all.equal(f, mydistance)
    [1] TRUE
    

    Solution - with functions

    R <- 6371 # Earth mean radius [km]
    
    #function to calculate the difference between each
    #element of different sized vectors and return a matrix
    vecEleDif <- function(vec1, vec2)
    {
        #the order of arguments is vec2 - vec1
        len1 <- length(vec1);len2 <- length(vec2)
        dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
        dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
        res <- t(dummy2 - t(dummy1))
    }
    
    #Function to calculate the product of each 
    #element of two different size vectors
    vecEleProd <- function(vec1, vec2)
    {
        #the order of the arguments is vec2 * vec1
        len1 <- length(vec1); len2 <- length(vec2)
        dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
        dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
        res <- t(dummy2 * t(dummy1))
    }
    
    ptm <- proc.time()
    
    delta.lon <- sin(vecEleDif(mylon2, mylon1)/2)^2
    delta.lan <- sin(vecEleDif(mylat2, mylat1)/2)^2
    cosprod <- vecEleProd(cos(mylat1), cos(mylat2))
    
    a <- delta.lan + (t(cosprod) * delta.lon)
    b <- ifelse(sqrt(a)<1,sqrt(a),1)
    c <- asin(b)
    d <- 2 * c
    e <- R * d
    f <- c((e))
    
    proc.time() - ptm
    

    and check the output:

    sum(f)
    sum(mydistance)
    
    all.equal(f, mydistance)
    
    > sum(f)
    [1] 647745044
    > sum(mydistance)
    [1] 647745044
    > 
    > all.equal(f, mydistance)
    [1] TRUE
    
    0 讨论(0)
  • library(compiler)
    enableJIT(3)
    
    nobs1 <- 4000
    nobs2 <- 4000
    mylon1 <- runif(nobs1, min=0, max=1)-76
    mylat1 <- runif(nobs1, min=0, max=1)+37
    mylon2 <- runif(nobs2, min=0, max=1)-76
    mylat2 <- runif(nobs2, min=0, max=1)+37
    
    trixie<-matrix(nrow=length(mylon1),ncol=4)
    trixie[,1]<-mylon1
    trixie[,2]<-mylat1
    trixie[,3]<-mylon2
    trixie[,4]<-mylat2
    
    #### define a distance function
    thedistance <- function(lon1, lat1, lon2, lat2) {
      R <- 6371 # Earth mean radius [km]
      delta.lon <- (lon2 - lon1)
      delta.lat <- (lat2 - lat1)
      a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
      c <- 2 * asin(min(1,sqrt(a)))
      d = R * c
      return(d)
    }
    calc_distance <- cmpfun(thedistance)
    ptm <- proc.time()
    
    #### Calculate distances between locations
    # Initiate the resulting distance vector
    distances<-apply(trixie,1, function(x) {
      return(apply(trixie,1, function(y) {
        return(calc_distance(x[1],x[2],y[3],y[4]))
      }))
    })
    
    proc.time() - ptm
    
    0 讨论(0)
  • 2021-01-06 00:27

    Here's an option that decreases the runtime to ~2 seconds on my machine because part of it is vectorized.

    A direct comparison with the original solution follows.

    Test data:

    nobs1 <- 4000
    nobs2 <- 5000
    mylon1 <- runif(nobs1, min=0, max=1)-76
    mylat1 <- runif(nobs1, min=0, max=1)+37
    mylon2 <- runif(nobs2, min=0, max=1)-76
    mylat2 <- runif(nobs2, min=0, max=1)+37
    

    Original solution:

    #### define a distance function
    thedistance <- function(lon1, lat1, lon2, lat2) {
      R <- 6371 # Earth mean radius [km]
      delta.lon <- (lon2 - lon1)
      delta.lat <- (lat2 - lat1)
      a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
      c <- 2 * asin(min(1,sqrt(a)))
      d = R * c
      return(d)
    }
    
    ptm <- proc.time()
    
    #### Calculate distances between locations
    # Initiate the resulting distance vector
    ndistance <- nobs1*nobs2 # The number of distances
    mydistance <- vector(mode = "numeric", length = ndistance)
    
    k=1
    for (i in 1:nobs1) {
      for (j in 1:nobs2) {
        mydistance[k] = thedistance(mylon1[i],mylat1[i],mylon2[j],mylat2[j])
        k=k+1
      }
    }
    
    proc.time() - ptm
       User      System     elapsed 
    148.243       0.681     148.901 
    

    My approach:

    # modified (vectorized) distance function:
    thedistance2 <- function(lon1, lat1, lon2, lat2) {
      R <- 6371 # Earth mean radius [km]
      delta.lon <- (lon2 - lon1)
      delta.lat <- (lat2 - lat1)
      a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
      c <- 2 * asin(pmin(1,sqrt(a)))   # pmin instead of min
      d = R * c
      return(d)
    }
    
    ptm2 <- proc.time()
    
    lst <- vector("list", length = nobs1)
    
    for (i in seq_len(nobs1)) {
        lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
    }
    
    res <- unlist(lst)
    
    proc.time() - ptm2
       User      System     elapsed
      1.988       0.331       2.319 
    

    Are the results all equal?

    all.equal(mydistance, res)
    #[1] TRUE
    
    0 讨论(0)
  • 2021-01-06 00:27

    Here is an other approach using Rcpp. On my machine it was slightly faster than beginneR's very nice vectorized version.

    library(Rcpp)
    
    nobs1 <- 4000
    nobs2 <- 5000
    mylon1 <- runif(nobs1, min=0, max=1)-76
    mylat1 <- runif(nobs1, min=0, max=1)+37
    mylon2 <- runif(nobs2, min=0, max=1)-76
    mylat2 <- runif(nobs2, min=0, max=1)+37
    
    sourceCpp("dist.cpp")
    
    system.time({
      lst <- vector("list", length = nobs1)
      for (i in seq_len(nobs1)) {
        lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
      }
      res <- unlist(lst)
    })
    
    ##  user  system elapsed 
    ## 4.636   0.084   4.737 
    
    
    system.time(res2 <- dist_vec(mylon1, mylat1, mylon2, mylat2))
    ##  user  system elapsed 
    ## 2.584   0.044   2.633 
    
    all.equal(res, res2)
    ## TRUE 
    

    And the file dist.cpp countains

    #include <Rcpp.h>
    #include <iostream>
    #include <math.h>
    
    using namespace Rcpp;
    
    double dist_cpp(double lon1, double lat1,
                    double lon2, double lat2){
      int R = 6371; 
      double delta_lon = (lon2 - lon1);
      double delta_lat = (lat2 - lat1);
      double a = pow(sin(delta_lat/2.0), 2) + cos(lat1) * cos(lat2) * pow(sin(delta_lon/2.0), 2);
      double c;
      a = sqrt(a);
      if (a < 1.0) {
        c = 2 * asin(a);
      } else {
        c = 2 * asin(1);
      }
      double d = R * c;
      return(d);
    
    }
    
    // [[Rcpp::export]]
    NumericVector dist_vec(NumericVector lon1, NumericVector lat1, 
                           NumericVector lon2, NumericVector lat2) {
      int n = lon1.size();
      int m = lon2.size();
      int k = n * m;
      NumericVector res(k);
      int c = 0;
      for (int i = 0; i < n; i++) {
        for (int j = 0; j < m; j++) {
          res[c] = dist_cpp(lon1[i], lat1[i], lon2[j], lat2[j]);
          c++;
        }
      }
      return(res);
    }
    
    0 讨论(0)
提交回复
热议问题