R: Distm for big data? Calculating minimum distances between two matrices

前端 未结 2 615
南方客
南方客 2021-01-06 10:50

I have two matrices, one is 200K rows long, the other is 20K. For each row (which is a point) in the first matrix, I am trying to find which row (also a point) in the second

相关标签:
2条回答
  • 2021-01-06 11:17

    This would use much less memory, as it does it one row at a time, rather than creating the full distance matrix (though it will be slower)

    library(geosphere)
    rnum <- apply(pixels.latlon, 1, function(x) {
                         dm <- distm(x, grwl.latlon, fun=distHaversine)
                         return(which.min(dm))
                         })
    

    Much of the time is taken up with the complicated Haversine formula. As you are really only interested in finding the closest point, not in the exact distances, we could use a simpler distance measure. Here is an alternative using a formula based on this article http://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/, and also using a quadratic approximation to the cosine (which is itself expensive to calculate)...

    #quadratic cosine approximation using lm (run once)
    qcos <- lm(y~x+I(x^2), data.frame(x=0:90, y=cos((0:90)*2*pi/360)))$coefficients
    cosadj <- function(lat) qcos[1]+lat*(qcos[2]+qcos[3]*lat)
    
    #define rough dist function
    roughDist <- function(x,y){#x should be a single (lon,lat), y a (n*2) matrix of (lon,lat)
                latDev <- x[2]-y[,2]
                lonDev <- (x[1]-y[,1])*cosadj(abs(x[2]))
                return(latDev*latDev+lonDev*lonDev) #don't need the usual square root or any scaling parameters
                }
    

    And then you can just replace Haversine with this new function...

    rnum <- apply(pixels.latlon, 1, function(x) {
                         dm <- distm(x, grwl.latlon, fun=roughDist)
                         return(which.min(dm))
                         })
    

    On my machine this runs about three times faster than the Haversine version.

    0 讨论(0)
  • 2021-01-06 11:24

    You can use this R(cpp) function:

    #include <Rcpp.h>
    using namespace Rcpp;
    
    double compute_a(double lat1, double long1, double lat2, double long2) {
    
      double sin_dLat = ::sin((lat2 - lat1) / 2);
      double sin_dLon = ::sin((long2 - long1) / 2);
    
      return sin_dLat * sin_dLat + ::cos(lat1) * ::cos(lat2) * sin_dLon * sin_dLon;
    }
    
    int find_min(double lat1, double long1,
                 const NumericVector& lat2,
                 const NumericVector& long2,
                 int current0) {
    
      int m = lat2.size();
      double lat_k, lat_min, lat_max, a, a0;
      int k, current = current0;
    
      a0 = compute_a(lat1, long1, lat2[current], long2[current]);
      // Search before current0
      lat_min = lat1 - 2 * ::asin(::sqrt(a0));
      for (k = current0 - 1; k >= 0; k--) {
        lat_k = lat2[k];
        if (lat_k > lat_min) {
          a = compute_a(lat1, long1, lat_k, long2[k]);
          if (a < a0) {
            a0 = a;
            current = k;
            lat_min = lat1 - 2 * ::asin(::sqrt(a0));
          }
        } else {
          // No need to search further
          break;
        }
      }
      // Search after current0
      lat_max = lat1 + 2 * ::asin(::sqrt(a0));
      for (k = current0 + 1; k < m; k++) {
        lat_k = lat2[k];
        if (lat_k < lat_max) {
          a = compute_a(lat1, long1, lat_k, long2[k]);
          if (a < a0) {
            a0 = a;
            current = k;
            lat_max = lat1 + 2 * ::asin(::sqrt(a0));
          }
        } else {
          // No need to search further
          break;
        }
      }
    
      return current;
    } 
    
    // [[Rcpp::export]]
    IntegerVector find_closest_point(const NumericVector& lat1,
                                     const NumericVector& long1,
                                     const NumericVector& lat2,
                                     const NumericVector& long2) {
    
      int n = lat1.size();
      IntegerVector res(n);
    
      int current = 0;
      for (int i = 0; i < n; i++) {
        res[i] = current = find_min(lat1[i], long1[i], lat2, long2, current);
      }
    
      return res; // need +1
    }
    
    
    /*** R
    N <- 2000  # 2e6
    M <- 500   # 2e4
    
    pixels.latlon=cbind(runif(N,min=-180, max=-120), runif(N, min=50, max=85))
    grwl.latlon=cbind(runif(M,min=-180, max=-120), runif(M, min=50, max=85))
    # grwl.latlon <- grwl.latlon[order(grwl.latlon[, 2]), ]
    
    library(geosphere)
    system.time({
      #calculate the distance matrix
      dist.matrix = distm(pixels.latlon, grwl.latlon, fun=distHaversine)
      #Pick out the indices of the minimum distance
      rnum=apply(dist.matrix, 1, which.min)
    })
    
    
    find_closest <- function(lat1, long1, lat2, long2) {
    
      toRad <- pi / 180
      lat1  <- lat1  * toRad
      long1 <- long1 * toRad
      lat2  <- lat2  * toRad
      long2 <- long2 * toRad
    
      ord1  <- order(lat1)
      rank1 <- match(seq_along(lat1), ord1)
      ord2  <- order(lat2)
    
      ind <- find_closest_point(lat1[ord1], long1[ord1], lat2[ord2], long2[ord2])
    
      ord2[ind + 1][rank1]
    }
    
    system.time(
      test <- find_closest(pixels.latlon[, 2], pixels.latlon[, 1], 
                           grwl.latlon[, 2], grwl.latlon[, 1])
    )
    all.equal(test, rnum)
    
    N <- 2e4
    M <- 2e4
    pixels.latlon=cbind(runif(N,min=-180, max=-120), runif(N, min=50, max=85))
    grwl.latlon=cbind(long = runif(M,min=-180, max=-120), lat = runif(M, min=50, max=85))
    system.time(
      test <- find_closest(pixels.latlon[, 2], pixels.latlon[, 1], 
                           grwl.latlon[, 2], grwl.latlon[, 1])
    )
    */
    

    It takes 0.5 sec for N = 2e4 and 4.2 sec for N = 2e5. I can't make your code work to compare.

    0 讨论(0)
提交回复
热议问题