Please look at the following small working example:
#### Pseudo data
nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <
I ran this 3 times, once as you have here, once loading up library compiler
(in base R) and running enableJIT(3)
, and running (with enableJIT
) after compiling your thedistance
function with cmpfun()
(in compiler
). The results are (respectively):
> proc.time() - ptm
user system elapsed
335.26 0.17 339.83
> proc.time() - ptm
user system elapsed
120.73 0.12 122.52
> proc.time() - ptm
user system elapsed
117.86 0.12 118.95
The difference with 2 and 3 is neglible, as I think enableJIT
just compiles the function anyway, but it looks like there is a decent improvement with enableJIT
alone. so just throw
library(compiler)
enableJIT(3)
at the top of your code.
More information can be found here
Here are two solutions, one is partly vectorised that uses sapply
, and I think the other is fully vectorised, but I've written a couple of functions for part of the vector calculations so there may be a better way.
I also only ran these for nobs1 = 4, 40 & 400
and nobs2 = 5, 50 & 500
as my laptop struggles for memory with large matrices
.
Solution - sapply
R <- 6371 # Earth mean radius [km]
delta.lon <- sapply(mylon2, "-", mylon1)
delta.lon <- sin(delta.lon/2)^2
coslat2 <- cos(mylat2)
coslat1 <- cos(mylat1)
delta.lan <- sapply(mylat2, "-", mylat1)
delta.lan <- sin(delta.lan/2)^2
a <- delta.lan + t(sapply(coslat1, "*", coslat2) * t(delta.lon))
b <- ifelse(sqrt(a)<1,sqrt(a),1)
c <- asin(b)
d <- 2 * c
e <- R * d
f <- c(t(e))
And to check the output
sum(f)
sum(mydistance)
all.equal(f, mydistance)
> sum(f)
[1] 647328856
> sum(mydistance)
[1] 647328856
> all.equal(f, mydistance)
[1] TRUE
Solution - with functions
R <- 6371 # Earth mean radius [km]
#function to calculate the difference between each
#element of different sized vectors and return a matrix
vecEleDif <- function(vec1, vec2)
{
#the order of arguments is vec2 - vec1
len1 <- length(vec1);len2 <- length(vec2)
dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
res <- t(dummy2 - t(dummy1))
}
#Function to calculate the product of each
#element of two different size vectors
vecEleProd <- function(vec1, vec2)
{
#the order of the arguments is vec2 * vec1
len1 <- length(vec1); len2 <- length(vec2)
dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
res <- t(dummy2 * t(dummy1))
}
ptm <- proc.time()
delta.lon <- sin(vecEleDif(mylon2, mylon1)/2)^2
delta.lan <- sin(vecEleDif(mylat2, mylat1)/2)^2
cosprod <- vecEleProd(cos(mylat1), cos(mylat2))
a <- delta.lan + (t(cosprod) * delta.lon)
b <- ifelse(sqrt(a)<1,sqrt(a),1)
c <- asin(b)
d <- 2 * c
e <- R * d
f <- c((e))
proc.time() - ptm
and check the output:
sum(f)
sum(mydistance)
all.equal(f, mydistance)
> sum(f)
[1] 647745044
> sum(mydistance)
[1] 647745044
>
> all.equal(f, mydistance)
[1] TRUE
library(compiler)
enableJIT(3)
nobs1 <- 4000
nobs2 <- 4000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37
trixie<-matrix(nrow=length(mylon1),ncol=4)
trixie[,1]<-mylon1
trixie[,2]<-mylat1
trixie[,3]<-mylon2
trixie[,4]<-mylat2
#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
R <- 6371 # Earth mean radius [km]
delta.lon <- (lon2 - lon1)
delta.lat <- (lat2 - lat1)
a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
c <- 2 * asin(min(1,sqrt(a)))
d = R * c
return(d)
}
calc_distance <- cmpfun(thedistance)
ptm <- proc.time()
#### Calculate distances between locations
# Initiate the resulting distance vector
distances<-apply(trixie,1, function(x) {
return(apply(trixie,1, function(y) {
return(calc_distance(x[1],x[2],y[3],y[4]))
}))
})
proc.time() - ptm
Here's an option that decreases the runtime to ~2 seconds on my machine because part of it is vectorized.
A direct comparison with the original solution follows.
Test data:
nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37
Original solution:
#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
R <- 6371 # Earth mean radius [km]
delta.lon <- (lon2 - lon1)
delta.lat <- (lat2 - lat1)
a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
c <- 2 * asin(min(1,sqrt(a)))
d = R * c
return(d)
}
ptm <- proc.time()
#### Calculate distances between locations
# Initiate the resulting distance vector
ndistance <- nobs1*nobs2 # The number of distances
mydistance <- vector(mode = "numeric", length = ndistance)
k=1
for (i in 1:nobs1) {
for (j in 1:nobs2) {
mydistance[k] = thedistance(mylon1[i],mylat1[i],mylon2[j],mylat2[j])
k=k+1
}
}
proc.time() - ptm
User System elapsed
148.243 0.681 148.901
My approach:
# modified (vectorized) distance function:
thedistance2 <- function(lon1, lat1, lon2, lat2) {
R <- 6371 # Earth mean radius [km]
delta.lon <- (lon2 - lon1)
delta.lat <- (lat2 - lat1)
a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
c <- 2 * asin(pmin(1,sqrt(a))) # pmin instead of min
d = R * c
return(d)
}
ptm2 <- proc.time()
lst <- vector("list", length = nobs1)
for (i in seq_len(nobs1)) {
lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
}
res <- unlist(lst)
proc.time() - ptm2
User System elapsed
1.988 0.331 2.319
Are the results all equal?
all.equal(mydistance, res)
#[1] TRUE
Here is an other approach using Rcpp
. On my machine it was slightly faster than beginneR's very nice vectorized version.
library(Rcpp)
nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37
sourceCpp("dist.cpp")
system.time({
lst <- vector("list", length = nobs1)
for (i in seq_len(nobs1)) {
lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
}
res <- unlist(lst)
})
## user system elapsed
## 4.636 0.084 4.737
system.time(res2 <- dist_vec(mylon1, mylat1, mylon2, mylat2))
## user system elapsed
## 2.584 0.044 2.633
all.equal(res, res2)
## TRUE
And the file dist.cpp
countains
#include <Rcpp.h>
#include <iostream>
#include <math.h>
using namespace Rcpp;
double dist_cpp(double lon1, double lat1,
double lon2, double lat2){
int R = 6371;
double delta_lon = (lon2 - lon1);
double delta_lat = (lat2 - lat1);
double a = pow(sin(delta_lat/2.0), 2) + cos(lat1) * cos(lat2) * pow(sin(delta_lon/2.0), 2);
double c;
a = sqrt(a);
if (a < 1.0) {
c = 2 * asin(a);
} else {
c = 2 * asin(1);
}
double d = R * c;
return(d);
}
// [[Rcpp::export]]
NumericVector dist_vec(NumericVector lon1, NumericVector lat1,
NumericVector lon2, NumericVector lat2) {
int n = lon1.size();
int m = lon2.size();
int k = n * m;
NumericVector res(k);
int c = 0;
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
res[c] = dist_cpp(lon1[i], lat1[i], lon2[j], lat2[j]);
c++;
}
}
return(res);
}