问题
First of all, this is NOT the problem of calculating Euclidean distance between two matrices.
Assuming I have two matrices x
and y
, e.g.,
set.seed(1)
x <- matrix(rnorm(15), ncol=5)
y <- matrix(rnorm(20), ncol=5)
where
> x
[,1] [,2] [,3] [,4] [,5]
[1,] -0.6264538 1.5952808 0.4874291 -0.3053884 -0.6212406
[2,] 0.1836433 0.3295078 0.7383247 1.5117812 -2.2146999
[3,] -0.8356286 -0.8204684 0.5757814 0.3898432 1.1249309
> y
[,1] [,2] [,3] [,4] [,5]
[1,] -0.04493361 0.59390132 -1.98935170 -1.4707524 -0.10278773
[2,] -0.01619026 0.91897737 0.61982575 -0.4781501 0.38767161
[3,] 0.94383621 0.78213630 -0.05612874 0.4179416 -0.05380504
[4,] 0.82122120 0.07456498 -0.15579551 1.3586796 -1.37705956
Then I want to get distance matrix distmat
of dimension 3-by-4, where the element distmat[i,j]
is the value from norm(x[1,]-y[2,],"2")
or dist(rbind(x[1,],y[2,]))
.
- My code
distmat <- as.matrix(unname(unstack(within(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), d <-sqrt(rowSums((x[Var1,]-y[Var2,])**2))), d~Var2)))
which gives
> distmat
[,1] [,2] [,3] [,4]
[1,] 3.016991 1.376622 2.065831 2.857002
[2,] 4.573625 3.336707 2.698124 1.412811
[3,] 3.764925 2.235186 2.743056 3.358577
but I don't think my code is elegant or efficient enough when with x
and y
of large number of rows.
- Objective
I am looking forward to a much faster and more elegant code with base R for this goal. Appreciated in advance!
- Benchmark Template (in updating)
For your convenience, you can use the following for benchmark to see if your code is faster:
set.seed(1)
x <- matrix(rnorm(15000), ncol=5)
y <- matrix(rnorm(20000), ncol=5)
# my customized approach
method_ThomasIsCoding_v1 <- function() {
as.matrix(unname(unstack(within(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), d <-sqrt(rowSums((x[Var1,]-y[Var2,])**2))), d~Var2)))
}
method_ThomasIsCoding_v2 <- function() {
`dim<-`(with(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), sqrt(rowSums((x[Var1,]-y[Var2,])**2))),c(nrow(x),nrow(y)))
}
method_ThomasIsCoding_v3 <- function() {
`dim<-`(with(idx1<-list(Var1 = rep(1:nrow(x), nrow(y)), Var2 = rep(1:nrow(y), each = nrow(x))), sqrt(rowSums((x[Var1,]-y[Var2,])**2))),c(nrow(x),nrow(y)))
}
# approach by AllanCameron
method_AllanCameron <- function()
{
`dim<-`(sqrt(rowSums((x[rep(1:nrow(x), nrow(y)),] - y[rep(1:nrow(y), each = nrow(x)),])^2)), c(nrow(x), nrow(y)))
}
# approach by F.Prive
method_F.Prive <- function() {
sqrt(outer(rowSums(x^2), rowSums(y^2), '+') - tcrossprod(x, 2 * y))
}
# an existing approach by A. Webb from https://stackoverflow.com/a/35107198/12158757
method_A.Webb <- function() {
euclidean_distance <- function(p,q) sqrt(sum((p - q)**2))
outer(
data.frame(t(x)),
data.frame(t(y)),
Vectorize(euclidean_distance)
)
}
bm <- microbenchmark::microbenchmark(
method_ThomasIsCoding_v1(),
method_ThomasIsCoding_v2(),
method_ThomasIsCoding_v3(),
method_AllanCameron(),
method_F.Prive(),
# method_A.Webb(),
unit = "relative",
check = "equivalent",
times = 10
)
bm
such that
Unit: relative
expr min lq mean median uq max neval
method_ThomasIsCoding_v1() 9.471806 8.838704 7.308433 7.567879 6.989114 5.429136 10
method_ThomasIsCoding_v2() 4.623405 4.469646 3.817199 4.024436 3.703473 2.854471 10
method_ThomasIsCoding_v3() 4.881620 4.832024 4.070866 4.134011 3.924366 3.367746 10
method_AllanCameron() 5.654533 5.279920 4.436071 4.772527 4.184927 3.157814 10
method_F.Prive() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
回答1:
method_XXX <- function() {
sqrt(outer(rowSums(x^2), rowSums(y^2), '+') - tcrossprod(x, 2 * y))
}
Unit: relative
expr min lq mean median uq max
method_ThomasIsCoding_v1() 12.151624 10.486417 9.213107 10.162740 10.235274 5.278517
method_ThomasIsCoding_v2() 6.923647 6.055417 5.549395 6.161603 6.140484 3.438976
method_ThomasIsCoding_v3() 7.133525 6.218283 5.709549 6.438797 6.382204 3.383227
method_AllanCameron() 7.093680 6.071482 5.776172 6.447973 6.497385 3.608604
method_XXX() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
回答2:
The proxy
package has a function for this.
library(proxy)
dist(x, y)
[,1] [,2] [,3] [,4]
[1,] 3.016991 1.376622 2.065831 2.857002
[2,] 4.573625 3.336707 2.698124 1.412811
[3,] 3.764925 2.235186 2.743056 3.358577
回答3:
I've kept it simple and base R whilst managing a one-liner that gives a 3* speedup.
`dim<-`(sqrt(rowSums((x[rep(1:nrow(x), nrow(y)),] - y[rep(1:nrow(y), each = nrow(x)),])^2)), c(nrow(x), nrow(y)))
# [,1] [,2] [,3] [,4]
# [1,] 3.016991 1.376622 2.065831 2.857002
# [2,] 4.573625 3.336707 2.698124 1.412811
# [3,] 3.764925 2.235186 2.743056 3.358577
Unfortunately my PC's memory choked on the tests with big matrices, so I had to reduce the dimensions by an order of magnitude to run the tests.
Full code shown:
set.seed(1)
x <- matrix(rnorm(1500), ncol=5)
y <- matrix(rnorm(2000), ncol=5)
# my customized approach
method_ThomasIsCoding <- function() {
as.matrix(unname(unstack(within(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), d <-sqrt(rowSums((x[Var1,]-y[Var2,])**2))), d~Var2)))
}
# an existing approach by A. Webb from https://stackoverflow.com/a/35107198/12158757
method_A.Webb <- function() {
euclidean_distance <- function(p,q) sqrt(sum((p - q)**2))
outer(
data.frame(t(x)),
data.frame(t(y)),
Vectorize(euclidean_distance)
)
}
# your approach
method_AllanCameron <- function()
{
`dim<-`(sqrt(rowSums((x[rep(1:nrow(x), nrow(y)),] - y[rep(1:nrow(y), each = nrow(x)),])^2)), c(nrow(x), nrow(y)))
}
microbenchmark::microbenchmark(
method_ThomasIsCoding(),
method_A.Webb(),
method_AllanCameron(),
times = 10
)
Result
# Unit: milliseconds
# expr min lq mean median uq max neval
# method_ThomasIsCoding() 63.08587 64.70988 69.59648 67.73379 75.90281 76.92903 10
# method_A.Webb() 330.44824 349.90977 376.36962 368.52164 392.11780 446.57269 10
# method_AllanCameron() 16.29938 18.20057 21.02634 20.45267 22.41767 31.28646 10
来源:https://stackoverflow.com/questions/59679046/speed-challenge-any-faster-method-to-calculate-distance-matrix-between-rows-of