简体   繁体   中英

(Speed Challenge) Any faster method to calculate distance matrix between rows of two matrices, in terms of Euclidean distance?

First of all, this is NOT the problem of calculating Euclidean distance between two matrices .

Assuming I have two matrices x and y , eg,

set.seed(1)
x <- matrix(rnorm(15), ncol=5)
y <- matrix(rnorm(20), ncol=5)

where

> x
           [,1]       [,2]      [,3]       [,4]       [,5]
[1,] -0.6264538  1.5952808 0.4874291 -0.3053884 -0.6212406
[2,]  0.1836433  0.3295078 0.7383247  1.5117812 -2.2146999
[3,] -0.8356286 -0.8204684 0.5757814  0.3898432  1.1249309

> y
            [,1]       [,2]        [,3]       [,4]        [,5]
[1,] -0.04493361 0.59390132 -1.98935170 -1.4707524 -0.10278773
[2,] -0.01619026 0.91897737  0.61982575 -0.4781501  0.38767161
[3,]  0.94383621 0.78213630 -0.05612874  0.4179416 -0.05380504
[4,]  0.82122120 0.07456498 -0.15579551  1.3586796 -1.37705956

Then I want to get distance matrix distmat of dimension 3-by-4, where the element distmat[i,j] is the value from norm(x[1,]-y[2,],"2") or dist(rbind(x[1,],y[2,])) .

  • My code
distmat <- as.matrix(unname(unstack(within(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), d <-sqrt(rowSums((x[Var1,]-y[Var2,])**2))), d~Var2)))

which gives

> distmat
         [,1]     [,2]     [,3]     [,4]
[1,] 3.016991 1.376622 2.065831 2.857002
[2,] 4.573625 3.336707 2.698124 1.412811
[3,] 3.764925 2.235186 2.743056 3.358577

but I don't think my code is elegant or efficient enough when with x and y of large number of rows.

  • Objective

I am looking forward to a much faster and more elegant code with base R for this goal. Appreciated in advance!

  • Benchmark Template (in updating)

For your convenience, you can use the following for benchmark to see if your code is faster:

set.seed(1)
x <- matrix(rnorm(15000), ncol=5)
y <- matrix(rnorm(20000), ncol=5)
# my customized approach
method_ThomasIsCoding_v1 <- function() {
  as.matrix(unname(unstack(within(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), d <-sqrt(rowSums((x[Var1,]-y[Var2,])**2))), d~Var2)))
}
method_ThomasIsCoding_v2 <- function() {
  `dim<-`(with(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), sqrt(rowSums((x[Var1,]-y[Var2,])**2))),c(nrow(x),nrow(y)))
}
method_ThomasIsCoding_v3 <- function() {
  `dim<-`(with(idx1<-list(Var1 = rep(1:nrow(x), nrow(y)), Var2 = rep(1:nrow(y), each = nrow(x))), sqrt(rowSums((x[Var1,]-y[Var2,])**2))),c(nrow(x),nrow(y)))
}
# approach by AllanCameron
method_AllanCameron <- function()
{
  `dim<-`(sqrt(rowSums((x[rep(1:nrow(x), nrow(y)),] - y[rep(1:nrow(y), each = nrow(x)),])^2)), c(nrow(x), nrow(y)))
}
# approach by F.Prive
method_F.Prive <- function() {
  sqrt(outer(rowSums(x^2), rowSums(y^2), '+') - tcrossprod(x, 2 * y))
}
# an existing approach by A. Webb from https://stackoverflow.com/a/35107198/12158757
method_A.Webb <- function() {
  euclidean_distance <- function(p,q) sqrt(sum((p - q)**2))
  outer(
    data.frame(t(x)),
    data.frame(t(y)),
    Vectorize(euclidean_distance)
  )
}



bm <- microbenchmark::microbenchmark(
  method_ThomasIsCoding_v1(),
  method_ThomasIsCoding_v2(),
  method_ThomasIsCoding_v3(),
  method_AllanCameron(),
  method_F.Prive(),
  # method_A.Webb(),
  unit = "relative",
  check = "equivalent",
  times = 10
)
bm

such that

Unit: relative
                       expr      min       lq     mean   median       uq      max neval
 method_ThomasIsCoding_v1() 9.471806 8.838704 7.308433 7.567879 6.989114 5.429136    10
 method_ThomasIsCoding_v2() 4.623405 4.469646 3.817199 4.024436 3.703473 2.854471    10
 method_ThomasIsCoding_v3() 4.881620 4.832024 4.070866 4.134011 3.924366 3.367746    10
      method_AllanCameron() 5.654533 5.279920 4.436071 4.772527 4.184927 3.157814    10
           method_F.Prive() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10
method_XXX <- function() {
  sqrt(outer(rowSums(x^2), rowSums(y^2), '+') - tcrossprod(x, 2 * y))
}

Unit: relative
                       expr       min        lq     mean    median        uq      max
 method_ThomasIsCoding_v1() 12.151624 10.486417 9.213107 10.162740 10.235274 5.278517
 method_ThomasIsCoding_v2()  6.923647  6.055417 5.549395  6.161603  6.140484 3.438976
 method_ThomasIsCoding_v3()  7.133525  6.218283 5.709549  6.438797  6.382204 3.383227
      method_AllanCameron()  7.093680  6.071482 5.776172  6.447973  6.497385 3.608604
               method_XXX()  1.000000  1.000000 1.000000  1.000000  1.000000 1.000000

The proxy package has a function for this.

library(proxy)
dist(x, y)

     [,1]     [,2]     [,3]     [,4]    
[1,] 3.016991 1.376622 2.065831 2.857002
[2,] 4.573625 3.336707 2.698124 1.412811
[3,] 3.764925 2.235186 2.743056 3.358577

I've kept it simple and base R whilst managing a one-liner that gives a 3* speedup.

`dim<-`(sqrt(rowSums((x[rep(1:nrow(x), nrow(y)),] - y[rep(1:nrow(y), each = nrow(x)),])^2)), c(nrow(x), nrow(y)))
#          [,1]     [,2]     [,3]     [,4]
# [1,] 3.016991 1.376622 2.065831 2.857002
# [2,] 4.573625 3.336707 2.698124 1.412811
# [3,] 3.764925 2.235186 2.743056 3.358577

Unfortunately my PC's memory choked on the tests with big matrices, so I had to reduce the dimensions by an order of magnitude to run the tests.

Full code shown:

set.seed(1)
x <- matrix(rnorm(1500), ncol=5)
y <- matrix(rnorm(2000), ncol=5)
# my customized approach
method_ThomasIsCoding <- function() {
  as.matrix(unname(unstack(within(idx<-expand.grid(seq(nrow(x)),seq(nrow(y))), d <-sqrt(rowSums((x[Var1,]-y[Var2,])**2))), d~Var2)))
}
# an existing approach by A. Webb from https://stackoverflow.com/a/35107198/12158757
method_A.Webb <- function() {
  euclidean_distance <- function(p,q) sqrt(sum((p - q)**2))
  outer(
    data.frame(t(x)),
    data.frame(t(y)),
    Vectorize(euclidean_distance)
  )
}
# your approach
method_AllanCameron <- function()
{
  `dim<-`(sqrt(rowSums((x[rep(1:nrow(x), nrow(y)),] - y[rep(1:nrow(y), each = nrow(x)),])^2)), c(nrow(x), nrow(y)))
}

microbenchmark::microbenchmark(
  method_ThomasIsCoding(),
  method_A.Webb(),
  method_AllanCameron(),
  times = 10
)

Result

# Unit: milliseconds
#                     expr       min        lq      mean    median        uq       max neval
#  method_ThomasIsCoding()  63.08587  64.70988  69.59648  67.73379  75.90281  76.92903    10
#          method_A.Webb() 330.44824 349.90977 376.36962 368.52164 392.11780 446.57269    10
#    method_AllanCameron()  16.29938  18.20057  21.02634  20.45267  22.41767  31.28646    10

This Rcpp function was a bit faster than the previously fastest solutions:

library(Rcpp)

cppFunction('NumericMatrix crossdist(NumericMatrix x,NumericMatrix y){
  int n1=x.nrow(),n2=y.nrow(),ncol=x.ncol();
  if(ncol!=y.ncol())throw std::runtime_error("Different column number");
  NumericMatrix out(n1,n2);
  for(int i=0;i<n1;i++)
    for(int j=0;j<n2;j++){
      double sum=0;
      for(int k=0;k<ncol;k++)sum+=pow(x(i,k)-y(j,k),2);
      out(i,j)=sqrt(sum);
    }
  return out;
}')

x=matrix(rnorm(15000),,5)
y=matrix(rnorm(20000),,5)

b=microbenchmark(times=100,
  crossdist(x,y),
  Rfast::dista(x,y),
  proxy::dist(x,y),
  pracma::distmat(x,y),
  as.matrix(pdist::pdist(x,y)),
  sqrt(outer(rowSums(x^2),rowSums(y^2),"+")-2*tcrossprod(x,y)),
  sqrt(outer(rowSums(x^2),rowSums(y^2),"+")-2*x%*%t(y))
)

a=aggregate(b$time,list(b$expr),median)
a=a[order(a[,2]),]
writeLines(paste(sprintf("%.3f",a[,2]/min(a[,2])),gsub(" ","",a[,1])))

Result:

1.000 crossdist(x,y)
1.701 proxy::dist(x,y)
1.717 sqrt(outer(rowSums(x^2),rowSums(y^2),"+")-2*tcrossprod(x,y))
1.731 sqrt(outer(rowSums(x^2),rowSums(y^2),"+")-2*x%*%t(y))
2.257 Rfast::dista(x,y)
2.803 as.matrix(pdist::pdist(x,y))
3.435 pracma::distmat(x,y)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM