I am doing a large amount of string comparisons using the Levenshtein distance measure, but because I need to be able to account for the spatial adjacency in the latent structure of the strings, I had to make my own script including a weight function.
My problem now is that my script is very inefficient. I have to do approximately 600,000 comparisons and it will take hours for the script to be done. I am therefor seeking a way to make my script more efficient, but being a self taught nub, I don't know how to solve this my self.
Here is the functions:
zeros <- function(lengthA,lengthB){
m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights == TRUE){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
indexA <- which(cost_weight==A, arr.ind=TRUE)
indexB <- which(cost_weight==B, arr.ind=TRUE)
walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
w <- walk/max_walk
}
else {w <- 1}
return(w)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
for (i in seq(to=nchar(A))){
D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
}
for (j in seq(to=nchar(B))){
D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
}
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
D[i,j] + substitution * weight(As[i],Bs[j], weights))
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}
Comparing the performance of my Levenshtein measure to the one from the stringdist package the performance is 83 times worse.
library (stringdist)
library(rbenchmark)
A <-"abcdefghijklmnopqrstuvwx"
B <-"xwvutsrqponmlkjihgfedcba"
benchmark(levenshtein(A,B), stringdist(A,B,method="lv"),
columns=c("test", "replications", "elapsed", "relative"),
order="relative", replications=10)
test replications elapsed relative
2 stringdist(A, B, method = "lv") 10 0.01 1
1 levenshtein(A, B) 10 0.83 83
Does anyone have an idea to improving my script?
The following code is already some improvement (of your code; calculates the same as you did before, not the same as stringdist
), but I'm sure it can be even more simplified and sped up.
zeros <- function(lengthA,lengthB){
m <- matrix(0, nrow=lengthA, ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
amats <- lapply(A, `==`, y=cost_weight)
bmats <- lapply(B, `==`, y=cost_weight)
walk <- mapply(function(a, b){
sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
}, amats, bmats)
return(walk/max_walk)
}
else return(1)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
#browser()
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
weight.mat <- outer(As, Bs, weight, weights=weights)
D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
D[i,j + 1] + deletion * weight.mat[i, j],
D[i,j] + substitution * weight.mat[i, j])
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.