简体   繁体   中英

Find the euclidean distance for nearest neighbor with two data sets

I have two set of first_data. The first one is a 100 (x_0,y_0) data:

x_0 <- seq(1, 10, by=1)
y_0 <- seq(1, 10, by=1)
data <- expand.grid(x_0,y_0)

The second one is 5 (x, y) data called second_data:

x <- c(2,4,6,8,10)
y <- c(3,5,7,9,11)
color <- c("green", "green", "red", "red", "red")
second_data<- data.frame(x,y, color)

I need to apply a Euclidean distance formula for 3NN to determine if each point in the first data set either green or red based on the Euclidean distance. Basically, I need to find the distance of each 100 pair points, 5 times, then use the code below to choose the 3 with the minimum distance.

I think I need a loop for this, but I'm not getting it correctly:

out <- rep(NA, nrow(first_data))
K=3

for(k in 1:nrow(first_data)){
green <- mutate(second_data, distance = sqrt(x - first_data[k]^2)+(y-first_data[k]^2)) %>%
  slice_min(distance, n=K) %>% filter(color=='green') %>% nrow()
  out[k] <- ifelse(new_blue >= (K+1)/2, 'green', 'red')
}

If I understand you correctly, the get.knn function in package FNN will do this easily:

library(FNN)    
neighbors3 <- get.knnx(second_data[, -3], data, k=3)
str(neighbors3)
# List of 2
#  $ nn.index: int [1:100, 1:3] 1 1 1 1 1 1 2 2 2 3 ...
#  $ nn.dist : num [1:100, 1:3] 2.24 2 2.24 2.83 3.61 ...
head(neighbors3$nn.index)
#      [,1] [,2] [,3]
# [1,]    1    2    3
# [2,]    1    2    3
# [3,]    1    2    3
# [4,]    1    2    3
# [5,]    1    2    3
# [6,]    1    2    3

The list element neighbors3$nn.index provides the three nearest neighbors in second_data to each row in data . Now to get the colors of the neighbors:

result <- matrix(color[neighbors3$nn.index], 100, 3)
head(result); cat("\n"); tail(result)
#      [,1]    [,2]    [,3] 
# [1,] "green" "green" "red"
# [2,] "green" "green" "red"
# [3,] "green" "green" "red"
# [4,] "green" "green" "red"
# [5,] "green" "green" "red"
# [6,] "green" "green" "red"
# 
#        [,1]  [,2]  [,3]   
#  [95,] "red" "red" "green"
#  [96,] "red" "red" "red"  
#  [97,] "red" "red" "red"  
#  [98,] "red" "red" "red"  
#  [99,] "red" "red" "red"  
# [100,] "red" "red" "red"  

You can combine everything with the original data if you want:

results <- cbind(data, neighbors3$nn.index, result, neighbors3$nn.dist)
colnames(results) <- c("x0", "y_0", "nn1", "nn2", "nn3", "col1", "col2", "col3", "dist1", "dist2", "dist3")
head(results)
#   x0 y_0 nn1 nn2 nn3  col1  col2 col3    dist1    dist2    dist3
# 1  1   1   1   2   3 green green  red 2.236068 5.000000 7.810250
# 2  2   1   1   2   3 green green  red 2.000000 4.472136 7.211103
# 3  3   1   1   2   3 green green  red 2.236068 4.123106 6.708204
# 4  4   1   1   2   3 green green  red 2.828427 4.000000 6.324555
# 5  5   1   1   2   3 green green  red 3.605551 4.123106 6.082763
# 6  6   1   1   2   3 green green  red 4.472136 4.472136 6.000000

If I get it correctly this will yield the correct classification

data$color <- NA
k <- 3
for (i in 1:nrow(data)){
  d <- data.frame()
  dat_aux <- second_data
  for (j in 1:k){
    d_j <- which.min((dat_aux$x - data$Var1[i])^2 + (dat_aux$y - data$Var2[i])^2)
    d <- c(d, dat_aux$color[d_j])
    dat_aux[d_j,] <- NA
  }
  data$color[i] <- names(sort(table(unlist(d)),decreasing = T))[1]
}
data

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM