在 R 中使用 foreach 和 %dopar% 对索引进行并行计算的问题

Question

我正在尝试有效地编写 KDN 复杂性度量的计算代码，这涉及对距离矩阵的所有行进行循环并从中进行一些计算。

我试图将此代码与foreach和%dopar%函数并行，但我没有实现任何运行时间减少。 我意识到由于 memory 管理，一些并行计算效率不高，但我不知道这是我的情况还是我做错了什么。

这是一个可重现的示例，其中包含来自rsvd package 的数字数据：

首先，我调用所有必要的包，读取数字数据，然后获得一些有用的信息。

#######################
### NEEDED PACKAGES ###
#######################

library(dplyr)
library(parallelDist)
# for Parallel Processing
library(doParallel)  
library(foreach)
# for digits data
library(rsvd)


#############
###  DATA ###
#############

data(digits)
data = as.data.frame(digits)

# Dividing on X variables and Y target
dataX = data %>%
  dplyr::select(-label)
dataY = data %>%
  mutate(label=factor(label)) %>% 
  pull(label)

## number of data
n=dim(dataX)[1]

然后，我在要高效并行的 KDN 循环之前进行一些必要的计算。

##############################
###  PREVIOUS COMPUTATIONS ###
##############################

## number of available data in each class
n_data_classes=table(factor(dataY))

## number of data to be considered as neighbours in each class
k=0.05
k_neighbours_classes=ceiling(n_data_classes*k)

##  DISTANCE MATRIX COMPUTATION
 # this is time consuming but I'm not concerned about this
distance_matrix=as.matrix(parDist(scale(dataX)))

没有并行化的 KDN 计算是下一个，它需要 12 秒。

#########################################
### COMPUTING KDN: NO PARALLELIZATION ###
#########################################

## KDN instance level computation
# inicialization of a vector to store KDN instance level values
kdn_instance=numeric(n)

system.time(

  for (ix in 1:n){
    ## Gettig the class of ix data point
    class_ix=dataY[ix]
    ## number of data to be considered as neighbours in this class
    k_value=k_neighbours_classes[class_ix]
    
    # we get the k_value nearest neighbors set of ix
    distances_ix=distance_matrix[ix,]
    distances_ix_ordered=order(distances_ix,decreasing = F)
    knn_set_ix=distances_ix_ordered[2:(k_value+1)]
    
    # Y value of the k_neighbors_set_ix
    Y_value_knn_set_ix=dataY[knn_set_ix]
    # Y value of ix data
    Y_value_ix=dataY[ix]
    
    # number of data in knn_set_ix with different Y value that ix
    knn_set_ix_different_Y_value=length(Y_value_knn_set_ix[Y_value_knn_set_ix!=Y_value_ix])
    kdn_instance[ix]=knn_set_ix_different_Y_value/k_value
  }
)

# user  system elapsed 
# 12.29    0.37   12.67 secs

我尝试并行该循环的是以下循环，使用 foreach 和 %dopar%，这需要 35 秒。

######################################
### COMPUTING KDN: PARALLELIZATION ###
######################################

## Preparing for paralleling
# number of cores to use
n.cores <- parallel::detectCores() - 1

# we define the cluster and register it so it can be used by %dopar%
my.cluster <- parallel::makeCluster(n.cores,type = "PSOCK")

# register it to be used by %dopar%
doParallel::registerDoParallel(cl = my.cluster)

## KDN instance level computation
kdn_instance= NULL

#iterator
itx <- iter(distance_matrix, by = 'row')

system.time(

  kdn_instance <- foreach(
    ix = itx,
    .combine = 'c'
  ) %dopar% {
    ## Gettig the class of ix data point
    class_ix=dataY[ix]
    ## number of data to be considered as neighbours in this class
    k_value=k_neighbours_classes[class_ix]
    
    # we get the k_value nearest neighbors set of ix
    distances_ix=distance_matrix[ix,]
    distances_ix_ordered=order(distances_ix,decreasing = F)
    knn_set_ix=distances_ix_ordered[2:(k_value+1)]
    
    # Y value of the k_neighbors_set_ix
    Y_value_knn_set_ix=dataY[knn_set_ix]
    # Y value of ix data
    Y_value_ix=dataY[ix]
    
    # number of data in knn_set_ix with different Y value that ix
    knn_set_ix_different_Y_value=length(Y_value_knn_set_ix[Y_value_knn_set_ix!=Y_value_ix])
    knn_set_ix_different_Y_value/k_value}
)
parallel::stopCluster(cl = my.cluster)
  

# user  system elapsed 
# 12.38    4.64   35.14 secs

可以看出，并行计算比非并行计算花费更多时间。

我的问题是：并行处理代码有问题吗？ 有更好的方法吗？ 也许应该用另一个 package 来完成。

Answer 1

试图运行您的代码，但我的计算机死机并用完了 memory。

给你的三个想法，首先考虑按照社区机器人的建议提出一个新问题。 包括有关您的系统的信息（并行处理在不同系统上的工作方式不同）

其次，当您寻求性能时，一定要优化您的系统，例如，使用 openblas BLAS/LAPACK https://csantill.github.io/RPerformanceWBLAS/

最后，测试文档第 5 章中概述的基本%dopar%功能。更严重的示例https://cran.r-project.org/web/packages/doParallel/vi.nettes/gettingstartedParallel.pdf

library(doParallel)
registerDoParallel(cores=2) ## test with different number of cores
# cl <- makeCluster(2) ## Test also different number of clusters
# registerDoParallel(cl) 

 x <- iris[which(iris[,5] != "setosa"), c(1,5)]
 trials <- 10000

 ptime <- system.time({
   r <- foreach(icount(trials), .combine=cbind) %dopar% {
     ind <- sample(100, 100, replace=TRUE)
     result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
     coefficients(result1)
     }
   })[3]
 ptime
 # stopCluster(cl)


 stime <- system.time({
   r <- foreach(icount(trials), .combine=cbind) %do% {
     ind <- sample(100, 100, replace=TRUE)
     result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
     coefficients(result1)
     }
   })[3]
 stime

在我的旧笔记本电脑上的结果：

描述	经过的秒数
stime, baseline `%do%`方法	21.479
`registerDoParallel(cores=1)`	20.425
`registerDoParallel(cores=2)`	11.508
`registerDoParallel(cores=3)`	11.107
`registerDoParallel(cores=4)`	9.491
`cl<-makeCluster(1); registerDoParallel(cl)`	26.265
`cl<-makeCluster(2); registerDoParallel(cl)`	14.66
`cl<-makeCluster(3); registerDoParallel(cl)`	12.827
`cl<-makeCluster(4); registerDoParallel(cl)`	11.943

在 R 中使用 foreach 和 %dopar% 对索引进行并行计算的问题

问题描述

1 个解决方案

解决方案1
1 2022-11-27 01:50:34

在 R 中使用 foreach 和 %dopar% 对索引进行并行计算的问题

问题描述

1 个解决方案

解决方案1 1 2022-11-27 01:50:34

解决方案1
1 2022-11-27 01:50:34