[英]How can I make this nested for loop faster?
for (i in 1:nrow(surgeries_7)){
count = 0
for (j in 1:nrow(visits_1)){
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
surgeries_7$post_op_visits[i] <- surgeries_7$post_op_visits[i] + count
}
print(i)
}
有两个表:urgery_7 是一个:它有两列,PatientProfileId(unique),我们有每个对应的个人资料 ID 的手术日期。
第二个表是访问表,其中我们有不同访问的配置文件 ID(有多个相同配置文件 ID 的条目)。
我们试图在手术日期之后(出现在手术_7 表中)但在手术日期的一年内计算访问表中的访问次数(每个配置文件 ID 的条目)。
问题是代码运行时间太长,无法运行大约 6k 行。 有没有办法让循环更快?
我同意 Jonathan V. Solórzano,尝试从dplyr
包中dplyr
函数。
以下是对您的脚本的一些改进。
#Use data structures that consume lesser memory
library(data.table)
surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)
# vectorization and pre-allocation dramatically improves speed on large data.
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))
for (i in 1:nrow(surgeries_7)){
count=0
for (j in 1:nrow(visits_1)){
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
post_op_visits[i] <- surgeries_7$post_op_visits[i] + count
}
print(i)
}
# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits
如果您有一台多核机器,您还可以尝试使用foreach
+ doParallel
并行处理嵌套循环
#Use data structures that consume lesser memory
library(data.table)
surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))
library(foreach)
library(doParallel)
cl <- parallel::makeCluster(4) # for 4 cores machine
doParallel::registerDoParallel(cl)
post_op_visits <- foreach(i=1:nrow(surgeries_7), .combine='rbind') %dopar% {
foreach(j=1:nrow(visits_1), .combine='c') %do% {
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
surgeries_7$post_op_visits[i] + count
}
}
# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits
#close parallel backend
parallel::stopCluster(cl)
最美好的祝愿——艾哈迈德·阿亨迪
考虑使用分块处理来避免循环和处理,特别是merge
、 subset
和aggregate
。 以下假设患者在一年内不超过一次手术,这可能会高估就诊次数。
# MERGE
merged_df <- merge(surgeries_7, visits_1, by = "PatientProfileId")
# SUBSET
sub_df <- subset(merged_df, visit_date > surgery_date &
visit_date <= one_year_from_surgery)
# AGGREGATE ACROSS ALL PATIENT SURGERIES
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId,
sub_df, FUN = length)
# AGGREGATE BY PATIENT AND SURGERY
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId + surgery_date,
sub_df, FUN = length)
如果您需要将结果添加为新列,只需将聚合合并到原始数据框:
survery7 <- merge(surgery7, agg_df, by = c("PatientProfileId", "surgery_date"))
在data.table
包中使用 non-equi join 的选项:
#calculate date one year after surgery
surgery_7[, oneyr := as.IDate(sapply(surgery_date, function(x)
seq(x, by="1 year", length.out=2L)[2L]))]
#update by reference
surgery_7[, post_op_visits :=
#non-equi join
visits_1[.SD, on=.(PatientProfileId, visit_date>=surgery_date, visit_date<=oneyr),
#for each row of surgery_7 find the number of rows from visits_1
by=.EACHI, .N]$N]
输出surgery_7
:
PatientProfileId surgery_date oneyr post_op_visits
1: 1 2018-01-01 2019-01-01 2
2: 2 2019-01-01 2020-01-01 1
数据:
library(data.table)
surgery_7 <- data.table(PatientProfileId=c(1,2),
surgery_date=as.IDate(c("2018-01-01", "2019-01-01")))
# PatientProfileId surgery_date
#1: 1 2018-01-01
#2: 2 2019-01-01
visits_1 <- data.table(PatientProfileId=c(1,1,1,2,2),
visit_date=as.IDate(c("2018-03-15","2018-09-15","2019-02-03","2019-06-30","2020-01-15")))
# PatientProfileId visit_date
# 1: 1 2018-03-15
# 2: 1 2018-09-15
# 3: 1 2019-02-03
# 4: 2 2019-06-30
# 5: 2 2020-01-15
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.