[英]How can I make this nested for loop faster?
for (i in 1:nrow(surgeries_7)){
count = 0
for (j in 1:nrow(visits_1)){
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
surgeries_7$post_op_visits[i] <- surgeries_7$post_op_visits[i] + count
}
print(i)
}
有兩個表:urgery_7 是一個:它有兩列,PatientProfileId(unique),我們有每個對應的個人資料 ID 的手術日期。
第二個表是訪問表,其中我們有不同訪問的配置文件 ID(有多個相同配置文件 ID 的條目)。
我們試圖在手術日期之后(出現在手術_7 表中)但在手術日期的一年內計算訪問表中的訪問次數(每個配置文件 ID 的條目)。
問題是代碼運行時間太長,無法運行大約 6k 行。 有沒有辦法讓循環更快?
我同意 Jonathan V. Solórzano,嘗試從dplyr
包中dplyr
函數。
以下是對您的腳本的一些改進。
#Use data structures that consume lesser memory
library(data.table)
surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)
# vectorization and pre-allocation dramatically improves speed on large data.
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))
for (i in 1:nrow(surgeries_7)){
count=0
for (j in 1:nrow(visits_1)){
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
post_op_visits[i] <- surgeries_7$post_op_visits[i] + count
}
print(i)
}
# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits
如果您有一台多核機器,您還可以嘗試使用foreach
+ doParallel
並行處理嵌套循環
#Use data structures that consume lesser memory
library(data.table)
surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))
library(foreach)
library(doParallel)
cl <- parallel::makeCluster(4) # for 4 cores machine
doParallel::registerDoParallel(cl)
post_op_visits <- foreach(i=1:nrow(surgeries_7), .combine='rbind') %dopar% {
foreach(j=1:nrow(visits_1), .combine='c') %do% {
count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j]
& visits_1$visit_date[j] > surgeries_7$surgery_date[i] &
visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0)
surgeries_7$post_op_visits[i] + count
}
}
# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits
#close parallel backend
parallel::stopCluster(cl)
最美好的祝願——艾哈邁德·阿亨迪
考慮使用分塊處理來避免循環和處理,特別是merge
、 subset
和aggregate
。 以下假設患者在一年內不超過一次手術,這可能會高估就診次數。
# MERGE
merged_df <- merge(surgeries_7, visits_1, by = "PatientProfileId")
# SUBSET
sub_df <- subset(merged_df, visit_date > surgery_date &
visit_date <= one_year_from_surgery)
# AGGREGATE ACROSS ALL PATIENT SURGERIES
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId,
sub_df, FUN = length)
# AGGREGATE BY PATIENT AND SURGERY
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId + surgery_date,
sub_df, FUN = length)
如果您需要將結果添加為新列,只需將聚合合並到原始數據框:
survery7 <- merge(surgery7, agg_df, by = c("PatientProfileId", "surgery_date"))
在data.table
包中使用 non-equi join 的選項:
#calculate date one year after surgery
surgery_7[, oneyr := as.IDate(sapply(surgery_date, function(x)
seq(x, by="1 year", length.out=2L)[2L]))]
#update by reference
surgery_7[, post_op_visits :=
#non-equi join
visits_1[.SD, on=.(PatientProfileId, visit_date>=surgery_date, visit_date<=oneyr),
#for each row of surgery_7 find the number of rows from visits_1
by=.EACHI, .N]$N]
輸出surgery_7
:
PatientProfileId surgery_date oneyr post_op_visits
1: 1 2018-01-01 2019-01-01 2
2: 2 2019-01-01 2020-01-01 1
數據:
library(data.table)
surgery_7 <- data.table(PatientProfileId=c(1,2),
surgery_date=as.IDate(c("2018-01-01", "2019-01-01")))
# PatientProfileId surgery_date
#1: 1 2018-01-01
#2: 2 2019-01-01
visits_1 <- data.table(PatientProfileId=c(1,1,1,2,2),
visit_date=as.IDate(c("2018-03-15","2018-09-15","2019-02-03","2019-06-30","2020-01-15")))
# PatientProfileId visit_date
# 1: 1 2018-03-15
# 2: 1 2018-09-15
# 3: 1 2019-02-03
# 4: 2 2019-06-30
# 5: 2 2020-01-15
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.