简体   繁体   中英

How can I make this nested for loop faster?

for (i in 1:nrow(surgeries_7)){ 
  count = 0 
  for (j in 1:nrow(visits_1)){ 
    count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j] 
                      & visits_1$visit_date[j] > surgeries_7$surgery_date[i] &  
           visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0) 
    surgeries_7$post_op_visits[i] <- surgeries_7$post_op_visits[i] + count 
  } 
  print(i) 
} 

There are two tables: surgery_7 is one: this has two columns, PatientProfileId(unique) and we have the date of surgery for each corresponding profile id.

the second table is visits table, where we have profile id (there are multiple entries of same profile id) for different visits.

We are trying to count the number of visits(entries for each profile id) in the visits table after the date of surgery( present in surgery_7 table) but within one year of the surgery date.

The thing is that the code is taking too long to run for approx 6k rows. Is there any way to make the loop faster?

I agree with Jonathan V. Solórzano, try to ovoid functions from dplyr package.

Here are some improvements on your script.

#Use data structures that consume lesser memory
library(data.table)

surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)

# vectorization and pre-allocation dramatically improves speed on large data.
# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))

for (i in 1:nrow(surgeries_7)){ 
count=0
  for (j in 1:nrow(visits_1)){ 
    count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j] 
                    & visits_1$visit_date[j] > surgeries_7$surgery_date[i] &  
                    visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0) 

    post_op_visits[i] <- surgeries_7$post_op_visits[i] + count 
  } 
  print(i) 
} 

# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits

You also try parallel processing nested loop with foreach + doParallel if you have a multicore machine


#Use data structures that consume lesser memory
library(data.table)

surgeries_7 <- data.table(surgeries_7)
visits_1 <- data.table(visits_1)

# initialize output vector
post_op_visits <- numeric (nrow(surgeries_7))

library(foreach)
library(doParallel)

cl <- parallel::makeCluster(4) # for 4 cores machine
doParallel::registerDoParallel(cl)

post_op_visits <- foreach(i=1:nrow(surgeries_7), .combine='rbind') %dopar% { 
  foreach(j=1:nrow(visits_1), .combine='c') %do% {
    count <- ifelse(surgeries_7$PatientProfileId[i]==visits_1$PatientProfileId[j] 
                    & visits_1$visit_date[j] > surgeries_7$surgery_date[i] &  
                    visits_1$visit_date[j] <= surgeries_7$one_year_from_surgery[i],1,0) 

    surgeries_7$post_op_visits[i] + count
  } 
} 


# assign output outside loops
surgeries_7$post_op_visits <- post_op_visits

#close parallel backend
parallel::stopCluster(cl)

best wishes - Ahmed Alhendi

Consider avoiding loops and process with blockwise handling, specifically merge , subset , and aggregate . Below assumes patients do not have more than one surgery within a year which can over-count visits.

# MERGE
merged_df <- merge(surgeries_7, visits_1, by = "PatientProfileId")

# SUBSET
sub_df <- subset(merged_df, visit_date > surgery_date & 
                            visit_date <= one_year_from_surgery)

# AGGREGATE ACROSS ALL PATIENT SURGERIES
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId,
                    sub_df, FUN = length)

# AGGREGATE BY PATIENT AND SURGERY
agg_df <- aggregate(cbind(post_op_visits=visit_date) ~ PatientProfileId + surgery_date,
                    sub_df, FUN = length)

Should you need to add result as new column, simply merge aggregation to original data frame:

survery7 <- merge(surgery7, agg_df, by = c("PatientProfileId", "surgery_date"))

An option using the non-equi join in data.table package:

#calculate date one year after surgery
surgery_7[, oneyr := as.IDate(sapply(surgery_date, function(x) 
    seq(x, by="1 year", length.out=2L)[2L]))]

            #update by reference
surgery_7[, post_op_visits := 
    #non-equi join
    visits_1[.SD, on=.(PatientProfileId, visit_date>=surgery_date, visit_date<=oneyr),
        #for each row of surgery_7 find the number of rows from visits_1
        by=.EACHI, .N]$N]

output surgery_7 :

   PatientProfileId surgery_date      oneyr post_op_visits
1:                1   2018-01-01 2019-01-01              2
2:                2   2019-01-01 2020-01-01              1

data:

library(data.table)
surgery_7 <- data.table(PatientProfileId=c(1,2), 
    surgery_date=as.IDate(c("2018-01-01", "2019-01-01")))
#   PatientProfileId surgery_date
#1:                1   2018-01-01
#2:                2   2019-01-01

visits_1 <- data.table(PatientProfileId=c(1,1,1,2,2),
    visit_date=as.IDate(c("2018-03-15","2018-09-15","2019-02-03","2019-06-30","2020-01-15")))
#    PatientProfileId visit_date
# 1:                1 2018-03-15
# 2:                1 2018-09-15
# 3:                1 2019-02-03
# 4:                2 2019-06-30
# 5:                2 2020-01-15

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM