简体   繁体   中英

Alternative to Nested For Loop in R

I have two data sets: competitor_data - contains competitors for a given product as well as the price and date when the competitor prices were collected.

product_price - the date of each price change.

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                   "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                            competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
                            competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
 product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22", "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"), price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE) product_price$date = as.Date(product_price$date) 

Objective

  • For a given product in product_price, for each record (date), find the relevant crawl_date price from competitor_data.
  • Compare product_price$price to lowest competitor_data$competitor_price.
  • If product_price$price <= competitor_data$competitor_price, then create a new column to flag 1 (price_leader) else flag 0 (price_leader)

My script below using nested for loops but it takes over 24 hours to process 5000 unique product_id:

 unique_skus <- unique(product_price$productId) all_competitive_data <- data.frame() mid_step_data <- data.frame() start_time <-Sys.time() for (i in 1:length(unique_skus)){ step1 <- subset(product_price, productId == unique_skus[i]) transact_dates = unique(step1$date) for (a in 1:length(transact_dates)){ step2 <- subset(step1, date ==transact_dates[a]) step3 <- inner_join(step2,competitor_data, by='productId') if (nrow(subset(step3, date > crawl_date)) == 0){ step3 <- step3[ order(step3$crawl_date , decreasing = FALSE ),] competitor_price <- head(step3,1)$competitor_price step2$competitor_price = competitor_price } else { step4 <- subset(step3, date > crawl_date) step4 <- step4[ order(step4$crawl_date , decreasing = TRUE ),] competitor_price <- head(step4,1)$competitor_price step2$competitor_price = competitor_price } step2$price_leader <- ifelse(step2$price <= step2$competitor_price, 1, 0) mid_step_data = rbind(mid_step_data,step2) } all_competitive_data <- rbind(all_competitive_data,mid_step_data) } Sys.time()-start_time all_competitive_data = unique(all_competitive_data) 

Is there a way to accomplish this quickly perhaps using dplyr?

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

Use this function to fill a vector with NAs forward then backward

## fill in NAs
f <- function(..., lead = NA) {
  # f(NA, 1, NA, 2, NA, NA, lead = NULL)
  x <- c(lead, c(...))
  head(zoo::na.locf(zoo::na.locf(x, na.rm = FALSE), fromLast = TRUE),
       if (is.null(lead)) length(x) else -length(lead))
}

Merge the two by product and date. We pad the first price by product with an extra NA so this will effectively use the previous price when we fill in the NAs

Then do the comparison of price and competitor price. The last step is just some cleaning up to prove it is the same result

dd <- merge(product_price, competitor_data,
            by.y = c('productId', 'crawl_date'),
            by.x = c('productId', 'date'), all = TRUE)
dd$competitor_price <-
  unlist(sapply(split(dd$competitor_price, dd$productId), f))
dd$price_leader <- +(dd$price <= dd$competitor_price)
(res1 <- `rownames<-`(dd[!is.na(dd$price_leader), -4], NULL))

#    productId       date price competitor_price price_leader
# 1     banana 2014-02-22  2.09             2.50            1
# 2     banana 2014-05-03  2.04             2.35            1
# 3     banana 2014-05-05  2.12             2.35            1
# 4     banana 2014-06-22  2.31             2.22            0
# 5     banana 2014-07-05  2.29             2.52            1
# 6     banana 2014-08-31  2.01             2.52            1
# 7        fig 2014-03-09  5.21             5.32            1
# 8        fig 2014-05-21  5.22             5.32            1
# 9        fig 2014-06-19  5.36             5.56            1
# 10       fig 2014-06-22  5.91             5.56            0
# 11       fig 2014-07-03  5.36             5.86            1
# 12       fig 2014-09-08  5.56             5.96            1

res0 <- `rownames<-`(all_competitive_data[
  order(all_competitive_data$productId, all_competitive_data$date), ], NULL)

all.equal(res0, res1)
# [1] TRUE

You can change any of these steps to dplyr or data.table syntax; I don't use either one, but it should be straight-forward:

library('dplyr')
dd <- full_join(product_price, competitor_data,
                by = c(
                  'productId' = 'productId',
                  'date' = 'crawl_date'
                )
) %>% arrange(productId, date)

dd %>% group_by(productId) %>%
  mutate(
    competitor_price = f(competitor_price),
    price_leader = as.integer(price <= competitor_price)
) %>% filter(!is.na(price_leader)) %>% select(-competitor)

# Source: local data frame [12 x 5]
# Groups: productId [2]
# 
#      productId       date price competitor_price price_leader
#          <chr>     <date> <dbl>            <dbl>        <int>
#   1     banana 2014-02-22  2.09             2.50            1
#   2     banana 2014-05-03  2.04             2.35            1
#   3     banana 2014-05-05  2.12             2.35            1
#   4     banana 2014-06-22  2.31             2.22            0
#   5     banana 2014-07-05  2.29             2.52            1
#   6     banana 2014-08-31  2.01             2.52            1
#   7        fig 2014-03-09  5.21             5.32            1
#   8        fig 2014-05-21  5.22             5.32            1
#   9        fig 2014-06-19  5.36             5.56            1
#   10       fig 2014-06-22  5.91             5.56            0
#   11       fig 2014-07-03  5.36             5.86            1
#   12       fig 2014-09-08  5.56             5.96            1

The below solution uses dplyr join to match. (NOTE:I changed "crawl_date" to "date" so that dplyr join would select the matching columns automatically. It can be matched explicitly by using something like

by=c('productId'='productId', date'='crawl_date')  

as a parameter to join.

competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                              date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
                                           "2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
                              competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"),
                              competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)

competitor_data$date = as.Date(competitor_data$date)

product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
                            date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
                                   "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
                            price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)

product_price$date = as.Date(product_price$date)

require(dplyr)
joined <- product_price %>% left_join(competitor_data)
joined$leader <- as.integer(joined$price <= joined$competitor_price)

joined

The resulting data frame is

   productId       date price competitor competitor_price leader
1     banana 2014-05-05  2.12     google             1.99      0
2     banana 2014-06-22  2.31    tencent             2.52      1
3     banana 2014-07-05  2.29       <NA>               NA     NA
4     banana 2014-08-31  2.01       <NA>               NA     NA
5     banana 2014-05-03  2.04       <NA>               NA     NA
6     banana 2014-02-22  2.09       <NA>               NA     NA
7        fig 2014-05-21  5.22       <NA>               NA     NA
8        fig 2014-06-19  5.36       <NA>               NA     NA
9        fig 2014-03-09  5.21       <NA>               NA     NA
10       fig 2014-06-22  5.91       <NA>               NA     NA
11       fig 2014-07-03  5.36       <NA>               NA     NA
12       fig 2014-09-08  5.56       <NA>               NA     NA
> 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM