I have two data sets: competitor_data - contains competitors for a given product as well as the price and date when the competitor prices were collected.
product_price - the date of each price change.
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'), date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22", "2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"), price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE) product_price$date = as.Date(product_price$date)
Objective
My script below using nested for loops but it takes over 24 hours to process 5000 unique product_id:
unique_skus <- unique(product_price$productId) all_competitive_data <- data.frame() mid_step_data <- data.frame() start_time <-Sys.time() for (i in 1:length(unique_skus)){ step1 <- subset(product_price, productId == unique_skus[i]) transact_dates = unique(step1$date) for (a in 1:length(transact_dates)){ step2 <- subset(step1, date ==transact_dates[a]) step3 <- inner_join(step2,competitor_data, by='productId') if (nrow(subset(step3, date > crawl_date)) == 0){ step3 <- step3[ order(step3$crawl_date , decreasing = FALSE ),] competitor_price <- head(step3,1)$competitor_price step2$competitor_price = competitor_price } else { step4 <- subset(step3, date > crawl_date) step4 <- step4[ order(step4$crawl_date , decreasing = TRUE ),] competitor_price <- head(step4,1)$competitor_price step2$competitor_price = competitor_price } step2$price_leader <- ifelse(step2$price <= step2$competitor_price, 1, 0) mid_step_data = rbind(mid_step_data,step2) } all_competitive_data <- rbind(all_competitive_data,mid_step_data) } Sys.time()-start_time all_competitive_data = unique(all_competitive_data)
Is there a way to accomplish this quickly perhaps using dplyr?
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
"2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)
product_price$date = as.Date(product_price$date)
Use this function to fill a vector with NAs forward then backward
## fill in NAs
f <- function(..., lead = NA) {
# f(NA, 1, NA, 2, NA, NA, lead = NULL)
x <- c(lead, c(...))
head(zoo::na.locf(zoo::na.locf(x, na.rm = FALSE), fromLast = TRUE),
if (is.null(lead)) length(x) else -length(lead))
}
Merge the two by product and date. We pad the first price by product with an extra NA so this will effectively use the previous price when we fill in the NAs
Then do the comparison of price and competitor price. The last step is just some cleaning up to prove it is the same result
dd <- merge(product_price, competitor_data,
by.y = c('productId', 'crawl_date'),
by.x = c('productId', 'date'), all = TRUE)
dd$competitor_price <-
unlist(sapply(split(dd$competitor_price, dd$productId), f))
dd$price_leader <- +(dd$price <= dd$competitor_price)
(res1 <- `rownames<-`(dd[!is.na(dd$price_leader), -4], NULL))
# productId date price competitor_price price_leader
# 1 banana 2014-02-22 2.09 2.50 1
# 2 banana 2014-05-03 2.04 2.35 1
# 3 banana 2014-05-05 2.12 2.35 1
# 4 banana 2014-06-22 2.31 2.22 0
# 5 banana 2014-07-05 2.29 2.52 1
# 6 banana 2014-08-31 2.01 2.52 1
# 7 fig 2014-03-09 5.21 5.32 1
# 8 fig 2014-05-21 5.22 5.32 1
# 9 fig 2014-06-19 5.36 5.56 1
# 10 fig 2014-06-22 5.91 5.56 0
# 11 fig 2014-07-03 5.36 5.86 1
# 12 fig 2014-09-08 5.56 5.96 1
res0 <- `rownames<-`(all_competitive_data[
order(all_competitive_data$productId, all_competitive_data$date), ], NULL)
all.equal(res0, res1)
# [1] TRUE
You can change any of these steps to dplyr or data.table syntax; I don't use either one, but it should be straight-forward:
library('dplyr')
dd <- full_join(product_price, competitor_data,
by = c(
'productId' = 'productId',
'date' = 'crawl_date'
)
) %>% arrange(productId, date)
dd %>% group_by(productId) %>%
mutate(
competitor_price = f(competitor_price),
price_leader = as.integer(price <= competitor_price)
) %>% filter(!is.na(price_leader)) %>% select(-competitor)
# Source: local data frame [12 x 5]
# Groups: productId [2]
#
# productId date price competitor_price price_leader
# <chr> <date> <dbl> <dbl> <int>
# 1 banana 2014-02-22 2.09 2.50 1
# 2 banana 2014-05-03 2.04 2.35 1
# 3 banana 2014-05-05 2.12 2.35 1
# 4 banana 2014-06-22 2.31 2.22 0
# 5 banana 2014-07-05 2.29 2.52 1
# 6 banana 2014-08-31 2.01 2.52 1
# 7 fig 2014-03-09 5.21 5.32 1
# 8 fig 2014-05-21 5.22 5.32 1
# 9 fig 2014-06-19 5.36 5.56 1
# 10 fig 2014-06-22 5.91 5.56 0
# 11 fig 2014-07-03 5.36 5.86 1
# 12 fig 2014-09-08 5.56 5.96 1
The below solution uses dplyr join to match. (NOTE:I changed "crawl_date" to "date" so that dplyr join would select the matching columns automatically. It can be matched explicitly by using something like
by=c('productId'='productId', date'='crawl_date')
as a parameter to join.
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$date = as.Date(competitor_data$date)
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
"2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)
product_price$date = as.Date(product_price$date)
require(dplyr)
joined <- product_price %>% left_join(competitor_data)
joined$leader <- as.integer(joined$price <= joined$competitor_price)
joined
The resulting data frame is
productId date price competitor competitor_price leader
1 banana 2014-05-05 2.12 google 1.99 0
2 banana 2014-06-22 2.31 tencent 2.52 1
3 banana 2014-07-05 2.29 <NA> NA NA
4 banana 2014-08-31 2.01 <NA> NA NA
5 banana 2014-05-03 2.04 <NA> NA NA
6 banana 2014-02-22 2.09 <NA> NA NA
7 fig 2014-05-21 5.22 <NA> NA NA
8 fig 2014-06-19 5.36 <NA> NA NA
9 fig 2014-03-09 5.21 <NA> NA NA
10 fig 2014-06-22 5.91 <NA> NA NA
11 fig 2014-07-03 5.36 <NA> NA NA
12 fig 2014-09-08 5.56 <NA> NA NA
>
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.