[英]How to make R loop faster?
我正在嘗試使用以下函數將嵌套的json文件轉換為R中的數據框:
rf1 <- function(data) {
master <-
data.frame(
id = character(0),
awardAmount = character(0),
awardStatus = character(0),
tenderAmount = character(0)
)
for (i in 1:nrow(data)) {
temp1 <- unlist(data$data$awards[[i]]$status)
length <- length(temp1)
temp2 <- rep(data$data$id[i], length)
temp3 <- rep(data$data$value$amount[[i]], length)
temp4 <- unlist(data$data$awards[[i]]$value[[1]])
tempDF <-
data.frame(id = temp2,
awardAmount = temp4,
awardStatus = temp1,
tenderAmount = temp3)
master <- rbind(master, tempDF)
}
return(master)
}
這是我正在使用的json文件的示例:
{
"data" : {
"id" : "3f066cdd81cf4944b42230ed56a35bce",
"awards" : [
{
"status" : "unsuccessful",
"value" : {
"amount" : 76
}
},
{
"status" : "active",
"value" : {
"amount" : 41220
}
}
],
"value" : {
"amount" : 48000
}
}
},
{
"data" : {
"id" : "9507162e6ee24cef8e0ea75d46a81a30",
"awards" : [
{
"status" : "active",
"value" : {
"amount" : 2650
}
}
],
"value" : {
"amount" : 2650
}
}
},
{
"data" : {
"id" : "a516ac43240c4ec689f3392cf0c17575",
"awards" : [
{
"status" : "active",
"value" : {
"amount" : 2620
}
}
],
"value" : {
"amount" : 2650
}
}
}
正如您所看到的,這三個觀察的獎項數量不同(第一個觀察有兩個獎項,而另外兩個只有一個獎勵)。 由於我正在尋找一個表視圖數據幀,我正在填充空單元格中的重復信息,例如data$id
和data$value$amount
。
json文件有大約100,000個觀察值,因此返回一個數據幀需要永遠(我已經等了30多分鍾但仍然沒有結果)。 我認為可能有一種方法可以並行運行所有temp
行,這可以節省大量時間,但我不知道如何在我的代碼中實現它。
為了讓您了解我正在尋找的輸出,我將我的功能限制for (i in 1:3)
,這產生了以下數據幀。 我的問題是如何做同樣的事情,但是對於100,000次觀察。 注意,json示例對應於示例輸出。
期望的輸出:
這絕不是優雅的,但似乎有效:
library(jsonlite)
library(purrr)
library(dplyr)
json_data <- '[{"data":{"id":"3f066cdd81cf4944b42230ed56a35bce","awards":[{"status":"unsuccessful","value":{"amount":76}},{"status":"active","value":{"amount":41220}}],"value":{"amount":48000}}},{"data":{"id":"9507162e6ee24cef8e0ea75d46a81a30","awards":[{"status":"active","value":{"amount":2650}}],"value":{"amount":2650}}},{"data":{"id":"a516ac43240c4ec689f3392cf0c17575","awards":[{"status":"active","value":{"amount":2620}}],"value":{"amount":2650}}}] '
# parse original JSON records
parsed_json_data <- fromJSON(json_data)$data
# extract awards data, un-nest the nested parts, and re-assemble awards into a data frame for each id
awards <- map2(.x = parsed_json_data$id,
.y = parsed_json_data$awards,
.f = function(x, y) bind_cols(data.frame('id' = rep(x, nrow(y)), stringsAsFactors = FALSE), as.data.frame(as.list(y))))
# bind together the data frames over all ids
awards <-
bind_rows(awards) %>%
rename(awards_status = status, awards_amount = amount)
# remove awards data from original parsed data
parsed_json_data$awards <- NULL
# un-nest the remaining data structures
parsed_json_data <- as.data.frame(as.list(parsed_json_data), stringsAsFactors = FALSE)
# join higher-level data with awards data (in denormalisation process)
final_data_frame <- inner_join(parsed_json_data, awards, by = 'id')
final_data_frame
# id amount awards_status awards_amount
# 1 3f066cdd81cf4944b42230ed56a35bce 48000 unsuccessful 76
# 2 3f066cdd81cf4944b42230ed56a35bce 48000 active 41220
# 3 9507162e6ee24cef8e0ea75d46a81a30 2650 active 2650
# 4 a516ac43240c4ec689f3392cf0c17575 2650 active 2620
另一種方法是刪除工作表單R並重新構建您的mongodb查詢。
如果這是您在mongodb中的數據
在mongo shell中,您可以按行編寫查詢
db.json.aggregate([
{ "$unwind" : "$data.awards"},
{ "$group" : {
"_id" : {"id" : "$data.id", "status" : "$data.awards.status"},
"awardAmount" : { "$sum" : "$data.awards.value.amount" },
"tenderAmount" : { "$sum" : "$data.value.amount" }
}
},
{ "$project" : {
"id" : "$_id.id",
"status" : "$_id.status",
"awardAmount" : "$awardAmount",
"tenderAmount" : "$tenderAmount",
"_id" : 0} }
])
(注意:我不是mongodb專家,所以可能會有更簡潔的方式來寫這個)
你也可以在R中使用它
library(mongolite)
mongo <- mongo(collection = "json", db = "test")
qry <- '[
{ "$unwind" : "$data.awards"},
{ "$group" : {
"_id" : {"id" : "$data.id", "status" : "$data.awards.status"},
"awardAmount" : { "$sum" : "$data.awards.value.amount" },
"tenderAmount" : { "$sum" : "$data.value.amount" }
}
},
{ "$project" : {
"id" : "$_id.id",
"status" : "$_id.status",
"awardAmount" : "$awardAmount",
"tenderAmount" : "$tenderAmount",
"_id" : 0}
}
]'
df <- mongo$aggregate(pipeline = qry)
df
# awardAmount tenderAmount id status
# 1 2620 2650 a516ac43240c4ec689f3392cf0c17575 active
# 2 41220 48000 3f066cdd81cf4944b42230ed56a35bce active
# 3 2650 2650 9507162e6ee24cef8e0ea75d46a81a30 active
# 4 76 48000 3f066cdd81cf4944b42230ed56a35bce unsuccessful
這可能是最簡單的方法。 它不使用JSON解析,而是使用一堆正則表達式
但是,我同意SymbolixAU,在mongo查詢中這樣做是可行的。
# load json file ("file.json") just as a single string / single-element character vector
jsonAsString <- readChar("file.json", file.info("file.json")$size)
# chunk the tenders
dataChunks <- unlist(strsplit(jsonAsString, '"data" : \\{'))
dataChunks <- dataChunks[grepl("id", dataChunks)] # this removes the unnecessary header
# get the award subchunks
awardSubChunks <- gsub('.*("awards".*?}.*?}.*?]).*', "\\1", dataChunks)
# scrape status values out of the award subchunks
statusIndex <- gregexpr('(?<="status" : ")([[:alnum:]]*)', awardSubChunks, perl = T)
status <- unlist(regmatches(awardSubChunks, statusIndex))
# scrape bidAmount value out of the award subchunks
bidAmountIndex <- gregexpr('(?<="amount" : )([[:alnum:]]*)', awardSubChunks, perl = T)
bidAmount <- unlist(regmatches(awardSubChunks, bidAmountIndex))
# get the id and tender by removing the award subchunks
idTenderAmount <- gsub('"awards".*?}.*?}.*?]', "", dataChunks)
# scrape id and tenderAmount values
id <- gsub('.*"id" : "([[:alnum:]]*)".*', "\\1", idTenderAmount)
tenderAmount <- gsub('.*"amount" : ([[:alnum:]]*).*', "\\1", idTenderAmount)
# find the number of bids per Id in order to find number of times id and tenderAmount needs to be repeated
numBidsPerId <- gregexpr("value", awardSubChunks)
numBidsTotal <- sapply(numBidsPerId, length)
# putting things together
df <- data.frame(id = rep(id, numBidsTotal),
tenderAmount = rep(tenderAmount, numBidsTotal),
status = status,
bidAmount = bidAmount)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.