[英]Is there a more efficient way to assign categories in R than using a loop?
我目前正在處理一個大型數據集(兩個子樣本的大小分別為 2,008,384x15 和 3,072,435x15)。 這些數據是大約 200 個月內的月度食品和飲料報價。 非常感謝您對以下問題的幫助:
在原始數據中,每個觀察值(產品報價)都有一個從 1 到 151 的類別索引。我想為這些分配更廣泛的類別,即將 151 個類別分組到 19 個超類別下。 不幸的是,休息不是直接的,即一個類別,比如奶酪,包括索引 57、58、59、60和143。
我嘗試使用 for 循環進行分配——這很有效——但我的計算機需要 2-3 天才能運行它(在下面插入)因此,我非常想尋求一種更有效的分配方式這些類別。
非常感謝您的幫助! 一切順利!
sample_1993_2002$category_all <- NA
sample_1993_2002 = sample_1993_2002[order(sample_1993_2002$category_idx),]
for (i in 1:nrow(sample_1993_2002)){
if (sample_1993_2002$category_idx[i] %in% c(1, 2, 3, 4, 5, 6, 17, 19, 68,69, 70,71,72,74,75,76,77,78,80,81,82,83,84,85,86, 88, 89,90,91, 92, 93, 94, 95, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 127, 141, 151,148, 146,147)) {sample_1993_2002$category_all[i] <- "fruit_veg_cereal"
} else if (sample_1993_2002$category_idx[i] %in% c(9,10,11,12,13,14,123)) {sample_1993_2002$category_all[i] <-"bakery"
} else if (sample_1993_2002$category_idx[i] == 16) {sample_1993_2002$category_all[i] <- "pasta"
} else if (sample_1993_2002$category_idx[i] %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129)) {sample_1993_2002$category_all[i] <- "meat_fresh"
} else if (sample_1993_2002$category_idx[i] %in% c(32,33,34,35,36,37,38,39, 121, 130)) {sample_1993_2002$category_all[i] <- "meat_product"
} else if (sample_1993_2002$category_idx[i] %in% c(40,41,42,43,44,45,46,47,48)) {sample_1993_2002$category_all[i] <- "fish"
} else if (sample_1993_2002$category_idx[i] %in% c(49,50,51,52,53,54,56,61,62,63,111,125)) {sample_1993_2002$category_all[i] <- "animal_prod"
} else if (sample_1993_2002$category_idx[i] %in% c(55,64,65,66,67)) {sample_1993_2002$category_all[i] <- "oils_butter"
} else if (sample_1993_2002$category_idx[i] %in% c(57,58,59,60, 143)){sample_1993_2002$category_all[i] <-"cheese"
} else if (sample_1993_2002$category_idx[i] %in% c(112,113)) {sample_1993_2002$category_all[i] <- "coffee"
} else if (sample_1993_2002$category_idx[i] %in% c(114,115)) {sample_1993_2002$category_all[i] <- "chocolate"
} else if (sample_1993_2002$category_idx[i] %in% c(116,117, 119,150)) {sample_1993_2002$category_all[i] <- "spice"
} else if (sample_1993_2002$category_idx[i] ==124) {sample_1993_2002$category_all[i] <- "jams"
} else if (sample_1993_2002$category_idx[i] == 134) {sample_1993_2002$category_all[i] <- "beer"
} else if (sample_1993_2002$category_idx[i] == 135) {sample_1993_2002$category_all[i] <-"rum"
} else if (sample_1993_2002$category_idx[i]== 136) {sample_1993_2002$category_all[i] <- "brandy"
} else if (sample_1993_2002$category_idx[i]== 137) {sample_1993_2002$category_all[i] <- "wine"
} else if (sample_1993_2002$category_idx[i] == 138) {sample_1993_2002$category_all[i] <- "spirit"
} else {sample_1993_2002$category_all[i] <- "other"}
}
是的。 不要使用循環。 R 默認情況下是矢量化的,因此,作為概念證明:使用隨機數據:
library(tidyverse)
df <- tibble(Category=sample(1:151, size=2e6, replace=TRUE))
df <- df %>%
mutate(
SuperCategory=ifelse(Category %in% c(9,10,11,12,13,14,123), "bakery", NA),
SuperCategory=ifelse(Category %in% c(16), "pasta", SuperCategory),
SuperCategory=ifelse(Category %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129), "meat_fresh", SuperCategory),
SuperCategory=ifelse(Category %in% c(32,33,34,35,36,37,38,39, 121, 130), "meat_product", SuperCategory),
SuperCategory=ifelse(is.na(SuperCategory), "other", SuperCategory)
)
df %>%
group_by(SuperCategory) %>%
summarise(N=n(), .groups="drop")
# A tibble: 5 x 2
SuperCategory N
<chr> <int>
1 bakery 92606
2 meat_fresh 172751
3 meat_product 132752
4 other 1588537
5 pasta 13354
我還沒有實現你所有的超級類別定義,但你明白了。 這運行得非常快:
library(microbenchmark)
microbenchmark({
df <- df %>%
mutate(
SuperCategory=ifelse(Category %in% c(9,10,11,12,13,14,123), "bakery", NA),
SuperCategory=ifelse(Category %in% c(16), "pasta", SuperCategory),
SuperCategory=ifelse(Category %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129), "meat_fresh", SuperCategory),
SuperCategory=ifelse(Category %in% c(32,33,34,35,36,37,38,39, 121, 130), "meat_product", SuperCategory),
SuperCategory=ifelse(is.na(SuperCategory), "other", SuperCategory)
)
},
times=10
)
<output deleted>
Unit: seconds
min lq mean median uq max neval
2.640724 2.655678 2.764513 2.680773 2.901454 3.002492 10
因此,200 萬行的平均值為 2.7 秒。 我不准備運行您的代碼來進行直接比較。 :=)
您也可以嘗試data.table
,對於大桌子來說非常快。 這里同樣是Limey的代碼和data.table
方式的數據:
library(data.table)
set.seed(1234)
dt <- data.frame(Category=sample(1:151, size=2e6, replace=TRUE)
result <-
data.table(dt)[,':='(SuperCategory = fifelse(Category %in% c(9:14, 125),'bakery',
fifelse(Category %in% c(16),'pasta',
fifelse(Category %in% c(20:31, 129),'meat_fresh',
fifelse(Category %in% c(32:39, 130),'meat_product',
fifelse(is.na(Category),'other','x'))))))]
這里是 100 次迭代的性能:
library(microbenchmark)
library(dplyr)
microbenchmark(
data.table = data.table(dt)[,':='(SuperCategory = fifelse(Category %in% c(9:14, 125),'bakery',
fifelse(Category %in% c(16),'pasta',
fifelse(Category %in% c(20:31, 129),'meat_fresh',
fifelse(Category %in% c(32:39, 130),'meat_product',
fifelse(is.na(Category),'other','x'))))))]
,
tidyverse = dt %>%
tibble() %>%
mutate(
SuperCategory=ifelse(Category %in% c(9,10,11,12,13,14,123), "bakery", NA),
SuperCategory=ifelse(Category %in% c(16), "pasta", SuperCategory),
SuperCategory=ifelse(Category %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129), "meat_fresh", SuperCategory),
SuperCategory=ifelse(Category %in% c(32,33,34,35,36,37,38,39, 121, 130), "meat_product", SuperCategory),
SuperCategory=ifelse(is.na(SuperCategory), "other", SuperCategory)
))
Unit: milliseconds
expr min lq mean median uq max neval
data.table 373.2851 393.7549 437.7108 422.4139 440.835 743.3399 100
tidyverse 3080.3477 3170.4254 3333.6604 3254.2223 3437.303 4192.6353 100
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.