在 R 中分配类别是否有比使用循环更有效的方法？

Question

我目前正在处理一个大型数据集（两个子样本的大小分别为 2,008,384x15 和 3,072,435x15）。 这些数据是大约 200 个月内的月度食品和饮料报价。 非常感谢您对以下问题的帮助：

在原始数据中，每个观察值（产品报价）都有一个从 1 到 151 的类别索引。我想为这些分配更广泛的类别，即将 151 个类别分组到 19 个超类别下。 不幸的是，休息不是直接的，即一个类别，比如奶酪，包括索引 57、58、59、60和143。

我尝试使用 for 循环进行分配——这很有效——但我的计算机需要 2-3 天才能运行它（在下面插入）因此，我非常想寻求一种更有效的分配方式这些类别。

非常感谢您的帮助！ 一切顺利！

sample_1993_2002$category_all <- NA
sample_1993_2002 = sample_1993_2002[order(sample_1993_2002$category_idx),]

for (i in 1:nrow(sample_1993_2002)){
  
  if (sample_1993_2002$category_idx[i] %in% c(1, 2, 3, 4, 5, 6, 17, 19, 68,69, 70,71,72,74,75,76,77,78,80,81,82,83,84,85,86, 88, 89,90,91, 92, 93, 94, 95, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 127, 141, 151,148, 146,147)) {sample_1993_2002$category_all[i] <- "fruit_veg_cereal"
  } else if (sample_1993_2002$category_idx[i] %in% c(9,10,11,12,13,14,123)) {sample_1993_2002$category_all[i] <-"bakery"
  } else if (sample_1993_2002$category_idx[i] == 16) {sample_1993_2002$category_all[i] <- "pasta" 
  } else if (sample_1993_2002$category_idx[i] %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129)) {sample_1993_2002$category_all[i] <- "meat_fresh"
  } else if (sample_1993_2002$category_idx[i] %in% c(32,33,34,35,36,37,38,39, 121, 130)) {sample_1993_2002$category_all[i] <- "meat_product"
  } else if (sample_1993_2002$category_idx[i] %in% c(40,41,42,43,44,45,46,47,48)) {sample_1993_2002$category_all[i] <- "fish"
  } else if (sample_1993_2002$category_idx[i] %in% c(49,50,51,52,53,54,56,61,62,63,111,125)) {sample_1993_2002$category_all[i] <- "animal_prod"
  } else if (sample_1993_2002$category_idx[i] %in% c(55,64,65,66,67)) {sample_1993_2002$category_all[i] <-  "oils_butter"
  } else if (sample_1993_2002$category_idx[i] %in% c(57,58,59,60, 143)){sample_1993_2002$category_all[i] <-"cheese" 
  } else if (sample_1993_2002$category_idx[i] %in% c(112,113)) {sample_1993_2002$category_all[i] <- "coffee" 
  } else if (sample_1993_2002$category_idx[i] %in% c(114,115)) {sample_1993_2002$category_all[i] <- "chocolate" 
  } else if (sample_1993_2002$category_idx[i] %in% c(116,117, 119,150)) {sample_1993_2002$category_all[i] <- "spice"
  } else if (sample_1993_2002$category_idx[i] ==124) {sample_1993_2002$category_all[i] <- "jams"
  } else if (sample_1993_2002$category_idx[i] == 134) {sample_1993_2002$category_all[i] <- "beer"
  } else if (sample_1993_2002$category_idx[i] == 135) {sample_1993_2002$category_all[i] <-"rum"
  } else if (sample_1993_2002$category_idx[i]== 136) {sample_1993_2002$category_all[i] <- "brandy"
  } else if (sample_1993_2002$category_idx[i]== 137) {sample_1993_2002$category_all[i] <- "wine"
  } else if (sample_1993_2002$category_idx[i] == 138) {sample_1993_2002$category_all[i] <- "spirit"
  } else {sample_1993_2002$category_all[i] <- "other"}
}

Answer 1

是的。 不要使用循环。 R 默认情况下是矢量化的，因此，作为概念证明：使用随机数据：

library(tidyverse)

df <- tibble(Category=sample(1:151, size=2e6, replace=TRUE))

df <- df %>% 
       mutate(
         SuperCategory=ifelse(Category %in% c(9,10,11,12,13,14,123), "bakery", NA),
         SuperCategory=ifelse(Category %in% c(16), "pasta", SuperCategory),
         SuperCategory=ifelse(Category %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129), "meat_fresh", SuperCategory),
         SuperCategory=ifelse(Category %in% c(32,33,34,35,36,37,38,39, 121, 130), "meat_product", SuperCategory),
         SuperCategory=ifelse(is.na(SuperCategory), "other", SuperCategory)
      )
df %>% 
  group_by(SuperCategory) %>% 
  summarise(N=n(), .groups="drop")
# A tibble: 5 x 2
  SuperCategory       N
  <chr>           <int>
1 bakery          92606
2 meat_fresh     172751
3 meat_product   132752
4 other         1588537
5 pasta           13354

我还没有实现你所有的超级类别定义，但你明白了。 这运行得非常快：

library(microbenchmark)

microbenchmark({
df <- df %>% 
       mutate(
         SuperCategory=ifelse(Category %in% c(9,10,11,12,13,14,123), "bakery", NA),
         SuperCategory=ifelse(Category %in% c(16), "pasta", SuperCategory),
         SuperCategory=ifelse(Category %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129), "meat_fresh", SuperCategory),
         SuperCategory=ifelse(Category %in% c(32,33,34,35,36,37,38,39, 121, 130), "meat_product", SuperCategory),
         SuperCategory=ifelse(is.na(SuperCategory), "other", SuperCategory)
      )
 },
 times=10
)

<output deleted>

Unit: seconds
                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
      min       lq     mean   median       uq      max neval
 2.640724 2.655678 2.764513 2.680773 2.901454 3.002492    10

因此，200 万行的平均值为 2.7 秒。 我不准备运行您的代码来进行直接比较。 :=)

Answer 2

您也可以尝试data.table ，对于大桌子来说非常快。 这里同样是Limey的代码和data.table方式的数据：

library(data.table)
set.seed(1234)
dt <- data.frame(Category=sample(1:151, size=2e6, replace=TRUE)

result <-
 data.table(dt)[,':='(SuperCategory = fifelse(Category %in% c(9:14, 125),'bakery',
                                       fifelse(Category %in% c(16),'pasta',
                                         fifelse(Category %in% c(20:31, 129),'meat_fresh',
                                           fifelse(Category %in% c(32:39, 130),'meat_product',
                                             fifelse(is.na(Category),'other','x'))))))]

这里是 100 次迭代的性能：

library(microbenchmark)
library(dplyr)

microbenchmark(
data.table =  data.table(dt)[,':='(SuperCategory = fifelse(Category %in% c(9:14, 125),'bakery',
                                                  fifelse(Category %in% c(16),'pasta',
                                                  fifelse(Category %in% c(20:31, 129),'meat_fresh',
                                                  fifelse(Category %in% c(32:39, 130),'meat_product',
                                                  fifelse(is.na(Category),'other','x'))))))]

,

tidyverse = dt %>% 
 tibble() %>%
 mutate(
  SuperCategory=ifelse(Category %in% c(9,10,11,12,13,14,123), "bakery", NA),
  SuperCategory=ifelse(Category %in% c(16), "pasta", SuperCategory),
  SuperCategory=ifelse(Category %in% c(20,21,22,23,24,25,26,27,28,29,30,31,129), "meat_fresh", SuperCategory),
  SuperCategory=ifelse(Category %in% c(32,33,34,35,36,37,38,39, 121, 130), "meat_product", SuperCategory),
  SuperCategory=ifelse(is.na(SuperCategory), "other", SuperCategory)
 ))

Unit: milliseconds
       expr       min        lq      mean    median       uq       max neval
 data.table  373.2851  393.7549  437.7108  422.4139  440.835  743.3399   100
  tidyverse 3080.3477 3170.4254 3333.6604 3254.2223 3437.303 4192.6353   100

在 R 中分配类别是否有比使用循环更有效的方法？

问题描述

2 个解决方案

解决方案1
2 2021-08-17 08:38:04

解决方案2
2 已采纳 2021-08-17 09:03:48

在 R 中分配类别是否有比使用循环更有效的方法？

问题描述

2 个解决方案

解决方案1 2 2021-08-17 08:38:04

解决方案2 2 已采纳 2021-08-17 09:03:48

解决方案1
2 2021-08-17 08:38:04

解决方案2
2 已采纳 2021-08-17 09:03:48