[英]How to loop through a dataframe's columns in R and output quantiles() for each column as a row in new dataframe
[英]R dataframe: loop through multiple columns and row values
我是 R 的新手。我有一個包含數百萬行的大型數據框,如下所示:
Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3
64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9
55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4
70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7
26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2
47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8
35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1
我正在尋找三個輸出字段,具體取決於 code1、code2 和 code3 中的值。
> Output1 : If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2 : If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3 : If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3
如果可以更正下面的代碼,我們將不勝感激:
df1 %>%
for (i in 1:6) {
if (paste0("code", i) %in% c("a", "b")) {
mutate (paste0("Output", i) = Whole * paste0("P_", i) )
} else {
mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
}
}
library(dplyr)
df1 %>%
mutate(
Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
# 2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
# 3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
# 4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
# 5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
# 6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
如果您的數據更通用(不是硬編碼的或多於“3”組列),那么我們可以重塑數據,進行分配,並將其整形。
library(tidyr)
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
pivot_wider(
id_cols = Whole,
names_from = set,
values_from = c(code, P, Q, Output),
names_sep = "_"
)
# # A tibble: 6 x 13
# Whole code_1 code_2 code_3 P_1 P_2 P_3 Q_1 Q_2 Q_3 Output_1 Output_2 Output_3
# <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 64 a b d 0.2 0.3 0.1 0.1 0.2 0.9 12.8 19.2 57.6
# 2 55 a c b 0.5 0.1 0.4 0.3 0.3 0.4 27.5 16.5 22
# 3 70 b d NULL 0.4 0.2 0.7 0.1 0.5 0.7 28 35 49
# 4 26 c a b 0.7 0.2 0.2 0.5 0.6 0.2 13 5.2 5.2
# 5 47 a d NULL 0.8 0.1 0.6 0.7 0.2 0.8 37.6 9.4 37.6
# 6 35 d b a 0.2 0.8 0.2 0.8 0.1 0.1 28 28 7
作為旁注,一般而言,我建議將其保留為“長”格式,而不是重新加寬它。 這種“長”格式通常是其他類似 tidy 的函數(包括ggplot2
)的ggplot2
,並且很容易擴展到任意計數。 這將導致這樣的數據:
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
# Whole set code P Q Output
# <int> <chr> <chr> <dbl> <dbl> <dbl>
# 1 64 1 a 0.2 0.1 12.8
# 2 64 2 b 0.3 0.2 19.2
# 3 64 3 d 0.1 0.9 57.6
# 4 55 1 a 0.5 0.3 27.5
# 5 55 2 c 0.1 0.3 16.5
# 6 55 3 b 0.4 0.4 22
# 7 70 1 b 0.4 0.1 28
# 8 70 2 d 0.2 0.5 35
# 9 70 3 NULL 0.7 0.7 49
# 10 26 1 c 0.7 0.5 13
# 11 26 2 a 0.2 0.6 5.2
# 12 26 3 b 0.2 0.2 5.2
# 13 47 1 a 0.8 0.7 37.6
# 14 47 2 d 0.1 0.2 9.4
# 15 47 3 NULL 0.6 0.8 37.6
# 16 35 1 d 0.2 0.8 28
# 17 35 2 b 0.8 0.1 28
# 18 35 3 a 0.2 0.1 7
(矮得多。)
我們可以使用map2
。 獲取具有“P”、“Q”的列的名稱。 _
后面是數字。 然后使用map2
遍歷相應的列,應用轉換邏輯並將列與原始數據集綁定
library(dplyr)
library(purrr)
library(stringr)
ps <- names(df1)[str_detect(names(df1), "^P_\\d+$")]
qs <- names(df1)[str_detect(names(df1), "^Q_\\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
transmute(Output = Whole *
case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
TRUE ~ !! rlang::sym(.y)))) %>%
rename_all(~ str_remove(., fixed("..."))) %>%
bind_cols(df1, .)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
#2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
#3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
#4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
#5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
#6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a",
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c",
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2,
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL",
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4,
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA,
-6L))
根據您有多少行,這種 data.table 方法可能會更快。
library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
][Logical == TRUE,`:=`(Output1 = Whole * P_1,
Output2 = Whole * P_2,
Output3 = Whole * P_3)
][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
Output2 = Whole * Q_2,
Output3 = Whole * Q_3)
][,.(Output1,Output2,Output3)]
Output1 Output2 Output3
1: 12.8 19.2 6.4
2: 27.5 5.5 22.0
3: 28.0 14.0 49.0
4: 13.0 15.6 5.2
5: 37.6 4.7 28.2
6: 28.0 3.5 3.5
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.