簡體   English   中英

R 數據框:遍歷多個列和行值

[英]R dataframe: loop through multiple columns and row values

我是 R 的新手。我有一個包含數百萬行的大型數據框,如下所示:

Whole   code1       P_1   Q_1   code2   P_2   Q_2   code3   P_3   Q_3
    64      a       0.2   0.1   b      0.3    0.2   d      0.1    0.9
    55      a       0.5   0.3   c      0.1    0.3   b      0.4    0.4
    70      b       0.4   0.1   d      0.2    0.5   NULL   0.7    0.7
    26      c       0.7   0.5   a      0.2    0.6   b      0.2    0.2
    47      a       0.8   0.7   d      0.1    0.2   NULL   0.6    0.8
    35      d       0.2   0.8   b      0.8    0.1   a      0.2    0.1

我正在尋找三個輸出字段,具體取決於 code1、code2 和 code3 中的值。

> Output1   :   If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2   :  If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3   :  If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3

如果可以更正下面的代碼,我們將不勝感激:

 df1 %>%    
  for (i in 1:6) {
    if (paste0("code", i) %in% c("a", "b")) {
      mutate (paste0("Output", i) = Whole * paste0("P_", i) )
    } else {    
      mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
    }   
  } 
library(dplyr)
df1 %>%
  mutate(
    Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
    Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
    Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
  )
#   Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1    64     a 0.2 0.1     b 0.3 0.2     d 0.1 0.9    12.8    19.2     6.4
# 2    55     a 0.5 0.3     c 0.1 0.3     b 0.4 0.4    27.5     5.5    22.0
# 3    70     b 0.4 0.1     d 0.2 0.5  NULL 0.7 0.7    28.0    14.0    49.0
# 4    26     c 0.7 0.5     a 0.2 0.6     b 0.2 0.2    13.0    15.6     5.2
# 5    47     a 0.8 0.7     d 0.1 0.2  NULL 0.6 0.8    37.6     4.7    28.2
# 6    35     d 0.2 0.8     b 0.8 0.1     a 0.2 0.1    28.0     3.5     3.5

如果您的數據更通用(不是硬編碼的或多於“3”組列),那么我們可以重塑數據,進行分配,並將其整形。

library(tidyr)
df1 %>%
  rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
  pivot_longer(
    -Whole,
    names_to = c(".value", "set"),
    names_sep = "_"
  ) %>%
  mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
  pivot_wider(
    id_cols = Whole,
    names_from = set,
    values_from = c(code, P, Q, Output),
    names_sep = "_"
  )
# # A tibble: 6 x 13
#   Whole code_1 code_2 code_3   P_1   P_2   P_3   Q_1   Q_2   Q_3 Output_1 Output_2 Output_3
#   <int> <chr>  <chr>  <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>    <dbl>
# 1    64 a      b      d        0.2   0.3   0.1   0.1   0.2   0.9     12.8     19.2     57.6
# 2    55 a      c      b        0.5   0.1   0.4   0.3   0.3   0.4     27.5     16.5     22  
# 3    70 b      d      NULL     0.4   0.2   0.7   0.1   0.5   0.7     28       35       49  
# 4    26 c      a      b        0.7   0.2   0.2   0.5   0.6   0.2     13        5.2      5.2
# 5    47 a      d      NULL     0.8   0.1   0.6   0.7   0.2   0.8     37.6      9.4     37.6
# 6    35 d      b      a        0.2   0.8   0.2   0.8   0.1   0.1     28       28        7  

作為旁注,一般而言,我建議將其保留為“長”格式,而不是重新加寬它。 這種“長”格式通常是其他類似 tidy 的函數(包括ggplot2 )的ggplot2 ,並且很容易擴展到任意計數。 這將導致這樣的數據:

df1 %>%
  rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
  pivot_longer(
    -Whole,
    names_to = c(".value", "set"),
    names_sep = "_"
  ) %>%
  mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
#    Whole set   code      P     Q Output
#    <int> <chr> <chr> <dbl> <dbl>  <dbl>
#  1    64 1     a       0.2   0.1   12.8
#  2    64 2     b       0.3   0.2   19.2
#  3    64 3     d       0.1   0.9   57.6
#  4    55 1     a       0.5   0.3   27.5
#  5    55 2     c       0.1   0.3   16.5
#  6    55 3     b       0.4   0.4   22  
#  7    70 1     b       0.4   0.1   28  
#  8    70 2     d       0.2   0.5   35  
#  9    70 3     NULL    0.7   0.7   49  
# 10    26 1     c       0.7   0.5   13  
# 11    26 2     a       0.2   0.6    5.2
# 12    26 3     b       0.2   0.2    5.2
# 13    47 1     a       0.8   0.7   37.6
# 14    47 2     d       0.1   0.2    9.4
# 15    47 3     NULL    0.6   0.8   37.6
# 16    35 1     d       0.2   0.8   28  
# 17    35 2     b       0.8   0.1   28  
# 18    35 3     a       0.2   0.1    7  

(矮得多。)

我們可以使用map2 獲取具有“P”、“Q”的列的名稱。 _后面是數字。 然后使用map2遍歷相應的列,應用轉換邏輯並將列與原始數據集綁定

library(dplyr)
library(purrr)
library(stringr)
ps <-  names(df1)[str_detect(names(df1), "^P_\\d+$")]
qs <-  names(df1)[str_detect(names(df1), "^Q_\\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
                   transmute(Output = Whole *  
       case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
            TRUE ~ !! rlang::sym(.y)))) %>% 
    rename_all(~ str_remove(., fixed("..."))) %>%     
    bind_cols(df1, .)
#   Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1    64     a 0.2 0.1     b 0.3 0.2     d 0.1 0.9    12.8    19.2     6.4
#2    55     a 0.5 0.3     c 0.1 0.3     b 0.4 0.4    27.5     5.5    22.0
#3    70     b 0.4 0.1     d 0.2 0.5  NULL 0.7 0.7    28.0    14.0    49.0
#4    26     c 0.7 0.5     a 0.2 0.6     b 0.2 0.2    13.0    15.6     5.2
#5    47     a 0.8 0.7     d 0.1 0.2  NULL 0.6 0.8    37.6     4.7    28.2
#6    35     d 0.2 0.8     b 0.8 0.1     a 0.2 0.1    28.0     3.5     3.5

數據

df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a", 
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c", 
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2, 
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL", 
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4, 
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA, 
-6L))

根據您有多少行,這種 data.table 方法可能會更快。

library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
  ,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
  ][Logical == TRUE,`:=`(Output1 = Whole * P_1,
                         Output2 = Whole * P_2,
                         Output3 = Whole * P_3)
    ][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
                            Output2 = Whole * Q_2,
                            Output3 = Whole * Q_3)
      ][,.(Output1,Output2,Output3)]

   Output1 Output2 Output3
1:    12.8    19.2     6.4
2:    27.5     5.5    22.0
3:    28.0    14.0    49.0
4:    13.0    15.6     5.2
5:    37.6     4.7    28.2
6:    28.0     3.5     3.5

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM