簡體   English   中英

“R:dplyr:如何添加一個將值除以第一組值的列(類似於vlookup)”

[英]“R: dplyr: How to add a column that divides a value by the first group of values (kind of like a vlookup)”

我正在嘗試分析我的數據以按計划比較db_perk。 我想創建一個列,它將db_perk除以plan列中第一個計划的db_perk。 這樣我可以根據計划看到db_perk的差異。

我想把這個叫做SQL_Table數據:

   plan   gender marital_status accel_type extension_type inflation iss_age   dur    db_perk
   <chr>  <chr>  <chr>          <chr>        <chr>        <chr>     <chr>    <dbl>   <dbl>
 1 BasicF   F    Married        A.24         E.0          AC3.EC3    40       1      0.20
 2 BasicF   F    Married        A.24         E.0          AC3.EC3    40       2      0.25
 3 BasicF   F    Married        A.24         E.0          AC3.EC3    40       3      0.30
 4 BasicF   F    Married        A.24         E.0          AC3.EC3    40       4      0.40
 5 BasicF   M    Single         A.36         E.24         AC3.EC3    40       1      0.15
 6 GradedF  F    Married        A.24         E.0          AC3.EC3    40       1      0.25
 7 GradedF  F    Married        A.24         E.0          AC3.EC3    40       2      0.30
 8 GradedF  F    Married        A.24         E.0          AC3.EC3    40       3      0.50
 9 GradedF  F    Married        A.24         E.0          AC3.EC3    40       4      0.70
10 GradedF  M    Single         A.36         E.24         AC3.EC3    40       1      0.10

並將其轉換為:

   plan   gender marital_status accel_type extension_type inflation iss_age   dur    db_perk  db_perk_compare
   <chr>  <chr>  <chr>          <chr>        <chr>        <chr>     <chr>    <dbl>   <dbl>      <dbl>
 1 BasicF   F    Married        A.24         E.0          AC3.EC3    40       1      0.20       1.00
 2 BasicF   F    Married        A.24         E.0          AC3.EC3    40       2      0.25       1.00
 3 BasicF   F    Married        A.24         E.0          AC3.EC3    40       3      0.30       1.00
 4 BasicF   F    Married        A.24         E.0          AC3.EC3    40       4      0.40       1.00
 5 BasicF   M    Single         A.36         E.24         AC3.EC3    40       1      0.15       1.00
 6 GradedF  F    Married        A.24         E.0          AC3.EC3    40       1      0.25       1.25
 7 GradedF  F    Married        A.24         E.0          AC3.EC3    40       2      0.30       1.20
 8 GradedF  F    Married        A.24         E.0          AC3.EC3    40       3      0.50       1.67
 9 GradedF  F    Married        A.24         E.0          AC3.EC3    40       4      0.70       1.75
10 GradedF  M    Single         A.36         E.24         AC3.EC3    40       1      0.10       0.67

正如您所看到的,當計划為“BasicF”時,db_perk_compare列為“1”,因為公式將db_perk除以BasicF的db_perk。 其他列也可以有多個不同的值來影響db_perk。

我嘗試過這樣的事情

for (i in nrow(SQL_Table)){
      SQL_Table$db_perk_compare[i] <- SQL_Table$db_perk[i]/SQL_Table$db_perk[which(plan == SQL_Table$plan[1],
                                                                                   gender == SQL_Table$gender[i],
                                                                                   marital_status == SQL_Table$marital_status[i],
                                                                                   accel_type == SQL_Table$accel_type[i],
                                                                                   extension_type  == SQL_Table$extension_type [i],
                                                                                   inflation  == SQL_Table$inflation [i],
                                                                                   iss_age    == SQL_Table$iss_age[i],
                                                                                   dur  == SQL_Table$dur[i])]
  }

但得到這個錯誤:

Error in which(plan == SQL_Table$plan[1], gender == SQL_Table$gender[i],  : 
  unused arguments (accel_type == SQL_Table$accel_type[i], extension_type == SQL_Table$extension_type[i], inflation == SQL_Table$inflation[i], iss_age == SQL_Table$iss_age[i], dur == SQL_Table$dur[i])

使用tidyverse ,我們將所有列放在group_by組中,然后通過除以該列的first觀察值來mutate 'db_perk'

library(tidyverse)
SQL_Table %>%
       arrange(plan != "BasicF")%>%
       group_by(gender, marital_status, accel_type,
                extension_type, inflation, iss_age, dur) %>%
      mutate(db_perk_compare = db_perk/first(db_perk))
# A tibble: 10 x 10
# Groups:   gender, marital_status, accel_type, extension_type, inflation, iss_age, dur [5]
#   plan    gender marital_status accel_type extension_type inflation iss_age   dur db_perk db_perk_compare
#   <chr>   <chr>  <chr>          <chr>      <chr>          <chr>       <int> <int>   <dbl>           <dbl>
# 1 BasicF  F      Married        A.24       E.0            AC3.EC3        40     1    0.2            1    
# 2 BasicF  F      Married        A.24       E.0            AC3.EC3        40     2    0.25           1    
# 3 BasicF  F      Married        A.24       E.0            AC3.EC3        40     3    0.3            1    
# 4 BasicF  F      Married        A.24       E.0            AC3.EC3        40     4    0.4            1    
# 5 BasicF  M      Single         A.36       E.24           AC3.EC3        40     1    0.15           1    
# 6 GradedF F      Married        A.24       E.0            AC3.EC3        40     1    0.25           1.25 
# 7 GradedF F      Married        A.24       E.0            AC3.EC3        40     2    0.3            1.2  
# 8 GradedF F      Married        A.24       E.0            AC3.EC3        40     3    0.5            1.67 
# 9 GradedF F      Married        A.24       E.0            AC3.EC3        40     4    0.7            1.75 
#10 GradedF M      Single         A.36       E.24           AC3.EC3        40     1    0.1            0.667

這個想法與akrun相同,但我們可以使用group_by_at並排除plandb_perk ,而不是給每個列名。

library(dplyr)
SQL_Table %>%
  group_by_at(names(SQL_Table)[-grep("plan|db_perk", names(SQL_Table))]) %>%
  mutate(db_perk_compare = db_perk/first(db_perk))

# # A tibble: 10 x 10
# # Groups:   gender, marital_status, accel_type, extension_type, inflation, iss_age, dur [5]
# plan    gender marital_status accel_type extension_type inflation iss_age   dur db_perk db_perk_compare
# <chr>   <chr>  <chr>          <chr>      <chr>          <chr>     <chr>   <dbl>   <dbl>           <dbl>
# 1   BasicF  F      Married        A.24       E.0            AC3.EC3   40          1    0.2            1    
# 2   BasicF  F      Married        A.24       E.0            AC3.EC3   40          2    0.25           1    
# 3   BasicF  F      Married        A.24       E.0            AC3.EC3   40          3    0.3            1    
# 4   BasicF  F      Married        A.24       E.0            AC3.EC3   40          4    0.4            1    
# 5   BasicF  M      Single         A.36       E.24           AC3.EC3   40          1    0.15           1    
# 6   GradedF F      Married        A.24       E.0            AC3.EC3   40          1    0.25           1.25 
# 7   GradedF F      Married        A.24       E.0            AC3.EC3   40          2    0.3            1.2  
# 8   GradedF F      Married        A.24       E.0            AC3.EC3   40          3    0.5            1.67 
# 9   GradedF F      Married        A.24       E.0            AC3.EC3   40          4    0.7            1.75 
# 10  GradedF M      Single         A.36       E.24           AC3.EC3   40          1    0.1            0.667

數據:

dput(SQL_Table)
 structure(list(plan = c("BasicF", "BasicF", "BasicF", "BasicF", 
 "BasicF", "GradedF", "GradedF", "GradedF", "GradedF", "GradedF"
 ), gender = c("F", "F", "F", "F", "M", "F", "F", "F", "F", "M"
 ), marital_status = c("Married", "Married", "Married", "Married", 
 "Single", "Married", "Married", "Married", "Married", "Single"
 ), accel_type = c("A.24", "A.24", "A.24", "A.24", "A.36", "A.24", 
 "A.24", "A.24", "A.24", "A.36"), extension_type = c("E.0", "E.0", 
 "E.0", "E.0", "E.24", "E.0", "E.0", "E.0", "E.0", "E.24"), inflation = c("AC3.EC3", 
 "AC3.EC3", "AC3.EC3", "AC3.EC3", "AC3.EC3", "AC3.EC3", "AC3.EC3", 
 "AC3.EC3", "AC3.EC3", "AC3.EC3"), iss_age = c("40", "40", "40", 
 "40", "40", "40", "40", "40", "40", "40"), dur = c(1, 2, 3, 4, 
 1, 1, 2, 3, 4, 1), db_perk = c(0.2, 0.25, 0.3, 0.4, 0.15, 0.25, 
 0.3, 0.5, 0.7, 0.1)), row.names = c(NA, -10L), class = c("tbl_df", 
 "tbl", "data.frame"))

reprex包創建於2019-06-24(v0.3.0)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM