简体   繁体   English

有没有办法将 `across` 与 `group_by` 和 `mutate` 一起使用?

[英]Is there a way to use `across` together with `group_by` and `mutate`?

I have a dataset with quartile columns我有一个包含四分位数列的数据集

library(dplyr)
library(glue)

set.seed(234)
df = 
tibble(x1 = sample(1:100, size = 100), # simulate data
       y1 = sample(1:100, size = 100),
       z1 = sample(1:100, size = 100)) %>% 
  mutate(across(ends_with("1"), ~factor(ntile(.x, 4)), # assign to quartiles
                                        .names = "{.col}_q"))
  
head(df)         
#> # A tibble: 6 × 6
#>      x1    y1    z1 x1_q  y1_q  z1_q 
#>   <int> <int> <int> <fct> <fct> <fct>
#> 1    97    94     9 4     4     1    
#> 2    31    66    66 2     3     3    
#> 3    34    43    62 2     2     3    
#> 4    46    57    41 2     3     2    
#> 5    98     3    38 4     1     2    
#> 6    18    37    40 1     2     2

For each of these quartile columns, I would like to append the min and the max of the quartile like so:对于这些四分位数列中的每一个,我想要 append 四分位数的最小值和最大值,如下所示:

df %>% 
  group_by(x1_q) %>% 
  mutate(x1_q = glue("Q{x1_q} ({min(x1)} - {max(x1)})")) %>% 
  head()
#> # A tibble: 6 × 6
#> # Groups:   x1_q [3]
#>      x1    y1    z1 x1_q          y1_q  z1_q 
#>   <int> <int> <int> <glue>        <fct> <fct>
#> 1    97    94     9 Q4 (76 - 100) 4     1    
#> 2    31    66    66 Q2 (26 - 50)  3     3    
#> 3    34    43    62 Q2 (26 - 50)  2     3    
#> 4    46    57    41 Q2 (26 - 50)  3     2    
#> 5    98     3    38 Q4 (76 - 100) 1     2    
#> 6    18    37    40 Q1 (1 - 25)   2     2

Created on 2022-04-29 by the reprex package (v2.0.1)reprex package (v2.0.1) 创建于 2022-04-29

Is there a scalable was I can apply this group_by mutate pattern to all of my quartile columns?是否可以将此group_by mutate模式应用于我所有的四分位数列?

Here is one method - loop across the _q columns, extract the corresponding columns without the _q by removing the substring suffix from the column name, and get the value ('tmp'), then use one of the formatting functions to append the column values along with the min and max这是一种方法 - 遍历_q列,通过从列名称中删除across后缀来提取不带_q的相应列,并get值('tmp'),然后使用其中一个格式化函数来 append 列值连同minmax

library(dplyr)
library(stringr)
df %>%
    mutate(across(ends_with("_q"),  ~ 
       {
        tmp <- get(str_remove(cur_column(), "_q"))
        str_c("Q", .x, " (", ave(tmp, .x, FUN = min), " - ", 
                 ave(tmp, .x, FUN = max), ")")
        }
  ))

-output -输出

# A tibble: 100 × 6
      x1    y1    z1 x1_q          y1_q          z1_q         
   <int> <int> <int> <chr>         <chr>         <chr>        
 1    97    94     9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)  
 2    31    66    66 Q2 (26 - 50)  Q3 (51 - 75)  Q3 (51 - 75) 
 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (51 - 75) 
 4    46    57    41 Q2 (26 - 50)  Q3 (51 - 75)  Q2 (26 - 50) 
 5    98     3    38 Q4 (76 - 100) Q1 (1 - 25)   Q2 (26 - 50) 
 6    18    37    40 Q1 (1 - 25)   Q2 (26 - 50)  Q2 (26 - 50) 
 7    56     4    98 Q3 (51 - 75)  Q1 (1 - 25)   Q4 (76 - 100)
 8     1    17    27 Q1 (1 - 25)   Q1 (1 - 25)   Q2 (26 - 50) 
 9    68    99    73 Q3 (51 - 75)  Q4 (76 - 100) Q3 (51 - 75) 
10    92    65    16 Q4 (76 - 100) Q3 (51 - 75)  Q1 (1 - 25)  
# … with 90 more rows

Or loop over the names in map2或者遍历map2中的names

library(purrr)
map2_dfc(names(df)[4:6], names(df)[1:3], 
    ~ df %>%
          group_by(across(all_of(.x))) %>%
          transmute(!! .x := glue::glue("Q{.data[[.x]]} ({min(.data[[.y]])} - {max(.data[[.y]])})")) %>% ungroup ) %>% bind_cols(df[1:3], .)
# A tibble: 100 × 6
      x1    y1    z1 x1_q          y1_q          z1_q         
   <int> <int> <int> <glue>        <glue>        <glue>       
 1    97    94     9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)  
 2    31    66    66 Q2 (26 - 50)  Q3 (51 - 75)  Q3 (51 - 75) 
 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (51 - 75) 
 4    46    57    41 Q2 (26 - 50)  Q3 (51 - 75)  Q2 (26 - 50) 
 5    98     3    38 Q4 (76 - 100) Q1 (1 - 25)   Q2 (26 - 50) 
 6    18    37    40 Q1 (1 - 25)   Q2 (26 - 50)  Q2 (26 - 50) 
 7    56     4    98 Q3 (51 - 75)  Q1 (1 - 25)   Q4 (76 - 100)
 8     1    17    27 Q1 (1 - 25)   Q1 (1 - 25)   Q2 (26 - 50) 
 9    68    99    73 Q3 (51 - 75)  Q4 (76 - 100) Q3 (51 - 75) 
10    92    65    16 Q4 (76 - 100) Q3 (51 - 75)  Q1 (1 - 25)  
# … with 90 more rows

You could write your own myntile function and just use across as usual.您可以编写自己的myntile function 并像往常一样使用across Below myntile is based on ntile , cut and some string manipulation.下面的myntile基于ntilecut和一些字符串操作。

library(dplyr)
library(stringr)

set.seed(234)

myntile <- function(x, n) {
  
  q <- ntile(x, n)
  rng <- as.character(cut(x, n))
  rng2 <- str_replace_all(rng, "[0-9\\.]+", function(x) round(as.numeric(x), 0)) %>%
    str_replace(",", " - ") %>%
    str_replace("]", ")")

  paste0("Q", q, " ", rng2)
}

df = 
  tibble(x1 = sample(1:100, size = 100), # simulate data
         y1 = sample(1:100, size = 100),
         z1 = sample(1:100, size = 100)) %>% 
  mutate(across(ends_with("1"), ~ myntile(.x, 4), # assign to quartiles
                .names = "{.col}_q"))

head(df)

#> # A tibble: 6 x 6
#>      x1    y1    z1 x1_q          y1_q          z1_q        
#>   <int> <int> <int> <chr>         <chr>         <chr>       
#> 1    97    94     9 Q4 (75 - 100) Q4 (75 - 100) Q1 (1 - 26) 
#> 2    31    66    66 Q2 (26 - 50)  Q3 (50 - 75)  Q3 (50 - 75)
#> 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (50 - 75)
#> 4    46    57    41 Q2 (26 - 50)  Q3 (50 - 75)  Q2 (26 - 50)
#> 5    98     3    38 Q4 (75 - 100) Q1 (1 - 26)   Q2 (26 - 50)
#> 6    18    37    40 Q1 (1 - 26)   Q2 (26 - 50)  Q2 (26 - 50)

Created on 2022-04-29 by the reprex package (v2.0.1)reprex package (v2.0.1) 创建于 2022-04-29

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM