简体   繁体   中英

Is there a way to use `across` together with `group_by` and `mutate`?

I have a dataset with quartile columns

library(dplyr)
library(glue)

set.seed(234)
df = 
tibble(x1 = sample(1:100, size = 100), # simulate data
       y1 = sample(1:100, size = 100),
       z1 = sample(1:100, size = 100)) %>% 
  mutate(across(ends_with("1"), ~factor(ntile(.x, 4)), # assign to quartiles
                                        .names = "{.col}_q"))
  
head(df)         
#> # A tibble: 6 × 6
#>      x1    y1    z1 x1_q  y1_q  z1_q 
#>   <int> <int> <int> <fct> <fct> <fct>
#> 1    97    94     9 4     4     1    
#> 2    31    66    66 2     3     3    
#> 3    34    43    62 2     2     3    
#> 4    46    57    41 2     3     2    
#> 5    98     3    38 4     1     2    
#> 6    18    37    40 1     2     2

For each of these quartile columns, I would like to append the min and the max of the quartile like so:

df %>% 
  group_by(x1_q) %>% 
  mutate(x1_q = glue("Q{x1_q} ({min(x1)} - {max(x1)})")) %>% 
  head()
#> # A tibble: 6 × 6
#> # Groups:   x1_q [3]
#>      x1    y1    z1 x1_q          y1_q  z1_q 
#>   <int> <int> <int> <glue>        <fct> <fct>
#> 1    97    94     9 Q4 (76 - 100) 4     1    
#> 2    31    66    66 Q2 (26 - 50)  3     3    
#> 3    34    43    62 Q2 (26 - 50)  2     3    
#> 4    46    57    41 Q2 (26 - 50)  3     2    
#> 5    98     3    38 Q4 (76 - 100) 1     2    
#> 6    18    37    40 Q1 (1 - 25)   2     2

Created on 2022-04-29 by the reprex package (v2.0.1)

Is there a scalable was I can apply this group_by mutate pattern to all of my quartile columns?

Here is one method - loop across the _q columns, extract the corresponding columns without the _q by removing the substring suffix from the column name, and get the value ('tmp'), then use one of the formatting functions to append the column values along with the min and max

library(dplyr)
library(stringr)
df %>%
    mutate(across(ends_with("_q"),  ~ 
       {
        tmp <- get(str_remove(cur_column(), "_q"))
        str_c("Q", .x, " (", ave(tmp, .x, FUN = min), " - ", 
                 ave(tmp, .x, FUN = max), ")")
        }
  ))

-output

# A tibble: 100 × 6
      x1    y1    z1 x1_q          y1_q          z1_q         
   <int> <int> <int> <chr>         <chr>         <chr>        
 1    97    94     9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)  
 2    31    66    66 Q2 (26 - 50)  Q3 (51 - 75)  Q3 (51 - 75) 
 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (51 - 75) 
 4    46    57    41 Q2 (26 - 50)  Q3 (51 - 75)  Q2 (26 - 50) 
 5    98     3    38 Q4 (76 - 100) Q1 (1 - 25)   Q2 (26 - 50) 
 6    18    37    40 Q1 (1 - 25)   Q2 (26 - 50)  Q2 (26 - 50) 
 7    56     4    98 Q3 (51 - 75)  Q1 (1 - 25)   Q4 (76 - 100)
 8     1    17    27 Q1 (1 - 25)   Q1 (1 - 25)   Q2 (26 - 50) 
 9    68    99    73 Q3 (51 - 75)  Q4 (76 - 100) Q3 (51 - 75) 
10    92    65    16 Q4 (76 - 100) Q3 (51 - 75)  Q1 (1 - 25)  
# … with 90 more rows

Or loop over the names in map2

library(purrr)
map2_dfc(names(df)[4:6], names(df)[1:3], 
    ~ df %>%
          group_by(across(all_of(.x))) %>%
          transmute(!! .x := glue::glue("Q{.data[[.x]]} ({min(.data[[.y]])} - {max(.data[[.y]])})")) %>% ungroup ) %>% bind_cols(df[1:3], .)
# A tibble: 100 × 6
      x1    y1    z1 x1_q          y1_q          z1_q         
   <int> <int> <int> <glue>        <glue>        <glue>       
 1    97    94     9 Q4 (76 - 100) Q4 (76 - 100) Q1 (1 - 25)  
 2    31    66    66 Q2 (26 - 50)  Q3 (51 - 75)  Q3 (51 - 75) 
 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (51 - 75) 
 4    46    57    41 Q2 (26 - 50)  Q3 (51 - 75)  Q2 (26 - 50) 
 5    98     3    38 Q4 (76 - 100) Q1 (1 - 25)   Q2 (26 - 50) 
 6    18    37    40 Q1 (1 - 25)   Q2 (26 - 50)  Q2 (26 - 50) 
 7    56     4    98 Q3 (51 - 75)  Q1 (1 - 25)   Q4 (76 - 100)
 8     1    17    27 Q1 (1 - 25)   Q1 (1 - 25)   Q2 (26 - 50) 
 9    68    99    73 Q3 (51 - 75)  Q4 (76 - 100) Q3 (51 - 75) 
10    92    65    16 Q4 (76 - 100) Q3 (51 - 75)  Q1 (1 - 25)  
# … with 90 more rows

You could write your own myntile function and just use across as usual. Below myntile is based on ntile , cut and some string manipulation.

library(dplyr)
library(stringr)

set.seed(234)

myntile <- function(x, n) {
  
  q <- ntile(x, n)
  rng <- as.character(cut(x, n))
  rng2 <- str_replace_all(rng, "[0-9\\.]+", function(x) round(as.numeric(x), 0)) %>%
    str_replace(",", " - ") %>%
    str_replace("]", ")")

  paste0("Q", q, " ", rng2)
}

df = 
  tibble(x1 = sample(1:100, size = 100), # simulate data
         y1 = sample(1:100, size = 100),
         z1 = sample(1:100, size = 100)) %>% 
  mutate(across(ends_with("1"), ~ myntile(.x, 4), # assign to quartiles
                .names = "{.col}_q"))

head(df)

#> # A tibble: 6 x 6
#>      x1    y1    z1 x1_q          y1_q          z1_q        
#>   <int> <int> <int> <chr>         <chr>         <chr>       
#> 1    97    94     9 Q4 (75 - 100) Q4 (75 - 100) Q1 (1 - 26) 
#> 2    31    66    66 Q2 (26 - 50)  Q3 (50 - 75)  Q3 (50 - 75)
#> 3    34    43    62 Q2 (26 - 50)  Q2 (26 - 50)  Q3 (50 - 75)
#> 4    46    57    41 Q2 (26 - 50)  Q3 (50 - 75)  Q2 (26 - 50)
#> 5    98     3    38 Q4 (75 - 100) Q1 (1 - 26)   Q2 (26 - 50)
#> 6    18    37    40 Q1 (1 - 26)   Q2 (26 - 50)  Q2 (26 - 50)

Created on 2022-04-29 by the reprex package (v2.0.1)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM