简体   繁体   中英

For loops in dplyr and group_by

I have individual-level data including several categorical variables over different years. I want to produce summary tables for each category over different years. The data looks like this, but with many more categorical variables:

library(dplyr)
library(extraDistr) # just for data generation
set.seed(1000)

data <- data.frame(
  year = rep(c(2010:2019), each=50),
  gender = rcat(500, c(.5,.5), c("female","male")),
  employment = rcat(500, c(.1,.6,.3), c("unemployed","employed","nonparticipant")),
  race = rcat(500, c(.7,.2,.05,.1), c("white","black","Asian","Latino")),
  individual_weight = runif(500,1,50))

My summary table for each variable is something like this:

data %>% 
  group_by(year, employment) %>%
  summarize(number = sum(individual_weight)) %>%
  group_by(year) %>%
  mutate(share = number/sum(number)*100) %>%
  pivot_wider(employment, names_from="year", values_from="share")

which gives a result like this:

# A tibble: 3 x 11
  employment     `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019`
  <fct>           <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 unemployed       16.8   9.25   8.51   6.12   18.4   12.6   5.99   11.9   11.5   8.10
2 employed         47.0  70.6   67.8   53.4    62.3   66.2  75.5    57.2   48.5  58.6 
3 nonparticipant   36.2  20.2   23.7   40.5    19.3   21.2  18.5    30.9   40.1  33.3

I want to produce this table for several categorical variables and wrote a for loop that doesn't work.

for (i in c("gender", "employment", "race")) {
  data %>% 
    group_by(year, get(i)) %>%
    summarize(number = sum(individual_weight)) %>%
    group_by(year) %>%
    mutate(share = number/sum(number)*100) %>%
    pivot_wider(get(i), names_from="year", values_from="share")
}

`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
Error: object 'gender' not found
Run `rlang::last_error()` to see where the error occurred.

We can wrap the steps into a function, and call that function on a list of values, using non standard evaluation.

library(tidyverse)
f = function(var){
  var = sym(var) # transform "gender" to gender without quotes
  data %>%
    group_by(year, !!var) %>% # !! force evaluation
    summarize(number = sum(individual_weight), .groups = "drop_last") %>%
    mutate(share = number / sum(number) * 100) %>%
    pivot_wider(var, names_from = "year", values_from = "share")
}

map(c("gender", "employment", "race"), f)

# or as a loop

vars = c("gender", "employment", "race")
l = vector("list", length(vars))
for (i in seq_along(vars)) {
  l[[i]] = f(vars[i])
}

If you're okay with not using a loop, here's a tidyverse solution:

library(tidyverse)
library(extraDistr) # just for data generation
#> 
#> Attaching package: 'extraDistr'
#> The following object is masked from 'package:purrr':
#> 
#>     rdunif

set.seed(1000)

data <- data.frame(
  year = rep(c(2010:2019), each = 50),
  gender = rcat(500, c(.5, .5), c("female", "male")),
  employment = rcat(
    500,
    c(.1, .6, .3),
    c("unemployed", "employed", "nonparticipant")
  ),
  race = rcat(500, c(.7, .2, .05, .1), c("white", "black", "Asian", "Latino")),
  individual_weight = runif(500, 1, 50)
) %>%
  as_tibble()

data %>%
  pivot_longer(cols = gender:race,
               names_to = "category") %>%
  group_by(year, category, value) %>%
  summarise(number = sum(individual_weight),
            .groups = "drop") %>%
  group_by(year) %>%
  mutate(pct = number / sum(number)) %>%
  select(-number) %>%
  pivot_wider(
    names_from = year,
    names_prefix = "yr_",
    values_from = pct,
  )
#> # A tibble: 9 x 12
#>   category value yr_2010 yr_2011 yr_2012 yr_2013 yr_2014 yr_2015 yr_2016 yr_2017
#>   <chr>    <fct>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1 employm~ unem~  0.0559 0.0308  0.0284   0.0204 0.0614   0.0420  0.0200  0.0397
#> 2 employm~ empl~  0.157  0.235   0.226    0.178  0.208    0.221   0.252   0.191 
#> 3 employm~ nonp~  0.121  0.0672  0.0791   0.135  0.0642   0.0708  0.0616  0.103 
#> 4 gender   fema~  0.198  0.145   0.199    0.208  0.180    0.146   0.180   0.210 
#> 5 gender   male   0.135  0.188   0.135    0.125  0.154    0.187   0.153   0.124 
#> 6 race     white  0.239  0.242   0.233    0.191  0.243    0.247   0.204   0.238 
#> 7 race     black  0.0493 0.0393  0.0775   0.0806 0.0464   0.0536  0.0435  0.0381
#> 8 race     Asian  0.0183 0.00763 0.00970  0.0300 0.00570  0.0208  0.0106  0.0181
#> 9 race     Lati~  0.0265 0.0445  0.0136   0.0319 0.0384   0.0118  0.0753  0.0390
#> # ... with 2 more variables: yr_2018 <dbl>, yr_2019 <dbl>

Created on 2022-01-19 by the reprex package (v2.0.1)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM