简体   繁体   中英

Is there a way to create factors from data dictionary in R?

Am trying to create factors from the data dictionary ? I tried using the Map but all the variables are converted to missing. How best was to approach this approach this? Doing it the purrr way would also be welcome.

library(dplyr)

mydata <- tibble(
  a_1 = c(20,22, 13,14,44),
  a_2 = c(42, 13, 32, 31, 14),
  b = c(1, 2, 1, 1, 2),
  c = c(1, 2, 1, 3, 1)
)



dictionary <- tibble(
  variable = c("a", "b", "c"),
  label = c("Age", "Gender", "Education"),
  type = c("mselect", "select", "select"),
  values = c(NA, "1, 2", "1, 2,3" ),
  valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")

)

# Expected results 
expectedata <- mydata %>% 
  mutate(
    b = factor(b, levels = c(1, 2), labels = c("Male", "Female")),
    c = factor(c, levels = c(1, 2, 3), 
               labels = c("Primary", "Secondary", "Tertiary"))
  )
expectedata 


# Select the factor variables

factor_vars <- dictionary %>%
  filter(type == "select") %>% pull(variable)


mydata[] <- Map(
  function(x, fctvalues, fctlabels)  factor(x, fctvalues,  fctlabels) ,
                mydata,
                dictionary$values[ match(factor_vars,
                                                 dictionary$variable) ],

                dictionary$valuelabel[ match(factor_vars,
                                             dictionary$variable) ]
)

Via pivot_ , left_join , and a bit of data wrangling:

Data

library(tidyverse)

mydata <- tibble(
    a_1 = c(20,22, 13,14,44),
    a_2 = c(42, 13, 32, 31, 14),
    b = c(1, 2, 1, 1, 2),
    c = c(1, 2, 1, 3, 1)
)



dictionary <- tibble(
    variable = c("a", "b", "c"),
    label = c("Age", "Gender", "Education"),
    type = c("mselect", "select", "select"),
    values = c(NA, "1, 2", "1, 2, 3" ),
    valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")
    
)

Code

target_dictionary <- dictionary %>%
    # optional: filter(type == "select") %>%
    separate_rows(values, valuelabel) %>% 
    select(variable, values, valuelabel)

target_mydata <- mydata %>%
    # Assuming you have no unique identifier
    rownames_to_column("id") %>%
    pivot_longer(
        cols = c("b", "c"),
        names_to = "var_name",
        values_to = "var_value"
    ) %>%
    # because the data types don't match here
    mutate(
        var_value = as.character(var_value)
    ) %>%
    left_join(
        target_dictionary,
        by = c("var_name" = "variable", "var_value" = "values")
    ) %>%
    pivot_wider(
        names_from = var_name,
        values_from = valuelabel, 
        id_cols = c("id", "a_1", "a_2")
    ) %>%
    select(-id)

Result:

> target_mydata
# A tibble: 5 × 4
    a_1   a_2 b      c        
  <dbl> <dbl> <chr>  <chr>    
1    20    42 Male   Primary  
2    22    13 Female Secondary
3    13    32 Male   Primary  
4    14    31 Male   Tertiary 
5    44    14 Female Primary  


Edit: You cpuld also go one step further and rename the factor column names.

Renaming the columns

target_mydata %>%
    rename_with(
        .fn = ~ setNames(dictionary$label, dictionary$variable)[.x], 
        .cols = intersect(names(mydata), setNames(dictionary$variable, dictionary$label))
    )

Result:

# A tibble: 5 × 4
    a_1   a_2 Gender Education
  <dbl> <dbl> <chr>  <chr>    
1    20    42 Male   Primary  
2    22    13 Female Secondary
3    13    32 Male   Primary  
4    14    31 Male   Tertiary 
5    44    14 Female Primary  

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM