Am trying to create factors from the data dictionary ? I tried using the Map
but all the variables are converted to missing. How best was to approach this approach this? Doing it the purrr
way would also be welcome.
library(dplyr)
mydata <- tibble(
a_1 = c(20,22, 13,14,44),
a_2 = c(42, 13, 32, 31, 14),
b = c(1, 2, 1, 1, 2),
c = c(1, 2, 1, 3, 1)
)
dictionary <- tibble(
variable = c("a", "b", "c"),
label = c("Age", "Gender", "Education"),
type = c("mselect", "select", "select"),
values = c(NA, "1, 2", "1, 2,3" ),
valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")
)
# Expected results
expectedata <- mydata %>%
mutate(
b = factor(b, levels = c(1, 2), labels = c("Male", "Female")),
c = factor(c, levels = c(1, 2, 3),
labels = c("Primary", "Secondary", "Tertiary"))
)
expectedata
# Select the factor variables
factor_vars <- dictionary %>%
filter(type == "select") %>% pull(variable)
mydata[] <- Map(
function(x, fctvalues, fctlabels) factor(x, fctvalues, fctlabels) ,
mydata,
dictionary$values[ match(factor_vars,
dictionary$variable) ],
dictionary$valuelabel[ match(factor_vars,
dictionary$variable) ]
)
Via pivot_
, left_join
, and a bit of data wrangling:
library(tidyverse)
mydata <- tibble(
a_1 = c(20,22, 13,14,44),
a_2 = c(42, 13, 32, 31, 14),
b = c(1, 2, 1, 1, 2),
c = c(1, 2, 1, 3, 1)
)
dictionary <- tibble(
variable = c("a", "b", "c"),
label = c("Age", "Gender", "Education"),
type = c("mselect", "select", "select"),
values = c(NA, "1, 2", "1, 2, 3" ),
valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")
)
target_dictionary <- dictionary %>%
# optional: filter(type == "select") %>%
separate_rows(values, valuelabel) %>%
select(variable, values, valuelabel)
target_mydata <- mydata %>%
# Assuming you have no unique identifier
rownames_to_column("id") %>%
pivot_longer(
cols = c("b", "c"),
names_to = "var_name",
values_to = "var_value"
) %>%
# because the data types don't match here
mutate(
var_value = as.character(var_value)
) %>%
left_join(
target_dictionary,
by = c("var_name" = "variable", "var_value" = "values")
) %>%
pivot_wider(
names_from = var_name,
values_from = valuelabel,
id_cols = c("id", "a_1", "a_2")
) %>%
select(-id)
Result:
> target_mydata
# A tibble: 5 × 4
a_1 a_2 b c
<dbl> <dbl> <chr> <chr>
1 20 42 Male Primary
2 22 13 Female Secondary
3 13 32 Male Primary
4 14 31 Male Tertiary
5 44 14 Female Primary
Edit: You cpuld also go one step further and rename the factor column names.
target_mydata %>%
rename_with(
.fn = ~ setNames(dictionary$label, dictionary$variable)[.x],
.cols = intersect(names(mydata), setNames(dictionary$variable, dictionary$label))
)
Result:
# A tibble: 5 × 4
a_1 a_2 Gender Education
<dbl> <dbl> <chr> <chr>
1 20 42 Male Primary
2 22 13 Female Secondary
3 13 32 Male Primary
4 14 31 Male Tertiary
5 44 14 Female Primary
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.