[英]Organising Factor Variables Prior to GLM fit in R
我在 R 中使用 GLM 做了很多工作,在相当大的数据集上(通常在模型拟合中包括几十个变量)。 为了能够在拟合模型后生成某种类型的图形输出,我发现在拟合之前“准备”任何我打算作为因子拟合的变量(名称以f_
开头的变量)很有用模型拟合。 我的意思是:
(i) 因为我在拟合 GLM 之前对每个因子重新排序,使得参考级别等于权重最大的级别,所以我想在relevel()
命令之前保留级别排序;
(ii) 为了以后在图表中突出参考水平,我喜欢将其记录在一个单独的变量中。
我已经使用内置的mtcars
数据集将这种方法放在一起示例。
目前我有这个代码:
library(dplyr)
data(mtcars)
# tidy up and make easier to read
df <- mtcars # built in data set
# let's make it a bit easier to follow
df <- df %>%
select(mpg,
f_cylinders = cyl,
c_displacement = disp,
c_hp = hp,
c_final_drive_ratio = drat,
c_weight = wt,
c_qtr_mile_time = qsec,
f_v_or_straight = vs,
f_transmission = am,
f_gears = gear,
f_num_carbs = carb)
df$f_v_or_straight <- ifelse(df$f_v_or_straight == 0, "V", "Straight")
df$f_transmission <- ifelse(df$f_transmission == 0, "Automatic", "Manual")
df$glm_weight <- 1
# organise factors - levels, reference level, weights
my_list <- list()
df$f_cylinders <- as.factor(df$f_cylinders)
my_list$f_cylinders_levels <- levels(df$f_cylinders)
my_list$f_cylinders_weights <- df %>% group_by(f_cylinders) %>% summarise(glm_weight = sum(glm_weight)) %>% ungroup() %>% pull(glm_weight)
my_list$f_cylinders_ref <- "8"
df$f_cylinders <- df$f_cylinders %>% relevel(ref = my_list$f_cylinders_ref)
df$f_v_or_straight <- as.factor(df$f_v_or_straight)
my_list$f_v_or_straight_levels <- levels(df$f_v_or_straight)
my_list$f_v_or_straight_weights <- df %>% group_by(f_v_or_straight) %>% summarise(glm_weight = sum(glm_weight)) %>% ungroup() %>% pull(glm_weight)
my_list$f_v_or_straight_ref <- "V"
df$f_v_or_straight <- df$f_v_or_straight %>% relevel(ref = my_list$f_v_or_straight_ref)
df$f_transmission <- as.factor(df$f_transmission)
my_list$f_transmission_levels <- levels(df$f_transmission)
my_list$f_transmission_weights <- df %>% group_by(f_transmission) %>% summarise(glm_weight = sum(glm_weight)) %>% ungroup() %>% pull(glm_weight)
my_list$f_transmission_ref <- "Automatic"
df$f_transmission <- df$f_transmission %>% relevel(ref = my_list$f_transmission_ref)
df$f_gears <- as.factor(df$f_gears)
my_list$f_gears_levels <- levels(df$f_gears)
my_list$f_gears_weights <- df %>% group_by(f_gears) %>% summarise(glm_weight = sum(glm_weight)) %>% ungroup() %>% pull(glm_weight)
my_list$f_gears_ref <- "3"
df$f_gears <- df$f_gears %>% relevel(ref = my_list$f_gears_ref)
df$f_num_carbs <- as.factor(df$f_num_carbs)
my_list$f_num_carbs_levels <- levels(df$f_num_carbs)
my_list$f_num_carbs_weights <- df %>% group_by(f_num_carbs) %>% summarise(glm_weight = sum(glm_weight)) %>% ungroup() %>% pull(glm_weight)
my_list$f_num_carbs_ref <- "4"
df$f_num_carbs <- df$f_num_carbs %>% relevel(ref = my_list$f_num_carbs_ref)
这段代码工作正常,但是……在现实世界中,我正在处理数十个因子变量,而不仅仅是上面的 5 个。 所以如果我有 50 个因子变量,我会重复做同样的事情 50 次。 我想将此准备工作捆绑到一个函数调用中,基本上是说:
对于名称以f_
开头的每个字段(即看起来像f_xxx
):
把它从chr
/ int
/whatever 变成一个因子f_xxx
;
计算权重f_xxx_weights
计算出参考水平f_xxx_ref
(如果领先,不确定该怎么做);
将当前因子水平存储在f_xxx_levels
;
重新排列因子水平,使f_xxx_ref
成为列表中的第一个。
我在这里问了很多……但任何能推动我前进的东西都会非常感激。
谢谢你。
考虑在用户定义的方法中概括您的relevel
过程,然后使用purrr::map_df
映射调用您的过程的f_
列(与 tidy 保持一致):
数据
library(dplyr)
library(purrr)
df <- mtcars %>%
select(mpg,
f_cylinders = cyl,
c_displacement = disp,
c_hp = hp,
c_final_drive_ratio = drat,
c_weight = wt,
c_qtr_mile_time = qsec,
f_v_or_straight = vs,
f_transmission = am,
f_gears = gear,
f_num_carbs = carb) %>%
mutate(f_v_or_straight = ifelse(f_v_or_straight == 0,
"V",
"Straight"),
f_transmission = ifelse(f_transmission == 0,
"Automatic",
"Manual"),
glm_weight = 1)
rlevel
进程(使用table
频率)
proc_rlevel <- function(col) {
agg <- df %>% group_by_at(col) %>%
summarise(glm_weight = sum(glm_weight)) %>%
arrange(desc(glm_weight))
f_ref <- df[[col]] %>%
as.character() %>%
as.factor() %>%
relevel(ref = paste(agg[[col]][1]))
return(f_ref)
}
# REPLACING ORIGINAL f_cols WITH TWO WAY PIPES
df[grep("f_", names(df))] %<>%
names() %>%
setNames(identity(.)) %>%
map_df(proc_rlevel)
查看更改
# ORIGINAL LEVELS
df %>%
select(starts_with("f_")) %>%
map_df(as.factor) %>%
map(levels)
# $f_cylinders
# [1] "4" "6" "8"
#
# $f_v_or_straight
# [1] "Straight" "V"
#
# $f_transmission
# [1] "Automatic" "Manual"
#
# $f_gears
# [1] "3" "4" "5"
#
# $f_num_carbs
# [1] "1" "2" "3" "4" "6" "8"
# ADJUSTED LEVELS
df %>%
select(starts_with("f_")) %>%
map_df(as.factor) %>%
map(levels)
# $f_cylinders
# [1] "8" "4" "6"
#
# $f_v_or_straight
# [1] "V" "Straight"
#
# $f_transmission
# [1] "Automatic" "Manual"
#
# $f_gears
# [1] "3" "4" "5"
#
# $f_num_carbs
# [1] "2" "1" "3" "4" "6" "8"
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.