[英]Categorize a continuous predictor variable and calculate proportion of binary outcome
我正在測試不同連續變量和二進制結果之間的非線性關系。 我想要一種快速有效的方法來按分類變量繪制結果概率。 這是我所擁有的,但似乎很笨拙:
一,數據:
(編輯:缺少引號)
df <- structure(list(BMI = c(23, 23, 19, 21, 24, 25, 22, 20,
20, 18, 18, 22, 23, 22, 20, 21, 20, 23, 26, 18, 20, 25, 28, 21,
24, 21, 21, 19, 22, 19, 21, 27, 21, 20, 20, 20, 22, 25, 20, 24,
25, 31, 27, 22, 21, 26, 23, 24, 31, 22, 22, 25, 24, 20, 23, 19,
20, 24, 20, 22, 23, 21, 20, 22, 21, 22, 21, 25, 20, 31, 23, 22,
24, 25, 23, 28, 20, 28, 20, 23, 27, 22, 21, 20, 25, 22, 28, 25,
27, 27, 29, 21, 21, 24, 25, 24, 22, 29, 23, 34, 22, 27, 18, 25,
23, 26, 23, 23, 21, 22, 29, 26, 23, 23, 21, 21, 24, 20, 21, 23,
27, 24, 31, 25, 19, 21, 21, 23, 19, 21, 22, 26, 21, 22, 22, 23,
25, 19, 20, 21, 20, 22, 20, 21, 26, 20, 22, 24, 21, 24, 22, 24,
28, 22, 24, 25, 30, 20, 24, 29, 23, 24, 24, 22, 20, 21, 22, 25,
19, 25, 20, 23, 25, 24, 17, 26, 25, 20, 21, 20, 22, 5, 26, 25,
26, 20, 23, 20, 19, 25, 21, 37, 20, 28, 32, 22, 23, 26, 23, 21,
24, 20, 22, 19, 24, 22, 22, 25, 24, 26, 25, 21, 21, 22, 27, 27,
24, 24, 25, 26, 18, 21, 28, 25, 21, 22, 21, 19, 24, 21, 25, 23,
21, 24, 22, 25, 23, 26, 23, 23, 21, 22, 25, 19, 24, 20, 26, 29,
19, 22, 24, 30, 28, 24, 31, 22, 27, 25, 23, 23, 26, 23, 25, 23,
24, 29, 23, 23, 26, 24, 32, 31, 22, 31, 22, 21, 18, 24, 21, 25,
25, 22, 24, 28, 22, 23, 22, 24, 32, 28, 26, 27, 22, 20, 23, 18,
20, 20, 19, 30, 28, 27, 29, 23, 20, 20, 25, 28, 22, 24, NA, 27
), Mortality = c(1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), .Names = c("BMI", "Mortality"), row.names = c(NA,
-312L), class = "data.frame")
這是我得到的:
df$BMIcut <- cut(df$BMI,breaks = c(0,17.5,20,22.5,25,30))
df$MortBMIcut <- NULL
for(i in levels(df$BMIcut)){
df[which(df$BMIcut==i & is.na(df$BMIcut)==F),"MortBMIcut"] <-
sum(df[which(df$BMIcut==i & is.na(df$BMIcut)==F & is.na(df$Mortality)==F),"Mortality"])/
NROW(df[which(df$BMIcut==i & is.na(df$BMIcut)==F & is.na(df$Mortality)==F),"Mortality"])
}
plot(MortBMIcut ~ BMIcut, data=df)
哪個產生
必須有一個更快的方法..?
我不明白為什么您需要制作許多冗余副本才能生成此圖:
t <- prop.table(table(df$BMIcut, df$Mortality),1)
plot(x= factor(levels(df$BMIcut)), y= t[,2], ylim=c(0,1))
但是,如評論中所述,我得到的值與您的圖不同。 而且(0,17.5]存儲桶中只有兩個Obs。不確定這是非線性還是數據稀疏。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.