繁体   English   中英

predict() 函数使用 R 中线性模型的因子抛出错误

[英]predict() function throws error using factors on linear model in R

我正在使用“肺活量”数据集来尝试建立一个线性模型:

library(tidyverse)
library(rvest)
h <- "https://docs.google.com/spreadsheets/d/0BxQfpNgXuWoIWUdZV1ZTc2ZscnM/edit?resourcekey=0-gqXT7Re2eUS2JGt_w1y4vA#gid=1055321634"
t <- rvest::read_html(h)
Nodes <- t %>% html_nodes("table")
table <- html_table(Nodes[[1]])
colnames(table) <- table[1,]
table <- table[-1,]
table <- table %>% select(LungCap, Age, Height, Smoke, Gender, Caesarean)
Lung_Capacity <- table

Lung_Capacity$LungCap <- as.numeric(Lung_Capacity$LungCap)
Lung_Capacity$Age <- as.numeric(Lung_Capacity$Age)
Lung_Capacity$Height <- as.numeric(Lung_Capacity$Height)
Lung_Capacity$Smoke <- as.numeric(Lung_Capacity$Smoke == "yes")
Lung_Capacity$Gender <- as.numeric(Lung_Capacity$Gender == "male")
Lung_Capacity$Caesarean <- as.numeric(Lung_Capacity$Caesarean == "yes")

colnames(Lung_Capacity)[4] <- "Smoker_YN"
colnames(Lung_Capacity)[5] <- "Male_YN"
colnames(Lung_Capacity)[6] <- "Caesarean_YN"
head(Lung_Capacity)
Capacity <- Lung_Capacity

我将数据拆分为训练集和验证集:

library(caret)
set.seed(1)
y <- Capacity$LungCap
testIndex <- caret::createDataPartition(y, times = 1, p = 0.2, list = FALSE)

train <- Capacity[-testIndex,]
test <- Capacity[testIndex,]

交叉验证以获得我的最终模型:

set.seed(3)
control <- trainControl(method="cv", number = 5)
LinearModel <- train(LungCap ~ ., data = train, method = "lm", trControl = control)
LM <- LinearModel$finalModel
summary(LM)

并尝试对保留的测试集进行预测:

lmPredictions <- predict(LM, newdata = test)

但是,抛出了一个错误,内容如下:

eval(predvars, data, env) 中的错误:找不到对象“Smoker_YN1”

浏览这个网站,我认为 test 和 train 表的列名可能已经关闭,但事实并非如此,它们是相同的。 问题似乎是训练模型已将因子预测变量重命名为“Smoker_YN1”,而不是预期的列名称“Smokey_YN”。 我尝试重命名测试集中的列标题,并尝试重命名系数标题。 这两种方法都没有成功。

我已经用完了研究和实验方法,有人可以帮忙解决这个问题吗?

我不确定。 请仔细告诉我:我的猜测(我不是专家, LungCap字符和Lung数字是否会干扰此代码):

h <- "https://docs.google.com/spreadsheets/d/0BxQfpNgXuWoIWUdZV1ZTc2ZscnM/edit?resourcekey=0-gqXT7Re2eUS2JGt_w1y4vA#gid=1055321634"
#install.packages("textreadr")
library(textreadr)
library(rvest)
t <- read_html(h)
t
Nodes <- t %>% html_nodes("table")
table <- html_table(Nodes[[1]])
colnames(table) <- table[1,]
table <- table[-1,]
table <- table %>% select(LungCap, Age, Height, Smoke, Gender, Caesarean)
Lung_Capacity <- table


# I changed Lung_Capacity$LungCap <- as.numeric(Lung_Capacity$LungCap) to
Lung_Capacity$Lung <- as.numeric(Lung_Capacity$LungCap)

Lung_Capacity$Age <- as.numeric(Lung_Capacity$Age)
Lung_Capacity$Height <- as.numeric(Lung_Capacity$Height)
Lung_Capacity$Smoke <- as.numeric(Lung_Capacity$Smoke == "yes")
Lung_Capacity$Gender <- as.numeric(Lung_Capacity$Gender == "male")
Lung_Capacity$Caesarean <- as.numeric(Lung_Capacity$Caesarean == "yes")

colnames(Lung_Capacity)[4] <- "Smoker_YN"
colnames(Lung_Capacity)[5] <- "Male_YN"
colnames(Lung_Capacity)[6] <- "Caesarean_YN"
head(Lung_Capacity)
# I changed to
Capacity <- Lung_Capacity
Capacity

library(caret)
set.seed(1)
# I changed y <- Capacity$LungCap to 
y <- Capacity$Lung
testIndex <- caret::createDataPartition(y, times = 1, p = 0.2, list = FALSE)

train <- Capacity[-testIndex,]
test <- Capacity[testIndex,]

# I removed 
train$LungCap <- NULL
test$LungCap <- NULL

set.seed(3)
control <- trainControl(method="cv", number = 5)
# I changed LungCap to Lung 
LinearModel <- train(Lung ~ ., data = train, method = "lm", trControl = control)
LM <- LinearModel$finalModel
summary(LM)

lmPredictions <- predict(LM, newdata = test)
lmPredictions

输出:

        1         2         3         4         5         6         7 
 6.344355 10.231586  4.902900  7.500179  5.295711  9.434454  8.879997 
        8         9        10        11        12        13        14 
12.227635 11.097691  7.775063  8.085810  6.399364  7.852107  9.480219 
       15        16        17        18        19        20 
 8.982051 10.115840  7.917863 12.089960  7.838881  9.653292 

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM