简体   繁体   中英

Naive Bayes classifier in R

I'm trying to run a NaiveBayes classifier running this code:

library(e1071)
filename <- "file.csv"  
data <- read.csv(filename, sep=",")  
trainData <- data[1:1000,,drop = FALSE]    
testData <- data[1001:1500,,drop = FALSE]  
model <- naiveBayes(trainData[,1:49], trainData[,50])   
predicted <- predict(model, testData[,-50])  

I get this error:

Error in apply(log(sapply(seq_along(attribs), function(v) { : 
  dim(X) must have a positive length

I checked and I don't see any missing attributes. What could be the problem?

Some sample data, following Dr Mike's request. Note that the data contains 50 columns (in my original post I used 30 since I tried to simplify things)

structure(list(X = 1:10, X1 = c(12L, 6L, 6L, 11L, 17L, 7L, 5L, 
3L, 4L, 7L), X2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = "A", class = "factor"), X3 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "us", class = "factor"), 
    X4 = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, 
    TRUE), X5 = c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, 
    FALSE, FALSE, TRUE), X6 = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
    FALSE, FALSE, FALSE, FALSE, FALSE), X7 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "DAY", class = "factor"), 
    X8 = c(159L, 47L, 163L, 51L, 171L, 145L, 31L, 49L, 49L, 154L
    ), X9 = structure(c(2L, 5L, 2L, 4L, 3L, 2L, 1L, 5L, 1L, 2L
    ), .Label = c("100-150", "150-200", "50-100", "650-700", 
    "unknown"), class = "factor"), X10 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), X11 = c(0L, 1L, 0L, 2L, 0L, 0L, 0L, 
    1L, 0L, 0L), X12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L), X13 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), X14 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X15 = c(1L, 1L, 2L, 
    2L, 0L, 1L, 0L, 2L, 2L, 2L), X16 = c(2L, 1L, 2L, 1L, 0L, 
    2L, 1L, 2L, 1L, 2L), X17 = c(6L, 4L, 6L, 7L, 0L, 4L, 3L, 
    4L, 4L, 8L), X18 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L), X19 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X20 = c(1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X21 = c(7L, 6L, 7L, 
    3L, 0L, 6L, 6L, 2L, 7L, 7L), X22 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), X23 = c(2L, 2L, 2L, 2L, 0L, 2L, 2L, 
    2L, 2L, 2L), X24 = c(0L, 1L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 
    1L), X25 = c(2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), X26 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X27 = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X28 = c(1L, 0L, 0L, 0L, 0L, 
    0L, 0L, 1L, 1L, 0L), X29 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), X30 = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 2L, 
    0L), X31 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X32 = c(2L, 
    0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L), X33 = c(0L, 0L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X34 = c(1L, 1L, 2L, 3L, 0L, 
    2L, 0L, 2L, 2L, 1L), X35 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    1L, 2L, 0L), X36 = c(3L, 3L, 5L, 3L, 0L, 4L, 4L, 7L, 4L, 
    2L), X37 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X38 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), X39 = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X40 = c(3L, 1L, 4L, 3L, 0L, 
    2L, 0L, 3L, 1L, 0L), X41 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), X42 = c(3L, 6L, 3L, 2L, 0L, 2L, 0L, 3L, 2L, 
    3L), X43 = c(1L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L), X44 = c(1L, 
    2L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), X45 = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), X46 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L), X47 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), X48 = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 2L, 1L, 
    0L), X49 = c(2L, 0L, 1L, 1L, 0L, 0L, 2L, 3L, 0L, 4L), target = c(FALSE, 
    FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
    )), .Names = c("X", "X1", "X2", "X3", "X4", "X5", "X6", "X7", 
"X8", "X9", "X10", "X11", "X12", "X13", "X14", "X15", "X16", 
"X17", "X18", "X19", "X20", "X21", "X22", "X23", "X24", "X25", 
"X26", "X27", "X28", "X29", "X30", "X31", "X32", "X33", "X34", 
"X35", "X36", "X37", "X38", "X39", "X40", "X41", "X42", "X43", 
"X44", "X45", "X46", "X47", "X48", "X49", "target"), row.names = c(NA, 
10L), class = "data.frame")

Following Dr Mike's advice I factorized the data. I still get the exact same error.
So the code is now:

data <- read.csv(filename, sep=",")
data[, -which(sapply(data, is.numeric))]<-
  as.data.frame(lapply((data[,-which(sapply(data, is.numeric))]), as.factor))
trainData <- data[1:1000,,drop = FALSE]    
testData <- data[1001:1500,,drop = FALSE]  
model <- naiveBayes(trainData[,1:49], trainData[,50])   
predicted <- predict(model, testData[,-50])  

You should make sure to work with factors for the categorical variables. The following small example using your data.

data$target[1:5]<-TRUE
data$target[6:10]<-FALSE
data<-data[sample(1:10),]
# Make sure the target is a factor
data$target<-as.factor(data$target)
# Work with a subset since we only have 10 data points.
data2<-data[,c(2,49:50,51)]
model<-naiveBayes(target~., data=data2[1:5,])
predict(model, data2[6:10,])
table(predict(model, data2[6:10,]), data[6:10, 5])
#        FALSE TRUE
#  FALSE     0    0
#  TRUE      0    5

So make sure to convert categorical variables and response to factors and use a subset of the variables.

Found the problem and recording the solution in case someone googles this error like I did.
In this case, the problem was that the "target" attribute contained only examples labeled "false". Since there were no examples labelled "true" the classifier didn't work.
I still don't understand why I got this specific error and not something readable like: "cannot classify, no "true" labelled examples". But at least I solved it.

Many thanks to Dr.Mike for sending me in the right direction (which in this case was - look at the data, not at the code)

I had a similar issue. After many hours or research I changed TRUE/FALSE to character values y/n. It seems R gets confused about whether TRUE/FALSE is categorical or numeric.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM