简体   繁体   中英

Train and Test dataset are changing for k-fold cross validation so the accuracy is changed in naive bayes classifier

I am trying to use naive bayes classifier code from here . I am using 5 fold for dataset.The problem is, test and train dataset are changing for each fold, so the accuracy is also changing for each execution. But I need a fixed accuracy result. I am trying to get result using some sample dataset. my Jupyter code is here:

import numpy as np
from random import randrange
import csv
import math
import codecs

# Returns the mean of numbers
def mean(numbers):
    
    return np.mean(numbers)

#Returns the std_deviation of numbers
def stdev(numbers):

    return np.std(numbers)  

#Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
    
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
   
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

#Evaluate an algorithm using a cross validation split

def evaluate_algorithm(dataset, algorithm, n_folds, ):
   
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, )
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores, predicted

#Split training set by class value
def separate_by_class(dataset):

    separated = {}
    for i in range(len(dataset)):
        row = dataset[i]
        if row[-1] not in separated:
            separated[row[-1]] = []
        separated[row[-1]].append(row)
    return separated

#Find the mean and standard deviation of each feature in dataset
def model(dataset):
   
    models = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    models.pop() #Remove last entry because it is class value.
    return models

#find the mean and standard deviation of each feature in dataset by their class
def model_by_class(dataset):
    
    separated = separate_by_class(dataset)
    class_models = {}
    for (classValue, instances) in separated.items():
        class_models[classValue] = model(instances)
    return class_models

#Calculate probability using gaussian density function
def calculate_pdf(x, mean, stdev):
    
    if stdev == 0.0:
        if x == mean:
            return 1.0
        else:
            return 0.0
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return 1 / (math.sqrt(2 * math.pi) * stdev) * exponent

#Calculate the class probability for input sample. Combine probability of each feature
def calculate_class_probabilities(models, input):
    
    probabilities = {}
    for (classValue, classModels) in models.items():
        probabilities[classValue] = 1
        for i in range(len(classModels)):
            (mean, stdev) = classModels[i]
            x = input[i]
            probabilities[classValue] *= calculate_pdf(x, mean, stdev)
    return probabilities

#Compare probability for each class. Return the class label which has max probability.
def predict(models, inputVector):
    
    probabilities = calculate_class_probabilities(models, inputVector)
    (bestLabel, bestProb) = (None, -1)
    for (classValue, probability) in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

#Get class label for each value in test set.
def getPredictions(models, testSet):

    predictions = []
    for i in range(len(testSet)):
        result = predict(models, testSet[i])
        predictions.append(result)
    return predictions

#Create a naive bayes model. Then test the model and returns the testing result.
def naive_bayes(train, test, ):
    
    summaries = model_by_class(train)
    predictions = getPredictions(summaries, test)
    return predictions

# load and prepare data for result
dataset =[[1, 20, 1],
          [2, 21, 0],
          [3, 22, 1],
          [4, 22, 0],
          [5, 20, 0],
          [6, 20, 1],
          [7, 21, 0],
          [8, 22, 1],
          [9, 22, 0],
          [10, 20, 1]]
      
n_folds = 5
print ("---------- Gaussian Naive Bayes ---------------")
accuracy_naive = evaluate_algorithm(dataset, naive_bayes, n_folds)
print ("Naive Bayes Classification")
print ('Accuracy in each fold: %s' % accuracy_naive)
print ('Average Accuracy: %f' % (sum(accuracy_naive) / len(accuracy_naive)))

I have tried to do test with sample data. I think problem is here:

# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
    
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split



#Test splitting data
dataset = [[1, 20, 1],
           [2, 21, 0],
           [3, 22, 1],
           [4, 22, 0],
           [5, 20, 0],
           [6, 20, 1],
           [7, 21, 0],
           [8, 22, 1],
           [9, 22, 0],
           [10, 20, 1]
           ]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split

Thank You in Advance

The answer provided by @Amesh Jayaweera is correct but I would like to tell you that there is a predefined function for this in sklearn which is as follows.

from sklearn.model_selection import StratifiedKfold
splitter = StratifiedKfold(n_splits=5, random_state=1234)

This is a way better cause of elegant implementation and the extra advantage of the folds being stratified. Also, the random_state is the seed. You can check its implementation online.

Before randrange seed the random to follow the same splitting in every execution.

So, you can change the code as follows,

import random

# Split dataset into the k folds. Returns the list of k folds
def cross_validation_split(dataset, n_folds):
    random.seed(0)
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split



#Test splitting data
dataset = [[1, 20, 1],
           [2, 21, 0],
           [3, 22, 1],
           [4, 22, 0],
           [5, 20, 0],
           [6, 20, 1],
           [7, 21, 0],
           [8, 22, 1],
           [9, 22, 0],
           [10, 20, 1]
           ]
nfold = 5
dataset_split = cross_validation_split(dataset, nfold)
dataset_split 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM