简体   繁体   中英

sklearn SVM, different accuracy in Python2 vs Python3

I have the following code where I am doing 4 fold cross-validation on a dataset with feature vector size 11156 and 129 data points .

But the issue is that the same code gives different results when I run it using Python2 compiler vs when I run it using the Python3 compiler.

In the case of Python2 it gives accuracy values in the 90s while in case of Python3 it gives accuracy values in 70s and 80s

from __future__ import division
import scipy.io as sio
import numpy as np
from sklearn import svm
import random
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score

# Loading Data
data = sio.loadmat('data.mat')

feat_highcurve_u = np.array(data['HiCurve'])[0]
feat_lowcurve_u = np.array(data['LoCurve'])[0]

feat_highcurve = np.array([np.array(x[0]
                [int(len(x[0])/2) - 2789:
                 int(len(x[0])/2) + 2789]) 
                for x in feat_highcurve_u])
feat_lowcurve = np.array([np.array(x[0]
                [int(len(x[0])/2) - 2789:
                 int(len(x[0])/2) + 2789])
                for x in feat_lowcurve_u])

X_data = [np.concatenate((a,b), axis = 0) 
          for a,b in zip(feat_highcurve, 
                         feat_lowcurve)]

X = np.array(X_data)
X = np.transpose(X,(1,0))
avg_X = np.array([sum(x)/len(x) 
                  for x in X])

X_data = [x-avg_X for x in X_data]

y_labels = data['ClassLabels']
y_labels = np.array([(x[0]-1) 
                     for x in y_labels])


def calculate_ber(c_mat):
    val = 0
    for index, row in enumerate(c_mat):
        val += (np.sum(row) - row[index])/ np.sum(row)

    return val / len(c_mat)


def apply_svm(nu=0.1, kernel='rbf', degree=3):
    clf = svm.NuSVC(random_state=0, nu=nu, kernel=kernel, degree=degree)

    avg_accuracy = 0
    avg_ber = 0

    for n in range(10):
        # Randomizing the data
        combined = list(zip(X_data, y_labels))
        random.shuffle(combined)
        X_data[:], y_labels[:] = zip(*combined)

        # Splitting into 4 folds
        X_folds = [X_data[i:i+int(len(X_data)/4)] for i in range(0, len(X_data), int(len(X_data)/4))]
        y_folds = [y_labels[i:i+int(len(y_labels)/4)] for i in range(0, len(y_labels), int(len(y_labels)/4))]

        if(len(X_folds) == 5):
            X_folds[3] = np.concatenate((X_folds[3], X_folds[4]), axis = 0)
            X_folds.pop()

            y_folds[3] = np.concatenate((y_folds[3], y_folds[4]), axis = 0)
            y_folds.pop()

        accuracy = 0
        ber = 0

        # Iterating over folds
        for i in range(4):
            # Selecting test fold
            X_test = X_folds[i]
            y_test = y_folds[i]

            # Concatenating the rest of the folds
            o = [i for i in range(4)]
            o.remove(i)

            X_train = np.concatenate((X_folds[o[0]], X_folds[o[1]], X_folds[o[2]]), axis = 0)
            y_train = np.concatenate((y_folds[o[0]], y_folds[o[1]], y_folds[o[2]]), axis = 0)

            # Training SVM to fit the data
            clf.fit(X_train, y_train)

            # Testing the SVM
            preds = clf.predict(X_test)
            accuracy += (len([i for i in range(len(preds)) if preds[i] == y_test[i]])/len(preds))
            c_mat = cm(y_test, preds)
            ber += calculate_ber(c_mat)

        #print("Four fold cross-validation accuracy: Step("+str(n+1)+"): ",accuracy/4.0)
        avg_accuracy += (accuracy/4)
        avg_ber += (ber/4)

    print("After ten steps Average Accuracy: ", avg_accuracy/10) 
    print("After ten steps Average BER: ", avg_ber/10) 
    return ((avg_accuracy/10), (avg_ber/10))

nu_accuracies = {}
nu_values = [0.05, 0.1, 0.15, 0.20, 0.25, 0.30]

for nu_val in nu_values:
    nu_accuracies[nu_val] = apply_svm(nu=nu_val)

print("Final Metrics: ", nu_accuracies)

A late comment but for anyone else that is looking for differences in the two - sklearn changed their default solver for logistic regression , this could be the difference in some cases. some of the SVM implementations have undergone other changes in default parameters.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM