简体   繁体   中英

Scikit Learn multiclass classification (perfect results)

Hello i'm very new to scikit learn and i'm trying to do some text multiclass classification, i'm following this tutorial.
My dataset has 4 classes 'fipdl', 'lna','m5s','pd' , so i got 4 folder(one for class) each folder contains 120 txt files with about 25 rows of text(facebook statuses). I use 90% of it for training , 10% for testing.
10% of my txt files names starts with 'ts' and i'm using these for testing.
so my code is :

import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

def usage():
    print("Usage:")
    print("python %s <data_dir>" % sys.argv[0])

if __name__ == '__main__':

if len(sys.argv) < 2:
    usage()
    sys.exit(1)

data_dir = sys.argv[1]
classes = ['fipdl', 'lna','m5s','pd']

# Read the data
train_data = []
train_labels = []
test_data = []
test_labels = []

for curr_class in classes:
    dirname = os.path.join(data_dir, curr_class)
    for fname in os.listdir(dirname):
        with open(os.path.join(dirname, fname), 'r') as f:
            content = f.read()
            if fname.startswith('ts'):
                test_data.append(content)
                test_labels.append(curr_class)

            else:
                train_data.append(content)
                train_labels.append(curr_class)



# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 0.8,
                             sublinear_tf=True,
                             use_idf=True)
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)
# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(train_vectors, train_labels)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(test_vectors)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(test_vectors)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(test_labels, prediction_rbf))
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(test_labels, prediction_linear))
print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(test_labels, prediction_liblinear))

output :

Results for SVC(kernel=rbf)
Training time: 0.940005s; Prediction time: 0.055970s
             precision    recall  f1-score   support

      fipdl       1.00      1.00      1.00        11
        lna       1.00      1.00      1.00        11
        m5s       1.00      1.00      1.00        11
         pd       1.00      1.00      1.00        11

avg / total       1.00      1.00      1.00        44

Results for SVC(kernel=linear)
Training time: 0.941262s; Prediction time: 0.056382s
             precision    recall  f1-score   support

      fipdl       1.00      1.00      1.00        11
        lna       1.00      1.00      1.00        11
        m5s       1.00      1.00      1.00        11
         pd       1.00      1.00      1.00        11

avg / total       1.00      1.00      1.00        44

Results for LinearSVC()
Training time: 0.034038s; Prediction time: 0.000323s
             precision    recall  f1-score   support

      fipdl       1.00      1.00      1.00        11
        lna       1.00      1.00      1.00        11
        m5s       1.00      1.00      1.00        11
         pd       1.00      1.00      1.00        11

avg / total       1.00      1.00      1.00        44

Now the result seem too good to be true since every method gave me 1 of precision.
I think also would be nice to try to predict string passed by me instead of a test set,for do more tests, so i change the original code to this:

import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

def usage():
    print("Usage:")
    print("python %s <data_dir>" % sys.argv[0])

if __name__ == '__main__':

    if len(sys.argv) < 2:
        usage()
        sys.exit(1)

    data_dir = sys.argv[1]
    classes = ['fipdl', 'lna','m5s','pd']

    # Read the data
    train_data = []
    train_labels = []
    test_data = []
    test_labels = []

    for curr_class in classes:
        dirname = os.path.join(data_dir, curr_class)
        for fname in os.listdir(dirname):
            with open(os.path.join(dirname, fname), 'r') as f:
                content = f.read()
                if fname.startswith('ts'):
                    test_data.append(content)
                    test_labels.append(curr_class)

                else:
                    train_data.append(content)
                    train_labels.append(curr_class)



    # Create feature vectors
    vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True)
    string = ['string to predict'] #my string
    vector = vectorizer.transform(string) #convert
    train_vectors = vectorizer.fit_transform(train_data)

    test_vectors = vectorizer.transform(test_data)
    # Perform classification with SVM, kernel=rbf
    classifier_rbf = svm.SVC()
    t0 = time.time()
    classifier_rbf.fit(train_vectors, train_labels)
    t1 = time.time()
    prediction_rbf = classifier_rbf.predict(vector) #predict
    t2 = time.time()
    time_rbf_train = t1-t0
    time_rbf_predict = t2-t1

    # Perform classification with SVM, kernel=linear
    classifier_linear = svm.SVC(kernel='linear')
    t0 = time.time()
    classifier_linear.fit(train_vectors, train_labels)
    t1 = time.time()
    prediction_linear = classifier_linear.predict(test_vectors)
    t2 = time.time()
    time_linear_train = t1-t0
    time_linear_predict = t2-t1

    # Perform classification with SVM, kernel=linear
    classifier_liblinear = svm.LinearSVC()
    t0 = time.time()
    classifier_liblinear.fit(train_vectors, train_labels)
    t1 = time.time()
    prediction_liblinear = classifier_liblinear.predict(test_vectors)
    t2 = time.time()
    time_liblinear_train = t1-t0
    time_liblinear_predict = t2-t1

    # Print results in a nice table
    print("Results for SVC(kernel=rbf)")
    print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
    print(classification_report(test_labels, prediction_rbf))
    print("Results for SVC(kernel=linear)")
    print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
    print(classification_report(test_labels, prediction_linear))
    print("Results for LinearSVC()")
    print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
    print(classification_report(test_labels, prediction_liblinear))

but it fails with

ValueError: Found arrays with inconsistent numbers of samples: [18 44]

i'm missing something? or maybe this is a totally wrong approach?
any help would be really appreciated,
thanks in advance Nico.

# Create feature vectors
    vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True)
    string = ['string to predict'] #my string
    vector = vectorizer.transform(string) #convert
    train_vectors = vectorizer.fit_transform(train_data)

You creating new instance of vectorizer, and before fitting it , you using transform method. Just change the order of two last rows, like this:

    vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True)
    string = ['string to predict'] #my string
    train_vectors = vectorizer.fit_transform(train_data)
    vector = vectorizer.transform(string) #convert

Even if i have not figured out yet why it give me perfect results, i've decided to use a different approach for classify my text (with MultinomialNB) and test it with a string of my choice. I'm not sure if this is the best way to do it but it works so i decided to post as answer: (please note that not all the lines of code are necessary)

# -*- coding: utf-8 -*-
import sys
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.datasets import load_files
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


string = sys.argv[1] #the string i'd like to predict
sets = load_files('scikit') #load my personal dataset




count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sets.data)



tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


clf = MultinomialNB().fit(X_train_tfidf, sets.target)
docs_new = [string]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, sets.target_names[category])) #print result of prediction

then from console just run script.py "string to predict"

A lot of improvement can be done to this code, like dump the trained model, but is good enough for my usage.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM