[英]Multi-class, multi-label, ordinal classification with sklearn
我想知道如何使用 sklearn 運行多類、多標簽、序數分類。 我想預測目標群體的排名,范圍從某個位置最流行的人群 (1) 到最不流行的人群 (7)。 我似乎無法正確處理。 你能幫幫我嗎?
# Random Forest Classification
# Import
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Import dataset
dataset = pd.read_excel('alle_probs_edit.v2.xlsx')
X = dataset.iloc[:,4:-1].values
Y = dataset.iloc[:,-1].values
# Split in Train and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42 )
# Scaling the features (alle Variablen auf eine gleiche Ebene), necessary depend on the choosen method
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Creat classifier
classifier = RandomForestClassifier(criterion = 'entropy')
# Choose some parameter combinations to try
parameters = {'bootstrap': [True, False],
'max_depth': [50],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 3, 4],
'min_samples_split': [9, 10, 11, 12, 13],
'n_estimators': [500,1000,1500]}
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)
# Run the grid search
grid_obj = GridSearchCV(classifier, parameters, scoring=acc_scorer, cv = 3, n_jobs = -1)
grid_obj = grid_obj.fit(X_train, Y_train)
# Set the classifier to the best combination of parameters
classifier = grid_obj.best_estimator_
# Fit the best algorithm to the data
classifier.fit(X_train, Y_train)
#Prediction the Test data
Y_pred = classifier.predict(X_test)
#Confusion Matrix
cm = pd.DataFrame(confusion_matrix(Y_test, Y_pred))
#Accuracy
accuracy1 = accuracy_score(Y_test, Y_pred)
print("Accuracy1: %.2f%%" % (accuracy1 * 100.0))
# k-Fold Class Validation
accuracy1 = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 10)
kfold = accuracy1.mean()
accuracy1.std()
這可能不是您要尋找的確切答案, 本文概述了如下技術:
我們可以通過將 k 類序數回歸問題轉換為 k-1 二元分類問題來利用有序類值,我們將序數值為 V1、V2、V3、... Vk 的序數屬性 A* 轉換為 k-1 二元屬性,每個原始屬性的前 k - 1 個值。 第 i 個二進制屬性表示測試 A* > Vi
本質上,聚合多個二元分類器(預測目標> 1、目標> 2、目標> 3、目標> 4)能夠預測目標是1、2、3、4還是5。作者創建了一個OrdinalClassifier類在 Python 字典中存儲多個二進制分類器。
class OrdinalClassifier():
def __init__(self, clf):
self.clf = clf
self.clfs = {}
def fit(self, X, y):
self.unique_class = np.sort(np.unique(y))
if self.unique_class.shape[0] > 2:
for i in range(self.unique_class.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.unique_class[i]).astype(np.uint8)
clf = clone(self.clf)
clf.fit(X, binary_y)
self.clfs[i] = clf
def predict_proba(self, X):
clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
predicted = []
for i, y in enumerate(self.unique_class):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[i][:,1])
elif i in clfs_predict:
# Vi = Pr(y > Vi-1) - Pr(y > Vi)
predicted.append(clfs_predict[i-1][:,1] - clfs_predict[i][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[i-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)
def score(self, X, y, sample_weight=None):
_, indexed_y = np.unique(y, return_inverse=True)
return accuracy_score(indexed_y, self.predict(X), sample_weight=sample_weight)
這是一個使用 KNN 的示例,應該在 sklearn 管道或網格搜索中進行調整。
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import check_classification_targets
class KNeighborsOrdinalClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, n_neighbors=5, *, weights='uniform',
algorithm='auto', leaf_size=30, p=2,
metric='minkowski', metric_params=None, n_jobs=None):
self.n_neighbors = n_neighbors
self.weights = weights
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.metric = metric
self.metric_params = metric_params
self.n_jobs = n_jobs
def fit(self, X, y):
X, y = check_X_y(X, y)
check_classification_targets(y)
self.clf_ = KNeighborsClassifier(**self.get_params())
self.clfs_ = {}
self.classes_ = np.sort(np.unique(y))
if self.classes_.shape[0] > 2:
for i in range(self.classes_.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.classes_[i]).astype(np.uint8)
clf = clone(self.clf_)
clf.fit(X, binary_y)
self.clfs_[i] = clf
return self
def predict_proba(self, X):
X = check_array(X)
check_is_fitted(self, ['classes_', 'clf_', 'clfs_'])
clfs_predict = {k:self.clfs_[k].predict_proba(X) for k in self.clfs_}
predicted = []
for i,y in enumerate(self.classes_):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[y][:,1])
elif y in clfs_predict:
# Vi = Pr(y > Vi-1) - Pr(y > Vi)
predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[y-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
X = check_array(X)
check_is_fitted(self, ['classes_', 'clf_', 'clfs_'])
return np.argmax(self.predict_proba(X), axis=1)
在 David Diaz、白皮書和上面的 Kartik 以及其他鏈接到 Medium 並在自述文件中歸因的基礎上,我正在開發一個基於 sklearn 框架的 OrdinalClassifier,它與 sklearn 管道、評分、和交叉驗證。
OC 與標准的非序數 mc 分類相比表現非常好,並且可以更好地控制正類(即“高”在例如低<中<高類的糖尿病疾病進展中的精確度/召回率)的優化。它支持任何支持 pred_proba 的 sklearn 分類器。交叉驗證分數顯示在 repo 上。
https://github.com/leeprevost/OrdinalClassifier
在這個時候,我不會稱之為多標簽。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.