[英]Hyperparameter Optimisation for Supervised classification Algorithms in SCI-KIT learn?
我有 NSL-KDD 數據集,任務是從 scikit learn 運行各種分類算法(例如:KNN-classifier),無論我選擇什么分類器,我都必須獲得超過 80% 的准確度得分,要執行超參數優化,截至目前,如果我運行 KNN 分類器,我的准確度得分為 75.5%,那么超參數優化會給我的准確度得分超過 80% 嗎?
運行代碼所需的數據文件:-
http://www.filedropper.com/kddtest_1
http://www.filedropper.com/kddtrain
http://www.filedropper.com/trainingattacktypes
主要代碼文件:-
http://www.filedropper.com/main1_4
import os
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
dataset_root = 'NSL-KDD-Dataset/NSL-KDD-Dataset'
#train_file = os.path.join(dataset_root, 'KDDTrain+.txt')
#test_file = os.path.join(dataset_root, 'KDDTest+.txt')
# Original KDD dataset feature names obtained from
# http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
# http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
header_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
'attack_type', 'success_pred']
# Differentiating between nominal, binary, and numeric features
# root_shell is marked as a continuous feature in the kddcup.names
# file, but it is supposed to be a binary feature according to the
# dataset documentation
col_names = np.array(header_names)
nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(nominal_idx).difference(binary_idx))
nominal_cols = col_names[nominal_idx].tolist()
binary_cols = col_names[binary_idx].tolist()
numeric_cols = col_names[numeric_idx].tolist()
# training_attack_types.txt maps each of the 22 different attacks to 1 of 4 categories
# file obtained from http://kdd.ics.uci.edu/databases/kddcup99/training_attack_types
category = defaultdict(list)
category['benign'].append('normal')
with open('training_attack_types.txt', 'r') as f:
for line in f.readlines():
attack, cat = line.strip().split(' ')
category[cat].append(attack)
attack_mapping = dict((v,k) for k in category for v in category[k])
train_df = pd.read_csv('KDDTest+.txt', names=header_names)
train_df['attack_category'] = train_df['attack_type'] \
.map(lambda x: attack_mapping[x])
train_df.drop(['success_pred'], axis=1, inplace=True)
test_df = pd.read_csv('KDDTest+.txt', names=header_names)
test_df['attack_category'] = test_df['attack_type'] \
.map(lambda x: attack_mapping[x])
test_df.drop(['success_pred'], axis=1, inplace=True)
train_attack_types = train_df['attack_type'].value_counts()
train_attack_cats = train_df['attack_category'].value_counts()
test_attack_types = test_df['attack_type'].value_counts()
test_attack_cats = test_df['attack_category'].value_counts()
train_attack_types.plot(kind='barh', figsize=(20,10), fontsize=20)
train_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)
test_attack_types.plot(kind='barh', figsize=(20,10), fontsize=15)
test_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)
# Let's take a look at the binary features
# By definition, all of these features should have a min of 0.0 and a max of 1.0
#execute the commands in console
train_df[binary_cols].describe().transpose()
# Wait a minute... the su_attempted column has a max value of 2.0?
train_df.groupby(['su_attempted']).size()
# Let's fix this discrepancy and assume that su_attempted=2 -> su_attempted=0
train_df['su_attempted'].replace(2, 0, inplace=True)
test_df['su_attempted'].replace(2, 0, inplace=True)
train_df.groupby(['su_attempted']).size()
# Next, we notice that the num_outbound_cmds column only takes on one value!
train_df.groupby(['num_outbound_cmds']).size()
# Now, that's not a very useful feature - let's drop it from the dataset
train_df.drop('num_outbound_cmds', axis = 1, inplace=True)
test_df.drop('num_outbound_cmds', axis = 1, inplace=True)
numeric_cols.remove('num_outbound_cmds')
"""
Data Preparation
"""
train_Y = train_df['attack_category']
train_x_raw = train_df.drop(['attack_category','attack_type'], axis=1)
test_Y = test_df['attack_category']
test_x_raw = test_df.drop(['attack_category','attack_type'], axis=1)
combined_df_raw = pd.concat([train_x_raw, test_x_raw])
combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True)
train_x = combined_df[:len(train_x_raw)]
test_x = combined_df[len(train_x_raw):]
# Store dummy variable feature names
dummy_variables = list(set(train_x)-set(combined_df_raw))
#execute the commands in console
train_x.describe()
train_x['duration'].describe()
# Experimenting with StandardScaler on the single 'duration' feature
from sklearn.preprocessing import StandardScaler
durations = train_x['duration'].values.reshape(-1, 1)
standard_scaler = StandardScaler().fit(durations)
scaled_durations = standard_scaler.transform(durations)
pd.Series(scaled_durations.flatten()).describe()
# Experimenting with MinMaxScaler on the single 'duration' feature
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler().fit(durations)
min_max_scaled_durations = min_max_scaler.transform(durations)
pd.Series(min_max_scaled_durations.flatten()).describe()
# Experimenting with RobustScaler on the single 'duration' feature
from sklearn.preprocessing import RobustScaler
min_max_scaler = RobustScaler().fit(durations)
robust_scaled_durations = min_max_scaler.transform(durations)
pd.Series(robust_scaled_durations.flatten()).describe()
# Let's proceed with StandardScaler- Apply to all the numeric columns
standard_scaler = StandardScaler().fit(train_x[numeric_cols])
train_x[numeric_cols] = \
standard_scaler.transform(train_x[numeric_cols])
test_x[numeric_cols] = \
standard_scaler.transform(test_x[numeric_cols])
train_x.describe()
train_Y_bin = train_Y.apply(lambda x: 0 if x is 'benign' else 1)
test_Y_bin = test_Y.apply(lambda x: 0 if x is 'benign' else 1)
必須優化 KNN 分類器代碼。
下面的 KNN 實現需要超參數優化
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss , accuracy_score
knn_clf = KNeighborsClassifier( n_neighbors = 3 )
knn_clf.fit( train_x,train_Y )
knn_pred = knn_clf.predict(test_x)
accuracy_score(test_Y,knn_pred)
那么,您的 model 可能不適合您正在使用的給定數據集。 您可以按照您的建議添加更多數據或調整參數。 您可能還想研究過擬合或欠擬合。 至於參數,請看下面我的示例代碼。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
df = pd.read_csv('C:\\your_path\\heart.csv')
df.head()
df.info()
df.isnull().sum()
#Univariate analysis target.
sns.countplot(df['target'])
#Univariate analysis age.
f = plt.figure(figsize=(20,4))
f.add_subplot(1,2,1)
sns.distplot(df['age'])
f.add_subplot(1,2,2)
sns.boxplot(df['age'])
#Univariate analysis resting blood pressure (mm Hg) atau trestbps.f = plt.figure(figsize=(20,4))
f.add_subplot(1,2,1)
sns.distplot(df['trestbps'])
f.add_subplot(1,2,2)
sns.boxplot(df['trestbps'])
#Create KNN Object.
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
knn = KNeighborsClassifier()#Create x and y variables.
x = df.drop(columns=['target'])
y = df['target']#Split data into training and testing.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)#Training the model.
knn.fit(x_train, y_train)#Predict test data set.
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
# Returns a NumPy Array
# Predict for One Observation (image)
#logisticRegr.predict(x_test[0].reshape(1,-1))
logisticRegr.predict(x_test[0:10])
predictions = logisticRegr.predict(x_test)
y_pred = predictions#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)
結果:
precision recall f1-score support
0 0.91 0.84 0.87 25
1 0.89 0.94 0.92 36
accuracy 0.90 61
macro avg 0.90 0.89 0.90 61
weighted avg 0.90 0.90 0.90 61
性能還可以,90%以上。 但是,讓我們嘗試使用超參數調整來提高模型的性能。
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)#Create new KNN object
knn_2 = KNeighborsClassifier()#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)#Fit the model
best_model = clf.fit(x,y)#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
結果:
Best leaf_size: 1
Best p: 1
Best n_neighbors: 7
現在,讓我們利用我們上面積累的知識,做一個小調整並重新運行這個過程......
# train your model using all data and the best known parameters
# instantiate model with best parameters
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')
# fit with X and y, not X_train and y_train
# even if we use train/test split, we should train on X and y before making predictions on new data
# otherwise we throw away potential valuable data we can learn from
knn.fit(x, y)
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
# Returns a NumPy Array
# Predict for One Observation (image)
#logisticRegr.predict(x_test[0].reshape(1,-1))
logisticRegr.predict(x_test[0:10])
predictions = logisticRegr.predict(x_test)
y_pred = predictions#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)
結果:
precision recall f1-score support
0 0.91 0.84 0.87 25
1 0.89 0.94 0.92 36
accuracy 0.90 61
macro avg 0.90 0.89 0.90 61
weighted avg 0.90 0.90 0.90 61
這是同一件事,在這種情況下。 擺弄超參數根本沒有區別,在其他情況下; 性能上可能會有輕微的提升,5%、10%。 或者其他什么,所以,外賣是,KNN 在我的特定數據集上表現良好,但顯然它並沒有在您的數據集上提供良好的結果,這完全沒問題。 只需選擇不同的 model 進行測試。
# data source:
# https://raw.githubusercontent.com/adiptamartulandi/KNN-and-Tuning-Hyperparameters/master/heart.csv
我會給你一個最后的想法。 您可以自動循環遍歷多個分類器,並查看每個分類器的結果,然后選擇前 1 個或 2 個,然后運行。
import numpy as np
import pandas as pd
# Load data from UCI dataset repo
bank_note_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
data = np.loadtxt(bank_note_url, delimiter=',')
data = pd.DataFrame(data)
# Add column names
clean_columns = ['variance_of_wavelet', 'skewness_of_wavelet',
'curtosis_of_wavelet', 'entropy_of_wavelet',
'class']
data.columns = clean_columns
data.head()
# Separate the target and features as separate dataframes for sklearn APIs
X = data.drop('class', axis=1)
y = data[['class']].astype('int')
# Specify the design matrix and the target vector for yellowbrick as arrays
design_matrix = X.values
target_vector = y.values.flatten()
X.head()
y.head()
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
from sklearn.model_selection import train_test_split
# Stratified sampling based on the distribution of the target vector, y
X_train, X_test, y_train, y_test = train_test_split(X, y,
stratify=y,
test_size=0.20,
random_state=30)
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="rbf", C=0.025, probability=True),
NuSVC(probability=True),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier()
]
for classifier in classifiers:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', classifier)])
pipe.fit(X_train, y_train)
print(classifier)
print("model score: %.3f" % pipe.score(X_test, y_test))
param_grid = {
'classifier__n_estimators': [200, 500],
'classifier__max_features': ['auto', 'sqrt', 'log2'],
'classifier__max_depth' : [4,5,6,7,8],
'classifier__criterion' :['gini', 'entropy']}
結果:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform')
model score: 1.000
SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
verbose=False)
model score: 0.967
NuSVC(break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, nu=0.5, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
model score: 0.971
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
model score: 0.978
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
self._final_estimator.fit(Xt, y, **fit_params)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
model score: 0.993
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
n_estimators=50, random_state=None)
model score: 0.996
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:1454: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='deprecated',
random_state=None, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
model score: 0.993
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.