[英]Why does KNeighborsClassifier always predict the same number?
Why does knn always predict the same number?为什么 knn 总是预测相同的数字? How can I solve this?
我该如何解决这个问题? The dataset is here .
数据集在这里。
Code:代码:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import scipy.io
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
def load_mat_data(path):
mat = scipy.io.loadmat(DATA_PATH)
x,y = mat['data'], mat['class']
x = x.astype('float32')
# stadardize values
standardizer = preprocessing.StandardScaler()
x = standardizer.fit_transform(x)
return x, standardizer, y
def numpyToTensor(x):
x_train = torch.from_numpy(x)
return x_train
class DataBuilder(Dataset):
def __init__(self, path):
self.x, self.standardizer, self.y = load_mat_data(DATA_PATH)
self.x = numpyToTensor(self.x)
self.len=self.x.shape[0]
self.y = numpyToTensor(self.y)
def __getitem__(self,index):
return (self.x[index], self.y[index])
def __len__(self):
return self.len
datasets = ['/home/katerina/Desktop/datasets/GSE75110.mat']
for DATA_PATH in datasets:
print(DATA_PATH)
data_set=DataBuilder(DATA_PATH)
pred_rpknn = [0] * len(data_set.y)
kf = KFold(n_splits=10, shuffle = True, random_state=7)
for train_index, test_index in kf.split(data_set.x):
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
#print("TRAIN:", train_index, "TEST:", test_index)
x_train, x_test = data_set.x[train_index], data_set.x[test_index]
y_train, y_test = data_set.y[train_index], data_set.y[test_index]
#Train the model using the training sets
y1_train = y_train.ravel()
knn.fit(x_train, y1_train)
#Predict the response for test dataset
y_pred = knn.predict(x_test)
#print(y_pred)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
c = 0
for idx in test_index:
pred_rpknn[idx] = y_pred[c]
c +=1
print("Accuracy:",metrics.accuracy_score(data_set.y, pred_rpknn))
print(pred_rpknn, data_set.y.reshape(1,-1))
Output:输出:
/home/katerina/Desktop/datasets/GSE75110.mat
Accuracy: 0.2857142857142857
Accuracy: 0.38095238095238093
Accuracy: 0.14285714285714285
Accuracy: 0.4
Accuracy: 0.3
Accuracy: 0.25
Accuracy: 0.3
Accuracy: 0.6
Accuracy: 0.25
Accuracy: 0.45
Accuracy: 0.33497536945812806
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
I am trying to combine knn with k fold in order to test the whole dataset using 10 folds.我正在尝试将 knn 与 k 折叠结合起来,以便使用 10 折叠来测试整个数据集。 The problem is that knn always predicts arrays of 3's for each fold.
问题是 knn 总是为每个折叠预测 3 的数组。 The classes I want to predict are these:
我想预测的类是这些:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]
张量([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]
TL;DR TL; 博士
It have to do with the StandardScaler
, change it to a simple normalisation.它与
StandardScaler
,将其更改为简单的规范化。
eg例如
from sklearn import preprocessing
...
x = preprocessing.normalize(x)
Explanation:解释:
Standard Scalar as you use it will do:使用它时的标准标量将执行以下操作:
The standard score of a sample `x` is calculated as: z = (x - u) / s where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`.
When you actually want this features to help KNN to decide which vector is closer.当您确实希望此功能帮助 KNN 确定哪个向量更接近时。
in normalize the normalization happen for each vector separately so it doesn't effect and even help the KNN to differentiate the vectors在归一化中,对每个向量分别进行归一化,因此它不会影响甚至帮助 KNN 区分向量
With KNN StandardScaler
can actually harm your prediction.使用 KNN
StandardScaler
实际上可能会损害您的预测。 It is better to use it in other forms of data.最好在其他形式的数据中使用它。
import scipy.io
from torch.utils.data import Dataset
from sklearn import preprocessing
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
def load_mat_data(path):
mat = scipy.io.loadmat(DATA_PATH)
x, y = mat['data'], mat['class']
x = x.astype('float32')
# stadardize values
x = preprocessing.normalize(x)
return x, y
def numpyToTensor(x):
x_train = torch.from_numpy(x)
return x_train
class DataBuilder(Dataset):
def __init__(self, path):
self.x, self.y = load_mat_data(DATA_PATH)
self.x = numpyToTensor(self.x)
self.len=self.x.shape[0]
self.y = numpyToTensor(self.y)
def __getitem__(self,index):
return (self.x[index], self.y[index])
def __len__(self):
return self.len
datasets = ['/home/katerina/Desktop/datasets/GSE75110.mat']
for DATA_PATH in datasets:
print(DATA_PATH)
data_set=DataBuilder(DATA_PATH)
pred_rpknn = [0] * len(data_set.y)
kf = KFold(n_splits=10, shuffle = True, random_state=7)
for train_index, test_index in kf.split(data_set.x):
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
#print("TRAIN:", train_index, "TEST:", test_index)
x_train, x_test = data_set.x[train_index], data_set.x[test_index]
y_train, y_test = data_set.y[train_index], data_set.y[test_index]
#Train the model using the training sets
y1_train = y_train.view(-1)
knn.fit(x_train, y1_train)
#Predict the response for test dataset
y_pred = knn.predict(x_test)
#print(y_pred)
# Model Accuracy, how often is the classifier correct?
print("Accuracy in loop:", metrics.accuracy_score(y_test, y_pred))
c = 0
for idx in test_index:
pred_rpknn[idx] = y_pred[c]
c +=1
print("Accuracy:",metrics.accuracy_score(data_set.y, pred_rpknn))
print(pred_rpknn, data_set.y.reshape(1,-1))
Accuracy in loop: 1.0
Accuracy in loop: 0.8571428571428571
Accuracy in loop: 0.8571428571428571
Accuracy in loop: 1.0
Accuracy in loop: 0.9
Accuracy in loop: 0.9
Accuracy in loop: 0.95
Accuracy in loop: 1.0
Accuracy in loop: 0.9
Accuracy in loop: 1.0
Accuracy: 0.9359605911330049
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.