I've been able to successfully use SVR to predict a value on a data-set with one data entry. However, my data-set has 47 entries per "row" or "entry" or whatever you want to call it. I've uploaded my dataset csv and in my code I have commented out the other 46 entries in the get_data function.
All 47 data entries are relative and impact x, the player's salary. I am trying to project a player's future salary using only the statistics available for the player prior to that player's salary is known. However, as I mentioned, a lot of stats define the salary and at the moment I am only able to conduct the prediction on 1 stat entry.
import csv
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
salary = []
stats = []
def get_data(filename):
with open(filename, 'r', encoding='utf8', errors='ignore') as csvfile:
csvFileReader = csv.reader(csvfile)
for row in csvFileReader:
# stats.append(float(row[4])) #
# stats.append(int(row[5])) #
salary.append(float(row[6]))
# stats.append(int(row[8])) #
# stats.append(int(row[9])) #
# stats.append(int(row[10])) #
stats.append(int(row[11])) #
# stats.append(int(row[12])) #
# stats.append(int(row[13])) #
# stats.append(float(row[14])) #
# stats.append(int(row[15])) #
# stats.append(int(row[16])) #
# stats.append(int(row[17])) #
# stats.append(int(row[18])) #
# stats.append(int(row[19])) #
# stats.append(int(row[20])) #
# stats.append(int(row[21])) #
# stats.append(int(row[22])) #
# stats.append(int(row[23])) #
# stats.append(int(row[24])) #
# stats.append(float(row[25])) #
# stats.append(int(row[26])) #
# stats.append(int(row[27])) #
# stats.append(int(row[28])) #
# stats.append(int(row[29])) #
# stats.append(int(row[30])) #
# stats.append(int(row[31])) #
# stats.append(int(row[32])) #
# stats.append(int(row[33])) #
# stats.append(int(row[34])) #
# stats.append(int(row[35])) #
# stats.append(float(row[36])) #
# stats.append(int(row[37])) #
# stats.append(int(row[38])) #
# stats.append(int(row[39])) #
# stats.append(int(row[40])) #
# stats.append(int(row[41])) #
# stats.append(int(row[42])) #
# stats.append(int(row[43])) #
# stats.append(int(row[44])) #
# stats.append(int(row[45])) #
# stats.append(int(row[46])) #
# stats.append(float(row[47])) #
# stats.append(int(row[48])) #
# stats.append(int(row[49])) #
# stats.append(int(row[50])) #
# stats.append(int(row[51])) #
# stats.append(int(row[52])) #
return
get_data('dataset.csv')
def predict_salary(stats, salary, x):
stats = np.reshape(stats,(len(salary), int(len(stats)/len(salary))))
svr_lin = SVR(kernel='linear', C=1e3, epsilon=0.2, cache_size=7000)
svr_rbf = SVR(kernel= 'rbf', C=1e3, gamma=0.1, cache_size=7000)
svr_poly = SVR(kernel='poly', C=1e3, degree=2, cache_size=7000)
svr_lin.fit(stats, salary)
svr_rbf.fit(stats, salary)
svr_poly.fit(stats, salary)
plt.scatter(stats, salary, color='black', label='Data')
plt.plot(stats, svr_lin.predict(stats), color='green', label='Linear model')
plt.plot(stats, svr_rbf.predict(stats), color='red', label='RBF model')
plt.plot(stats, svr_poly.predict(stats), color='blue', label='Polynomial model')
plt.xlabel('Stats')
plt.ylabel('Salary')
plt.title('Support Vector Regression')
plt.legend()
plt.show()
return svr_lin.predict(x)[0], svr_rbf.predict(x)[0], svr_poly.predict(x)[0]
projected_salary = predict_salary(stats, salary, 1)
print (projected_salary)
And here is the dataset.csv, I've only included 10 rows but what I have goes up to 200 rows of data:
N/A,N/A,player 1,team,3,26,1350000,508500,22,31,32,8,361,3,0.217,0,0,0,0,25,33,48,11,390,13,0.256,0,0,0,0,9,18,22,1,225,4,0.215,0,0,0,0,22,27,37,8,313,9,0.192,0,0,0,0,0
N/A,N/A,player 2,team,3,27,805000,508500,15,26,17,4,176,1,0.242,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,1,2,0,13,0,0.231,0,0,0,0,10,10,17,1,168,1,0.201,0,0,0,0,0
N/A,N/A,player 3,team,3,25,2625000,508500,25,17,69,3,460,58,0.26,0,0,0,0,15,28,56,4,454,57,0.226,0,0,0,0,39,48,72,6,611,56,0.25,0,0,0,0,2,1,9,0,22,13,0.368,2,0,0,0,0
N/A,N/A,player 4,team,3,26,3575000,508500,65,81,73,30,601,6,0.243,0,0,0,0,37,46,44,11,497,13,0.258,0,0,0,0,29,36,47,10,411,4,0.221,0,0,0,1,25,36,41,8,335,5,0.265,0,0,0,0,0
N/A,N/A,player 5,team,3,28,1950000,508500,23,34,45,7,324,4,0.255,0,0,0,0,35,45,56,2,509,8,0.28,1,0,0,0,32,29,68,4,492,12,0.281,0,0,0,0,5,14,15,0,144,1,0.25,0,0,0,0,0
N/A,N/A,player 6,team,2.5,30,700000,508500,3,0,7,0,141,0,0.174,0,0,0,0,28,49,38,11,355,0,0.234,0,0,0,0,18,28,22,9,275,0,0.207,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N/A,N/A,player 7,team,2.5,26,2550000,508500,31,39,67,6,622,17,0.294,1,0,0,0,25,35,57,1,452,19,0.272,0,0,0,0,3,4,13,1,125,1,0.237,0,0,0,0,5,10,17,0,131,0,0.289,0,0,0,0,0
N/A,N/A,player 8,team,3,28,938000,508500,15,28,21,6,166,4,0.284,0,0,0,0,8,10,13,2,113,0,0.146,0,0,0,0,3,4,8,0,79,1,0.213,0,0,0,0,11,19,16,4,197,0,0.189,0,0,0,0,0
N/A,N/A,player 9,team,3,24,2300000,508500,40,49,52,5,466,21,0.277,0,0,0,0,36,43,59,4,552,16,0.227,0,0,0,0,27,26,34,6,332,8,0.261,0,0,0,0,5,5,5,0,61,2,0.291,0,0,0,0,0
N/A,N/A,player 10,team,3,27,3025000,508500,63,70,57,24,548,0,0.245,0,0,0,0,30,31,30,10,234,0,0.304,0,0,0,0,57,76,74,24,478,8,0.312,0,0,0,0,23,17,32,5,213,2,0.263,0,0,0,0,0
It's taken me a couple days to even get this working using 1 of the 47 entries and a couple more trying to figure out how to get it to analyze the entire set for each player. I am a beginner in python and have no statistical background so I am completely lost right now! Any help or guidance is appreciated, thanks!
I would use pandas , since the approach you are taking by commenting out lines is painful, to say the least.
import pandas
# list of columns (features) you'd like to use
columns_of_interest = [11, 15, 20, 26] # features you'd like to use (stats). You only used 11 but you could use many more
df = pandas.read_csv(filename, header=None)
stats = df[df[columns_of_interest]].values # select columns of interest
salary = df[6].values # salary column, which is in column 6
Then, you could use sklearn's train_test_split
. This will enable you to split data into training and testing.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(stats, salary)
Which you can send to your prediction function:
pred_lin, pred_rbf, pred_poly = predict_salary(x_train, y_train, x_test)
I've added three arguments since the function returns three sets of predictions, each from each SVR model.
Also, I would just change the return
of the function to:
svr_lin.predict(x), svr_rbf.predict(x), svr_poly.predict(x)
This will return the entire set of predictions from the test set.
import csv
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import pandas
from sklearn.model_selection import train_test_split
def predict_salary(stats, salary, x):
svr_lin = SVR(kernel='linear', C=1e3, epsilon=0.2, cache_size=7000)
svr_rbf = SVR(kernel= 'rbf', C=1e3, gamma=0.1, cache_size=7000)
svr_poly = SVR(kernel='poly', C=1e3, degree=2, cache_size=7000)
svr_lin.fit(stats, salary)
svr_rbf.fit(stats, salary)
svr_poly.fit(stats, salary)
# plt.scatter(stats, salary, color='black', label='Data')
plt.scatter(salary, svr_lin.predict(stats), color='green', label='Linear model')
plt.scatter(salary, svr_rbf.predict(stats), color='red', label='RBF model')
plt.scatter(salary, svr_poly.predict(stats), color='blue', label='Polynomial model')
plt.xlabel('Actual Salary')
plt.ylabel('Salary Predictions')
plt.title('Support Vector Regression')
plt.legend()
plt.show()
return svr_lin.predict(x), svr_rbf.predict(x), svr_poly.predict(x)
filename = '/Users/carlomazzaferro/Desktop/p.csv'
columns_of_interest = [11, 15, 20, 26]
df = pandas.read_csv(filename, header=None)
stats = df[columns_of_interest].values # select columns of interest
salary = df[6].values # salary column, which is in column
x_train, x_test, y_train, y_test = train_test_split(stats, salary)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
pred_lin, pred_rbf, pred_poly = predict_salary(x_train, y_train, x_test)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.