简体   繁体   中英

R2 score is coming 0 for test and train data

I am trying to implement a lasso model for house pricing, but its predicting 0.00 for r2_score for test and train data. Can anyone please help me where I am going wrong.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

train = pd.read_csv(r"C:\train - Copy.csv")
test = pd.read_csv(r"C:\test.csv")

# CHECKING SHAPE OF THE DATA
print("Train Data shape", train.shape,"\n Test Data shape",test.shape)

# Save the 'Id' column
train_ID = train['id']
test_ID = test['id']

# Droping COLUMNS WHICH HAS NO IMPACT ON DATA
train = train.drop(['id', 'thumbnail_url'], axis=1)
test = test.drop(['id', 'thumbnail_url'], axis=1)

# Check data size after dropping no impact variables
print("\nThe train data size after dropping features is : {} ".format(train.shape))
print("The test data size after dropping featurea is : {} ".format(test.shape))

# Checking Categorical Data
C_data = train.select_dtypes(include=['object']).columns
print("Categorical Data", C_data)

# Checking Numerical Data
N_data = train.select_dtypes(include=['int64', 'float64']).columns
print("Numerical Data", N_data)

# Combining Datasets
ntrain = train.shape[0]
ntest = test.shape[0]
#y_train = train.log_price.values
y = train.log_price.values

print(ntrain)
print(ntest)
print(y)

all_data = pd.concat((train, test),sort='true').reset_index(drop=True)
print(all_data.shape)
all_data = all_data.drop(['log_price'], axis=1)
print(all_data.shape)


# Find Missing Ratio of Dataset
null_values = all_data.isnull().sum()
# print(null_values)

# IMPUTING NULL VALUES
all_data = all_data.dropna(subset=['host_since'])
all_data['bathrooms'] = all_data['bathrooms'].fillna(all_data['bathrooms'].mean())
all_data['bedrooms'] = all_data['bedrooms'].fillna(all_data['bedrooms'].mean())
all_data['beds'] = all_data['beds'].fillna(all_data['beds'].mean())
all_data['review_scores_rating'] = all_data['review_scores_rating'].fillna(all_data['review_scores_rating'].mean())
all_data['host_response_rate'] = all_data['host_response_rate'].fillna('None')
all_data['neighbourhood'] = all_data['neighbourhood'].fillna('None')
all_data['host_has_profile_pic'] = all_data['host_has_profile_pic'].fillna('f')
all_data['host_identity_verified'] = all_data['host_identity_verified'].fillna('f')
all_data['description'] = all_data['description'].fillna('None')
all_data['first_review'] = all_data['first_review'].fillna('None')
all_data['last_review'] = all_data['last_review'].fillna('None')
all_data['name'] = all_data['name'].fillna('None')
all_data['zipcode'] = all_data['zipcode'].fillna('None')

# Check if Missing values left
post_null_values = all_data.isnull().sum().sum()
print("post_null_values\n", post_null_values)

print("-----------------------------------------------------------------------------------------------")

# apply LabelEncoder to categorical features
from sklearn.preprocessing import LabelEncoder
cols = ('property_type', 'room_type', 'amenities', 'bed_type',
       'cancellation_policy', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'name',
       'neighbourhood', 'zipcode')
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(all_data[c].values))
    all_data[c] = lbl.transform(list(all_data[c].values))

# creating matrices for sklearn:
X = all_data[:ntrain]
test_values = all_data[ntrain:]

print("X col", X.columns, "X shape", X.shape)

# import train test split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#clf = LinearRegression()
clf = Lasso()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

from sklearn.metrics import r2_score

print("Train acc: " , r2_score(y_train, y_train_pred))
print("Test acc: ", r2_score(y_test, y_pred))

from sklearn.metrics import mean_squared_error

print("Train acc: " , clf.score(X_train, y_train))
print("Test acc: ", clf.score(X_test, y_test))

Output: Train acc: 0.0001732000413904311 Test acc: 0.00011093390171657003 Train acc: 0.0001732000413904311 Test acc: 0.00011093390171657004

This is not unusual. Your regression is super bad. I would you suggest it to rework it again. And for regression also note that you can have negative r_2_score : How is the R2 value in Scikit learn calculated?

You can also use a dummy to see what is your baseline? How about to predict every y_test value es the mean of all y_train values. Which r_2 you get then?

Some ideas to tune it are: Play with the features(leave some out), pick another regression model and so on

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM