简体   繁体   中英

difference of summary between sklearn and statsmodels OLS

The goal is to detect and fix why the report between my sklearn "summary" implementation is not matching with the results of OLS statsmodels. The only thing is matching, is the beta coefficients.

import pandas as pd
import numpy as np
from statsmodels.regression.linear_model import OLS
from sklearn import linear_model
from scipy.stats import t


class LinearRegression(linear_model.LinearRegression):
    """
    LinearRegression class after sklearn's, but calculate t-statistics
    and p-values for model coefficients (betas).
    Additional attributes available after .fit()
    are `t` and `p` which are of the shape (y.shape[1], X.shape[1])
    which is (n_features, n_coefs)
    This class sets the intercept to 0 by default, since usually we include it
    in X.
    """

    def __init__(self, *args, **kwargs):
        if not "fit_intercept" in kwargs:
            kwargs['fit_intercept'] = False
        super(LinearRegression, self)\
                .__init__(*args, **kwargs)

    def fit(self, X, y, n_jobs=1):
        self = super(LinearRegression, self).fit(X, y, n_jobs)
       
        # std errors
        uhat = (y-(X@self.coef_).ravel())
        k = np.shape(X)[1]
        s2 = (uhat.T@uhat)/(y.shape[0])
        var = s2*np.linalg.inv(X.T@X)
        self.se = np.sqrt(np.diag(var))

        # T-Stat
        self.t_stats = self.coef_/self.se

        # p-values
        self.df = y.shape[0] - k # -1 degrees of freedom: N minus number of parameters
        self.p_values = 2*(t.sf(abs(self.t_stats),self.df))

        # Rsquared
        tss = (y-np.mean(y)).T@(y-np.mean(y))
        rss = uhat.T@uhat
        self.rsq = 1 - rss/tss

        self.summary = pd.DataFrame({
            "beta":self.coef_.reshape(1,-1).tolist()[0],
            "se":self.se.reshape(1,-1).tolist()[0],
            "t_stats":self.t_stats.reshape(1,-1).tolist()[0],
            "p_values":self.p_values.reshape(1,-1).tolist()[0],
            })

        return self

Running the function in a toy dataset we can test the results:

import statsmodels.api as sm
data = sm.datasets.longley.load_pandas()

# Estimate statsmodels OLS
model = OLS(endog=data.endog,exog=data.exog).fit()

# Estimate Sklearn with report like statsmodels OLS
model2 = LinearRegression(fit_intercept=False).fit(data.exog,np.array(data.endog))
model2.summary

I am worried about some formula is not matching with the correct one.

Here you have a class that you can use in order to obtain a LinearRegression model summary using Scikit-learn:

import numpy as np
import pandas as pd
from scipy.stats import t
from sklearn import linear_model

class LinearRegression(linear_model.LinearRegression):

    def __init__(self, *args, **kwargs):
        if not "fit_intercept" in kwargs:
            self.with_intercept = True
            kwargs['fit_intercept'] = True
        else:
            self.with_intercept = False
            kwargs['fit_intercept'] = False
        super(LinearRegression, self).__init__(*args, **kwargs)

    def fit(self, X, y):
        self = super(LinearRegression, self).fit(X, y)

        y_pred = self.predict(X)
        residuals = y - y_pred
        residual_sum_of_squares = residuals.T @ residuals

        if self.with_intercept:
            coefficients = [coef for coef in self.coef_]
            self.coefficients = [self.intercept_] + coefficients

            p = len(X.columns) + 1  # plus one because an intercept is added
            new_X = np.empty(shape=(len(X), p), dtype=float)
            new_X[:, 0] = 1
            new_X[:, 1:p] = X.values
            
        else:
            self.coefficients = self.coef_

            p = len(X.columns)
            new_X = X.values

        # standard errors
        sigma_squared_hat = residual_sum_of_squares / (len(X) - p)
        var_beta_hat = np.linalg.inv(new_X.T @ new_X) * sigma_squared_hat
        self.std_errors = [var_beta_hat[p_, p_] ** 0.5 for p_ in range(p)]

        # t_values
        self.t_values = np.array(self.coefficients)/self.std_errors

        # p values
        freedom_degree = y.shape[0] - X.shape[1]
        self.p_values = 2*(t.sf(abs(self.t_values), freedom_degree))

        # summary
        self.summary = pd.DataFrame()
        self.summary['Coefficients'] = self.coefficients
        self.summary['Standard errors'] = self.std_errors
        self.summary['t values'] = self.t_values
        self.summary['p values'] = self.p_values
    
        return self

This is how you can try the class and fit a model:

import statsmodels.api as sm

data = sm.datasets.longley.load_pandas()

X = pd.DataFrame(data.exog)
y = np.array(data.endog)

# in case you want to ignore the intercept include fit_intercept=False as a parameter
model_linear_regression = LinearRegression().fit(X, y)
model_linear_regression.summary

Result:

    Coefficients    Standard errors   t values   p values
0   -3.482259e+06   890420.379        -3.911     0.003
1   1.506190e+01    84.915            0.177      0.863
2   -3.580000e-02   0.033             -1.070     0.310
3   -2.020200e+00   0.488             -4.136     0.002
4   -1.033200e+00   0.214             -4.822     0.001
5   -5.110000e-02   0.226             -0.226     0.826
6   1.829151e+03    455.478           4.016      0.002

You can train an OLS model, making the necessary changes in case you want to add the intercept/constant, as follows:

with_intercept = True
if with_intercept:
    p = len(X.columns) + 1  # plus one because an intercept is added
    new_X = np.empty(shape=(len(X), p), dtype=float)
    new_X[:, 0] = 1
    new_X[:, 1:p] = X.values
    model_statsmodel = sm.OLS(endog=y, exog=new_X).fit()
else:
    model_statsmodel = sm.OLS(endog=y, exog=X).fit()

To finish, you can verify the results are the same this way:

assert np.allclose(np.round(model_statsmodel.params, 3), np.round(model_linear_regression.coefficients, 3))
assert np.allclose(np.round(model_statsmodel.bse, 3), np.round(model_linear_regression.std_errors, 3))
assert np.allclose(np.round(model_statsmodel.tvalues, 3), np.round(model_linear_regression.t_values, 3))
# the following assertion might return False, due to a difference in the number of decimals
assert np.allclose(np.round(model_statsmodel.pvalues, 3), np.round(model_linear_regression.p_values, 3))

Hope it is helpful:)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM