""" Weighted Least Squares example is extended to look at the meaning of rsquared in WLS, at outliers, compares with RLM and a short bootstrap """ from __future__ import print_function import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt data = sm.datasets.ccard.load() data.exog = sm.add_constant(data.exog, prepend=False) ols_fit = sm.OLS(data.endog, data.exog).fit() # perhaps the residuals from this fit depend on the square of income incomesq = data.exog[:,2] plt.scatter(incomesq, ols_fit.resid) #@savefig wls_resid_check.png plt.grid() # If we think that the variance is proportional to income**2 # we would want to weight the regression by income # the weights argument in WLS weights the regression by its square root # and since income enters the equation, if we have income/income # it becomes the constant, so we would want to perform # this type of regression without an explicit constant in the design #..data.exog = data.exog[:,:-1] wls_fit = sm.WLS(data.endog, data.exog[:,:-1], weights=1/incomesq).fit() # This however, leads to difficulties in interpreting the post-estimation # statistics. Statsmodels does not yet handle this elegantly, but # the following may be more appropriate # explained sum of squares ess = wls_fit.uncentered_tss - wls_fit.ssr # rsquared rsquared = ess/wls_fit.uncentered_tss # mean squared error of the model mse_model = ess/(wls_fit.df_model + 1) # add back the dof of the constant # f statistic fvalue = mse_model/wls_fit.mse_resid # adjusted r-squared rsquared_adj = 1 -(wls_fit.nobs)/(wls_fit.df_resid)*(1-rsquared) #Trying to figure out what's going on in this example #---------------------------------------------------- #JP: I need to look at this again. Even if I exclude the weight variable # from the regressors and keep the constant in then the reported rsquared # stays small. Below also compared using squared or sqrt of weight variable. # TODO: need to add 45 degree line to graphs wls_fit3 = sm.WLS(data.endog, data.exog[:,(0,1,3,4)], weights=1/incomesq).fit() print(wls_fit3.summary()) print('corrected rsquared') print((wls_fit3.uncentered_tss - wls_fit3.ssr)/wls_fit3.uncentered_tss) plt.figure(); plt.title('WLS dropping heteroscedasticity variable from regressors'); plt.plot(data.endog, wls_fit3.fittedvalues, 'o'); plt.xlim([0,2000]); #@savefig wls_drop_het.png plt.ylim([0,2000]); print('raw correlation of endog and fittedvalues') print(np.corrcoef(data.endog, wls_fit.fittedvalues)) print('raw correlation coefficient of endog and fittedvalues squared') print(np.corrcoef(data.endog, wls_fit.fittedvalues)[0,1]**2) # compare with robust regression, # heteroscedasticity correction downweights the outliers rlm_fit = sm.RLM(data.endog, data.exog).fit() plt.figure(); plt.title('using robust for comparison'); plt.plot(data.endog, rlm_fit.fittedvalues, 'o'); plt.xlim([0,2000]); #@savefig wls_robust_compare.png plt.ylim([0,2000]); #What is going on? A more systematic look at the data #---------------------------------------------------- # two helper functions def getrsq(fitresult): '''calculates rsquared residual, total and explained sums of squares Parameters ---------- fitresult : instance of Regression Result class, or tuple of (resid, endog) arrays regression residuals and endogenous variable Returns ------- rsquared residual sum of squares (centered) total sum of squares explained sum of squares (for centered) ''' if hasattr(fitresult, 'resid') and hasattr(fitresult, 'model'): resid = fitresult.resid endog = fitresult.model.endog nobs = fitresult.nobs else: resid = fitresult[0] endog = fitresult[1] nobs = resid.shape[0] rss = np.dot(resid, resid) tss = np.var(endog)*nobs return 1-rss/tss, rss, tss, tss-rss def index_trim_outlier(resid, k): '''returns indices to residual array with k outliers removed Parameters ---------- resid : array_like, 1d data vector, usually residuals of a regression k : int number of outliers to remove Returns ------- trimmed_index : array, 1d index array with k outliers removed outlier_index : array, 1d index array of k outliers Notes ----- Outliers are defined as the k observations with the largest absolute values. ''' sort_index = np.argsort(np.abs(resid)) # index of non-outlier trimmed_index = np.sort(sort_index[:-k]) outlier_index = np.sort(sort_index[-k:]) return trimmed_index, outlier_index #Comparing estimation results for ols, rlm and wls with and without outliers #--------------------------------------------------------------------------- #ols_test_fit = sm.OLS(data.endog, data.exog).fit() olskeep, olsoutl = index_trim_outlier(ols_fit.resid, 2) print('ols outliers', olsoutl, ols_fit.resid[olsoutl]) ols_fit_rm2 = sm.OLS(data.endog[olskeep], data.exog[olskeep,:]).fit() rlm_fit_rm2 = sm.RLM(data.endog[olskeep], data.exog[olskeep,:]).fit() #weights = 1/incomesq results = [ols_fit, ols_fit_rm2, rlm_fit, rlm_fit_rm2] #Note: I think incomesq is already square for weights in [1/incomesq, 1/incomesq**2, np.sqrt(incomesq)]: print('\nComparison OLS and WLS with and without outliers') wls_fit0 = sm.WLS(data.endog, data.exog, weights=weights).fit() wls_fit_rm2 = sm.WLS(data.endog[olskeep], data.exog[olskeep,:], weights=weights[olskeep]).fit() wlskeep, wlsoutl = index_trim_outlier(ols_fit.resid, 2) print('2 outliers candidates and residuals') print(wlsoutl, wls_fit.resid[olsoutl]) # redundant because ols and wls outliers are the same: ##wls_fit_rm2_ = sm.WLS(data.endog[wlskeep], data.exog[wlskeep,:], ## weights=1/incomesq[wlskeep]).fit() print('outliers ols, wls:', olsoutl, wlsoutl) print('rsquared') print('ols vs ols rm2', ols_fit.rsquared, ols_fit_rm2.rsquared) print('wls vs wls rm2', wls_fit0.rsquared, wls_fit_rm2.rsquared) #, wls_fit_rm2_.rsquared print('compare R2_resid versus R2_wresid') print('ols minus 2', getrsq(ols_fit_rm2)[0],) print(getrsq((ols_fit_rm2.wresid, ols_fit_rm2.model.wendog))[0]) print('wls ', getrsq(wls_fit)[0],) print(getrsq((wls_fit.wresid, wls_fit.model.wendog))[0]) print('wls minus 2', getrsq(wls_fit_rm2)[0]) # next is same as wls_fit_rm2.rsquared for cross checking print(getrsq((wls_fit_rm2.wresid, wls_fit_rm2.model.wendog))[0]) #print(getrsq(wls_fit_rm2_)[0], #print(getrsq((wls_fit_rm2_.wresid, wls_fit_rm2_.model.wendog))[0] results.extend([wls_fit0, wls_fit_rm2]) print(' ols ols_rm2 rlm rlm_rm2 wls (lin) wls_rm2 (lin) wls (squ) wls_rm2 (squ) wls (sqrt) wls_rm2 (sqrt)') print('Parameter estimates') print(np.column_stack([r.params for r in results])) print('R2 original data, next line R2 weighted data') print(np.column_stack([getattr(r, 'rsquared', None) for r in results])) print('Standard errors') print(np.column_stack([getattr(r, 'bse', None) for r in results])) print('Heteroscedasticity robust standard errors (with ols)') print('with outliers') print(np.column_stack([getattr(ols_fit, se, None) for se in ['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se']])) #..''' #.. #.. ols ols_rm2 rlm rlm_rm2 wls (lin) wls_rm2 (lin) wls (squ) wls_rm2 (squ) wls (sqrt) wls_rm2 (sqrt) #..Parameter estimates #..[[ -3.08181404 -5.06103843 -4.98510966 -5.34410309 -2.69418516 -3.1305703 -1.43815462 -1.58893054 -3.57074829 -6.80053364] #.. [ 234.34702702 115.08753715 129.85391456 109.01433492 158.42697752 128.38182357 60.95113284 100.25000841 254.82166855 103.75834726] #.. [ -14.99684418 -5.77558429 -6.46204829 -4.77409191 -7.24928987 -7.41228893 6.84943071 -3.34972494 -16.40524256 -4.5924465 ] #.. [ 27.94090839 85.46566835 89.91389709 95.85086459 60.44877369 79.7759146 55.9884469 60.97199734 -3.8085159 84.69170048] #.. [-237.1465136 39.51639838 -15.50014814 31.39771833 -114.10886935 -40.04207242 -6.41976501 -38.83583228 -260.72084271 117.20540179]] #.. #..R2 original data, next line R2 weighted data #..[[ 0.24357792 0.31745994 0.19220308 0.30527648 0.22861236 0.3112333 0.06573949 0.29366904 0.24114325 0.31218669]] #..[[ 0.24357791 0.31745994 None None 0.05936888 0.0679071 0.06661848 0.12769654 0.35326686 0.54681225]] #.. #..-> R2 with weighted data is jumping all over #.. #..standard errors #..[[ 5.51471653 3.31028758 2.61580069 2.39537089 3.80730631 2.90027255 2.71141739 2.46959477 6.37593755 3.39477842] #.. [ 80.36595035 49.35949263 38.12005692 35.71722666 76.39115431 58.35231328 87.18452039 80.30086861 86.99568216 47.58202096] #.. [ 7.46933695 4.55366113 3.54293763 3.29509357 9.72433732 7.41259156 15.15205888 14.10674821 7.18302629 3.91640711] #.. [ 82.92232357 50.54681754 39.33262384 36.57639175 58.55088753 44.82218676 43.11017757 39.31097542 96.4077482 52.57314209] #.. [ 199.35166485 122.1287718 94.55866295 88.3741058 139.68749646 106.89445525 115.79258539 105.99258363 239.38105863 130.32619908]] #.. #..robust standard errors (with ols) #..with outliers #.. HC0_se HC1_se HC2_se HC3_se' #..[[ 3.30166123 3.42264107 3.4477148 3.60462409] #.. [ 88.86635165 92.12260235 92.08368378 95.48159869] #.. [ 6.94456348 7.19902694 7.19953754 7.47634779] #.. [ 92.18777672 95.56573144 95.67211143 99.31427277] #.. [ 212.9905298 220.79495237 221.08892661 229.57434782]] #.. #..removing 2 outliers #..[[ 2.57840843 2.67574088 2.68958007 2.80968452] #.. [ 36.21720995 37.58437497 37.69555106 39.51362437] #.. [ 3.1156149 3.23322638 3.27353882 3.49104794] #.. [ 50.09789409 51.98904166 51.89530067 53.79478834] #.. [ 94.27094886 97.82958699 98.25588281 102.60375381]] #.. #.. #..''' # a quick bootstrap analysis # -------------------------- # #(I didn't check whether this is fully correct statistically) #**With OLS on full sample** nobs, nvar = data.exog.shape niter = 2000 bootres = np.zeros((niter, nvar*2)) for it in range(niter): rind = np.random.randint(nobs, size=nobs) endog = data.endog[rind] exog = data.exog[rind,:] res = sm.OLS(endog, exog).fit() bootres[it, :nvar] = res.params bootres[it, nvar:] = res.bse np.set_printoptions(linewidth=200) print('Bootstrap Results of parameters and parameter standard deviation OLS') print('Parameter estimates') print('median', np.median(bootres[:,:5], 0)) print('mean ', np.mean(bootres[:,:5], 0)) print('std ', np.std(bootres[:,:5], 0)) print('Standard deviation of parameter estimates') print('median', np.median(bootres[:,5:], 0)) print('mean ', np.mean(bootres[:,5:], 0)) print('std ', np.std(bootres[:,5:], 0)) plt.figure() for i in range(4): plt.subplot(2,2,i+1) plt.hist(bootres[:,i],50) plt.title('var%d' % i) #@savefig wls_bootstrap.png plt.figtext(0.5, 0.935, 'OLS Bootstrap', ha='center', color='black', weight='bold', size='large') #**With WLS on sample with outliers removed** data_endog = data.endog[olskeep] data_exog = data.exog[olskeep,:] incomesq_rm2 = incomesq[olskeep] nobs, nvar = data_exog.shape niter = 500 # a bit slow bootreswls = np.zeros((niter, nvar*2)) for it in range(niter): rind = np.random.randint(nobs, size=nobs) endog = data_endog[rind] exog = data_exog[rind,:] res = sm.WLS(endog, exog, weights=1/incomesq[rind,:]).fit() bootreswls[it, :nvar] = res.params bootreswls[it, nvar:] = res.bse print('Bootstrap Results of parameters and parameter standard deviation',) print('WLS removed 2 outliers from sample') print('Parameter estimates') print('median', np.median(bootreswls[:,:5], 0)) print('mean ', np.mean(bootreswls[:,:5], 0)) print('std ', np.std(bootreswls[:,:5], 0)) print('Standard deviation of parameter estimates') print('median', np.median(bootreswls[:,5:], 0)) print('mean ', np.mean(bootreswls[:,5:], 0)) print('std ', np.std(bootreswls[:,5:], 0)) plt.figure() for i in range(4): plt.subplot(2,2,i+1) plt.hist(bootreswls[:,i],50) plt.title('var%d' % i) #@savefig wls_bootstrap_rm2.png plt.figtext(0.5, 0.935, 'WLS rm2 Bootstrap', ha='center', color='black', weight='bold', size='large') #..plt.show() #..plt.close('all') #:: # # The following a random variables not fixed by a seed # # Bootstrap Results of parameters and parameter standard deviation # OLS # # Parameter estimates # median [ -3.26216383 228.52546429 -14.57239967 34.27155426 -227.02816597] # mean [ -2.89855173 234.37139359 -14.98726881 27.96375666 -243.18361746] # std [ 3.78704907 97.35797802 9.16316538 94.65031973 221.79444244] # # Standard deviation of parameter estimates # median [ 5.44701033 81.96921398 7.58642431 80.64906783 200.19167735] # mean [ 5.44840542 86.02554883 8.56750041 80.41864084 201.81196849] # std [ 1.43425083 29.74806562 4.22063268 19.14973277 55.34848348] # # Bootstrap Results of parameters and parameter standard deviation # WLS removed 2 outliers from sample # # Parameter estimates # median [ -3.95876112 137.10419042 -9.29131131 88.40265447 -44.21091869] # mean [ -3.67485724 135.42681207 -8.7499235 89.74703443 -46.38622848] # std [ 2.96908679 56.36648967 7.03870751 48.51201918 106.92466097] # # Standard deviation of parameter estimates # median [ 2.89349748 59.19454402 6.70583332 45.40987953 119.05241283] # mean [ 2.97600894 60.14540249 6.92102065 45.66077486 121.35519673] # std [ 0.55378808 11.77831934 1.69289179 7.4911526 23.72821085] # # # #Conclusion: problem with outliers and possibly heteroscedasticity #----------------------------------------------------------------- # #in bootstrap results # #* bse in OLS underestimates the standard deviation of the parameters # compared to standard deviation in bootstrap #* OLS heteroscedasticity corrected standard errors for the original # data (above) are close to bootstrap std #* using WLS with 2 outliers removed has a relatively good match between # the mean or median bse and the std of the parameter estimates in the # bootstrap # #We could also include rsquared in bootstrap, and do it also for RLM. #The problems could also mean that the linearity assumption is violated, #e.g. try non-linear transformation of exog variables, but linear #in parameters. # # #for statsmodels # # * In this case rsquared for original data looks less random/arbitrary. # * Don't change definition of rsquared from centered tss to uncentered # tss when calculating rsquared in WLS if the original exog contains # a constant. The increase in rsquared because of a change in definition # will be very misleading. # * Whether there is a constant in the transformed exog, wexog, or not, # might affect also the degrees of freedom calculation, but I haven't # checked this. I would guess that the df_model should stay the same, # but needs to be verified with a textbook. # * df_model has to be adjusted if the original data does not have a # constant, e.g. when regressing an endog on a single exog variable # without constant. This case might require also a redefinition of # the rsquare and f statistic for the regression anova to use the # uncentered tss. # This can be done through keyword parameter to model.__init__ or # through autodedection with hasconst = (exog.var(0)<1e-10).any() # I'm not sure about fixed effects with a full dummy set but # without a constant. In this case autodedection wouldn't work this # way. Also, I'm not sure whether a ddof keyword parameter can also # handle the hasconst case.