ols_table.py 2.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
"""Example: statsmodels.OLS
"""

from statsmodels.datasets.longley import load
import statsmodels.api as sm
from statsmodels.iolib.table import SimpleTable, default_txt_fmt
import numpy as np

data = load()

data_orig = (data.endog.copy(), data.exog.copy())

#.. Note: In this example using zscored/standardized variables has no effect on
#..   regression estimates. Are there no numerical problems?

rescale = 0
#0: no rescaling, 1:demean, 2:standardize, 3:standardize and transform back
rescale_ratio = data.endog.std() / data.exog.std(0)
if rescale > 0:
    # rescaling
    data.endog -= data.endog.mean()
    data.exog -= data.exog.mean(0)
if rescale > 1:
    data.endog /= data.endog.std()
    data.exog /= data.exog.std(0)

#skip because mean has been removed, but dimension is hardcoded in table
data.exog = sm.tools.add_constant(data.exog, prepend=False)


ols_model = sm.OLS(data.endog, data.exog)
ols_results = ols_model.fit()

# the Longley dataset is well known to have high multicollinearity
# one way to find the condition number is as follows


#Find OLS parameters for model with one explanatory variable dropped

resparams = np.nan * np.ones((7, 7))
res = sm.OLS(data.endog, data.exog).fit()
resparams[:, 0] = res.params

indall = range(7)
for i in range(6):
    ind = indall[:]
    del ind[i]
    res = sm.OLS(data.endog, data.exog[:, ind]).fit()
    resparams[ind, i + 1] = res.params

if rescale == 1:
    pass
if rescale == 3:
    resparams[:-1, :] *= rescale_ratio[:, None]

txt_fmt1 = default_txt_fmt
numformat = '%10.4f'
txt_fmt1 = dict(data_fmts=[numformat])
rowstubs = data.names[1:] + ['const']
headers = ['all'] + ['drop %s' % name for name in data.names[1:]]
tabl = SimpleTable(resparams, headers, rowstubs, txt_fmt=txt_fmt1)

nanstring = numformat % np.nan
nn = len(nanstring)
nanrep = ' ' * (nn - 1)
nanrep = nanrep[:nn // 2] + '-' + nanrep[nn // 2:]

print('Longley data - sensitivity to dropping an explanatory variable')
print(str(tabl).replace(nanstring, nanrep))