{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prediction (out of sample)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "from __future__ import print_function\n", "import numpy as np\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Artificial data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "nsample = 50\n", "sig = 0.25\n", "x1 = np.linspace(0, 20, nsample)\n", "X = np.column_stack((x1, np.sin(x1), (x1-5)**2))\n", "X = sm.add_constant(X)\n", "beta = [5., 0.5, 0.5, -0.02]\n", "y_true = np.dot(X, beta)\n", "y = y_true + sig * np.random.normal(size=nsample)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Estimation " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "olsmod = sm.OLS(y, X)\n", "olsres = olsmod.fit()\n", "print(olsres.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## In-sample prediction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ypred = olsres.predict(X)\n", "print(ypred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a new sample of explanatory variables Xnew, predict and plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "x1n = np.linspace(20.5,25, 10)\n", "Xnew = np.column_stack((x1n, np.sin(x1n), (x1n-5)**2))\n", "Xnew = sm.add_constant(Xnew)\n", "ynewpred = olsres.predict(Xnew) # predict out of sample\n", "print(ynewpred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot comparison" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "fig, ax = plt.subplots()\n", "ax.plot(x1, y, 'o', label=\"Data\")\n", "ax.plot(x1, y_true, 'b-', label=\"True\")\n", "ax.plot(np.hstack((x1, x1n)), np.hstack((ypred, ynewpred)), 'r', label=\"OLS prediction\")\n", "ax.legend(loc=\"best\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predicting with Formulas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using formulas can make both estimation and prediction a lot easier" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from statsmodels.formula.api import ols\n", "\n", "data = {\"x1\" : x1, \"y\" : y}\n", "\n", "res = ols(\"y ~ x1 + np.sin(x1) + I((x1-5)**2)\", data=data).fit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We use the `I` to indicate use of the Identity transform. Ie., we don't want any expansion magic from using `**2`" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "res.params" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we only have to pass the single variable and we get the transformed right-hand side variables automatically" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "res.predict(exog=dict(x1=x1n))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }