This very simple case-study is designed to get you up-and-running quickly with statsmodels. Starting from raw data, we will show the steps needed to estimate a statistical model and to draw a diagnostic plot. We will only use functions provided by statsmodels or its pandas and patsy dependencies.
After installing statsmodels and its dependencies, we load a few modules and functions:
In [1]: import statsmodels.api as sm
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-1-6030a6549dc0> in <module>()
----> 1 import statsmodels.api as sm
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/api.py in <module>()
10 from .discrete.discrete_model import (Poisson, Logit, Probit, MNLogit,
11 NegativeBinomial)
---> 12 from .tsa import api as tsa
13 from .nonparametric import api as nonparametric
14 import distributions
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/tsa/api.py in <module>()
----> 1 from .ar_model import AR
2 from .arima_model import ARMA, ARIMA
3 import vector_ar as var
4 from .vector_ar.var_model import VAR
5 from .vector_ar.svar_model import SVAR
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/tsa/ar_model.py in <module>()
16 from statsmodels.tools.numdiff import (approx_fprime, approx_hess,
17 approx_hess_cs)
---> 18 from statsmodels.tsa.kalmanf.kalmanfilter import KalmanFilter
19 import statsmodels.base.wrapper as wrap
20 from statsmodels.tsa.vector_ar import util
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/tsa/kalmanf/__init__.py in <module>()
----> 1 from kalmanfilter import KalmanFilter
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/tsa/kalmanf/kalmanfilter.py in <module>()
30 from numpy.linalg import inv, pinv
31 from statsmodels.tools.tools import chain_dot
---> 32 from . import kalman_loglike
33
34 #Fast filtering and smoothing for multivariate state space models
ImportError: cannot import name kalman_loglike
In [2]: import pandas
In [3]: from patsy import dmatrices
pandas builds on numpy arrays to provide rich data structures and data analysis tools. The pandas.DataFrame function provides labelled arrays of (potentially heterogenous) data, similar to the R “data.frame”. The pandas.read_csv function can be used to convert a comma-separated values file to a DataFrame object.
patsy is a Python library for describing satistical models and building Design Matrices using R-like formulas.
We download the Guerry dataset, a collection of historical data used in support of Andre-Michel Guerry’s 1833 Essay on the Moral Statistics of France. The data set is hosted online in comma-separated values format (CSV) by the Rdatasets repository. We could download the file locally and then load it using read_csv, but pandas takes care of all of this automatically for us:
In [4]: url = "http://vincentarelbundock.github.com/Rdatasets/csv/HistData/Guerry.csv"
#the next two lines are not necessary with a recent version of pandas
In [5]: from urllib2 import urlopen
In [6]: url = urlopen(url)
---------------------------------------------------------------------------
URLError Traceback (most recent call last)
<ipython-input-6-baebfb7a69e3> in <module>()
----> 1 url = urlopen(url)
/usr/lib64/python2.7/urllib2.pyc in urlopen(url, data, timeout)
125 if _opener is None:
126 _opener = build_opener()
--> 127 return _opener.open(url, data, timeout)
128
129 def install_opener(opener):
/usr/lib64/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
402 req = meth(req)
403
--> 404 response = self._open(req, data)
405
406 # post-process response
/usr/lib64/python2.7/urllib2.pyc in _open(self, req, data)
420 protocol = req.get_type()
421 result = self._call_chain(self.handle_open, protocol, protocol +
--> 422 '_open', req)
423 if result:
424 return result
/usr/lib64/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
380 func = getattr(handler, meth_name)
381
--> 382 result = func(*args)
383 if result is not None:
384 return result
/usr/lib64/python2.7/urllib2.pyc in http_open(self, req)
1214
1215 def http_open(self, req):
-> 1216 return self.do_open(httplib.HTTPConnection, req)
1217
1218 http_request = AbstractHTTPHandler.do_request_
/usr/lib64/python2.7/urllib2.pyc in do_open(self, http_class, req)
1184 except socket.error, err: # XXX what error?
1185 h.close()
-> 1186 raise URLError(err)
1187 else:
1188 try:
URLError: <urlopen error [Errno -3] Temporary failure in name resolution>
In [7]: df = pandas.read_csv(url)
---------------------------------------------------------------------------
URLError Traceback (most recent call last)
<ipython-input-7-4107a55c8bef> in <module>()
----> 1 df = pandas.read_csv(url)
/usr/lib64/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format)
450 infer_datetime_format=infer_datetime_format)
451
--> 452 return _read(filepath_or_buffer, kwds)
453
454 parser_f.__name__ = name
/usr/lib64/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
220
221 filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer,
--> 222 encoding)
223
224 if kwds.get('date_parser', None) is not None:
/usr/lib64/python2.7/site-packages/pandas/io/common.pyc in get_filepath_or_buffer(filepath_or_buffer, encoding)
116
117 if _is_url(filepath_or_buffer):
--> 118 req = _urlopen(str(filepath_or_buffer))
119 return maybe_read_encoded_stream(req, encoding)
120
/usr/lib64/python2.7/urllib2.pyc in urlopen(url, data, timeout)
125 if _opener is None:
126 _opener = build_opener()
--> 127 return _opener.open(url, data, timeout)
128
129 def install_opener(opener):
/usr/lib64/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
402 req = meth(req)
403
--> 404 response = self._open(req, data)
405
406 # post-process response
/usr/lib64/python2.7/urllib2.pyc in _open(self, req, data)
420 protocol = req.get_type()
421 result = self._call_chain(self.handle_open, protocol, protocol +
--> 422 '_open', req)
423 if result:
424 return result
/usr/lib64/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
380 func = getattr(handler, meth_name)
381
--> 382 result = func(*args)
383 if result is not None:
384 return result
/usr/lib64/python2.7/urllib2.pyc in http_open(self, req)
1214
1215 def http_open(self, req):
-> 1216 return self.do_open(httplib.HTTPConnection, req)
1217
1218 http_request = AbstractHTTPHandler.do_request_
/usr/lib64/python2.7/urllib2.pyc in do_open(self, http_class, req)
1184 except socket.error, err: # XXX what error?
1185 h.close()
-> 1186 raise URLError(err)
1187 else:
1188 try:
URLError: <urlopen error [Errno -3] Temporary failure in name resolution>
The Input/Output doc page shows how to import from various other formats.
We select the variables of interest and look at the bottom 5 rows:
In [8]: vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']
In [9]: df = df[vars]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-9-97b909d2ba47> in <module>()
----> 1 df = df[vars]
NameError: name 'df' is not defined
In [10]: df[-5:]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-10-387a2dedaa82> in <module>()
----> 1 df[-5:]
NameError: name 'df' is not defined
Notice that there is one missing observation in the Region column. We eliminate it using a DataFrame method provided by pandas:
In [11]: df = df.dropna()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-11-c783d194b823> in <module>()
----> 1 df = df.dropna()
NameError: name 'df' is not defined
In [12]: df[-5:]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-12-387a2dedaa82> in <module>()
----> 1 df[-5:]
NameError: name 'df' is not defined
We want to know whether literacy rates in the 86 French departments are associated with per capita wagers on the Royal Lottery in the 1820s. We need to control for the level of wealth in each department, and we also want to include a series of dummy variables on the right-hand side of our regression equation to control for unobserved heterogeneity due to regional effects. The model is estimated using ordinary least squares regression (OLS).
To fit most of the models covered by statsmodels, you will need to create two design matrices. The first is a matrix of endogenous variable(s) (i.e. dependent, response, regressand, etc.). The second is a matrix of exogenous variable(s) (i.e. independent, predictor, regressor, etc.). The OLS coefficient estimates are calculated as usual:
\hat{\beta} = (X'X)^{-1} X'y
where y is an N \times 1 column of data on lottery wagers per capita (Lottery). X is N \times 7 with an intercept, the Literacy and Wealth variables, and 4 region binary variables.
The patsy module provides a convenient function to prepare design matrices using R-like formulas. You can find more information here: http://patsy.readthedocs.org
We use patsy‘s dmatrices function to create design matrices:
In [13]: y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-13-b205df6907aa> in <module>()
----> 1 y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')
NameError: name 'df' is not defined
The resulting matrices/data frames look like this:
In [14]: y[:3]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-14-319cfe32a629> in <module>()
----> 1 y[:3]
NameError: name 'y' is not defined
In [15]: X[:3]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-15-a0b703194835> in <module>()
----> 1 X[:3]
NameError: name 'X' is not defined
Notice that dmatrices has
The above behavior can of course be altered. See the patsy doc pages.
Fitting a model in statsmodels typically involves 3 easy steps:
For OLS, this is achieved by:
In [16]: mod = sm.OLS(y, X) # Describe model
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-16-c6e56946df97> in <module>()
----> 1 mod = sm.OLS(y, X) # Describe model
NameError: name 'sm' is not defined
In [17]: res = mod.fit() # Fit model
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-17-6ff53364a7be> in <module>()
----> 1 res = mod.fit() # Fit model
NameError: name 'mod' is not defined
In [18]: print res.summary() # Summarize model
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-18-6ef19fe76b73> in <module>()
----> 1 print res.summary() # Summarize model
NameError: name 'res' is not defined
The res object has many useful attributes. For example, we can extract parameter estimates and r-squared by typing:
In [19]: res.params
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-19-cdb8a4b8734e> in <module>()
----> 1 res.params
NameError: name 'res' is not defined
In [20]: res.rsquared
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-20-375327da2eb6> in <module>()
----> 1 res.rsquared
NameError: name 'res' is not defined
Type dir(res) for a full list of attributes.
For more information and examples, see the Regression doc page
statsmodels allows you to conduct a range of useful regression diagnostics and specification tests. For instance, apply the Rainbow test for linearity (the null hypothesis is that the relationship is properly modelled as linear):
In [21]: sm.stats.linear_rainbow(res)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-21-d01346ddcfd4> in <module>()
----> 1 sm.stats.linear_rainbow(res)
NameError: name 'sm' is not defined
Admittedly, the output produced above is not very verbose, but we know from reading the docstring (also, print sm.stats.linear_rainbow.__doc__) that the first number is an F-statistic and that the second is the p-value.
statsmodels also provides graphics functions. For example, we can draw a plot of partial regression for a set of regressors by:
In [22]: from statsmodels.graphics.regressionplots import plot_partregress
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-22-be48e8d8a761> in <module>()
----> 1 from statsmodels.graphics.regressionplots import plot_partregress
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/graphics/regressionplots.py in <module>()
17 from statsmodels.sandbox.regression.predstd import wls_prediction_std
18 from statsmodels.graphics import utils
---> 19 from statsmodels.nonparametric.smoothers_lowess import lowess
20 from statsmodels.tools.tools import maybe_unwrap_results
21
/builddir/build/BUILD/statsmodels-0.5.0/statsmodels/nonparametric/smoothers_lowess.py in <module>()
9
10 import numpy as np
---> 11 from ._smoothers_lowess import lowess as _lowess
12
13 def lowess(endog, exog, frac=2.0/3.0, it=3, delta=0.0, is_sorted=False,
ImportError: No module named _smoothers_lowess
In [23]: plot_partregress(res)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-23-e05de258c07b> in <module>()
----> 1 plot_partregress(res)
NameError: name 'plot_partregress' is not defined
Congratulations! You’re ready to move on to other topics in the Table of Contents