Statsmodel¶

In [1]:

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:

iris = sns.load_dataset("iris")
iris.tail()

Out[2]:

	sepal_length	sepal_width	petal_length	petal_width	species
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

In [3]:

sns.pairplot(iris)

Out[3]:

<seaborn.axisgrid.PairGrid at 0x1053c7358>

../../../_images/contents_notebooks_statsmodels_Statsmodel_3_1.png

In [4]:

model = sm.OLS(iris.sepal_length, iris.sepal_width)
fitted = model.fit()

In [5]:

fitted.summary()

Out[5]:

OLS Regression Results
Dep. Variable:	sepal_length	R-squared:	0.957
Model:	OLS	Adj. R-squared:	0.956
Method:	Least Squares	F-statistic:	3277.
Date:	Sun, 03 Apr 2016	Prob (F-statistic):	2.42e-103
Time:	01:31:50	Log-Likelihood:	-243.98
No. Observations:	150	AIC:	490.0
Df Residuals:	149	BIC:	493.0
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
sepal_width	1.8690	0.033	57.246	0.000	1.804 1.934

Omnibus:	18.144	Durbin-Watson:	0.427
Prob(Omnibus):	0.000	Jarque-Bera (JB):	7.909
Skew:	-0.338	Prob(JB):	0.0192
Kurtosis:	2.101	Cond. No.	1.00

In [6]:

fitted.params

Out[6]:

sepal_width    1.869009
dtype: float64

In [7]:

def attrs(obj):
    return [a for a in dir(obj) if not a.startswith("_")]

In [8]:

%pprint

Pretty printing has been turned OFF

model と学習済みモデル¶

In [9]:

attrs(model)

Out[9]:

['data', 'df_model', 'df_resid', 'endog', 'endog_names', 'exog', 'exog_names', 'fit', 'fit_regularized', 'from_formula', 'hessian', 'information', 'initialize', 'k_constant', 'loglike', 'nobs', 'normalized_cov_params', 'pinv_wexog', 'predict', 'rank', 'score', 'weights', 'wendog', 'wexog', 'wexog_singular_values', 'whiten']

In [10]:

attrs(fitted)

Out[10]:

['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'diagn', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_robustcov_results', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wresid']

statsmodel api¶

In [11]:

[a for a in attrs(sm) if a.isupper()]

Out[11]:

['GEE', 'GLM', 'GLS', 'GLSAR', 'OLS', 'RLM', 'WLS']

In [12]:

[a for a in attrs(sm) if a[0].isupper()]

Out[12]:

['GEE', 'GLM', 'GLS', 'GLSAR', 'Logit', 'MNLogit', 'MixedLM', 'NegativeBinomial', 'NominalGEE', 'OLS', 'OrdinalGEE', 'PHReg', 'Poisson', 'ProbPlot', 'Probit', 'QuantReg', 'RLM', 'WLS']

In [13]:

[a for a in attrs(sm) if a.islower()]

Out[13]:

['add_constant', 'categorical', 'cov_struct', 'datasets', 'distributions', 'emplike', 'families', 'formula', 'genmod', 'graphics', 'iolib', 'load', 'nonparametric', 'qqline', 'qqplot', 'qqplot_2samples', 'regression', 'robust', 'show_versions', 'stats', 'test', 'tools', 'tsa', 'version', 'webdoc']

statsmodels formula api¶

In [14]:

[a for a in attrs(smf) if a.isupper()]

Out[14]:

['GEE', 'GLM', 'GLS', 'GLSAR', 'OLS', 'RLM', 'WLS']

In [15]:

[a for a in attrs(smf) if a[0].isupper()]

Out[15]:

['GEE', 'GLM', 'GLS', 'GLSAR', 'Logit', 'MNLogit', 'MixedLM', 'NegativeBinomial', 'NominalGEE', 'OLS', 'OrdinalGEE', 'PHReg', 'Poisson', 'Probit', 'QuantReg', 'RLM', 'WLS']

In [16]:

[a for a in attrs(smf) if a.islower()]

Out[16]:

['gee', 'glm', 'gls', 'glsar', 'logit', 'mixedlm', 'mnlogit', 'negativebinomial', 'nominal_gee', 'ols', 'ordinal_gee', 'phreg', 'poisson', 'probit', 'quantreg', 'rlm', 'wls']

statsmodels¶

In [17]:

import statsmodels

In [18]:

[a for a in attrs(statsmodels) if a.isupper()]

Out[18]:

[]

In [19]:

[a for a in attrs(statsmodels) if a[0].isupper()]

Out[19]:

['CacheWriteWarning', 'ConvergenceWarning', 'InvalidTestWarning', 'IterationLimitWarning', 'NoseWrapper', 'Tester']

In [20]:

[a for a in attrs(statsmodels) if a.islower()]

Out[20]:

['api', 'base', 'compat', 'datasets', 'discrete', 'distributions', 'duration', 'emplike', 'errstate', 'formula', 'genmod', 'graphics', 'info', 'iolib', 'nonparametric', 'print_function', 'regression', 'robust', 'sandbox', 'simplefilter', 'stats', 'test', 'tools', 'tsa', 'version']

In [21]:

{k for k in plt.rcParams.keys() if "text" in k} | {k for k in plt.rcParams.keys() if "size" in k}

Out[21]:

{'mathtext.bf', 'mathtext.rm', 'text.latex.preamble', 'text.hinting', 'ytick.minor.size', 'figure.titlesize', 'agg.path.chunksize', 'mathtext.fontset', 'mathtext.fallback_to_cm', 'text.hinting_factor', 'mathtext.tt', 'mathtext.cal', 'xtick.labelsize', 'mathtext.it', 'text.color', 'legend.handletextpad', 'legend.fontsize', 'axes.formatter.use_mathtext', 'figure.figsize', 'xtick.minor.size', 'text.dvipnghack', 'text.antialiased', 'errorbar.capsize', 'axes.titlesize', 'text.latex.unicode', 'lines.markersize', 'text.latex.preview', 'xtick.major.size', 'mathtext.default', 'axes.labelsize', 'ytick.labelsize', 'ps.papersize', 'boxplot.flierprops.markersize', 'mathtext.sf', 'ytick.major.size', 'font.size', 'text.usetex'}

In [26]:

from statsmodels.graphics.mosaicplot import mosaic

iris_copy = iris.copy()
iris_copy["sepal_length_round"] = iris_copy.sepal_length.round()

# http://matplotlib.org/users/customizing.html
# http://statsmodels.sourceforge.net/stable/generated/statsmodels.graphics.mosaicplot.mosaic.html
plot_context = {
    'figure.figsize': (10, 8),
    'font.size': 18,
    'font.stretch': 18,
    'figure.dpi': 100,
    'axes.labelsize': 18,
    'axes.titlesize': 20,
    'xtick.labelsize': 15,
    'ytick.labelsize': 20,
    'legend.fontsize': 15,
    'lines.markersize': 15,
}
with plt.rc_context(plot_context):
    fig, rects = mosaic(iris_copy, ["sepal_length_round", "species"], gap=0.01)
    plt.savefig("mosaic_cm.png")

pd.DataFrame(rects)
# 0, 1, : ??
# 2: 横軸を全体とした時の構成比率
# 3: 各横軸内での構成比率

Out[26]:

	5.0			4.0			6.0			7.0			8.0
	setosa	versicolor	virginica	setosa	versicolor	virginica	setosa	versicolor	virginica	setosa	versicolor	virginica	setosa	versicolor	virginica
0	0.000000	0.000000	0.000000	0.310897	0.310897	0.310897	0.352564	0.352564	0.352564	0.798077	0.798077	0.798077	0.961538	0.961538	0.961538
1	0.000000	0.846445	0.979003	0.000000	0.993421	1.000000	0.000000	0.079141	0.608166	0.000000	0.006579	0.342105	0.000000	0.006579	0.013158
2	0.301282	0.301282	0.301282	0.032051	0.032051	0.032051	0.435897	0.435897	0.435897	0.153846	0.153846	0.153846	0.038462	0.038462	0.038462
3	0.839866	0.125980	0.020997	0.986842	0.000000	0.000000	0.072562	0.522446	0.391834	0.000000	0.328947	0.657895	0.000000	0.000000	0.986842

../../../_images/contents_notebooks_statsmodels_Statsmodel_26_1.png

In [32]:

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
index = ["sepal_length_round", "species"]
mosaic(iris_copy, index, gap=0.01, ax=axes[0])
mosaic(iris_copy, index[::-1], gap=0.01, ax=axes[1])
plt.tight_layout()

../../../_images/contents_notebooks_statsmodels_Statsmodel_27_0.png

In [27]:

mosaic?

本サイトでは、サイトの分析と改善のためにGoogleアナリティクスを使用しています。