corr_pair_scatter

In [1]:
# http://qiita.com/ksomemo/items/69ae8eec98b795781586(load datasets in python)
In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [3]:
from sklearn import datasets
iris = datasets.load_iris()
# cm邪魔なので削除
columns = map(lambda c: c.replace(" (cm)", ""), iris.feature_names[:4])
iris_data = pd.DataFrame(iris.data, columns=columns)
iris_data["target"] = pd.Series(iris.target)
iris_data["target_names"] = iris_data["target"].apply(lambda x: iris.target_names[x])
iris_data.tail()
Out[3]:
sepal length sepal width petal length petal width target target_names
145 6.7 3.0 5.2 2.3 2 virginica
146 6.3 2.5 5.0 1.9 2 virginica
147 6.5 3.0 5.2 2.0 2 virginica
148 6.2 3.4 5.4 2.3 2 virginica
149 5.9 3.0 5.1 1.8 2 virginica
In [7]:
iris_data_feature = iris_data[iris_data.columns[:4]]
iris_data_feature.tail()
Out[7]:
sepal length sepal width petal length petal width
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
In [20]:
iris_data.groupby("target_names").describe()
Out[20]:
petal length petal width sepal length sepal width target
target_names
setosa count 50.000000 50.000000 50.000000 50.000000 50
mean 1.464000 0.244000 5.006000 3.418000 0
std 0.173511 0.107210 0.352490 0.381024 0
min 1.000000 0.100000 4.300000 2.300000 0
25% 1.400000 0.200000 4.800000 3.125000 0
50% 1.500000 0.200000 5.000000 3.400000 0
75% 1.575000 0.300000 5.200000 3.675000 0
max 1.900000 0.600000 5.800000 4.400000 0
versicolor count 50.000000 50.000000 50.000000 50.000000 50
mean 4.260000 1.326000 5.936000 2.770000 1
std 0.469911 0.197753 0.516171 0.313798 0
min 3.000000 1.000000 4.900000 2.000000 1
25% 4.000000 1.200000 5.600000 2.525000 1
50% 4.350000 1.300000 5.900000 2.800000 1
75% 4.600000 1.500000 6.300000 3.000000 1
max 5.100000 1.800000 7.000000 3.400000 1
virginica count 50.000000 50.000000 50.000000 50.000000 50
mean 5.552000 2.026000 6.588000 2.974000 2
std 0.551895 0.274650 0.635880 0.322497 0
min 4.500000 1.400000 4.900000 2.200000 2
25% 5.100000 1.800000 6.225000 2.800000 2
50% 5.550000 2.000000 6.500000 3.000000 2
75% 5.875000 2.300000 6.900000 3.175000 2
max 6.900000 2.500000 7.900000 3.800000 2
In [8]:
iris_data.target_names.value_counts().plot.bar()
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x118f846d8>
../../../_images/contents_notebooks_pandas_corr_pair_scatter_6_1.png
In [9]:
iris_data_feature.plot.scatter(x="sepal length", y="sepal width")
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x11900b0b8>
../../../_images/contents_notebooks_pandas_corr_pair_scatter_7_1.png
In [10]:
iris_data_feature["sepal length"].plot.line()
iris_data_feature["sepal width"].plot.line(secondary_y=True)

Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1190bb4a8>
../../../_images/contents_notebooks_pandas_corr_pair_scatter_8_1.png
In [11]:
with pd.plot_params.use('x_compat', True):
    iris_data_feature["sepal length"].plot(color='r')
    iris_data_feature["sepal width"].plot(color='g')
    iris_data_feature["petal length"].plot(color='b')
../../../_images/contents_notebooks_pandas_corr_pair_scatter_9_0.png
In [12]:
iris_data_feature.plot(subplots=True, figsize=(16, 8), layout=(2, 2))
Out[12]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1195c12e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11c4067b8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11c4324a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11c542cf8>]], dtype=object)
../../../_images/contents_notebooks_pandas_corr_pair_scatter_10_1.png
In [13]:
iris_data_feature.corr()
Out[13]:
sepal length sepal width petal length petal width
sepal length 1.000000 -0.109369 0.871754 0.817954
sepal width -0.109369 1.000000 -0.420516 -0.356544
petal length 0.871754 -0.420516 1.000000 0.962757
petal width 0.817954 -0.356544 0.962757 1.000000
In [14]:
iris_data_feature.corr(method="kendall")
Out[14]:
sepal length sepal width petal length petal width
sepal length 1.000000 -0.072112 0.717624 0.654960
sepal width -0.072112 1.000000 -0.182391 -0.146988
petal length 0.717624 -0.182391 1.000000 0.803014
petal width 0.654960 -0.146988 0.803014 1.000000
In [15]:
iris_data_feature.corr(method="spearman")
Out[15]:
sepal length sepal width petal length petal width
sepal length 1.000000 -0.159457 0.881386 0.834421
sepal width -0.159457 1.000000 -0.303421 -0.277511
petal length 0.881386 -0.303421 1.000000 0.936003
petal width 0.834421 -0.277511 0.936003 1.000000
In [16]:
from pandas.tools.plotting import scatter_matrix

scatter_matrix(iris_data_feature, alpha=0.2, figsize=(6, 6), diagonal='kde')
Out[16]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11caccf28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11cd9eb70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d3f25f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d42ce80>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11d479630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d4bb208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d503f28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d543c88>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11d595470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d5dccc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d72b390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d772b00>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11d7b0d30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d7fbd68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d937c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d988a20>]], dtype=object)
../../../_images/contents_notebooks_pandas_corr_pair_scatter_14_1.png
In [19]:
# sns.pairplot(iris_data, hue='target')
sns.pairplot(iris_data, hue='target_names')

Out[19]:
<seaborn.axisgrid.PairGrid at 0x11cac6550>
../../../_images/contents_notebooks_pandas_corr_pair_scatter_15_1.png
In [18]:
price = pd.Series(np.random.randn(150).cumsum(), index=pd.date_range('2000-1-1', periods=150, freq='B'))