datascientist_mook_vol1¶

データサイエンティスト養成読本vol.1（Python 機械学習）¶

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

irisデータ読み込み(from pandas repogitory)¶

In [2]:

import os

iris_pandas_url = "https://raw.githubusercontent.com/pydata/pandas/master/doc/data/iris.data"
iris_data_file_name = "iris.data.csv"
if os.path.exists(iris_data_file_name):
    iris = pd.read_csv(iris_data_file_name)
else:
    iris = pd.read_csv(iris_pandas_url)
    iris.to_csv(iris_data_file_name, index=False)
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
SepalLength    150 non-null float64
SepalWidth     150 non-null float64
PetalLength    150 non-null float64
PetalWidth     150 non-null float64
Name           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB

In [3]:

setosa = iris[iris.Name == "Iris-setosa"]
versicolor = iris[iris.Name == "Iris-versicolor"]
virginica = iris[iris.Name == "Iris-virginica"]
len(setosa), len(versicolor), len(virginica),

Out[3]:

(50, 50, 50)

In [4]:

iris.pivot_table(index="Name", aggfunc=np.mean)

Out[4]:

	PetalLength	PetalWidth	SepalLength	SepalWidth
Name
Iris-setosa	1.464	0.244	5.006	3.418
Iris-versicolor	4.260	1.326	5.936	2.770
Iris-virginica	5.552	2.026	6.588	2.974

In [5]:

pd.concat(
    [
        setosa.sum(),
        setosa.mean(), setosa.median(),
        setosa.min(), setosa.max(),
        setosa.var(), setosa.std(),
    ], axis=1,
    keys=["sum", "mean", "median", "min", "max", "var", "std"]
)

Out[5]:

	sum	mean	median	min	max	var	std
Name	Iris-setosaIris-setosaIris-setosaIris-setosaIr...	NaN	NaN	Iris-setosa	Iris-setosa	NaN	NaN
PetalLength	73.2	1.464	1.5	1	1.9	0.030106	0.173511
PetalWidth	12.2	0.244	0.2	0.1	0.6	0.011494	0.107210
SepalLength	250.3	5.006	5.0	4.3	5.8	0.124249	0.352490
SepalWidth	170.9	3.418	3.4	2.3	4.4	0.145180	0.381024

In [6]:

setosa.describe().T

Out[6]:

	count	mean	std	min	25%	50%	75%	max
SepalLength	50.0	5.006	0.352490	4.3	4.800	5.0	5.200	5.8
SepalWidth	50.0	3.418	0.381024	2.3	3.125	3.4	3.675	4.4
PetalLength	50.0	1.464	0.173511	1.0	1.400	1.5	1.575	1.9
PetalWidth	50.0	0.244	0.107210	0.1	0.200	0.2	0.300	0.6

In [7]:

[
    np.arange(0, 1.1, 0.1),
    np.linspace(0, 1, 11)
]

Out[7]:

[array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]),
 array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])]

In [8]:

setosa.describe(include="all", percentiles=np.linspace(0.1, 0.9, 9)).T

Out[8]:

	count	unique	top	freq	mean	std	min	10%	20%	30.0%	40%	50%	60%	70%	80%	90%	max
SepalLength	50	NaN	NaN	NaN	5.006	0.35249	4.3	4.59	4.7	4.8	4.96	5	5.1	5.1	5.32	5.41	5.8
SepalWidth	50	NaN	NaN	NaN	3.418	0.381024	2.3	3	3.1	3.2	3.36	3.4	3.5	3.53	3.72	3.9	4.4
PetalLength	50	NaN	NaN	NaN	1.464	0.173511	1	1.3	1.3	1.4	1.4	1.5	1.5	1.5	1.6	1.7	1.9
PetalWidth	50	NaN	NaN	NaN	0.244	0.10721	0.1	0.1	0.2	0.2	0.2	0.2	0.2	0.3	0.3	0.4	0.6
Name	50	1	Iris-setosa	50	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [9]:

setosa.corr()

Out[9]:

	SepalLength	SepalWidth	PetalLength	PetalWidth
SepalLength	1.000000	0.746780	0.263874	0.279092
SepalWidth	0.746780	1.000000	0.176695	0.279973
PetalLength	0.263874	0.176695	1.000000	0.306308
PetalWidth	0.279092	0.279973	0.306308	1.000000

In [10]:

setosa.cov()

Out[10]:

	SepalLength	SepalWidth	PetalLength	PetalWidth
SepalLength	0.124249	0.100298	0.016139	0.010547
SepalWidth	0.100298	0.145180	0.011682	0.011437
PetalLength	0.016139	0.011682	0.030106	0.005698
PetalWidth	0.010547	0.011437	0.005698	0.011494

In [11]:

fig, axes = plt.subplots(1, 2, figsize=(8, 4))
iris.SepalLength.plot.hist(ax=axes[0])
axes[0].set_xlabel("SepalLength")

setosa.SepalLength.plot.hist(ax=axes[1])

plt.tight_layout()

../../../_images/contents_notebooks_books_datascientist_mook_vol1_12_0.png

In [12]:

pd.concat(
    {
        "setosa": setosa.SepalLength,
        "versicolor": versicolor.SepalLength,
        "virginica": virginica.SepalLength
    }, axis=1
).plot.box()

Out[12]:

<matplotlib.axes._subplots.AxesSubplot at 0x117375e10>

../../../_images/contents_notebooks_books_datascientist_mook_vol1_13_1.png

In [13]:

data = [setosa.SepalLength, versicolor.SepalLength, virginica.SepalLength]
plt.boxplot(data)
"dummy"

Out[13]:

'dummy'

../../../_images/contents_notebooks_books_datascientist_mook_vol1_14_1.png

In [14]:

setosa.plot.scatter(x="SepalLength",y="SepalWidth")

Out[14]:

<matplotlib.axes._subplots.AxesSubplot at 0x119eff4e0>

../../../_images/contents_notebooks_books_datascientist_mook_vol1_15_1.png

In [15]:

setosa.corr().ix["SepalLength", "SepalWidth"]

Out[15]:

0.74678037326392688

In [16]:

sns.pairplot(setosa)
plt.tight_layout()

../../../_images/contents_notebooks_books_datascientist_mook_vol1_17_0.png

In [17]:

from sklearn import linear_model
np.random.seed(0)

linear_regr = linear_model.LinearRegression()
X = setosa[["SepalLength"]]
Y = setosa[["SepalWidth"]]
linear_regr.fit(X, Y)

plt.scatter(X, Y)
# http://sucrose.hatenablog.com/entry/2013/03/16/162019
px = np.arange(X.min(), X.max(), 0.01)[:, np.newaxis]
py = linear_regr.predict(px)
# print(px.shape, py.shape)

plt.plot(px, py, color="blue", linewidth=3)
plt.xlabel("SepalLength")
plt.ylabel("SepalWidth")

linear_regr.coef_, linear_regr.intercept_, linear_regr.score(X, Y)

Out[17]:

(array([[ 0.80723367]]), array([-0.62301173]), 0.55768092589220974)

../../../_images/contents_notebooks_books_datascientist_mook_vol1_18_1.png

In [18]:

linear_regr = linear_model.LinearRegression()
X = setosa[["SepalLength", "PetalLength", "PetalWidth"]]
Y = setosa[["SepalWidth"]]
linear_regr.fit(X, Y)
linear_regr.coef_, linear_regr.intercept_, linear_regr.score(X, Y)

Out[18]:

(array([[ 0.79303981, -0.09677873,  0.31530122]]),
 array([-0.48720671]),
 0.56492621264573206)

In [20]:

import pyper
import os

image_file_name = "image.png"
r_source = """library(dplyr)
png("{0}", width = 480, height = 480, pointsize = 12, bg = "white", res = NA)
pairs(select(r_data, c(-Name)))
dev.off()
""".format(image_file_name)

r = pyper.R(use_pandas = "True")
r_source_file = "example.R"
r.assign("r_data", iris)
print(r("summary(r_data)"))

with open(r_source_file, "w", encoding="sjis") as f:
    f.write(r_source)
print(r("source(file='{}')".format(r_source_file)))
os.remove(r_source_file)

from IPython.core.display import Image
Image(image_file_name)
#os.remove(image_file_name)

try({summary(r_data)})
  SepalLength      SepalWidth     PetalLength      PetalWidth
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300
 Median :5.800   Median :3.000   Median :4.350   Median :1.300
 Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
              Name
 Iris-setosa    :50
 Iris-versicolor:50
 Iris-virginica :50




<pyper.R object at 0x11b7455f8>
try({source(file='example.R')})

 次のパッケージを付け加えます: ‘dplyr’

 以下のオブジェクトは ‘package:stats’ からマスクされています:

     filter, lag

 以下のオブジェクトは ‘package:base’ からマスクされています:

     intersect, setdiff, setequal, union

Out[20]:

../../../_images/contents_notebooks_books_datascientist_mook_vol1_20_1.png

In [25]:

# k-means
def category2int(x):
    category = {
        "Iris-setosa": 0,
        "Iris-versicolor": 1,
        "Iris-virginica": 2,
    }
    return category[x]

In [45]:

from sklearn.cluster import KMeans

X = iris[["SepalLength", "SepalWidth"]]
kmeansCls = KMeans(n_clusters=3)
kmeansCls.fit(X)

[print(a, "\n", getattr(kmeansCls, a)) for a in dir(kmeansCls) if not a.startswith("_") and a.endswith("_")]

Y = iris.Name.map(category2int)
xMin = X.SepalLength.min()
xMax = X.SepalLength.max()
yMin = X.SepalWidth.min()
yMax = X.SepalWidth.max()
xx, yy = np.meshgrid(np.arange(xMin, xMax, 0.01), np.arange(yMin, yMax, 0.01))
Z = kmeansCls.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Reds)
plt.scatter(X.SepalLength, X.SepalWidth, c=np.array(Y), cmap=plt.cm.Blues)

cluster_centers_
 [[ 5.006       3.418     ]
 [ 6.81276596  3.07446809]
 [ 5.77358491  2.69245283]]
inertia_
 37.1237021277
labels_
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 1 2 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2
 1 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1
 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 2 1 1 1 2 1
 1 2]
n_iter_
 6

Out[45]:

<matplotlib.collections.PathCollection at 0x11cebd710>

../../../_images/contents_notebooks_books_datascientist_mook_vol1_22_2.png

In [47]:

plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Reds)
plt.scatter(X.SepalLength, X.SepalWidth, c=kmeansCls.labels_, cmap=plt.cm.Blues)

Out[47]:

<matplotlib.collections.PathCollection at 0x11afeb940>

../../../_images/contents_notebooks_books_datascientist_mook_vol1_23_1.png

In [42]:

kmeansCls.predict(np.c_[xx.ravel(), yy.ravel()])

Out[42]:

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [ ]:

In [ ]: