roc_auc_learning_curve

In [5]:
import numpy as np
from sklearn import metrics
import pandas as pd
from sklearn import datasets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [3]:
y = np.array([1, 1, 2, 2])
pred = np.array([0.1, 0.4, 0.35, 0.8])
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)

In [4]:
fpr
Out[4]:
array([ 0. ,  0.5,  0.5,  1. ])
In [5]:
tpr
Out[5]:
array([ 0.5,  0.5,  1. ,  1. ])
In [6]:
thresholds
Out[6]:
array([ 0.8 ,  0.4 ,  0.35,  0.1 ])
In [7]:
metrics.auc(fpr, tpr)

Out[7]:
0.75
In [ ]:

In [8]:
titanic = sns.load_dataset("titanic")
titanic = titanic.select_dtypes(include=["number"])
titanic.age = titanic.age.fillna(titanic.age.mean())
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
survived    891 non-null int64
pclass      891 non-null int64
age         891 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
fare        891 non-null float64
dtypes: float64(2), int64(4)
memory usage: 41.8 KB
In [10]:
titanic_features = titanic.drop("survived", axis=1)
titanic_target = titanic.survived
In [59]:
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics
X_train, X_test, y_train, y_test = model_selection.train_test_split(titanic_features, titanic_target, test_size=0.3)
t_lr = linear_model.LogisticRegression().fit(X_train, y_train)
proba = t_lr.predict_proba(X_test)
predict = t_lr.predict(X_test)

fpr, tpr, thresholds = metrics.roc_curve(y_test, proba[:, 1])
metrics.auc(fpr, tpr), metrics.roc_auc_score(y_test, proba[:, 1])
Out[59]:
(0.72904761904761894, 0.72904761904761894)
In [124]:
X_train.shape, X_test.shape
Out[124]:
((623, 5), (268, 5))
In [60]:
t_lr.classes_
Out[60]:
array([0, 1])
In [123]:
(proba[:, 1] > 0.5).sum(), proba.shape
Out[123]:
(59, (268, 2))
In [62]:
fpr.shape, tpr.shape, thresholds.shape
Out[62]:
((109,), (109,), (109,))
In [125]:
_fpr, _tpr, _thresholds = metrics.roc_curve(y_test, proba[:, 1], drop_intermediate=False)
_fpr.shape, _tpr.shape, _thresholds.shape, proba.shape[0] - _fpr.shape[0]
Out[125]:
((237,), (237,), (237,), 31)
In [113]:
plt.plot(fpr)
plt.plot(tpr)
plt.plot(thresholds)

Out[113]:
[<matplotlib.lines.Line2D at 0x1161aad68>]
../../../_images/contents_notebooks_sklearn_roc_auc_learning_curve_16_1.png
In [120]:
plt.plot(_fpr)
plt.plot(_tpr)
plt.plot(_thresholds)

Out[120]:
[<matplotlib.lines.Line2D at 0x116310e10>]
../../../_images/contents_notebooks_sklearn_roc_auc_learning_curve_17_1.png
In [305]:
np.where(np.r_[True,False,True, np.logical_or([1,0,2], [0,1,1])])
Out[305]:
(array([0, 2, 3, 4, 5]),)
In [307]:
# 縦軸にTrue Positive、横軸にFalse Positiveの割合
roc_df = pd.DataFrame(np.c_[y_test, proba[:, 1], proba[:, 1] > 0.5])
roc_df.columns = ["y", "proba", "pred"]

roc_df = roc_df.sort_values("proba", ascending=False).reset_index()
roc_df["proba_diff"] = roc_df["proba"].diff()
#roc_df = roc_df.dropna()
roc_df = roc_df[roc_df["proba_diff"] != 0].reset_index(drop=True)
roc_df["tps"] = roc_df["y"].cumsum()
roc_df["fps"] = 1 + roc_df.index - roc_df["tps"]

roc_df["fps_diff"] = roc_df["fps"].diff().diff().shift(-1).fillna(1)
roc_df["tps_diff"] = roc_df["tps"].diff().diff().shift(-1).fillna(1)
roc_df["ps_diff_0"] = (roc_df["fps_diff"] != 0) | (roc_df["fps_diff"] != 0)

drop_intermediate = True
if drop_intermediate and len(roc_df["fps"]) > 2:
    roc_df = roc_df[roc_df["ps_diff_0"]].reset_index(drop=True)
else:
    roc_df["_fpr"] = _fpr
    roc_df["_tpr"] = _tpr
roc_df["tps_norm"] = roc_df["tps"] / roc_df["tps"].max()
roc_df["fps_norm"] = roc_df["fps"] / roc_df["fps"].max()

roc_df
#roc_df.plot(x="fps_norm", y="tps_norm"); plt.plot([0,1], [0,1])

Out[307]:
index y proba pred proba_diff tps fps fps_diff tps_diff ps_diff_0 tps_norm fps_norm
0 99 1.0 0.932104 1.0 NaN 1.0 0.0 1.0 1.0 True 0.010753 0.000000
1 98 1.0 0.926251 1.0 -0.005853 2.0 0.0 1.0 -1.0 True 0.021505 0.000000
2 93 0.0 0.872601 1.0 -0.033082 2.0 2.0 -1.0 1.0 True 0.021505 0.013889
3 257 1.0 0.714187 1.0 -0.002086 18.0 2.0 1.0 -1.0 True 0.193548 0.013889
4 189 0.0 0.711957 1.0 -0.002230 18.0 3.0 -1.0 1.0 True 0.193548 0.020833
5 104 1.0 0.675602 1.0 -0.022820 22.0 3.0 1.0 -1.0 True 0.236559 0.020833
6 163 0.0 0.661268 1.0 -0.014334 22.0 4.0 -1.0 1.0 True 0.236559 0.027778
7 77 1.0 0.655571 1.0 -0.005697 23.0 4.0 1.0 -1.0 True 0.247312 0.027778
8 222 0.0 0.649628 1.0 -0.000305 23.0 7.0 -1.0 1.0 True 0.247312 0.048611
9 182 1.0 0.647242 1.0 -0.001402 25.0 7.0 1.0 -1.0 True 0.268817 0.048611
10 44 0.0 0.646640 1.0 -0.000602 25.0 8.0 -1.0 1.0 True 0.268817 0.055556
11 61 1.0 0.622325 1.0 -0.001325 27.0 8.0 1.0 -1.0 True 0.290323 0.055556
12 195 0.0 0.611768 1.0 -0.010557 27.0 9.0 -1.0 1.0 True 0.290323 0.062500
13 33 1.0 0.610921 1.0 -0.000847 28.0 9.0 1.0 -1.0 True 0.301075 0.062500
14 186 0.0 0.601489 1.0 -0.009432 28.0 10.0 -1.0 1.0 True 0.301075 0.069444
15 51 1.0 0.579769 1.0 -0.008188 32.0 10.0 1.0 -1.0 True 0.344086 0.069444
16 31 0.0 0.578852 1.0 -0.000917 32.0 11.0 -1.0 1.0 True 0.344086 0.076389
17 229 1.0 0.542517 1.0 -0.001301 36.0 11.0 1.0 -1.0 True 0.387097 0.076389
18 187 0.0 0.542236 1.0 -0.000281 36.0 12.0 -1.0 1.0 True 0.387097 0.083333
19 264 1.0 0.540829 1.0 -0.001406 37.0 12.0 1.0 -1.0 True 0.397849 0.083333
20 227 0.0 0.540596 1.0 -0.000234 37.0 13.0 -1.0 1.0 True 0.397849 0.090278
21 190 1.0 0.528790 1.0 -0.000949 40.0 13.0 1.0 -1.0 True 0.430108 0.090278
22 96 0.0 0.521248 1.0 -0.007542 40.0 14.0 -1.0 1.0 True 0.430108 0.097222
23 4 1.0 0.520023 1.0 -0.001225 41.0 14.0 1.0 -1.0 True 0.440860 0.097222
24 220 0.0 0.519490 1.0 -0.000532 41.0 15.0 -1.0 1.0 True 0.440860 0.104167
25 78 1.0 0.499988 0.0 -0.004243 43.0 15.0 1.0 -1.0 True 0.462366 0.104167
26 237 0.0 0.488286 0.0 -0.001619 43.0 19.0 -1.0 1.0 True 0.462366 0.131944
27 204 1.0 0.486151 0.0 -0.002135 44.0 19.0 1.0 -1.0 True 0.473118 0.131944
28 232 0.0 0.454173 0.0 -0.009898 44.0 25.0 -1.0 1.0 True 0.473118 0.173611
29 217 1.0 0.444851 0.0 -0.009322 45.0 25.0 1.0 -1.0 True 0.483871 0.173611
... ... ... ... ... ... ... ... ... ... ... ... ...
65 166 1.0 0.265476 0.0 -0.011778 76.0 80.0 1.0 -1.0 True 0.817204 0.555556
66 152 0.0 0.264115 0.0 -0.001362 76.0 81.0 -1.0 1.0 True 0.817204 0.562500
67 75 1.0 0.262040 0.0 -0.002075 77.0 81.0 1.0 -1.0 True 0.827957 0.562500
68 141 0.0 0.250243 0.0 -0.002058 77.0 88.0 -1.0 1.0 True 0.827957 0.611111
69 169 1.0 0.250230 0.0 -0.000013 78.0 88.0 1.0 -1.0 True 0.838710 0.611111
70 11 0.0 0.242056 0.0 -0.000125 78.0 92.0 -1.0 1.0 True 0.838710 0.638889
71 83 1.0 0.242043 0.0 -0.000013 79.0 92.0 1.0 -1.0 True 0.849462 0.638889
72 7 0.0 0.242003 0.0 -0.000040 79.0 93.0 -1.0 1.0 True 0.849462 0.645833
73 258 1.0 0.241959 0.0 -0.000044 80.0 93.0 1.0 -1.0 True 0.860215 0.645833
74 25 0.0 0.241939 0.0 -0.000020 80.0 94.0 -1.0 1.0 True 0.860215 0.652778
75 107 1.0 0.241929 0.0 -0.000010 81.0 94.0 1.0 -1.0 True 0.870968 0.652778
76 8 0.0 0.236222 0.0 -0.000238 81.0 103.0 -1.0 1.0 True 0.870968 0.715278
77 225 1.0 0.233435 0.0 -0.000078 83.0 103.0 1.0 -1.0 True 0.892473 0.715278
78 191 0.0 0.232782 0.0 -0.000653 83.0 104.0 -1.0 1.0 True 0.892473 0.722222
79 0 1.0 0.229373 0.0 -0.003409 84.0 104.0 1.0 -1.0 True 0.903226 0.722222
80 221 0.0 0.225902 0.0 -0.003471 84.0 105.0 -1.0 1.0 True 0.903226 0.729167
81 245 1.0 0.224496 0.0 -0.001351 86.0 105.0 1.0 -1.0 True 0.924731 0.729167
82 43 0.0 0.220303 0.0 -0.000726 86.0 107.0 -1.0 1.0 True 0.924731 0.743056
83 192 1.0 0.219365 0.0 -0.000938 87.0 107.0 1.0 -1.0 True 0.935484 0.743056
84 50 0.0 0.219000 0.0 -0.000091 87.0 110.0 -1.0 1.0 True 0.935484 0.763889
85 254 1.0 0.214019 0.0 -0.004982 88.0 110.0 1.0 -1.0 True 0.946237 0.763889
86 157 0.0 0.208116 0.0 -0.000011 88.0 117.0 -1.0 1.0 True 0.946237 0.812500
87 162 1.0 0.207681 0.0 -0.000435 89.0 117.0 1.0 -1.0 True 0.956989 0.812500
88 60 0.0 0.183371 0.0 -0.005874 89.0 129.0 -1.0 1.0 True 0.956989 0.895833
89 260 1.0 0.177140 0.0 -0.006231 90.0 129.0 1.0 -1.0 True 0.967742 0.895833
90 247 0.0 0.175714 0.0 -0.001073 90.0 131.0 -1.0 1.0 True 0.967742 0.909722
91 22 1.0 0.172461 0.0 -0.003253 91.0 131.0 1.0 -1.0 True 0.978495 0.909722
92 246 0.0 0.154211 0.0 -0.004246 91.0 138.0 -1.0 1.0 True 0.978495 0.958333
93 6 1.0 0.149162 0.0 -0.004977 93.0 138.0 1.0 -1.0 True 1.000000 0.958333
94 200 0.0 0.078597 0.0 -0.006344 93.0 144.0 1.0 1.0 True 1.000000 1.000000

95 rows × 12 columns

In [154]:
## Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, predict)
print(confusion_matrix)
print(confusion_matrix/ confusion_matrix.sum())
PNTF_labels = np.array([["TP", "FP"], ["FN", "TN"]])
sns.heatmap(confusion_matrix, annot=PNTF_labels, fmt="") #annot=True)
confusion_s = pd.Series(confusion_matrix.flatten(), index=PNTF_labels.flatten())
confusion_s
[[152  16]
 [ 57  43]]
[[ 0.56716418  0.05970149]
 [ 0.21268657  0.16044776]]
Out[154]:
TP    152
FP     16
FN     57
TN     43
dtype: int64
../../../_images/contents_notebooks_sklearn_roc_auc_learning_curve_20_2.png
In [153]:
confusion_matrix.flatten()
Out[153]:
array([152,  16,  57,  43])

xy xは予測した結果、正解/不正解(T/F) yは予測した結果が正解ならそのまま、不正解なら逆 Pは正例、Nは負例、

  • TP
  • TN
  • FP
  • FN
In [159]:
# ROC曲線は,分類器のパラメータを変化させながら,縦軸に TP/TP+FN,横軸に FP/FP+TN をとった曲線.
_cfs = confusion_s
[
    _cfs["TP"] / (_cfs["TP"] + _cfs["FN"]), # 正解は正のうち、ただしく予測できた割合(再現率: recall)
    _cfs["FP"] / (_cfs["FP"] + _cfs["TN"]), # 正解は負のうち、間違って予測した割合
    _cfs["TP"] / (_cfs["TP"] + _cfs["FP"]), # 正と予測したデータのうち,実際に正であるものの割合(精度、適合率、Precsion)
    (_cfs["TP"] + _cfs["TN"]) / _cfs.sum(), # 予測正解率
]

Out[159]:
[0.72727272727272729,
 0.2711864406779661,
 0.90476190476190477,
 0.72761194029850751]
In [145]:
_cfmx.loc["T"].sum(), _cfmx.sum()

Out[145]:
(168, P    209
 N     59
 dtype: int64)
In [87]:
print(metrics.classification_report(y_test, predict))
             precision    recall  f1-score   support

          0       0.73      0.90      0.81       168
          1       0.73      0.43      0.54       100

avg / total       0.73      0.73      0.71       268

In [73]:
## ROC曲線
In [109]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
ax1.plot(fpr, tpr, [0,1],[0,1])
#plt.plot(tpr, fpr, [0,1],[0,1])

pd.Series(tpr, index=fpr).plot(ax=ax2)
pd.Series([-0.05, 0, 1, 1.05], index=[-0.05, 0,1, 1.05]).plot(ax=ax2)

Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x115b73c88>
../../../_images/contents_notebooks_sklearn_roc_auc_learning_curve_27_1.png
In [160]:
## 正解率
[
    metrics.accuracy_score(y_test, predict),
    (y_test == (proba[:, 1] > 0.5).astype(int)).sum() / y_test.shape[0]
]
Out[160]:
[0.72761194029850751, 0.72761194029850751]
In [8]:
# learning curve
# http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.03-Hyperparameters-and-Model-Validation.ipynb

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
#from sklearn.learning_curve import learning_curve
from sklearn.model_selection import learning_curve

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

def make_data(N, err=1.0, rseed=1):
    # randomly sample the data
    rng = np.random.RandomState(rseed)
    X = rng.rand(N, 1) ** 2
    y = 10 - 1. / (X.ravel() + 0.1)
    if err > 0:
        y += err * rng.randn(N)
    return X, y

X, y = make_data(40)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

degrees = [2, 9]
for degree, ax in zip(degrees, axes):
    N, train_lc, val_lc = learning_curve(PolynomialRegression(degree),
                                         X, y, cv=7,
                                         train_sizes=np.linspace(0.3, 1, 25))

    ax.plot(N, np.mean(train_lc, 1), color='blue', label='training score')
    ax.plot(N, np.mean(val_lc, 1), color='red', label='validation score')
    ax.hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1],
                 color='gray', linestyle='dashed')

    ax.set_ylim(0, 1)
    ax.set_xlim(N[0], N[-1])
    ax.set_xlabel('training size')
    ax.set_ylabel('score')
    ax.set_title('degree = {0}'.format(degree), size=14)
    ax.legend(loc='best')
../../../_images/contents_notebooks_sklearn_roc_auc_learning_curve_30_0.png
In [9]:
# validation curve
In [11]:
#from sklearn.learning_curve import validation_curve
from sklearn.model_selection import validation_curve

degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y,
                                          'polynomialfeatures__degree', degree, cv=7)

plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');
../../../_images/contents_notebooks_sklearn_roc_auc_learning_curve_32_0.png
In [ ]:

In [ ]:

In [ ]: