import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from sklearn import datasets
from sklearn import model_selection
titanic = sns.load_dataset("titanic")
X_train, X_test, y_train, y_test = model_selection.train_test_split(titanic.drop("survived", axis=1), titanic.survived, test_size=0.3)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 308 to 579
Data columns (total 14 columns):
pclass         623 non-null int64
sex            623 non-null object
age            511 non-null float64
sibsp          623 non-null int64
parch          623 non-null int64
fare           623 non-null float64
embarked       621 non-null object
class          623 non-null category
who            623 non-null object
adult_male     623 non-null bool
deck           146 non-null category
embark_town    621 non-null object
alive          623 non-null object
alone          623 non-null bool
dtypes: bool(2), category(2), float64(2), int64(3), object(5)
memory usage: 56.1+ KB
pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
308 2 male 30.0 1 0 24.0 C Second man True NaN Cherbourg no False
417 2 female 18.0 0 2 13.0 S Second woman False NaN Southampton yes False
612 3 female NaN 1 0 15.5 Q Third woman False NaN Queenstown yes False
641 1 female 24.0 0 0 69.3 C First woman False B Cherbourg yes True
853 1 female 16.0 0 1 39.4 S First woman False D Southampton yes False
from sklearn import pipeline
from sklearn import preprocessing
TypeError                                 Traceback (most recent call last)
<ipython-input-9-5121c3fc1f23> in <module>()
      3 pipeline.Pipeline(
      4     [
----> 5         preprocessing
      6     ]
      7 )

/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/pipeline.py in __init__(self, steps)
    151         # shallow copy of steps
    152         self.steps = tosequence(steps)
--> 153         self._validate_steps()
    155     def get_params(self, deep=True):

/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/pipeline.py in _validate_steps(self)
    183     def _validate_steps(self):
--> 184         names, estimators = zip(*self.steps)
    186         # validate names

TypeError: zip argument #1 must support iteration
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris()
pipepline = Pipeline([
    ("scale", StandardScaler()),
    ("pca", PCA()),
    ("svc", SVC()),
params = {
    "pca__n_components": (2,3),
    "svc__C": (0.1, 0.5, 1.0, 1.5),
    "svc__kernel": ("linear", "rbf"),
grid_cv=GridSearchCV(pipepline, params, cv=3, scoring="accuracy")

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.3, random_state=np.random.RandomState(0))

grid_cv.fit(X_train, y_train)
#scores = cross_val_score(grid_cv, X_train, y_train, cv=5)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'svc__kernel': ('linear', 'rbf'), 'svc__C': (0.1, 0.5, 1.0, 1.5), 'pca__n_components': (2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)
Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
{'pca__n_components': 3, 'svc__C': 1.5, 'svc__kernel': 'rbf'}
/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/model_selection/_search.py:667: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
[mean: 0.85714, std: 0.07686, params: {'svc__kernel': 'linear', 'svc__C': 0.1, 'pca__n_components': 2},
 mean: 0.83810, std: 0.02339, params: {'svc__kernel': 'rbf', 'svc__C': 0.1, 'pca__n_components': 2},
 mean: 0.86667, std: 0.06819, params: {'svc__kernel': 'linear', 'svc__C': 0.5, 'pca__n_components': 2},
 mean: 0.86667, std: 0.06819, params: {'svc__kernel': 'rbf', 'svc__C': 0.5, 'pca__n_components': 2},
 mean: 0.87619, std: 0.03329, params: {'svc__kernel': 'linear', 'svc__C': 1.0, 'pca__n_components': 2},
 mean: 0.88571, std: 0.02185, params: {'svc__kernel': 'rbf', 'svc__C': 1.0, 'pca__n_components': 2},
 mean: 0.88571, std: 0.02185, params: {'svc__kernel': 'linear', 'svc__C': 1.5, 'pca__n_components': 2},
 mean: 0.87619, std: 0.03329, params: {'svc__kernel': 'rbf', 'svc__C': 1.5, 'pca__n_components': 2},
 mean: 0.89524, std: 0.06433, params: {'svc__kernel': 'linear', 'svc__C': 0.1, 'pca__n_components': 3},
 mean: 0.85714, std: 0.00334, params: {'svc__kernel': 'rbf', 'svc__C': 0.1, 'pca__n_components': 3},
 mean: 0.94286, std: 0.02240, params: {'svc__kernel': 'linear', 'svc__C': 0.5, 'pca__n_components': 3},
 mean: 0.91429, std: 0.02212, params: {'svc__kernel': 'rbf', 'svc__C': 0.5, 'pca__n_components': 3},
 mean: 0.93333, std: 0.02704, params: {'svc__kernel': 'linear', 'svc__C': 1.0, 'pca__n_components': 3},
 mean: 0.94286, std: 0.04539, params: {'svc__kernel': 'rbf', 'svc__C': 1.0, 'pca__n_components': 3},
 mean: 0.93333, std: 0.02704, params: {'svc__kernel': 'linear', 'svc__C': 1.5, 'pca__n_components': 3},
 mean: 0.95238, std: 0.03497, params: {'svc__kernel': 'rbf', 'svc__C': 1.5, 'pca__n_components': 3}]
{'mean_fit_time': array([ 0.01003496,  0.00146937,  0.00133928,  0.0015343 ,  0.00181842,
         0.00131806,  0.00118907,  0.00153232,  0.00192833,  0.00210055,
         0.00125098,  0.00170763,  0.00132998,  0.00135334,  0.00126402,
 'mean_score_time': array([ 0.00115132,  0.00100772,  0.00045204,  0.00046635,  0.00045633,
         0.00044831,  0.00042756,  0.00055734,  0.00074673,  0.00062307,
         0.00050004,  0.00058532,  0.00043869,  0.00047898,  0.0004584 ,
 'mean_test_score': array([ 0.85714286,  0.83809524,  0.86666667,  0.86666667,  0.87619048,
         0.88571429,  0.88571429,  0.87619048,  0.8952381 ,  0.85714286,
         0.94285714,  0.91428571,  0.93333333,  0.94285714,  0.93333333,
 'mean_train_score': array([ 0.91889987,  0.87610785,  0.92842563,  0.92842563,  0.92842563,
         0.92842563,  0.92842563,  0.92842563,  0.94264427,  0.88597284,
         0.97135761,  0.97611952,  0.97611952,  0.97611952,  0.97135761,
 'param_pca__n_components': masked_array(data = [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3],
              mask = [False False False False False False False False False False False False
  False False False False],
        fill_value = ?),
 'param_svc__C': masked_array(data = [0.1 0.1 0.5 0.5 1.0 1.0 1.5 1.5 0.1 0.1 0.5 0.5 1.0 1.0 1.5 1.5],
              mask = [False False False False False False False False False False False False
  False False False False],
        fill_value = ?),
 'param_svc__kernel': masked_array(data = ['linear' 'rbf' 'linear' 'rbf' 'linear' 'rbf' 'linear' 'rbf' 'linear' 'rbf'
  'linear' 'rbf' 'linear' 'rbf' 'linear' 'rbf'],
              mask = [False False False False False False False False False False False False
  False False False False],
        fill_value = ?),
 'params': ({'pca__n_components': 2, 'svc__C': 0.1, 'svc__kernel': 'linear'},
  {'pca__n_components': 2, 'svc__C': 0.1, 'svc__kernel': 'rbf'},
  {'pca__n_components': 2, 'svc__C': 0.5, 'svc__kernel': 'linear'},
  {'pca__n_components': 2, 'svc__C': 0.5, 'svc__kernel': 'rbf'},
  {'pca__n_components': 2, 'svc__C': 1.0, 'svc__kernel': 'linear'},
  {'pca__n_components': 2, 'svc__C': 1.0, 'svc__kernel': 'rbf'},
  {'pca__n_components': 2, 'svc__C': 1.5, 'svc__kernel': 'linear'},
  {'pca__n_components': 2, 'svc__C': 1.5, 'svc__kernel': 'rbf'},
  {'pca__n_components': 3, 'svc__C': 0.1, 'svc__kernel': 'linear'},
  {'pca__n_components': 3, 'svc__C': 0.1, 'svc__kernel': 'rbf'},
  {'pca__n_components': 3, 'svc__C': 0.5, 'svc__kernel': 'linear'},
  {'pca__n_components': 3, 'svc__C': 0.5, 'svc__kernel': 'rbf'},
  {'pca__n_components': 3, 'svc__C': 1.0, 'svc__kernel': 'linear'},
  {'pca__n_components': 3, 'svc__C': 1.0, 'svc__kernel': 'rbf'},
  {'pca__n_components': 3, 'svc__C': 1.5, 'svc__kernel': 'linear'},
  {'pca__n_components': 3, 'svc__C': 1.5, 'svc__kernel': 'rbf'}),
 'rank_test_score': array([14, 16, 12, 12, 10,  8,  8, 10,  7, 14,  2,  6,  4,  2,  4,  1], dtype=int32),
 'split0_test_score': array([ 0.75      ,  0.80555556,  0.77777778,  0.77777778,  0.83333333,
         0.86111111,  0.86111111,  0.83333333,  0.80555556,  0.86111111,
         0.91666667,  0.88888889,  0.91666667,  0.88888889,  0.91666667,
 'split0_train_score': array([ 0.89855072,  0.88405797,  0.91304348,  0.91304348,  0.91304348,
         0.91304348,  0.91304348,  0.91304348,  0.91304348,  0.92753623,
         0.97101449,  0.97101449,  0.97101449,  0.97101449,  0.97101449,
 'split1_test_score': array([ 0.91428571,  0.85714286,  0.94285714,  0.94285714,  0.91428571,
         0.91428571,  0.91428571,  0.91428571,  0.94285714,  0.85714286,
         0.97142857,  0.94285714,  0.97142857,  1.        ,  0.97142857,  1.        ]),
 'split1_train_score': array([ 0.92857143,  0.84285714,  0.92857143,  0.92857143,  0.92857143,
         0.92857143,  0.92857143,  0.92857143,  0.95714286,  0.85714286,
         0.95714286,  0.97142857,  0.97142857,  0.97142857,  0.95714286,
 'split2_test_score': array([ 0.91176471,  0.85294118,  0.88235294,  0.88235294,  0.88235294,
         0.88235294,  0.88235294,  0.88235294,  0.94117647,  0.85294118,
         0.94117647,  0.91176471,  0.91176471,  0.94117647,  0.91176471,
 'split2_train_score': array([ 0.92957746,  0.90140845,  0.94366197,  0.94366197,  0.94366197,
         0.94366197,  0.94366197,  0.94366197,  0.95774648,  0.87323944,
         0.98591549,  0.98591549,  0.98591549,  0.98591549,  0.98591549,
 'std_fit_time': array([  1.21561924e-02,   9.30547439e-05,   1.10284565e-04,
          3.03047198e-04,   6.95946432e-04,   3.71424893e-05,
          4.57914039e-05,   1.19908349e-04,   4.01103672e-04,
          7.74041020e-04,   9.07622507e-05,   4.54353185e-04,
          8.52150680e-05,   5.58674420e-05,   2.01641925e-05,
 'std_score_time': array([  9.94021775e-04,   7.42038836e-04,   2.80585344e-05,
          1.11471841e-05,   2.22151877e-05,   3.86568276e-06,
          1.91065713e-06,   1.07979550e-04,   6.34805604e-05,
          9.84837733e-05,   5.96352605e-05,   1.53933862e-04,
          1.06214518e-05,   1.49894325e-05,   2.65036923e-05,
 'std_test_score': array([ 0.07739765,  0.02356551,  0.06872894,  0.06872894,  0.0335527 ,
         0.02198419,  0.02198419,  0.0335527 ,  0.06478264,  0.003334  ,
         0.02254336,  0.02225554,  0.02701157,  0.04569405,  0.02701157,
 'std_train_score': array([ 0.01439488,  0.02455561,  0.01250037,  0.01250037,  0.01250037,
         0.01250037,  0.01250037,  0.01250037,  0.02093237,  0.03011546,
         0.01174888,  0.00692886,  0.00692886,  0.00692886,  0.01174888,
cross_val_score(grid_cv.best_estimator_, X_train, y_train, cv=5)
array([ 0.90909091,  0.95454545,  1.        ,  0.95238095,  0.94736842])
grid_cv.best_estimator_.fit(X_train, y_train)
Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
y_pred = grid_cv.best_estimator_.predict(X_test)
array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
       0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0])
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        16
          1       1.00      0.94      0.97        18
          2       0.92      1.00      0.96        11

avg / total       0.98      0.98      0.98        45

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]])
from sklearn.metrics import scorer
{'accuracy': make_scorer(accuracy_score),
 'adjusted_rand_score': make_scorer(adjusted_rand_score),
 'average_precision': make_scorer(average_precision_score, needs_threshold=True),
 'f1': make_scorer(f1_score),
 'f1_macro': make_scorer(f1_score, average=macro, pos_label=None),
 'f1_micro': make_scorer(f1_score, average=micro, pos_label=None),
 'f1_samples': make_scorer(f1_score, average=samples, pos_label=None),
 'f1_weighted': make_scorer(f1_score, average=weighted, pos_label=None),
 'log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True),
 'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
 'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
 'median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
 'neg_log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True),
 'neg_mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
 'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
 'neg_median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
 'precision': make_scorer(precision_score),
 'precision_macro': make_scorer(precision_score, average=macro, pos_label=None),
 'precision_micro': make_scorer(precision_score, average=micro, pos_label=None),
 'precision_samples': make_scorer(precision_score, average=samples, pos_label=None),
 'precision_weighted': make_scorer(precision_score, average=weighted, pos_label=None),
 'r2': make_scorer(r2_score),
 'recall': make_scorer(recall_score),
 'recall_macro': make_scorer(recall_score, average=macro, pos_label=None),
 'recall_micro': make_scorer(recall_score, average=micro, pos_label=None),
 'recall_samples': make_scorer(recall_score, average=samples, pos_label=None),
 'recall_weighted': make_scorer(recall_score, average=weighted, pos_label=None),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}