sklearn_processing

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [20]:
import math
In [5]:
x1 = [1, 2, 0]
x2 = [1, -1, 2]
In [47]:
# http://scikit-learn.org/stable/modules/preprocessing.html
In [4]:
# http://www.dataminingblog.com/standardization-vs-normalization/
# http://blog.pengyifan.com/scale-standardize-and-normalize-data/
# http://webbeginner.hatenablog.com/entry/2014/04/28/214822
In [46]:
scaled = preprocessing.scale(x1)
scaled, scaled.mean(), scaled.std()
Out[46]:
(array([ 0.        ,  1.22474487, -1.22474487]), 0.0, 0.99999999999999989)
In [50]:
min_max_scaled = preprocessing.minmax_scale(x1)
min_max_scaled, min_max_scaled.mean(), min_max_scaled.std()
Out[50]:
(array([ 0.5,  1. ,  0. ]), 0.5, 0.40824829046386302)
In [9]:
np.array(x2)
Out[9]:
array([ 1, -1,  2])
In [8]:
preprocessing.normalize(np.array(x2))
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.
  warnings.warn(msg, DataConversionWarning)
Out[8]:
array([[ 0.40824829, -0.40824829,  0.81649658]])
In [11]:
preprocessing.normalize(np.array(x2).reshape(1, -1))
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.
  warnings.warn(msg, DataConversionWarning)
Out[11]:
array([[ 0.40824829, -0.40824829,  0.81649658]])
In [13]:
preprocessing.normalize(np.array(x2).astype(np.float64).reshape(1, -1))
Out[13]:
array([[ 0.40824829, -0.40824829,  0.81649658]])
In [42]:
def my_scale(x):
    return map(lambda v: (v - min(x)) / (max(x) - min(x)), x)
In [43]:
list(my_scale(x1))
Out[43]:
[0.5, 1.0, 0.0]
In [26]:
def my_norm(x):
    return math.sqrt(sum(map(lambda v: v ** 2, x)))
In [27]:
def my_normalize(x):
    return map(lambda v: v / my_norm(x), x)
In [29]:
list(my_normalize(x2))
Out[29]:
[0.4082482904638631, -0.4082482904638631, 0.8164965809277261]
In [32]:
def my_mean(x):
    return sum(x) / len(x)
In [40]:
def my_std(x):
    return math.sqrt(sum(map(lambda v: (v - my_mean(x)) ** 2, x)) / len(x))
In [37]:
def my_standardize(x):
    return map(lambda v: (v - my_mean(x)) / my_std(x), x)
In [41]:
list(my_standardize(x1))
Out[41]:
[0.0, 1.224744871391589, -1.224744871391589]