sklearn_processing¶

In [2]:

import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [20]:

import math

In [5]:

x1 = [1, 2, 0]
x2 = [1, -1, 2]

In [47]:

# http://scikit-learn.org/stable/modules/preprocessing.html

In [4]:

# http://www.dataminingblog.com/standardization-vs-normalization/
# http://blog.pengyifan.com/scale-standardize-and-normalize-data/
# http://webbeginner.hatenablog.com/entry/2014/04/28/214822

In [46]:

scaled = preprocessing.scale(x1)
scaled, scaled.mean(), scaled.std()

Out[46]:

(array([ 0.        ,  1.22474487, -1.22474487]), 0.0, 0.99999999999999989)

In [50]:

min_max_scaled = preprocessing.minmax_scale(x1)
min_max_scaled, min_max_scaled.mean(), min_max_scaled.std()

Out[50]:

(array([ 0.5,  1. ,  0. ]), 0.5, 0.40824829046386302)

In [9]:

np.array(x2)

Out[9]:

array([ 1, -1,  2])

In [8]:

preprocessing.normalize(np.array(x2))

/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.
  warnings.warn(msg, DataConversionWarning)

Out[8]:

array([[ 0.40824829, -0.40824829,  0.81649658]])

In [11]:

preprocessing.normalize(np.array(x2).reshape(1, -1))

/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.
  warnings.warn(msg, DataConversionWarning)

Out[11]:

array([[ 0.40824829, -0.40824829,  0.81649658]])

In [13]:

preprocessing.normalize(np.array(x2).astype(np.float64).reshape(1, -1))

Out[13]:

array([[ 0.40824829, -0.40824829,  0.81649658]])

In [42]:

def my_scale(x):
    return map(lambda v: (v - min(x)) / (max(x) - min(x)), x)

In [43]:

list(my_scale(x1))

Out[43]:

[0.5, 1.0, 0.0]

In [26]:

def my_norm(x):
    return math.sqrt(sum(map(lambda v: v ** 2, x)))

In [27]:

def my_normalize(x):
    return map(lambda v: v / my_norm(x), x)

In [29]:

list(my_normalize(x2))

Out[29]:

[0.4082482904638631, -0.4082482904638631, 0.8164965809277261]

In [32]:

def my_mean(x):
    return sum(x) / len(x)

In [40]:

def my_std(x):
    return math.sqrt(sum(map(lambda v: (v - my_mean(x)) ** 2, x)) / len(x))

In [37]:

def my_standardize(x):
    return map(lambda v: (v - my_mean(x)) / my_std(x), x)

In [41]:

list(my_standardize(x1))

Out[41]:

[0.0, 1.224744871391589, -1.224744871391589]