{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# sklearn_processing" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn import preprocessing\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import math" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "x1 = [1, 2, 0]\n", "x2 = [1, -1, 2]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# http://scikit-learn.org/stable/modules/preprocessing.html" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# http://www.dataminingblog.com/standardization-vs-normalization/\n", "# http://blog.pengyifan.com/scale-standardize-and-normalize-data/\n", "# http://webbeginner.hatenablog.com/entry/2014/04/28/214822" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([ 0. , 1.22474487, -1.22474487]), 0.0, 0.99999999999999989)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaled = preprocessing.scale(x1)\n", "scaled, scaled.mean(), scaled.std()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([ 0.5, 1. , 0. ]), 0.5, 0.40824829046386302)" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "min_max_scaled = preprocessing.minmax_scale(x1)\n", "min_max_scaled, min_max_scaled.mean(), min_max_scaled.std()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 1, -1, 2])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.array(x2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n", " DeprecationWarning)\n", "/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.\n", " warnings.warn(msg, DataConversionWarning)\n" ] }, { "data": { "text/plain": [ "array([[ 0.40824829, -0.40824829, 0.81649658]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessing.normalize(np.array(x2))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.\n", " warnings.warn(msg, DataConversionWarning)\n" ] }, { "data": { "text/plain": [ "array([[ 0.40824829, -0.40824829, 0.81649658]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessing.normalize(np.array(x2).reshape(1, -1))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0.40824829, -0.40824829, 0.81649658]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessing.normalize(np.array(x2).astype(np.float64).reshape(1, -1))" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_scale(x):\n", " return map(lambda v: (v - min(x)) / (max(x) - min(x)), x)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[0.5, 1.0, 0.0]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(my_scale(x1))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_norm(x):\n", " return math.sqrt(sum(map(lambda v: v ** 2, x)))" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_normalize(x):\n", " return map(lambda v: v / my_norm(x), x)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0.4082482904638631, -0.4082482904638631, 0.8164965809277261]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(my_normalize(x2))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_mean(x):\n", " return sum(x) / len(x)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_std(x):\n", " return math.sqrt(sum(map(lambda v: (v - my_mean(x)) ** 2, x)) / len(x))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def my_standardize(x):\n", " return map(lambda v: (v - my_mean(x)) / my_std(x), x)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0.0, 1.224744871391589, -1.224744871391589]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(my_standardize(x1))" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "toc_cell": false, "toc_number_sections": true, "toc_threshold": 6, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }