{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# cut_and_dummy" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "plt.style.use('ggplot')\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html\n", "# http://pandas.pydata.org/pandas-docs/stable/reshaping.html#computing-indicator-dummy-variables" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
age
99536
99689
99750
99880
99985
\n", "
" ], "text/plain": [ " age\n", "995 36\n", "996 89\n", "997 50\n", "998 80\n", "999 85" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.seed(0)\n", "df_for_cut = pd.DataFrame(np.random.randint(1, 99, 1000), columns=[\"age\"])\n", "df_for_cut.tail()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bins = list(range(0, 100+1, 10))\n", "bins" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['0 - 9',\n", " '10 - 19',\n", " '20 - 29',\n", " '30 - 39',\n", " '40 - 49',\n", " '50 - 59',\n", " '60 - 69',\n", " '70 - 79',\n", " '80 - 89',\n", " '90 - 99']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bins_labels = [str(b) + \" - \" + str(b + 10 - 1) for b in bins[:-1]]\n", "bins_labels" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageage_groupage_group_rightage_group_label_Fage_group_labels
99536(30, 40][30, 40)330 - 39
99689(80, 90][80, 90)880 - 89
99750(40, 50][50, 60)440 - 49
99880(70, 80][80, 90)770 - 79
99985(80, 90][80, 90)880 - 89
\n", "
" ], "text/plain": [ " age age_group age_group_right age_group_label_F age_group_labels\n", "995 36 (30, 40] [30, 40) 3 30 - 39\n", "996 89 (80, 90] [80, 90) 8 80 - 89\n", "997 50 (40, 50] [50, 60) 4 40 - 49\n", "998 80 (70, 80] [80, 90) 7 70 - 79\n", "999 85 (80, 90] [80, 90) 8 80 - 89" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_cut[\"age_group\"] = pd.cut(df_for_cut.age, bins=bins)\n", "df_for_cut[\"age_group_right\"] = pd.cut(df_for_cut.age, bins=bins, right=False)\n", "df_for_cut[\"age_group_label_F\"] = pd.cut(df_for_cut.age, bins=bins, labels=False)\n", "df_for_cut[\"age_group_labels\"] = pd.cut(df_for_cut.age, bins=bins, labels=bins_labels)\n", "df_for_cut.tail()\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[(40, 50], (60, 70], (0, 10], (80, 90], (20, 30], (30, 40], (70, 80], (10, 20], (50, 60], (90, 100]]\n", "Categories (10, object): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_cut.age_group.unique()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([4, 6, 0, 8, 2, 3, 7, 1, 5, 9])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_cut.age_group_label_F.unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageage_groupage_group_rightage_group_label_Fage_group_labelsage_group_(0, 10]age_group_(10, 20]age_group_(20, 30]age_group_(30, 40]age_group_(40, 50]age_group_(50, 60]age_group_(60, 70]age_group_(70, 80]age_group_(80, 90]age_group_(90, 100]
99536(30, 40][30, 40)330 - 390001000000
99689(80, 90][80, 90)880 - 890000000010
99750(40, 50][50, 60)440 - 490000100000
99880(70, 80][80, 90)770 - 790000000100
99985(80, 90][80, 90)880 - 890000000010
\n", "
" ], "text/plain": [ " age age_group age_group_right age_group_label_F age_group_labels \\\n", "995 36 (30, 40] [30, 40) 3 30 - 39 \n", "996 89 (80, 90] [80, 90) 8 80 - 89 \n", "997 50 (40, 50] [50, 60) 4 40 - 49 \n", "998 80 (70, 80] [80, 90) 7 70 - 79 \n", "999 85 (80, 90] [80, 90) 8 80 - 89 \n", "\n", " age_group_(0, 10] age_group_(10, 20] age_group_(20, 30] \\\n", "995 0 0 0 \n", "996 0 0 0 \n", "997 0 0 0 \n", "998 0 0 0 \n", "999 0 0 0 \n", "\n", " age_group_(30, 40] age_group_(40, 50] age_group_(50, 60] \\\n", "995 1 0 0 \n", "996 0 0 0 \n", "997 0 1 0 \n", "998 0 0 0 \n", "999 0 0 0 \n", "\n", " age_group_(60, 70] age_group_(70, 80] age_group_(80, 90] \\\n", "995 0 0 0 \n", "996 0 0 1 \n", "997 0 0 0 \n", "998 0 1 0 \n", "999 0 0 1 \n", "\n", " age_group_(90, 100] \n", "995 0 \n", "996 0 \n", "997 0 \n", "998 0 \n", "999 0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummies = pd.get_dummies(df_for_cut['age_group'], prefix='age_group')\n", "df_for_cut_with_dummies = pd.concat([df_for_cut, dummies], axis=1)\n", "df_for_cut_with_dummies.tail()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
a_Aa_Bb_Cb_D
01010
10101
\n", "
" ], "text/plain": [ " a_A a_B b_C b_D\n", "0 1 0 1 0\n", "1 0 1 0 1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.get_dummies(pd.DataFrame({\"a\": list(\"AB\"), \"b\": list(\"CD\")}), prefix=list(\"ab\")) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 B\n", "1 NaN\n", "2 a\n", "3 NaN\n", "4 123\n", "5 0.4\n", "6 inf\n", "dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "factors = pd.Series([\"B\", np.nan, \"a\", np.nan, 123, 0.4, np.inf])\n", "factors" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([ 0, -1, 1, -1, 2, 3, 4]),\n", " Index(['B', 'a', 123, 0.4, inf], dtype='object'))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "factors.factorize()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "995 (25, 49]\n", "996 (74, 98]\n", "997 (49, 74]\n", "998 (74, 98]\n", "999 (74, 98]\n", "Name: age, dtype: category\n", "Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qcuted_4 = pd.qcut(df_for_cut[\"age\"], q=4)\n", "qcuted_4.tail()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "995 (30, 39]\n", "996 (80, 89]\n", "997 (49, 59]\n", "998 (70, 80]\n", "999 (80, 89]\n", "Name: age, dtype: category\n", "Categories (10, object): [[1, 9] < (9, 20] < (20, 30] < (30, 39] ... (59, 70] < (70, 80] < (80, 89] < (89, 98]]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qcuted_10 = pd.qcut(df_for_cut[\"age\"], q=10)\n", "qcuted_10.tail()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "995 (25, 49]\n", "996 (74, 98]\n", "997 (49, 74]\n", "998 (74, 98]\n", "999 (74, 98]\n", "Name: age, dtype: category\n", "Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = [0, .25, .5, .75, 1]\n", "qcuted_list = pd.qcut(df_for_cut[\"age\"], q=q)\n", "qcuted_list.tail()" ] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }