{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# cut_and_dummy"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('ggplot')\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html\n",
"# http://pandas.pydata.org/pandas-docs/stable/reshaping.html#computing-indicator-dummy-variables"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" 995 | \n",
" 36 | \n",
"
\n",
" \n",
" 996 | \n",
" 89 | \n",
"
\n",
" \n",
" 997 | \n",
" 50 | \n",
"
\n",
" \n",
" 998 | \n",
" 80 | \n",
"
\n",
" \n",
" 999 | \n",
" 85 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age\n",
"995 36\n",
"996 89\n",
"997 50\n",
"998 80\n",
"999 85"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(0)\n",
"df_for_cut = pd.DataFrame(np.random.randint(1, 99, 1000), columns=[\"age\"])\n",
"df_for_cut.tail()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = list(range(0, 100+1, 10))\n",
"bins"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['0 - 9',\n",
" '10 - 19',\n",
" '20 - 29',\n",
" '30 - 39',\n",
" '40 - 49',\n",
" '50 - 59',\n",
" '60 - 69',\n",
" '70 - 79',\n",
" '80 - 89',\n",
" '90 - 99']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins_labels = [str(b) + \" - \" + str(b + 10 - 1) for b in bins[:-1]]\n",
"bins_labels"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" age_group | \n",
" age_group_right | \n",
" age_group_label_F | \n",
" age_group_labels | \n",
"
\n",
" \n",
" \n",
" \n",
" 995 | \n",
" 36 | \n",
" (30, 40] | \n",
" [30, 40) | \n",
" 3 | \n",
" 30 - 39 | \n",
"
\n",
" \n",
" 996 | \n",
" 89 | \n",
" (80, 90] | \n",
" [80, 90) | \n",
" 8 | \n",
" 80 - 89 | \n",
"
\n",
" \n",
" 997 | \n",
" 50 | \n",
" (40, 50] | \n",
" [50, 60) | \n",
" 4 | \n",
" 40 - 49 | \n",
"
\n",
" \n",
" 998 | \n",
" 80 | \n",
" (70, 80] | \n",
" [80, 90) | \n",
" 7 | \n",
" 70 - 79 | \n",
"
\n",
" \n",
" 999 | \n",
" 85 | \n",
" (80, 90] | \n",
" [80, 90) | \n",
" 8 | \n",
" 80 - 89 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age age_group age_group_right age_group_label_F age_group_labels\n",
"995 36 (30, 40] [30, 40) 3 30 - 39\n",
"996 89 (80, 90] [80, 90) 8 80 - 89\n",
"997 50 (40, 50] [50, 60) 4 40 - 49\n",
"998 80 (70, 80] [80, 90) 7 70 - 79\n",
"999 85 (80, 90] [80, 90) 8 80 - 89"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_for_cut[\"age_group\"] = pd.cut(df_for_cut.age, bins=bins)\n",
"df_for_cut[\"age_group_right\"] = pd.cut(df_for_cut.age, bins=bins, right=False)\n",
"df_for_cut[\"age_group_label_F\"] = pd.cut(df_for_cut.age, bins=bins, labels=False)\n",
"df_for_cut[\"age_group_labels\"] = pd.cut(df_for_cut.age, bins=bins, labels=bins_labels)\n",
"df_for_cut.tail()\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(40, 50], (60, 70], (0, 10], (80, 90], (20, 30], (30, 40], (70, 80], (10, 20], (50, 60], (90, 100]]\n",
"Categories (10, object): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_for_cut.age_group.unique()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([4, 6, 0, 8, 2, 3, 7, 1, 5, 9])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_for_cut.age_group_label_F.unique()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" age_group | \n",
" age_group_right | \n",
" age_group_label_F | \n",
" age_group_labels | \n",
" age_group_(0, 10] | \n",
" age_group_(10, 20] | \n",
" age_group_(20, 30] | \n",
" age_group_(30, 40] | \n",
" age_group_(40, 50] | \n",
" age_group_(50, 60] | \n",
" age_group_(60, 70] | \n",
" age_group_(70, 80] | \n",
" age_group_(80, 90] | \n",
" age_group_(90, 100] | \n",
"
\n",
" \n",
" \n",
" \n",
" 995 | \n",
" 36 | \n",
" (30, 40] | \n",
" [30, 40) | \n",
" 3 | \n",
" 30 - 39 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 996 | \n",
" 89 | \n",
" (80, 90] | \n",
" [80, 90) | \n",
" 8 | \n",
" 80 - 89 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 997 | \n",
" 50 | \n",
" (40, 50] | \n",
" [50, 60) | \n",
" 4 | \n",
" 40 - 49 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 998 | \n",
" 80 | \n",
" (70, 80] | \n",
" [80, 90) | \n",
" 7 | \n",
" 70 - 79 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 999 | \n",
" 85 | \n",
" (80, 90] | \n",
" [80, 90) | \n",
" 8 | \n",
" 80 - 89 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age age_group age_group_right age_group_label_F age_group_labels \\\n",
"995 36 (30, 40] [30, 40) 3 30 - 39 \n",
"996 89 (80, 90] [80, 90) 8 80 - 89 \n",
"997 50 (40, 50] [50, 60) 4 40 - 49 \n",
"998 80 (70, 80] [80, 90) 7 70 - 79 \n",
"999 85 (80, 90] [80, 90) 8 80 - 89 \n",
"\n",
" age_group_(0, 10] age_group_(10, 20] age_group_(20, 30] \\\n",
"995 0 0 0 \n",
"996 0 0 0 \n",
"997 0 0 0 \n",
"998 0 0 0 \n",
"999 0 0 0 \n",
"\n",
" age_group_(30, 40] age_group_(40, 50] age_group_(50, 60] \\\n",
"995 1 0 0 \n",
"996 0 0 0 \n",
"997 0 1 0 \n",
"998 0 0 0 \n",
"999 0 0 0 \n",
"\n",
" age_group_(60, 70] age_group_(70, 80] age_group_(80, 90] \\\n",
"995 0 0 0 \n",
"996 0 0 1 \n",
"997 0 0 0 \n",
"998 0 1 0 \n",
"999 0 0 1 \n",
"\n",
" age_group_(90, 100] \n",
"995 0 \n",
"996 0 \n",
"997 0 \n",
"998 0 \n",
"999 0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dummies = pd.get_dummies(df_for_cut['age_group'], prefix='age_group')\n",
"df_for_cut_with_dummies = pd.concat([df_for_cut, dummies], axis=1)\n",
"df_for_cut_with_dummies.tail()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a_A | \n",
" a_B | \n",
" b_C | \n",
" b_D | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a_A a_B b_C b_D\n",
"0 1 0 1 0\n",
"1 0 1 0 1"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.get_dummies(pd.DataFrame({\"a\": list(\"AB\"), \"b\": list(\"CD\")}), prefix=list(\"ab\")) "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 B\n",
"1 NaN\n",
"2 a\n",
"3 NaN\n",
"4 123\n",
"5 0.4\n",
"6 inf\n",
"dtype: object"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"factors = pd.Series([\"B\", np.nan, \"a\", np.nan, 123, 0.4, np.inf])\n",
"factors"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0, -1, 1, -1, 2, 3, 4]),\n",
" Index(['B', 'a', 123, 0.4, inf], dtype='object'))"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"factors.factorize()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"995 (25, 49]\n",
"996 (74, 98]\n",
"997 (49, 74]\n",
"998 (74, 98]\n",
"999 (74, 98]\n",
"Name: age, dtype: category\n",
"Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qcuted_4 = pd.qcut(df_for_cut[\"age\"], q=4)\n",
"qcuted_4.tail()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"995 (30, 39]\n",
"996 (80, 89]\n",
"997 (49, 59]\n",
"998 (70, 80]\n",
"999 (80, 89]\n",
"Name: age, dtype: category\n",
"Categories (10, object): [[1, 9] < (9, 20] < (20, 30] < (30, 39] ... (59, 70] < (70, 80] < (80, 89] < (89, 98]]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qcuted_10 = pd.qcut(df_for_cut[\"age\"], q=10)\n",
"qcuted_10.tail()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"995 (25, 49]\n",
"996 (74, 98]\n",
"997 (49, 74]\n",
"998 (74, 98]\n",
"999 (74, 98]\n",
"Name: age, dtype: category\n",
"Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = [0, .25, .5, .75, 1]\n",
"qcuted_list = pd.qcut(df_for_cut[\"age\"], q=q)\n",
"qcuted_list.tail()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}