{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DecisionTree"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:51.045093",
"start_time": "2017-01-22T06:10:49.156058"
},
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn import tree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:51.058013",
"start_time": "2017-01-22T06:10:51.047401"
},
"collapsed": true
},
"outputs": [],
"source": [
"dataset = {\n",
" \"天気\": list(\"晴晴曇雨雨雨曇晴晴雨晴曇曇雨\"),\n",
" \"温度\": list(\"暑暑暑暖涼涼涼暖涼暖暖暖暑暖\"),\n",
" \"湿度\": list(\"高高高高普普普高普普普高普高\"),\n",
" \"風\": list(\"無有無無無有有無無無有有無有\"),\n",
" \"ゴルフプレイ\": list(\"☓☓◯◯◯☓◯☓◯◯◯◯◯☓\")\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.386496",
"start_time": "2017-01-22T06:10:51.061630"
},
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.455740",
"start_time": "2017-01-22T06:10:52.389165"
},
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" ゴルフプレイ | \n",
" 天気 | \n",
" 温度 | \n",
" 湿度 | \n",
" 風 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ☓ | \n",
" 晴 | \n",
" 暑 | \n",
" 高 | \n",
" 無 | \n",
"
\n",
" \n",
" 1 | \n",
" ☓ | \n",
" 晴 | \n",
" 暑 | \n",
" 高 | \n",
" 有 | \n",
"
\n",
" \n",
" 2 | \n",
" ◯ | \n",
" 曇 | \n",
" 暑 | \n",
" 高 | \n",
" 無 | \n",
"
\n",
" \n",
" 3 | \n",
" ◯ | \n",
" 雨 | \n",
" 暖 | \n",
" 高 | \n",
" 無 | \n",
"
\n",
" \n",
" 4 | \n",
" ◯ | \n",
" 雨 | \n",
" 涼 | \n",
" 普 | \n",
" 無 | \n",
"
\n",
" \n",
" 5 | \n",
" ☓ | \n",
" 雨 | \n",
" 涼 | \n",
" 普 | \n",
" 有 | \n",
"
\n",
" \n",
" 6 | \n",
" ◯ | \n",
" 曇 | \n",
" 涼 | \n",
" 普 | \n",
" 有 | \n",
"
\n",
" \n",
" 7 | \n",
" ☓ | \n",
" 晴 | \n",
" 暖 | \n",
" 高 | \n",
" 無 | \n",
"
\n",
" \n",
" 8 | \n",
" ◯ | \n",
" 晴 | \n",
" 涼 | \n",
" 普 | \n",
" 無 | \n",
"
\n",
" \n",
" 9 | \n",
" ◯ | \n",
" 雨 | \n",
" 暖 | \n",
" 普 | \n",
" 無 | \n",
"
\n",
" \n",
" 10 | \n",
" ◯ | \n",
" 晴 | \n",
" 暖 | \n",
" 普 | \n",
" 有 | \n",
"
\n",
" \n",
" 11 | \n",
" ◯ | \n",
" 曇 | \n",
" 暖 | \n",
" 高 | \n",
" 有 | \n",
"
\n",
" \n",
" 12 | \n",
" ◯ | \n",
" 曇 | \n",
" 暑 | \n",
" 普 | \n",
" 無 | \n",
"
\n",
" \n",
" 13 | \n",
" ☓ | \n",
" 雨 | \n",
" 暖 | \n",
" 高 | \n",
" 有 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ゴルフプレイ 天気 温度 湿度 風\n",
"0 ☓ 晴 暑 高 無\n",
"1 ☓ 晴 暑 高 有\n",
"2 ◯ 曇 暑 高 無\n",
"3 ◯ 雨 暖 高 無\n",
"4 ◯ 雨 涼 普 無\n",
"5 ☓ 雨 涼 普 有\n",
"6 ◯ 曇 涼 普 有\n",
"7 ☓ 晴 暖 高 無\n",
"8 ◯ 晴 涼 普 無\n",
"9 ◯ 雨 暖 普 無\n",
"10 ◯ 晴 暖 普 有\n",
"11 ◯ 曇 暖 高 有\n",
"12 ◯ 曇 暑 普 無\n",
"13 ☓ 雨 暖 高 有"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(dataset)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.484175",
"start_time": "2017-01-22T06:10:52.457737"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ゴルフプレイ | \n",
" 天気 | \n",
" 温度 | \n",
" 湿度 | \n",
" 風 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 晴 | \n",
" 暑 | \n",
" 高 | \n",
" 無 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 晴 | \n",
" 暑 | \n",
" 高 | \n",
" 有 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 曇 | \n",
" 暑 | \n",
" 高 | \n",
" 無 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ゴルフプレイ 天気 温度 湿度 風\n",
"0 0 晴 暑 高 無\n",
"1 0 晴 暑 高 有\n",
"2 1 曇 暑 高 無"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"df[\"ゴルフプレイ\"] = preprocessing.label_binarize(df[\"ゴルフプレイ\"], list(\"☓◯\"))\n",
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.492889",
"start_time": "2017-01-22T06:10:52.486817"
},
"collapsed": true
},
"outputs": [],
"source": [
"import math\n",
"def entropy(t, f):\n",
" if 0 in (t, f):\n",
" return 0.0\n",
" total = t + f\n",
" return -t/total * math.log2(t/total) -f/total * math.log2(f/total)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.566429",
"start_time": "2017-01-22T06:10:52.498969"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" ゴルフプレイ | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 天気 | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 晴 | \n",
" 3 | \n",
" 2 | \n",
"
\n",
" \n",
" 曇 | \n",
" 0 | \n",
" 4 | \n",
"
\n",
" \n",
" 雨 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"ゴルフプレイ 0 1\n",
"天気 \n",
"晴 3 2\n",
"曇 0 4\n",
"雨 2 3"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cross_tab = pd.crosstab(index=df[\"天気\"], columns=df[\"ゴルフプレイ\"])\n",
"cross_tab\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.577169",
"start_time": "2017-01-22T06:10:52.569736"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.9709505944546686, 0.0, 0.9709505944546686)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"entropy(2, 3), entropy(4, 0), entropy(3, 2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.605166",
"start_time": "2017-01-22T06:10:52.581752"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.6935361388961918"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"5/14*entropy(2, 3) + 4/14*entropy(4, 0) + 5/14*entropy(3, 2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.635062",
"start_time": "2017-01-22T06:10:52.611821"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1 9\n",
"0 5\n",
"Name: ゴルフプレイ, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# root\n",
"df[\"ゴルフプレイ\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.651800",
"start_time": "2017-01-22T06:10:52.642524"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9402859586706311"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"entropy(9, 5)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.691140",
"start_time": "2017-01-22T06:10:52.668917"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.9709505944546688, 0.0, 0.9709505944546688, 0.0]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# using scipy\n",
"from scipy import stats\n",
"[\n",
" stats.entropy([2/5, 3/5], base=2),\n",
" stats.entropy([4/4, 0/4]),\n",
" stats.entropy([2, 3], base=2),\n",
" stats.entropy([4, 0])\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T14:37:22.098276",
"start_time": "2017-01-22T14:37:22.086238"
},
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"def gain(df, feature_name, label_name):\n",
" feature_e = 0.0\n",
" for _, gdf in df.groupby(feature_name):\n",
" feature_e += len(gdf) / len(df) * stats.entropy(gdf[label_name].value_counts(), base=2)\n",
" root_e = stats.entropy(df[label_name].value_counts(), base=2)\n",
" gain = root_e - feature_e\n",
" return gain\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-22T06:10:52.827845",
"start_time": "2017-01-22T06:10:52.718416"
},
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0.24674981977443899,\n",
" 0.029222565658954758,\n",
" 0.15183550136234159,\n",
" 0.048127030408269378]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[\n",
" gain(df, \"天気\", \"ゴルフプレイ\"),\n",
" gain(df, \"温度\", \"ゴルフプレイ\"),\n",
" gain(df, \"湿度\", \"ゴルフプレイ\"),\n",
" gain(df, \"風\", \"ゴルフプレイ\"),\n",
"]\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-23T22:08:40.162265",
"start_time": "2017-01-23T22:08:40.130457"
},
"collapsed": true
},
"source": [
"entropy\n",
"- https://ja.wikipedia.org/wiki/%E3%82%AB%E3%83%AB%E3%83%90%E3%83%83%E3%82%AF%E3%83%BB%E3%83%A9%E3%82%A4%E3%83%96%E3%83%A9%E3%83%BC%E6%83%85%E5%A0%B1%E9%87%8F\n",
"- https://en.wikipedia.org/wiki/Information_gain_in_decision_trees\n",
"- https://en.wikipedia.org/wiki/Mutual_information\n",
"\n",
"\n",
"\n",
"- http://nzw.hatenablog.jp/entry/2015/09/28/010244\n",
"- http://sucrose.hatenablog.com/entry/2013/11/10/001555\n",
"- http://surolog.hatenablog.com/entry/2014/12/26/134446\n",
"- http://neuralnet.hatenablog.jp/entry/2016/05/17/010816\n",
"- "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ジニ係数\n",
"\n",
"- http://d.hatena.ne.jp/ksnt/20120218#1332079100"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-26T23:33:22.191191",
"start_time": "2017-01-26T23:33:22.162830"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(OrderedDict([('a', 2), ('b', 1)]), OrderedDict([('b', 1), ('a', 2)]))"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from collections import OrderedDict\n",
"o_dict1 = OrderedDict(b=1, a=2)\n",
"o_dict2 = OrderedDict()\n",
"o_dict2[\"b\"] = 1\n",
"o_dict2[\"a\"] = 2\n",
"o_dict1, o_dict2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"toc": {
"toc_cell": false,
"toc_number_sections": true,
"toc_threshold": 6,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}