{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DecisionTree" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:51.045093", "start_time": "2017-01-22T06:10:49.156058" }, "collapsed": true }, "outputs": [], "source": [ "from sklearn import tree" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:51.058013", "start_time": "2017-01-22T06:10:51.047401" }, "collapsed": true }, "outputs": [], "source": [ "dataset = {\n", " \"天気\": list(\"晴晴曇雨雨雨曇晴晴雨晴曇曇雨\"),\n", " \"温度\": list(\"暑暑暑暖涼涼涼暖涼暖暖暖暑暖\"),\n", " \"湿度\": list(\"高高高高普普普高普普普高普高\"),\n", " \"風\": list(\"無有無無無有有無無無有有無有\"),\n", " \"ゴルフプレイ\": list(\"☓☓◯◯◯☓◯☓◯◯◯◯◯☓\")\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.386496", "start_time": "2017-01-22T06:10:51.061630" }, "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.455740", "start_time": "2017-01-22T06:10:52.389165" }, "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ゴルフプレイ天気温度湿度
0
1
2
3
4
5
6
7
8
9
10
11
12
13
\n", "
" ], "text/plain": [ " ゴルフプレイ 天気 温度 湿度 風\n", "0 ☓ 晴 暑 高 無\n", "1 ☓ 晴 暑 高 有\n", "2 ◯ 曇 暑 高 無\n", "3 ◯ 雨 暖 高 無\n", "4 ◯ 雨 涼 普 無\n", "5 ☓ 雨 涼 普 有\n", "6 ◯ 曇 涼 普 有\n", "7 ☓ 晴 暖 高 無\n", "8 ◯ 晴 涼 普 無\n", "9 ◯ 雨 暖 普 無\n", "10 ◯ 晴 暖 普 有\n", "11 ◯ 曇 暖 高 有\n", "12 ◯ 曇 暑 普 無\n", "13 ☓ 雨 暖 高 有" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(dataset)\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.484175", "start_time": "2017-01-22T06:10:52.457737" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ゴルフプレイ天気温度湿度
00
10
21
\n", "
" ], "text/plain": [ " ゴルフプレイ 天気 温度 湿度 風\n", "0 0 晴 暑 高 無\n", "1 0 晴 暑 高 有\n", "2 1 曇 暑 高 無" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "df[\"ゴルフプレイ\"] = preprocessing.label_binarize(df[\"ゴルフプレイ\"], list(\"☓◯\"))\n", "df.head(3)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.492889", "start_time": "2017-01-22T06:10:52.486817" }, "collapsed": true }, "outputs": [], "source": [ "import math\n", "def entropy(t, f):\n", " if 0 in (t, f):\n", " return 0.0\n", " total = t + f\n", " return -t/total * math.log2(t/total) -f/total * math.log2(f/total)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.566429", "start_time": "2017-01-22T06:10:52.498969" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ゴルフプレイ01
天気
32
04
23
\n", "
" ], "text/plain": [ "ゴルフプレイ 0 1\n", "天気 \n", "晴 3 2\n", "曇 0 4\n", "雨 2 3" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cross_tab = pd.crosstab(index=df[\"天気\"], columns=df[\"ゴルフプレイ\"])\n", "cross_tab\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.577169", "start_time": "2017-01-22T06:10:52.569736" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(0.9709505944546686, 0.0, 0.9709505944546686)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entropy(2, 3), entropy(4, 0), entropy(3, 2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.605166", "start_time": "2017-01-22T06:10:52.581752" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.6935361388961918" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "5/14*entropy(2, 3) + 4/14*entropy(4, 0) + 5/14*entropy(3, 2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.635062", "start_time": "2017-01-22T06:10:52.611821" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1 9\n", "0 5\n", "Name: ゴルフプレイ, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# root\n", "df[\"ゴルフプレイ\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.651800", "start_time": "2017-01-22T06:10:52.642524" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.9402859586706311" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entropy(9, 5)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.691140", "start_time": "2017-01-22T06:10:52.668917" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0.9709505944546688, 0.0, 0.9709505944546688, 0.0]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# using scipy\n", "from scipy import stats\n", "[\n", " stats.entropy([2/5, 3/5], base=2),\n", " stats.entropy([4/4, 0/4]),\n", " stats.entropy([2, 3], base=2),\n", " stats.entropy([4, 0])\n", "]" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T14:37:22.098276", "start_time": "2017-01-22T14:37:22.086238" }, "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "def gain(df, feature_name, label_name):\n", " feature_e = 0.0\n", " for _, gdf in df.groupby(feature_name):\n", " feature_e += len(gdf) / len(df) * stats.entropy(gdf[label_name].value_counts(), base=2)\n", " root_e = stats.entropy(df[label_name].value_counts(), base=2)\n", " gain = root_e - feature_e\n", " return gain\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2017-01-22T06:10:52.827845", "start_time": "2017-01-22T06:10:52.718416" }, "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[0.24674981977443899,\n", " 0.029222565658954758,\n", " 0.15183550136234159,\n", " 0.048127030408269378]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[\n", " gain(df, \"天気\", \"ゴルフプレイ\"),\n", " gain(df, \"温度\", \"ゴルフプレイ\"),\n", " gain(df, \"湿度\", \"ゴルフプレイ\"),\n", " gain(df, \"風\", \"ゴルフプレイ\"),\n", "]\n" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2017-01-23T22:08:40.162265", "start_time": "2017-01-23T22:08:40.130457" }, "collapsed": true }, "source": [ "entropy\n", "- https://ja.wikipedia.org/wiki/%E3%82%AB%E3%83%AB%E3%83%90%E3%83%83%E3%82%AF%E3%83%BB%E3%83%A9%E3%82%A4%E3%83%96%E3%83%A9%E3%83%BC%E6%83%85%E5%A0%B1%E9%87%8F\n", "- https://en.wikipedia.org/wiki/Information_gain_in_decision_trees\n", "- https://en.wikipedia.org/wiki/Mutual_information\n", "\n", "\n", "\n", "- http://nzw.hatenablog.jp/entry/2015/09/28/010244\n", "- http://sucrose.hatenablog.com/entry/2013/11/10/001555\n", "- http://surolog.hatenablog.com/entry/2014/12/26/134446\n", "- http://neuralnet.hatenablog.jp/entry/2016/05/17/010816\n", "- " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "ジニ係数\n", "\n", "- http://d.hatena.ne.jp/ksnt/20120218#1332079100" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 141, "metadata": { "ExecuteTime": { "end_time": "2017-01-26T23:33:22.191191", "start_time": "2017-01-26T23:33:22.162830" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(OrderedDict([('a', 2), ('b', 1)]), OrderedDict([('b', 1), ('a', 2)]))" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import OrderedDict\n", "o_dict1 = OrderedDict(b=1, a=2)\n", "o_dict2 = OrderedDict()\n", "o_dict2[\"b\"] = 1\n", "o_dict2[\"a\"] = 2\n", "o_dict1, o_dict2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "toc_cell": false, "toc_number_sections": true, "toc_threshold": 6, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }