DecisionTree

In [1]:
from sklearn import tree
In [2]:
dataset = {
    "天気": list("晴晴曇雨雨雨曇晴晴雨晴曇曇雨"),
    "温度": list("暑暑暑暖涼涼涼暖涼暖暖暖暑暖"),
    "湿度": list("高高高高普普普高普普普高普高"),
    "風": list("無有無無無有有無無無有有無有"),
    "ゴルフプレイ": list("☓☓◯◯◯☓◯☓◯◯◯◯◯☓")
}
In [3]:
import pandas as pd
import numpy as np
In [4]:
df = pd.DataFrame(dataset)
df
Out[4]:
ゴルフプレイ 天気 温度 湿度
0
1
2
3
4
5
6
7
8
9
10
11
12
13
In [5]:
from sklearn import preprocessing
df["ゴルフプレイ"] = preprocessing.label_binarize(df["ゴルフプレイ"], list("☓◯"))
df.head(3)
Out[5]:
ゴルフプレイ 天気 温度 湿度
0 0
1 0
2 1
In [6]:
import math
def entropy(t, f):
    if 0 in (t, f):
        return 0.0
    total = t + f
    return -t/total * math.log2(t/total)  -f/total * math.log2(f/total)
In [7]:
cross_tab = pd.crosstab(index=df["天気"], columns=df["ゴルフプレイ"])
cross_tab

Out[7]:
ゴルフプレイ 0 1
天気
3 2
0 4
2 3
In [8]:
entropy(2, 3), entropy(4, 0), entropy(3, 2)
Out[8]:
(0.9709505944546686, 0.0, 0.9709505944546686)
In [9]:
5/14*entropy(2, 3) + 4/14*entropy(4, 0) + 5/14*entropy(3, 2)
Out[9]:
0.6935361388961918
In [10]:
# root
df["ゴルフプレイ"].value_counts()
Out[10]:
1    9
0    5
Name: ゴルフプレイ, dtype: int64
In [11]:
entropy(9, 5)
Out[11]:
0.9402859586706311
In [12]:
# using scipy
from scipy import stats
[
    stats.entropy([2/5, 3/5], base=2),
    stats.entropy([4/4, 0/4]),
    stats.entropy([2, 3], base=2),
    stats.entropy([4, 0])
]
Out[12]:
[0.9709505944546688, 0.0, 0.9709505944546688, 0.0]
In [131]:
def gain(df, feature_name, label_name):
    feature_e = 0.0
    for _, gdf in df.groupby(feature_name):
        feature_e += len(gdf) / len(df) * stats.entropy(gdf[label_name].value_counts(), base=2)
    root_e = stats.entropy(df[label_name].value_counts(), base=2)
    gain = root_e - feature_e
    return gain

In [14]:
[
    gain(df, "天気", "ゴルフプレイ"),
    gain(df, "温度", "ゴルフプレイ"),
    gain(df, "湿度", "ゴルフプレイ"),
    gain(df, "風", "ゴルフプレイ"),
]

Out[14]:
[0.24674981977443899,
 0.029222565658954758,
 0.15183550136234159,
 0.048127030408269378]

entropy - https://ja.wikipedia.org/wiki/%E3%82%AB%E3%83%AB%E3%83%90%E3%83%83%E3%82%AF%E3%83%BB%E3%83%A9%E3%82%A4%E3%83%96%E3%83%A9%E3%83%BC%E6%83%85%E5%A0%B1%E9%87%8F - https://en.wikipedia.org/wiki/Information_gain_in_decision_trees - https://en.wikipedia.org/wiki/Mutual_information

ジニ係数

In [ ]:

In [141]:
from collections import OrderedDict
o_dict1 = OrderedDict(b=1, a=2)
o_dict2 = OrderedDict()
o_dict2["b"] = 1
o_dict2["a"] = 2
o_dict1, o_dict2
Out[141]:
(OrderedDict([('a', 2), ('b', 1)]), OrderedDict([('b', 1), ('a', 2)]))
In [ ]: