cut_and_dummy¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
In [2]:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
# http://pandas.pydata.org/pandas-docs/stable/reshaping.html#computing-indicator-dummy-variables
In [3]:
np.random.seed(0)
df_for_cut = pd.DataFrame(np.random.randint(1, 99, 1000), columns=["age"])
df_for_cut.tail()
Out[3]:
age | |
---|---|
995 | 36 |
996 | 89 |
997 | 50 |
998 | 80 |
999 | 85 |
In [4]:
bins = list(range(0, 100+1, 10))
bins
Out[4]:
[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
In [5]:
bins_labels = [str(b) + " - " + str(b + 10 - 1) for b in bins[:-1]]
bins_labels
Out[5]:
['0 - 9',
'10 - 19',
'20 - 29',
'30 - 39',
'40 - 49',
'50 - 59',
'60 - 69',
'70 - 79',
'80 - 89',
'90 - 99']
In [6]:
df_for_cut["age_group"] = pd.cut(df_for_cut.age, bins=bins)
df_for_cut["age_group_right"] = pd.cut(df_for_cut.age, bins=bins, right=False)
df_for_cut["age_group_label_F"] = pd.cut(df_for_cut.age, bins=bins, labels=False)
df_for_cut["age_group_labels"] = pd.cut(df_for_cut.age, bins=bins, labels=bins_labels)
df_for_cut.tail()
Out[6]:
age | age_group | age_group_right | age_group_label_F | age_group_labels | |
---|---|---|---|---|---|
995 | 36 | (30, 40] | [30, 40) | 3 | 30 - 39 |
996 | 89 | (80, 90] | [80, 90) | 8 | 80 - 89 |
997 | 50 | (40, 50] | [50, 60) | 4 | 40 - 49 |
998 | 80 | (70, 80] | [80, 90) | 7 | 70 - 79 |
999 | 85 | (80, 90] | [80, 90) | 8 | 80 - 89 |
In [7]:
df_for_cut.age_group.unique()
Out[7]:
[(40, 50], (60, 70], (0, 10], (80, 90], (20, 30], (30, 40], (70, 80], (10, 20], (50, 60], (90, 100]]
Categories (10, object): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]
In [8]:
df_for_cut.age_group_label_F.unique()
Out[8]:
array([4, 6, 0, 8, 2, 3, 7, 1, 5, 9])
In [9]:
dummies = pd.get_dummies(df_for_cut['age_group'], prefix='age_group')
df_for_cut_with_dummies = pd.concat([df_for_cut, dummies], axis=1)
df_for_cut_with_dummies.tail()
Out[9]:
age | age_group | age_group_right | age_group_label_F | age_group_labels | age_group_(0, 10] | age_group_(10, 20] | age_group_(20, 30] | age_group_(30, 40] | age_group_(40, 50] | age_group_(50, 60] | age_group_(60, 70] | age_group_(70, 80] | age_group_(80, 90] | age_group_(90, 100] | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
995 | 36 | (30, 40] | [30, 40) | 3 | 30 - 39 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
996 | 89 | (80, 90] | [80, 90) | 8 | 80 - 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
997 | 50 | (40, 50] | [50, 60) | 4 | 40 - 49 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
998 | 80 | (70, 80] | [80, 90) | 7 | 70 - 79 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
999 | 85 | (80, 90] | [80, 90) | 8 | 80 - 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
In [10]:
pd.get_dummies(pd.DataFrame({"a": list("AB"), "b": list("CD")}), prefix=list("ab"))
Out[10]:
a_A | a_B | b_C | b_D | |
---|---|---|---|---|
0 | 1 | 0 | 1 | 0 |
1 | 0 | 1 | 0 | 1 |
In [11]:
factors = pd.Series(["B", np.nan, "a", np.nan, 123, 0.4, np.inf])
factors
Out[11]:
0 B
1 NaN
2 a
3 NaN
4 123
5 0.4
6 inf
dtype: object
In [12]:
factors.factorize()
Out[12]:
(array([ 0, -1, 1, -1, 2, 3, 4]),
Index(['B', 'a', 123, 0.4, inf], dtype='object'))
In [19]:
qcuted_4 = pd.qcut(df_for_cut["age"], q=4)
qcuted_4.tail()
Out[19]:
995 (25, 49]
996 (74, 98]
997 (49, 74]
998 (74, 98]
999 (74, 98]
Name: age, dtype: category
Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]
In [20]:
qcuted_10 = pd.qcut(df_for_cut["age"], q=10)
qcuted_10.tail()
Out[20]:
995 (30, 39]
996 (80, 89]
997 (49, 59]
998 (70, 80]
999 (80, 89]
Name: age, dtype: category
Categories (10, object): [[1, 9] < (9, 20] < (20, 30] < (30, 39] ... (59, 70] < (70, 80] < (80, 89] < (89, 98]]
In [22]:
q = [0, .25, .5, .75, 1]
qcuted_list = pd.qcut(df_for_cut["age"], q=q)
qcuted_list.tail()
Out[22]:
995 (25, 49]
996 (74, 98]
997 (49, 74]
998 (74, 98]
999 (74, 98]
Name: age, dtype: category
Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]