cut_and_dummy¶

In [1]:

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

In [2]:

# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
# http://pandas.pydata.org/pandas-docs/stable/reshaping.html#computing-indicator-dummy-variables

In [3]:

np.random.seed(0)
df_for_cut = pd.DataFrame(np.random.randint(1, 99, 1000), columns=["age"])
df_for_cut.tail()

Out[3]:

	age
995	36
996	89
997	50
998	80
999	85

In [4]:

bins = list(range(0, 100+1, 10))
bins

Out[4]:

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [5]:

bins_labels = [str(b) + " - " + str(b + 10 - 1) for b in bins[:-1]]
bins_labels

Out[5]:

['0 - 9',
 '10 - 19',
 '20 - 29',
 '30 - 39',
 '40 - 49',
 '50 - 59',
 '60 - 69',
 '70 - 79',
 '80 - 89',
 '90 - 99']

In [6]:

df_for_cut["age_group"] = pd.cut(df_for_cut.age, bins=bins)
df_for_cut["age_group_right"] = pd.cut(df_for_cut.age, bins=bins, right=False)
df_for_cut["age_group_label_F"] = pd.cut(df_for_cut.age, bins=bins, labels=False)
df_for_cut["age_group_labels"] = pd.cut(df_for_cut.age, bins=bins, labels=bins_labels)
df_for_cut.tail()

Out[6]:

	age	age_group	age_group_right	age_group_label_F	age_group_labels
995	36	(30, 40]	[30, 40)	3	30 - 39
996	89	(80, 90]	[80, 90)	8	80 - 89
997	50	(40, 50]	[50, 60)	4	40 - 49
998	80	(70, 80]	[80, 90)	7	70 - 79
999	85	(80, 90]	[80, 90)	8	80 - 89

In [7]:

df_for_cut.age_group.unique()

Out[7]:

[(40, 50], (60, 70], (0, 10], (80, 90], (20, 30], (30, 40], (70, 80], (10, 20], (50, 60], (90, 100]]
Categories (10, object): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [8]:

df_for_cut.age_group_label_F.unique()

Out[8]:

array([4, 6, 0, 8, 2, 3, 7, 1, 5, 9])

In [9]:

dummies = pd.get_dummies(df_for_cut['age_group'], prefix='age_group')
df_for_cut_with_dummies = pd.concat([df_for_cut, dummies], axis=1)
df_for_cut_with_dummies.tail()

Out[9]:

	age	age_group	age_group_right	age_group_label_F	age_group_labels	age_group_(30, 40]	age_group_(40, 50]	age_group_(70, 80]	age_group_(80, 90]
995	36	(30, 40]	[30, 40)	3	30 - 39	1	0	0	0
996	89	(80, 90]	[80, 90)	8	80 - 89	0	0	0	1
997	50	(40, 50]	[50, 60)	4	40 - 49	0	1	0	0
998	80	(70, 80]	[80, 90)	7	70 - 79	0	0	1	0
999	85	(80, 90]	[80, 90)	8	80 - 89	0	0	0	1

In [10]:

pd.get_dummies(pd.DataFrame({"a": list("AB"), "b": list("CD")}), prefix=list("ab"))

Out[10]:

	a_A	a_B	b_C	b_D
0	1	0	1	0
1	0	1	0	1

In [11]:

factors = pd.Series(["B", np.nan, "a", np.nan, 123, 0.4, np.inf])
factors

Out[11]:

    B
  NaN
    a
  NaN
  123
  0.4
  inf
dtype: object

In [12]:

factors.factorize()

Out[12]:

(array([ 0, -1,  1, -1,  2,  3,  4]),
 Index(['B', 'a', 123, 0.4, inf], dtype='object'))

In [19]:

qcuted_4 = pd.qcut(df_for_cut["age"], q=4)
qcuted_4.tail()

Out[19]:

  (25, 49]
  (74, 98]
  (49, 74]
  (74, 98]
  (74, 98]
Name: age, dtype: category
Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]

In [20]:

qcuted_10 = pd.qcut(df_for_cut["age"], q=10)
qcuted_10.tail()

Out[20]:

  (30, 39]
  (80, 89]
  (49, 59]
  (70, 80]
  (80, 89]
Name: age, dtype: category
Categories (10, object): [[1, 9] < (9, 20] < (20, 30] < (30, 39] ... (59, 70] < (70, 80] < (80, 89] < (89, 98]]

In [22]:

q = [0, .25, .5, .75, 1]
qcuted_list = pd.qcut(df_for_cut["age"], q=q)
qcuted_list.tail()

Out[22]:

  (25, 49]
  (74, 98]
  (49, 74]
  (74, 98]
  (74, 98]
Name: age, dtype: category
Categories (4, object): [[1, 25] < (25, 49] < (49, 74] < (74, 98]]