window_function¶
In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
In [3]:
pd.show_versions()
INSTALLED VERSIONS
------------------
commit: None
python: 3.5.1.final.0
python-bits: 64
OS: Darwin
OS-release: 15.6.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: ja_JP.UTF-8
pandas: 0.18.1
nose: 1.3.7
pip: 9.0.1
setuptools: 20.3
Cython: 0.23.4
numpy: 1.10.4
scipy: 0.17.0
statsmodels: 0.6.1
xarray: None
IPython: 5.1.0
sphinx: 1.3.5
patsy: 0.4.0
dateutil: 2.5.1
pytz: 2016.2
blosc: None
bottleneck: 1.0.0
tables: 3.2.2
numexpr: 2.5
matplotlib: 1.5.1
openpyxl: 2.3.2
xlrd: 0.9.4
xlwt: 1.0.0
xlsxwriter: 0.8.4
lxml: 3.6.0
bs4: 4.4.1
html5lib: 0.9999999
httplib2: None
apiclient: None
sqlalchemy: 1.0.12
pymysql: None
psycopg2: None
jinja2: 2.8
boto: 2.39.0
pandas_datareader: 0.2.1
In [12]:
iris = sns.load_dataset("iris")
In [13]:
iris = pd.concat([iris.head(3), iris.ix[50:52], iris.ix[100:102]]).reset_index(drop=True)
iris
Out[13]:
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 7.0 | 3.2 | 4.7 | 1.4 | versicolor |
4 | 6.4 | 3.2 | 4.5 | 1.5 | versicolor |
5 | 6.9 | 3.1 | 4.9 | 1.5 | versicolor |
6 | 6.3 | 3.3 | 6.0 | 2.5 | virginica |
7 | 5.8 | 2.7 | 5.1 | 1.9 | virginica |
8 | 7.1 | 3.0 | 5.9 | 2.1 | virginica |
In [6]:
pd.concat([
pd.rolling_sum(iris.sepal_length, window=3),
pd.expanding_sum(iris.sepal_length),
iris.groupby("species")["sepal_length"].transform(pd.expanding_sum),
], axis=1).T
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with
Series.rolling(center=False,window=3).sum()
from ipykernel import kernelapp as app
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with
Series.expanding(min_periods=1).sum()
app.launch_new_instance()
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/pandas/core/groupby.py:2746: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with
Series.expanding(min_periods=1).sum()
wrapper = lambda x: func(x, *args, **kwargs)
Out[6]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
sepal_length | NaN | NaN | 14.7 | 16.6 | 18.1 | 20.3 | 19.6 | 19.0 | 19.2 |
sepal_length | 5.1 | 10.0 | 14.7 | 21.7 | 28.1 | 35.0 | 41.3 | 47.1 | 54.2 |
sepal_length | 5.1 | 10.0 | 14.7 | 7.0 | 13.4 | 20.3 | 6.3 | 12.1 | 19.2 |
In [68]:
res = pd.concat([
iris.sepal_length,
iris.sepal_length.shift(),
iris.sepal_length.shift(periods=-1),
iris.sepal_length.cumsum(),
iris.sepal_length.rolling(window=3).sum(),
iris.sepal_length.rolling(window=3, center=True).sum(),
iris.sepal_length.rolling(window=4, center=True).sum(),
iris.sepal_length.expanding().sum(),
iris.groupby("species")["sepal_length"].transform(np.sum),
iris.groupby("species")["sepal_length"].transform(pd.Series.cumsum),
iris.groupby("species").apply(lambda x: x.expanding().sepal_length.sum()).reset_index(drop=True),
iris.groupby("species").expanding().sepal_length.sum().reset_index(level=0, drop=True),
iris.groupby("species").sepal_length.shift(),
iris.groupby("species").sepal_length.shift(periods=-1),
], axis=1)
res.columns = [
"default",
"shift_down",
"shift_up",
"cumsum",
"roll3_sum",
"roll3_sum_c",
"roll4_sum_c", # 偶数の場合, どの範囲になるかの確認のため
"ex_sum", # cumsum
"trans_sum", # SQL Window関数におけるGroup内すべてと同様
"trans_cumsum",# group内cumsum
"g_ex_old_sum",
"g_ex_sum", # group内cumsum
"g_shift_down",# group内shift
"g_shift_up",
]
import tabulate
print(tabulate.tabulate(res, headers="keys", tablefmt="pipe"))
| | default | shift_down | shift_up | cumsum | roll3_sum | roll3_sum_c | roll4_sum_c | ex_sum | trans_sum | trans_cumsum | g_ex_old_sum | g_ex_sum | g_shift_down | g_shift_up |
|---:|----------:|-------------:|-----------:|---------:|------------:|--------------:|--------------:|---------:|------------:|---------------:|---------------:|-----------:|---------------:|-------------:|
| 0 | 5.1 | nan | 4.9 | 5.1 | nan | nan | nan | 5.1 | 14.7 | 5.1 | 5.1 | 5.1 | nan | 4.9 |
| 1 | 4.9 | 5.1 | 4.7 | 10 | nan | 14.7 | nan | 10 | 14.7 | 10 | 10 | 10 | 5.1 | 4.7 |
| 2 | 4.7 | 4.9 | 7 | 14.7 | 14.7 | 16.6 | 21.7 | 14.7 | 14.7 | 14.7 | 14.7 | 14.7 | 4.9 | nan |
| 3 | 7 | 4.7 | 6.4 | 21.7 | 16.6 | 18.1 | 23 | 21.7 | 20.3 | 7 | 7 | 7 | nan | 6.4 |
| 4 | 6.4 | 7 | 6.9 | 28.1 | 18.1 | 20.3 | 25 | 28.1 | 20.3 | 13.4 | 13.4 | 13.4 | 7 | 6.9 |
| 5 | 6.9 | 6.4 | 6.3 | 35 | 20.3 | 19.6 | 26.6 | 35 | 20.3 | 20.3 | 20.3 | 20.3 | 6.4 | nan |
| 6 | 6.3 | 6.9 | 5.8 | 41.3 | 19.6 | 19 | 25.4 | 41.3 | 19.2 | 6.3 | 6.3 | 6.3 | nan | 5.8 |
| 7 | 5.8 | 6.3 | 7.1 | 47.1 | 19 | 19.2 | 26.1 | 47.1 | 19.2 | 12.1 | 12.1 | 12.1 | 6.3 | 7.1 |
| 8 | 7.1 | 5.8 | nan | 54.2 | 19.2 | nan | nan | 54.2 | 19.2 | 19.2 | 19.2 | 19.2 | 5.8 | nan |
In [77]:
s = pd.Series([1,2,2,3,4])
rank_df = pd.DataFrame().assign(value=s)
rank_df = rank_df.assign(
default=s.rank(),
first_rank=s.rank(method="first"),
dense=s.rank(method="dense"),
min_rank=s.rank(method="min"),
max_rank=s.rank(method="max")
).astype(int)
import tabulate
print(tabulate.tabulate(rank_df, headers="keys", tablefmt="pipe"))
rank_df
| | value | default | dense | first_rank | max_rank | min_rank |
|---:|--------:|----------:|--------:|-------------:|-----------:|-----------:|
| 0 | 1 | 1 | 1 | 1 | 1 | 1 |
| 1 | 2 | 2 | 2 | 2 | 3 | 2 |
| 2 | 2 | 2 | 2 | 3 | 3 | 2 |
| 3 | 3 | 4 | 3 | 4 | 4 | 4 |
| 4 | 4 | 5 | 4 | 5 | 5 | 5 |
Out[77]:
value | default | dense | first_rank | max_rank | min_rank | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 1 | 1 |
1 | 2 | 2 | 2 | 2 | 3 | 2 |
2 | 2 | 2 | 2 | 3 | 3 | 2 |
3 | 3 | 4 | 3 | 4 | 4 | 4 |
4 | 4 | 5 | 4 | 5 | 5 | 5 |
In [78]:
df = pd.DataFrame(
{
"type": list("aaaabbbbcccc"),
"v": range(12),
}
)
print(df.groupby("type")["v"].apply(lambda x: x[x%2==0].sum()))
def _t(x):
x[x%2!=0] = 0
return x.cumsum()
conditional_df = pd.concat(
[
df,
df["v"].apply(lambda x: x if x%2==0 else 0),
df["v"].apply(lambda x: x if x%2==0 else 0).cumsum(),
df.groupby("type")["v"].transform(lambda x: x[x%2==0].sum()),
df.groupby("type")["v"].transform(lambda x: x[x%2==0].cumsum()),
df.groupby("type")["v"].transform(_t),
], axis=1)
conditional_df.columns = ["type", "v", "奇数は0(以後同様に0)", "cumsum", "group sum", "group cumsum(期待値と異なる)", "group cumsum"]
import tabulate
print(tabulate.tabulate(conditional_df, headers="keys", tablefmt="pipe"))
conditional_df
type
a 2
b 10
c 18
Name: v, dtype: int64
| | type | v | 奇数は0(以後同様に0) | cumsum | group sum | group cumsum(期待値と異なる) | group cumsum |
|---:|:-------|----:|---------------:|---------:|------------:|------------------------:|---------------:|
| 0 | a | 0 | 0 | 0 | 2 | 0 | 0 |
| 1 | a | 1 | 0 | 0 | 2 | 2 | 0 |
| 2 | a | 2 | 2 | 2 | 2 | 0 | 2 |
| 3 | a | 3 | 0 | 2 | 2 | 2 | 2 |
| 4 | b | 4 | 4 | 6 | 10 | 4 | 4 |
| 5 | b | 5 | 0 | 6 | 10 | 10 | 4 |
| 6 | b | 6 | 6 | 12 | 10 | 4 | 10 |
| 7 | b | 7 | 0 | 12 | 10 | 10 | 10 |
| 8 | c | 8 | 8 | 20 | 18 | 8 | 8 |
| 9 | c | 9 | 0 | 20 | 18 | 18 | 8 |
| 10 | c | 10 | 10 | 30 | 18 | 8 | 18 |
| 11 | c | 11 | 0 | 30 | 18 | 18 | 18 |
Out[78]:
type | v | 奇数は0(以後同様に0) | cumsum | group sum | group cumsum(期待値と異なる) | group cumsum | |
---|---|---|---|---|---|---|---|
0 | a | 0 | 0 | 0 | 2 | 0 | 0 |
1 | a | 1 | 0 | 0 | 2 | 2 | 0 |
2 | a | 2 | 2 | 2 | 2 | 0 | 2 |
3 | a | 3 | 0 | 2 | 2 | 2 | 2 |
4 | b | 4 | 4 | 6 | 10 | 4 | 4 |
5 | b | 5 | 0 | 6 | 10 | 10 | 4 |
6 | b | 6 | 6 | 12 | 10 | 4 | 10 |
7 | b | 7 | 0 | 12 | 10 | 10 | 10 |
8 | c | 8 | 8 | 20 | 18 | 8 | 8 |
9 | c | 9 | 0 | 20 | 18 | 18 | 8 |
10 | c | 10 | 10 | 30 | 18 | 8 | 18 |
11 | c | 11 | 0 | 30 | 18 | 18 | 18 |