window_function¶

In [8]:

import pandas as pd
import numpy as np
import seaborn as sns

In [3]:

pd.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.5.1.final.0
python-bits: 64
OS: Darwin
OS-release: 15.6.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: ja_JP.UTF-8

pandas: 0.18.1
nose: 1.3.7
pip: 9.0.1
setuptools: 20.3
Cython: 0.23.4
numpy: 1.10.4
scipy: 0.17.0
statsmodels: 0.6.1
xarray: None
IPython: 5.1.0
sphinx: 1.3.5
patsy: 0.4.0
dateutil: 2.5.1
pytz: 2016.2
blosc: None
bottleneck: 1.0.0
tables: 3.2.2
numexpr: 2.5
matplotlib: 1.5.1
openpyxl: 2.3.2
xlrd: 0.9.4
xlwt: 1.0.0
xlsxwriter: 0.8.4
lxml: 3.6.0
bs4: 4.4.1
html5lib: 0.9999999
httplib2: None
apiclient: None
sqlalchemy: 1.0.12
pymysql: None
psycopg2: None
jinja2: 2.8
boto: 2.39.0
pandas_datareader: 0.2.1

In [12]:

iris = sns.load_dataset("iris")

In [13]:

iris = pd.concat([iris.head(3), iris.ix[50:52], iris.ix[100:102]]).reset_index(drop=True)
iris

Out[13]:

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	7.0	3.2	4.7	1.4	versicolor
4	6.4	3.2	4.5	1.5	versicolor
5	6.9	3.1	4.9	1.5	versicolor
6	6.3	3.3	6.0	2.5	virginica
7	5.8	2.7	5.1	1.9	virginica
8	7.1	3.0	5.9	2.1	virginica

In [6]:

pd.concat([
    pd.rolling_sum(iris.sepal_length, window=3),
    pd.expanding_sum(iris.sepal_length),
    iris.groupby("species")["sepal_length"].transform(pd.expanding_sum),
], axis=1).T

/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with
        Series.rolling(center=False,window=3).sum()
  from ipykernel import kernelapp as app
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with
        Series.expanding(min_periods=1).sum()
  app.launch_new_instance()
/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/pandas/core/groupby.py:2746: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with
        Series.expanding(min_periods=1).sum()
  wrapper = lambda x: func(x, *args, **kwargs)

Out[6]:

	0	1	2	3	4	5	6	7	8
sepal_length	NaN	NaN	14.7	16.6	18.1	20.3	19.6	19.0	19.2
sepal_length	5.1	10.0	14.7	21.7	28.1	35.0	41.3	47.1	54.2
sepal_length	5.1	10.0	14.7	7.0	13.4	20.3	6.3	12.1	19.2

In [68]:

res = pd.concat([
        iris.sepal_length,
        iris.sepal_length.shift(),
        iris.sepal_length.shift(periods=-1),
        iris.sepal_length.cumsum(),
        iris.sepal_length.rolling(window=3).sum(),
        iris.sepal_length.rolling(window=3, center=True).sum(),
        iris.sepal_length.rolling(window=4, center=True).sum(),
        iris.sepal_length.expanding().sum(),
        iris.groupby("species")["sepal_length"].transform(np.sum),
        iris.groupby("species")["sepal_length"].transform(pd.Series.cumsum),
        iris.groupby("species").apply(lambda x: x.expanding().sepal_length.sum()).reset_index(drop=True),
        iris.groupby("species").expanding().sepal_length.sum().reset_index(level=0, drop=True),
        iris.groupby("species").sepal_length.shift(),
        iris.groupby("species").sepal_length.shift(periods=-1),
    ], axis=1)
res.columns = [
    "default",
    "shift_down",
    "shift_up",
    "cumsum",
    "roll3_sum",
    "roll3_sum_c",
    "roll4_sum_c", # 偶数の場合, どの範囲になるかの確認のため
    "ex_sum",      # cumsum
    "trans_sum",   # SQL Window関数におけるGroup内すべてと同様
    "trans_cumsum",# group内cumsum
    "g_ex_old_sum",
    "g_ex_sum",    # group内cumsum
    "g_shift_down",# group内shift
    "g_shift_up",
]

import tabulate
print(tabulate.tabulate(res, headers="keys", tablefmt="pipe"))

|    |   default |   shift_down |   shift_up |   cumsum |   roll3_sum |   roll3_sum_c |   roll4_sum_c |   ex_sum |   trans_sum |   trans_cumsum |   g_ex_old_sum |   g_ex_sum |   g_shift_down |   g_shift_up |
|---:|----------:|-------------:|-----------:|---------:|------------:|--------------:|--------------:|---------:|------------:|---------------:|---------------:|-----------:|---------------:|-------------:|
|  0 |       5.1 |        nan   |        4.9 |      5.1 |       nan   |         nan   |         nan   |      5.1 |        14.7 |            5.1 |            5.1 |        5.1 |          nan   |          4.9 |
|  1 |       4.9 |          5.1 |        4.7 |     10   |       nan   |          14.7 |         nan   |     10   |        14.7 |           10   |           10   |       10   |            5.1 |          4.7 |
|  2 |       4.7 |          4.9 |        7   |     14.7 |        14.7 |          16.6 |          21.7 |     14.7 |        14.7 |           14.7 |           14.7 |       14.7 |            4.9 |        nan   |
|  3 |       7   |          4.7 |        6.4 |     21.7 |        16.6 |          18.1 |          23   |     21.7 |        20.3 |            7   |            7   |        7   |          nan   |          6.4 |
|  4 |       6.4 |          7   |        6.9 |     28.1 |        18.1 |          20.3 |          25   |     28.1 |        20.3 |           13.4 |           13.4 |       13.4 |            7   |          6.9 |
|  5 |       6.9 |          6.4 |        6.3 |     35   |        20.3 |          19.6 |          26.6 |     35   |        20.3 |           20.3 |           20.3 |       20.3 |            6.4 |        nan   |
|  6 |       6.3 |          6.9 |        5.8 |     41.3 |        19.6 |          19   |          25.4 |     41.3 |        19.2 |            6.3 |            6.3 |        6.3 |          nan   |          5.8 |
|  7 |       5.8 |          6.3 |        7.1 |     47.1 |        19   |          19.2 |          26.1 |     47.1 |        19.2 |           12.1 |           12.1 |       12.1 |            6.3 |          7.1 |
|  8 |       7.1 |          5.8 |      nan   |     54.2 |        19.2 |         nan   |         nan   |     54.2 |        19.2 |           19.2 |           19.2 |       19.2 |            5.8 |        nan   |

In [77]:

s = pd.Series([1,2,2,3,4])
rank_df = pd.DataFrame().assign(value=s)
rank_df = rank_df.assign(
    default=s.rank(),
    first_rank=s.rank(method="first"),
    dense=s.rank(method="dense"),
    min_rank=s.rank(method="min"),
    max_rank=s.rank(method="max")
).astype(int)

import tabulate
print(tabulate.tabulate(rank_df, headers="keys", tablefmt="pipe"))
rank_df

|    |   value |   default |   dense |   first_rank |   max_rank |   min_rank |
|---:|--------:|----------:|--------:|-------------:|-----------:|-----------:|
|  0 |       1 |         1 |       1 |            1 |          1 |          1 |
|  1 |       2 |         2 |       2 |            2 |          3 |          2 |
|  2 |       2 |         2 |       2 |            3 |          3 |          2 |
|  3 |       3 |         4 |       3 |            4 |          4 |          4 |
|  4 |       4 |         5 |       4 |            5 |          5 |          5 |

Out[77]:

	value	default	dense	first_rank	max_rank	min_rank
0	1	1	1	1	1	1
1	2	2	2	2	3	2
2	2	2	2	3	3	2
3	3	4	3	4	4	4
4	4	5	4	5	5	5

In [78]:

df = pd.DataFrame(
    {
        "type": list("aaaabbbbcccc"),
        "v": range(12),
    }
)
print(df.groupby("type")["v"].apply(lambda x: x[x%2==0].sum()))

def _t(x):
    x[x%2!=0] = 0
    return x.cumsum()

conditional_df = pd.concat(
    [
        df,
        df["v"].apply(lambda x: x if x%2==0 else 0),
        df["v"].apply(lambda x: x if x%2==0 else 0).cumsum(),
        df.groupby("type")["v"].transform(lambda x: x[x%2==0].sum()),
        df.groupby("type")["v"].transform(lambda x: x[x%2==0].cumsum()),
        df.groupby("type")["v"].transform(_t),
    ], axis=1)
conditional_df.columns = ["type", "v", "奇数は0(以後同様に0)", "cumsum", "group sum",  "group cumsum(期待値と異なる)", "group cumsum"]

import tabulate
print(tabulate.tabulate(conditional_df, headers="keys", tablefmt="pipe"))
conditional_df

type
a     2
b    10
c    18
Name: v, dtype: int64
|    | type   |   v |   奇数は0(以後同様に0) |   cumsum |   group sum |   group cumsum(期待値と異なる) |   group cumsum |
|---:|:-------|----:|---------------:|---------:|------------:|------------------------:|---------------:|
|  0 | a      |   0 |              0 |        0 |           2 |                       0 |              0 |
|  1 | a      |   1 |              0 |        0 |           2 |                       2 |              0 |
|  2 | a      |   2 |              2 |        2 |           2 |                       0 |              2 |
|  3 | a      |   3 |              0 |        2 |           2 |                       2 |              2 |
|  4 | b      |   4 |              4 |        6 |          10 |                       4 |              4 |
|  5 | b      |   5 |              0 |        6 |          10 |                      10 |              4 |
|  6 | b      |   6 |              6 |       12 |          10 |                       4 |             10 |
|  7 | b      |   7 |              0 |       12 |          10 |                      10 |             10 |
|  8 | c      |   8 |              8 |       20 |          18 |                       8 |              8 |
|  9 | c      |   9 |              0 |       20 |          18 |                      18 |              8 |
| 10 | c      |  10 |             10 |       30 |          18 |                       8 |             18 |
| 11 | c      |  11 |              0 |       30 |          18 |                      18 |             18 |

Out[78]:

	type	v	奇数は0(以後同様に0)	cumsum	group sum	group cumsum(期待値と異なる)	group cumsum
0	a	0	0	0	2	0	0
1	a	1	0	0	2	2	0
2	a	2	2	2	2	0	2
3	a	3	0	2	2	2	2
4	b	4	4	6	10	4	4
5	b	5	0	6	10	10	4
6	b	6	6	12	10	4	10
7	b	7	0	12	10	10	10
8	c	8	8	20	18	8	8
9	c	9	0	20	18	18	8
10	c	10	10	30	18	8	18
11	c	11	0	30	18	18	18