{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# window_function"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T06:32:33.183428",
"start_time": "2017-01-19T06:32:33.179574"
},
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T06:31:02.184084",
"start_time": "2017-01-19T06:31:00.358250"
},
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"INSTALLED VERSIONS\n",
"------------------\n",
"commit: None\n",
"python: 3.5.1.final.0\n",
"python-bits: 64\n",
"OS: Darwin\n",
"OS-release: 15.6.0\n",
"machine: x86_64\n",
"processor: i386\n",
"byteorder: little\n",
"LC_ALL: None\n",
"LANG: ja_JP.UTF-8\n",
"\n",
"pandas: 0.18.1\n",
"nose: 1.3.7\n",
"pip: 9.0.1\n",
"setuptools: 20.3\n",
"Cython: 0.23.4\n",
"numpy: 1.10.4\n",
"scipy: 0.17.0\n",
"statsmodels: 0.6.1\n",
"xarray: None\n",
"IPython: 5.1.0\n",
"sphinx: 1.3.5\n",
"patsy: 0.4.0\n",
"dateutil: 2.5.1\n",
"pytz: 2016.2\n",
"blosc: None\n",
"bottleneck: 1.0.0\n",
"tables: 3.2.2\n",
"numexpr: 2.5\n",
"matplotlib: 1.5.1\n",
"openpyxl: 2.3.2\n",
"xlrd: 0.9.4\n",
"xlwt: 1.0.0\n",
"xlsxwriter: 0.8.4\n",
"lxml: 3.6.0\n",
"bs4: 4.4.1\n",
"html5lib: 0.9999999\n",
"httplib2: None\n",
"apiclient: None\n",
"sqlalchemy: 1.0.12\n",
"pymysql: None\n",
"psycopg2: None\n",
"jinja2: 2.8\n",
"boto: 2.39.0\n",
"pandas_datareader: 0.2.1\n"
]
}
],
"source": [
"pd.show_versions()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T06:46:11.484097",
"start_time": "2017-01-19T06:46:11.476791"
},
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"iris = sns.load_dataset(\"iris\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T06:46:59.572691",
"start_time": "2017-01-19T06:46:59.549258"
},
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 3 | \n",
" 7.0 | \n",
" 3.2 | \n",
" 4.7 | \n",
" 1.4 | \n",
" versicolor | \n",
"
\n",
" \n",
" 4 | \n",
" 6.4 | \n",
" 3.2 | \n",
" 4.5 | \n",
" 1.5 | \n",
" versicolor | \n",
"
\n",
" \n",
" 5 | \n",
" 6.9 | \n",
" 3.1 | \n",
" 4.9 | \n",
" 1.5 | \n",
" versicolor | \n",
"
\n",
" \n",
" 6 | \n",
" 6.3 | \n",
" 3.3 | \n",
" 6.0 | \n",
" 2.5 | \n",
" virginica | \n",
"
\n",
" \n",
" 7 | \n",
" 5.8 | \n",
" 2.7 | \n",
" 5.1 | \n",
" 1.9 | \n",
" virginica | \n",
"
\n",
" \n",
" 8 | \n",
" 7.1 | \n",
" 3.0 | \n",
" 5.9 | \n",
" 2.1 | \n",
" virginica | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width species\n",
"0 5.1 3.5 1.4 0.2 setosa\n",
"1 4.9 3.0 1.4 0.2 setosa\n",
"2 4.7 3.2 1.3 0.2 setosa\n",
"3 7.0 3.2 4.7 1.4 versicolor\n",
"4 6.4 3.2 4.5 1.5 versicolor\n",
"5 6.9 3.1 4.9 1.5 versicolor\n",
"6 6.3 3.3 6.0 2.5 virginica\n",
"7 5.8 2.7 5.1 1.9 virginica\n",
"8 7.1 3.0 5.9 2.1 virginica"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris = pd.concat([iris.head(3), iris.ix[50:52], iris.ix[100:102]]).reset_index(drop=True)\n",
"iris\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T06:31:41.527491",
"start_time": "2017-01-19T06:31:41.478574"
},
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with \n",
"\tSeries.rolling(center=False,window=3).sum()\n",
" from ipykernel import kernelapp as app\n",
"/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with \n",
"\tSeries.expanding(min_periods=1).sum()\n",
" app.launch_new_instance()\n",
"/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/pandas/core/groupby.py:2746: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with \n",
"\tSeries.expanding(min_periods=1).sum()\n",
" wrapper = lambda x: func(x, *args, **kwargs)\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" \n",
" \n",
" sepal_length | \n",
" NaN | \n",
" NaN | \n",
" 14.7 | \n",
" 16.6 | \n",
" 18.1 | \n",
" 20.3 | \n",
" 19.6 | \n",
" 19.0 | \n",
" 19.2 | \n",
"
\n",
" \n",
" sepal_length | \n",
" 5.1 | \n",
" 10.0 | \n",
" 14.7 | \n",
" 21.7 | \n",
" 28.1 | \n",
" 35.0 | \n",
" 41.3 | \n",
" 47.1 | \n",
" 54.2 | \n",
"
\n",
" \n",
" sepal_length | \n",
" 5.1 | \n",
" 10.0 | \n",
" 14.7 | \n",
" 7.0 | \n",
" 13.4 | \n",
" 20.3 | \n",
" 6.3 | \n",
" 12.1 | \n",
" 19.2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8\n",
"sepal_length NaN NaN 14.7 16.6 18.1 20.3 19.6 19.0 19.2\n",
"sepal_length 5.1 10.0 14.7 21.7 28.1 35.0 41.3 47.1 54.2\n",
"sepal_length 5.1 10.0 14.7 7.0 13.4 20.3 6.3 12.1 19.2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([\n",
" pd.rolling_sum(iris.sepal_length, window=3),\n",
" pd.expanding_sum(iris.sepal_length),\n",
" iris.groupby(\"species\")[\"sepal_length\"].transform(pd.expanding_sum),\n",
"], axis=1).T"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T07:28:52.597858",
"start_time": "2017-01-19T07:28:52.527370"
},
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| | default | shift_down | shift_up | cumsum | roll3_sum | roll3_sum_c | roll4_sum_c | ex_sum | trans_sum | trans_cumsum | g_ex_old_sum | g_ex_sum | g_shift_down | g_shift_up |\n",
"|---:|----------:|-------------:|-----------:|---------:|------------:|--------------:|--------------:|---------:|------------:|---------------:|---------------:|-----------:|---------------:|-------------:|\n",
"| 0 | 5.1 | nan | 4.9 | 5.1 | nan | nan | nan | 5.1 | 14.7 | 5.1 | 5.1 | 5.1 | nan | 4.9 |\n",
"| 1 | 4.9 | 5.1 | 4.7 | 10 | nan | 14.7 | nan | 10 | 14.7 | 10 | 10 | 10 | 5.1 | 4.7 |\n",
"| 2 | 4.7 | 4.9 | 7 | 14.7 | 14.7 | 16.6 | 21.7 | 14.7 | 14.7 | 14.7 | 14.7 | 14.7 | 4.9 | nan |\n",
"| 3 | 7 | 4.7 | 6.4 | 21.7 | 16.6 | 18.1 | 23 | 21.7 | 20.3 | 7 | 7 | 7 | nan | 6.4 |\n",
"| 4 | 6.4 | 7 | 6.9 | 28.1 | 18.1 | 20.3 | 25 | 28.1 | 20.3 | 13.4 | 13.4 | 13.4 | 7 | 6.9 |\n",
"| 5 | 6.9 | 6.4 | 6.3 | 35 | 20.3 | 19.6 | 26.6 | 35 | 20.3 | 20.3 | 20.3 | 20.3 | 6.4 | nan |\n",
"| 6 | 6.3 | 6.9 | 5.8 | 41.3 | 19.6 | 19 | 25.4 | 41.3 | 19.2 | 6.3 | 6.3 | 6.3 | nan | 5.8 |\n",
"| 7 | 5.8 | 6.3 | 7.1 | 47.1 | 19 | 19.2 | 26.1 | 47.1 | 19.2 | 12.1 | 12.1 | 12.1 | 6.3 | 7.1 |\n",
"| 8 | 7.1 | 5.8 | nan | 54.2 | 19.2 | nan | nan | 54.2 | 19.2 | 19.2 | 19.2 | 19.2 | 5.8 | nan |\n"
]
}
],
"source": [
"res = pd.concat([\n",
" iris.sepal_length,\n",
" iris.sepal_length.shift(),\n",
" iris.sepal_length.shift(periods=-1),\n",
" iris.sepal_length.cumsum(),\n",
" iris.sepal_length.rolling(window=3).sum(),\n",
" iris.sepal_length.rolling(window=3, center=True).sum(),\n",
" iris.sepal_length.rolling(window=4, center=True).sum(),\n",
" iris.sepal_length.expanding().sum(),\n",
" iris.groupby(\"species\")[\"sepal_length\"].transform(np.sum),\n",
" iris.groupby(\"species\")[\"sepal_length\"].transform(pd.Series.cumsum),\n",
" iris.groupby(\"species\").apply(lambda x: x.expanding().sepal_length.sum()).reset_index(drop=True),\n",
" iris.groupby(\"species\").expanding().sepal_length.sum().reset_index(level=0, drop=True),\n",
" iris.groupby(\"species\").sepal_length.shift(),\n",
" iris.groupby(\"species\").sepal_length.shift(periods=-1),\n",
" ], axis=1)\n",
"res.columns = [\n",
" \"default\",\n",
" \"shift_down\",\n",
" \"shift_up\",\n",
" \"cumsum\",\n",
" \"roll3_sum\",\n",
" \"roll3_sum_c\",\n",
" \"roll4_sum_c\", # 偶数の場合, どの範囲になるかの確認のため\n",
" \"ex_sum\", # cumsum\n",
" \"trans_sum\", # SQL Window関数におけるGroup内すべてと同様 \n",
" \"trans_cumsum\",# group内cumsum\n",
" \"g_ex_old_sum\",\n",
" \"g_ex_sum\", # group内cumsum\n",
" \"g_shift_down\",# group内shift\n",
" \"g_shift_up\",\n",
"]\n",
"\n",
"import tabulate\n",
"print(tabulate.tabulate(res, headers=\"keys\", tablefmt=\"pipe\"))"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T07:38:21.637869",
"start_time": "2017-01-19T07:38:21.596351"
},
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| | value | default | dense | first_rank | max_rank | min_rank |\n",
"|---:|--------:|----------:|--------:|-------------:|-----------:|-----------:|\n",
"| 0 | 1 | 1 | 1 | 1 | 1 | 1 |\n",
"| 1 | 2 | 2 | 2 | 2 | 3 | 2 |\n",
"| 2 | 2 | 2 | 2 | 3 | 3 | 2 |\n",
"| 3 | 3 | 4 | 3 | 4 | 4 | 4 |\n",
"| 4 | 4 | 5 | 4 | 5 | 5 | 5 |\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" value | \n",
" default | \n",
" dense | \n",
" first_rank | \n",
" max_rank | \n",
" min_rank | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 3 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 4 | \n",
" 3 | \n",
" 4 | \n",
" 4 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 5 | \n",
" 4 | \n",
" 5 | \n",
" 5 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value default dense first_rank max_rank min_rank\n",
"0 1 1 1 1 1 1\n",
"1 2 2 2 2 3 2\n",
"2 2 2 2 3 3 2\n",
"3 3 4 3 4 4 4\n",
"4 4 5 4 5 5 5"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = pd.Series([1,2,2,3,4])\n",
"rank_df = pd.DataFrame().assign(value=s)\n",
"rank_df = rank_df.assign(\n",
" default=s.rank(),\n",
" first_rank=s.rank(method=\"first\"),\n",
" dense=s.rank(method=\"dense\"),\n",
" min_rank=s.rank(method=\"min\"),\n",
" max_rank=s.rank(method=\"max\")\n",
").astype(int)\n",
"\n",
"import tabulate\n",
"print(tabulate.tabulate(rank_df, headers=\"keys\", tablefmt=\"pipe\"))\n",
"rank_df"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-19T07:58:53.669019",
"start_time": "2017-01-19T07:58:53.569370"
},
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"type\n",
"a 2\n",
"b 10\n",
"c 18\n",
"Name: v, dtype: int64\n",
"| | type | v | 奇数は0(以後同様に0) | cumsum | group sum | group cumsum(期待値と異なる) | group cumsum |\n",
"|---:|:-------|----:|---------------:|---------:|------------:|------------------------:|---------------:|\n",
"| 0 | a | 0 | 0 | 0 | 2 | 0 | 0 |\n",
"| 1 | a | 1 | 0 | 0 | 2 | 2 | 0 |\n",
"| 2 | a | 2 | 2 | 2 | 2 | 0 | 2 |\n",
"| 3 | a | 3 | 0 | 2 | 2 | 2 | 2 |\n",
"| 4 | b | 4 | 4 | 6 | 10 | 4 | 4 |\n",
"| 5 | b | 5 | 0 | 6 | 10 | 10 | 4 |\n",
"| 6 | b | 6 | 6 | 12 | 10 | 4 | 10 |\n",
"| 7 | b | 7 | 0 | 12 | 10 | 10 | 10 |\n",
"| 8 | c | 8 | 8 | 20 | 18 | 8 | 8 |\n",
"| 9 | c | 9 | 0 | 20 | 18 | 18 | 8 |\n",
"| 10 | c | 10 | 10 | 30 | 18 | 8 | 18 |\n",
"| 11 | c | 11 | 0 | 30 | 18 | 18 | 18 |\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" v | \n",
" 奇数は0(以後同様に0) | \n",
" cumsum | \n",
" group sum | \n",
" group cumsum(期待値と異なる) | \n",
" group cumsum | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" a | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" a | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" a | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" a | \n",
" 3 | \n",
" 0 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" b | \n",
" 4 | \n",
" 4 | \n",
" 6 | \n",
" 10 | \n",
" 4 | \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
" b | \n",
" 5 | \n",
" 0 | \n",
" 6 | \n",
" 10 | \n",
" 10 | \n",
" 4 | \n",
"
\n",
" \n",
" 6 | \n",
" b | \n",
" 6 | \n",
" 6 | \n",
" 12 | \n",
" 10 | \n",
" 4 | \n",
" 10 | \n",
"
\n",
" \n",
" 7 | \n",
" b | \n",
" 7 | \n",
" 0 | \n",
" 12 | \n",
" 10 | \n",
" 10 | \n",
" 10 | \n",
"
\n",
" \n",
" 8 | \n",
" c | \n",
" 8 | \n",
" 8 | \n",
" 20 | \n",
" 18 | \n",
" 8 | \n",
" 8 | \n",
"
\n",
" \n",
" 9 | \n",
" c | \n",
" 9 | \n",
" 0 | \n",
" 20 | \n",
" 18 | \n",
" 18 | \n",
" 8 | \n",
"
\n",
" \n",
" 10 | \n",
" c | \n",
" 10 | \n",
" 10 | \n",
" 30 | \n",
" 18 | \n",
" 8 | \n",
" 18 | \n",
"
\n",
" \n",
" 11 | \n",
" c | \n",
" 11 | \n",
" 0 | \n",
" 30 | \n",
" 18 | \n",
" 18 | \n",
" 18 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type v 奇数は0(以後同様に0) cumsum group sum group cumsum(期待値と異なる) \\\n",
"0 a 0 0 0 2 0 \n",
"1 a 1 0 0 2 2 \n",
"2 a 2 2 2 2 0 \n",
"3 a 3 0 2 2 2 \n",
"4 b 4 4 6 10 4 \n",
"5 b 5 0 6 10 10 \n",
"6 b 6 6 12 10 4 \n",
"7 b 7 0 12 10 10 \n",
"8 c 8 8 20 18 8 \n",
"9 c 9 0 20 18 18 \n",
"10 c 10 10 30 18 8 \n",
"11 c 11 0 30 18 18 \n",
"\n",
" group cumsum \n",
"0 0 \n",
"1 0 \n",
"2 2 \n",
"3 2 \n",
"4 4 \n",
"5 4 \n",
"6 10 \n",
"7 10 \n",
"8 8 \n",
"9 8 \n",
"10 18 \n",
"11 18 "
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" {\n",
" \"type\": list(\"aaaabbbbcccc\"),\n",
" \"v\": range(12),\n",
" }\n",
")\n",
"print(df.groupby(\"type\")[\"v\"].apply(lambda x: x[x%2==0].sum()))\n",
"\n",
"def _t(x):\n",
" x[x%2!=0] = 0\n",
" return x.cumsum()\n",
"\n",
"conditional_df = pd.concat(\n",
" [\n",
" df,\n",
" df[\"v\"].apply(lambda x: x if x%2==0 else 0),\n",
" df[\"v\"].apply(lambda x: x if x%2==0 else 0).cumsum(),\n",
" df.groupby(\"type\")[\"v\"].transform(lambda x: x[x%2==0].sum()),\n",
" df.groupby(\"type\")[\"v\"].transform(lambda x: x[x%2==0].cumsum()),\n",
" df.groupby(\"type\")[\"v\"].transform(_t),\n",
" ], axis=1)\n",
"conditional_df.columns = [\"type\", \"v\", \"奇数は0(以後同様に0)\", \"cumsum\", \"group sum\", \"group cumsum(期待値と異なる)\", \"group cumsum\"]\n",
"\n",
"import tabulate\n",
"print(tabulate.tabulate(conditional_df, headers=\"keys\", tablefmt=\"pipe\"))\n",
"conditional_df\n"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"toc": {
"toc_cell": false,
"toc_number_sections": true,
"toc_threshold": 6,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}