{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# window_function" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:32:33.183428", "start_time": "2017-01-19T06:32:33.179574" }, "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:31:02.184084", "start_time": "2017-01-19T06:31:00.358250" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "INSTALLED VERSIONS\n", "------------------\n", "commit: None\n", "python: 3.5.1.final.0\n", "python-bits: 64\n", "OS: Darwin\n", "OS-release: 15.6.0\n", "machine: x86_64\n", "processor: i386\n", "byteorder: little\n", "LC_ALL: None\n", "LANG: ja_JP.UTF-8\n", "\n", "pandas: 0.18.1\n", "nose: 1.3.7\n", "pip: 9.0.1\n", "setuptools: 20.3\n", "Cython: 0.23.4\n", "numpy: 1.10.4\n", "scipy: 0.17.0\n", "statsmodels: 0.6.1\n", "xarray: None\n", "IPython: 5.1.0\n", "sphinx: 1.3.5\n", "patsy: 0.4.0\n", "dateutil: 2.5.1\n", "pytz: 2016.2\n", "blosc: None\n", "bottleneck: 1.0.0\n", "tables: 3.2.2\n", "numexpr: 2.5\n", "matplotlib: 1.5.1\n", "openpyxl: 2.3.2\n", "xlrd: 0.9.4\n", "xlwt: 1.0.0\n", "xlsxwriter: 0.8.4\n", "lxml: 3.6.0\n", "bs4: 4.4.1\n", "html5lib: 0.9999999\n", "httplib2: None\n", "apiclient: None\n", "sqlalchemy: 1.0.12\n", "pymysql: None\n", "psycopg2: None\n", "jinja2: 2.8\n", "boto: 2.39.0\n", "pandas_datareader: 0.2.1\n" ] } ], "source": [ "pd.show_versions()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:46:11.484097", "start_time": "2017-01-19T06:46:11.476791" }, "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "iris = sns.load_dataset(\"iris\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:46:59.572691", "start_time": "2017-01-19T06:46:59.549258" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
37.03.24.71.4versicolor
46.43.24.51.5versicolor
56.93.14.91.5versicolor
66.33.36.02.5virginica
75.82.75.11.9virginica
87.13.05.92.1virginica
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 7.0 3.2 4.7 1.4 versicolor\n", "4 6.4 3.2 4.5 1.5 versicolor\n", "5 6.9 3.1 4.9 1.5 versicolor\n", "6 6.3 3.3 6.0 2.5 virginica\n", "7 5.8 2.7 5.1 1.9 virginica\n", "8 7.1 3.0 5.9 2.1 virginica" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris = pd.concat([iris.head(3), iris.ix[50:52], iris.ix[100:102]]).reset_index(drop=True)\n", "iris\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:31:41.527491", "start_time": "2017-01-19T06:31:41.478574" }, "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: pd.rolling_sum is deprecated for Series and will be removed in a future version, replace with \n", "\tSeries.rolling(center=False,window=3).sum()\n", " from ipykernel import kernelapp as app\n", "/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with \n", "\tSeries.expanding(min_periods=1).sum()\n", " app.launch_new_instance()\n", "/Users/knt/.pyenv/versions/anaconda3-2.5.0/lib/python3.5/site-packages/pandas/core/groupby.py:2746: FutureWarning: pd.expanding_sum is deprecated for Series and will be removed in a future version, replace with \n", "\tSeries.expanding(min_periods=1).sum()\n", " wrapper = lambda x: func(x, *args, **kwargs)\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
sepal_lengthNaNNaN14.716.618.120.319.619.019.2
sepal_length5.110.014.721.728.135.041.347.154.2
sepal_length5.110.014.77.013.420.36.312.119.2
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8\n", "sepal_length NaN NaN 14.7 16.6 18.1 20.3 19.6 19.0 19.2\n", "sepal_length 5.1 10.0 14.7 21.7 28.1 35.0 41.3 47.1 54.2\n", "sepal_length 5.1 10.0 14.7 7.0 13.4 20.3 6.3 12.1 19.2" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([\n", " pd.rolling_sum(iris.sepal_length, window=3),\n", " pd.expanding_sum(iris.sepal_length),\n", " iris.groupby(\"species\")[\"sepal_length\"].transform(pd.expanding_sum),\n", "], axis=1).T" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T07:28:52.597858", "start_time": "2017-01-19T07:28:52.527370" }, "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "| | default | shift_down | shift_up | cumsum | roll3_sum | roll3_sum_c | roll4_sum_c | ex_sum | trans_sum | trans_cumsum | g_ex_old_sum | g_ex_sum | g_shift_down | g_shift_up |\n", "|---:|----------:|-------------:|-----------:|---------:|------------:|--------------:|--------------:|---------:|------------:|---------------:|---------------:|-----------:|---------------:|-------------:|\n", "| 0 | 5.1 | nan | 4.9 | 5.1 | nan | nan | nan | 5.1 | 14.7 | 5.1 | 5.1 | 5.1 | nan | 4.9 |\n", "| 1 | 4.9 | 5.1 | 4.7 | 10 | nan | 14.7 | nan | 10 | 14.7 | 10 | 10 | 10 | 5.1 | 4.7 |\n", "| 2 | 4.7 | 4.9 | 7 | 14.7 | 14.7 | 16.6 | 21.7 | 14.7 | 14.7 | 14.7 | 14.7 | 14.7 | 4.9 | nan |\n", "| 3 | 7 | 4.7 | 6.4 | 21.7 | 16.6 | 18.1 | 23 | 21.7 | 20.3 | 7 | 7 | 7 | nan | 6.4 |\n", "| 4 | 6.4 | 7 | 6.9 | 28.1 | 18.1 | 20.3 | 25 | 28.1 | 20.3 | 13.4 | 13.4 | 13.4 | 7 | 6.9 |\n", "| 5 | 6.9 | 6.4 | 6.3 | 35 | 20.3 | 19.6 | 26.6 | 35 | 20.3 | 20.3 | 20.3 | 20.3 | 6.4 | nan |\n", "| 6 | 6.3 | 6.9 | 5.8 | 41.3 | 19.6 | 19 | 25.4 | 41.3 | 19.2 | 6.3 | 6.3 | 6.3 | nan | 5.8 |\n", "| 7 | 5.8 | 6.3 | 7.1 | 47.1 | 19 | 19.2 | 26.1 | 47.1 | 19.2 | 12.1 | 12.1 | 12.1 | 6.3 | 7.1 |\n", "| 8 | 7.1 | 5.8 | nan | 54.2 | 19.2 | nan | nan | 54.2 | 19.2 | 19.2 | 19.2 | 19.2 | 5.8 | nan |\n" ] } ], "source": [ "res = pd.concat([\n", " iris.sepal_length,\n", " iris.sepal_length.shift(),\n", " iris.sepal_length.shift(periods=-1),\n", " iris.sepal_length.cumsum(),\n", " iris.sepal_length.rolling(window=3).sum(),\n", " iris.sepal_length.rolling(window=3, center=True).sum(),\n", " iris.sepal_length.rolling(window=4, center=True).sum(),\n", " iris.sepal_length.expanding().sum(),\n", " iris.groupby(\"species\")[\"sepal_length\"].transform(np.sum),\n", " iris.groupby(\"species\")[\"sepal_length\"].transform(pd.Series.cumsum),\n", " iris.groupby(\"species\").apply(lambda x: x.expanding().sepal_length.sum()).reset_index(drop=True),\n", " iris.groupby(\"species\").expanding().sepal_length.sum().reset_index(level=0, drop=True),\n", " iris.groupby(\"species\").sepal_length.shift(),\n", " iris.groupby(\"species\").sepal_length.shift(periods=-1),\n", " ], axis=1)\n", "res.columns = [\n", " \"default\",\n", " \"shift_down\",\n", " \"shift_up\",\n", " \"cumsum\",\n", " \"roll3_sum\",\n", " \"roll3_sum_c\",\n", " \"roll4_sum_c\", # 偶数の場合, どの範囲になるかの確認のため\n", " \"ex_sum\", # cumsum\n", " \"trans_sum\", # SQL Window関数におけるGroup内すべてと同様 \n", " \"trans_cumsum\",# group内cumsum\n", " \"g_ex_old_sum\",\n", " \"g_ex_sum\", # group内cumsum\n", " \"g_shift_down\",# group内shift\n", " \"g_shift_up\",\n", "]\n", "\n", "import tabulate\n", "print(tabulate.tabulate(res, headers=\"keys\", tablefmt=\"pipe\"))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T07:38:21.637869", "start_time": "2017-01-19T07:38:21.596351" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "| | value | default | dense | first_rank | max_rank | min_rank |\n", "|---:|--------:|----------:|--------:|-------------:|-----------:|-----------:|\n", "| 0 | 1 | 1 | 1 | 1 | 1 | 1 |\n", "| 1 | 2 | 2 | 2 | 2 | 3 | 2 |\n", "| 2 | 2 | 2 | 2 | 3 | 3 | 2 |\n", "| 3 | 3 | 4 | 3 | 4 | 4 | 4 |\n", "| 4 | 4 | 5 | 4 | 5 | 5 | 5 |\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuedefaultdensefirst_rankmax_rankmin_rank
0111111
1222232
2222332
3343444
4454555
\n", "
" ], "text/plain": [ " value default dense first_rank max_rank min_rank\n", "0 1 1 1 1 1 1\n", "1 2 2 2 2 3 2\n", "2 2 2 2 3 3 2\n", "3 3 4 3 4 4 4\n", "4 4 5 4 5 5 5" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = pd.Series([1,2,2,3,4])\n", "rank_df = pd.DataFrame().assign(value=s)\n", "rank_df = rank_df.assign(\n", " default=s.rank(),\n", " first_rank=s.rank(method=\"first\"),\n", " dense=s.rank(method=\"dense\"),\n", " min_rank=s.rank(method=\"min\"),\n", " max_rank=s.rank(method=\"max\")\n", ").astype(int)\n", "\n", "import tabulate\n", "print(tabulate.tabulate(rank_df, headers=\"keys\", tablefmt=\"pipe\"))\n", "rank_df" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T07:58:53.669019", "start_time": "2017-01-19T07:58:53.569370" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "type\n", "a 2\n", "b 10\n", "c 18\n", "Name: v, dtype: int64\n", "| | type | v | 奇数は0(以後同様に0) | cumsum | group sum | group cumsum(期待値と異なる) | group cumsum |\n", "|---:|:-------|----:|---------------:|---------:|------------:|------------------------:|---------------:|\n", "| 0 | a | 0 | 0 | 0 | 2 | 0 | 0 |\n", "| 1 | a | 1 | 0 | 0 | 2 | 2 | 0 |\n", "| 2 | a | 2 | 2 | 2 | 2 | 0 | 2 |\n", "| 3 | a | 3 | 0 | 2 | 2 | 2 | 2 |\n", "| 4 | b | 4 | 4 | 6 | 10 | 4 | 4 |\n", "| 5 | b | 5 | 0 | 6 | 10 | 10 | 4 |\n", "| 6 | b | 6 | 6 | 12 | 10 | 4 | 10 |\n", "| 7 | b | 7 | 0 | 12 | 10 | 10 | 10 |\n", "| 8 | c | 8 | 8 | 20 | 18 | 8 | 8 |\n", "| 9 | c | 9 | 0 | 20 | 18 | 18 | 8 |\n", "| 10 | c | 10 | 10 | 30 | 18 | 8 | 18 |\n", "| 11 | c | 11 | 0 | 30 | 18 | 18 | 18 |\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typev奇数は0(以後同様に0)cumsumgroup sumgroup cumsum(期待値と異なる)group cumsum
0a000200
1a100220
2a222202
3a302222
4b4461044
5b50610104
6b661210410
7b7012101010
8c88201888
9c902018188
10c10103018818
11c11030181818
\n", "
" ], "text/plain": [ " type v 奇数は0(以後同様に0) cumsum group sum group cumsum(期待値と異なる) \\\n", "0 a 0 0 0 2 0 \n", "1 a 1 0 0 2 2 \n", "2 a 2 2 2 2 0 \n", "3 a 3 0 2 2 2 \n", "4 b 4 4 6 10 4 \n", "5 b 5 0 6 10 10 \n", "6 b 6 6 12 10 4 \n", "7 b 7 0 12 10 10 \n", "8 c 8 8 20 18 8 \n", "9 c 9 0 20 18 18 \n", "10 c 10 10 30 18 8 \n", "11 c 11 0 30 18 18 \n", "\n", " group cumsum \n", "0 0 \n", "1 0 \n", "2 2 \n", "3 2 \n", "4 4 \n", "5 4 \n", "6 10 \n", "7 10 \n", "8 8 \n", "9 8 \n", "10 18 \n", "11 18 " ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"type\": list(\"aaaabbbbcccc\"),\n", " \"v\": range(12),\n", " }\n", ")\n", "print(df.groupby(\"type\")[\"v\"].apply(lambda x: x[x%2==0].sum()))\n", "\n", "def _t(x):\n", " x[x%2!=0] = 0\n", " return x.cumsum()\n", "\n", "conditional_df = pd.concat(\n", " [\n", " df,\n", " df[\"v\"].apply(lambda x: x if x%2==0 else 0),\n", " df[\"v\"].apply(lambda x: x if x%2==0 else 0).cumsum(),\n", " df.groupby(\"type\")[\"v\"].transform(lambda x: x[x%2==0].sum()),\n", " df.groupby(\"type\")[\"v\"].transform(lambda x: x[x%2==0].cumsum()),\n", " df.groupby(\"type\")[\"v\"].transform(_t),\n", " ], axis=1)\n", "conditional_df.columns = [\"type\", \"v\", \"奇数は0(以後同様に0)\", \"cumsum\", \"group sum\", \"group cumsum(期待値と異なる)\", \"group cumsum\"]\n", "\n", "import tabulate\n", "print(tabulate.tabulate(conditional_df, headers=\"keys\", tablefmt=\"pipe\"))\n", "conditional_df\n" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "toc_cell": false, "toc_number_sections": true, "toc_threshold": 6, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }