{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# groupby" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:50:22.541960", "start_time": "2017-02-06T00:50:22.537775" }, "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:02:58.766618", "start_time": "2017-01-19T06:02:58.747046" }, "collapsed": true }, "outputs": [], "source": [ "iris = sns.load_dataset(\"iris\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:08:04.665538", "start_time": "2017-01-19T06:08:04.644876" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
species
setosa250.3171.473.112.3
versicolor296.8138.5213.066.3
virginica329.4148.7277.6101.3
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "species \n", "setosa 250.3 171.4 73.1 12.3\n", "versicolor 296.8 138.5 213.0 66.3\n", "virginica 329.4 148.7 277.6 101.3" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species).sum()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:08:43.162185", "start_time": "2017-01-19T06:08:43.116384" }, "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['setosa']\n", "['setosa']\n", "['versicolor']\n", "['virginica']\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
species
setosa0.00.00.00.00
versicolor296.8138.5213.066.3versicolorversicolorversicolorversicolorversic...
virginica329.4148.7277.6101.3virginicavirginicavirginicavirginicavirginicav...
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width \\\n", "species \n", "setosa 0.0 0.0 0.0 0.0 \n", "versicolor 296.8 138.5 213.0 66.3 \n", "virginica 329.4 148.7 277.6 101.3 \n", "\n", " species \n", "species \n", "setosa 0 \n", "versicolor versicolorversicolorversicolorversicolorversic... \n", "virginica virginicavirginicavirginicavirginicavirginicav... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def f(df):\n", " print(df.species.unique())\n", " return df[df.species != \"setosa\"].sum()\n", "\n", "iris.groupby(iris.species).apply(f)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:17:54.281463", "start_time": "2017-01-19T06:17:54.239422" }, "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 5.1\n", "1 4.9\n", "Name: sepal_length, dtype: float64\n", "0 3.5\n", "1 3.0\n", "Name: sepal_width, dtype: float64\n", "0 1.4\n", "1 1.4\n", "Name: petal_length, dtype: float64\n", "0 0.2\n", "1 0.2\n", "Name: petal_width, dtype: float64\n", " sepal_length sepal_width petal_length petal_width\n", "0 5.1 3.5 1.4 0.2\n", "1 4.9 3.0 1.4 0.2\n", "50 7.0\n", "51 6.4\n", "Name: sepal_length, dtype: float64\n", "50 3.2\n", "51 3.2\n", "Name: sepal_width, dtype: float64\n", "50 4.7\n", "51 4.5\n", "Name: petal_length, dtype: float64\n", "50 1.4\n", "51 1.5\n", "Name: petal_width, dtype: float64\n", "100 6.3\n", "101 5.8\n", "Name: sepal_length, dtype: float64\n", "100 3.3\n", "101 2.7\n", "Name: sepal_width, dtype: float64\n", "100 6.0\n", "101 5.1\n", "Name: petal_length, dtype: float64\n", "100 2.5\n", "101 1.9\n", "Name: petal_width, dtype: float64\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
0NaNNaNNaNNaN
1NaNNaNNaNNaN
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "0 NaN NaN NaN NaN\n", "1 NaN NaN NaN NaN" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species).transform(lambda x: print(x.head(2))).head(2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:18:24.957929", "start_time": "2017-01-19T06:18:24.943867" }, "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 5.1\n", "1 4.9\n", "Name: setosa, dtype: float64\n", "50 7.0\n", "51 6.4\n", "Name: versicolor, dtype: float64\n", "100 6.3\n", "101 5.8\n", "Name: virginica, dtype: float64\n" ] }, { "data": { "text/plain": [ "0 NaN\n", "1 NaN\n", "Name: sepal_length, dtype: float64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species)[\"sepal_length\"].transform(lambda x: print(x.head(2))).head(2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:22:56.339744", "start_time": "2017-01-19T06:22:56.314668" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
0250.3171.473.112.3
1250.3171.473.112.3
50296.8138.5213.066.3
51296.8138.5213.066.3
100329.4148.7277.6101.3
101329.4148.7277.6101.3
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "0 250.3 171.4 73.1 12.3\n", "1 250.3 171.4 73.1 12.3\n", "50 296.8 138.5 213.0 66.3\n", "51 296.8 138.5 213.0 66.3\n", "100 329.4 148.7 277.6 101.3\n", "101 329.4 148.7 277.6 101.3" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species).transform(np.sum).iloc[[0,1,50,51,100,101]]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:23:24.820159", "start_time": "2017-01-19T06:23:24.790263" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
05.13.51.40.2
110.06.52.80.4
507.03.24.71.4
5113.46.49.22.9
1006.33.36.02.5
10112.16.011.14.4
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "0 5.1 3.5 1.4 0.2\n", "1 10.0 6.5 2.8 0.4\n", "50 7.0 3.2 4.7 1.4\n", "51 13.4 6.4 9.2 2.9\n", "100 6.3 3.3 6.0 2.5\n", "101 12.1 6.0 11.1 4.4" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species).transform(pd.Series.cumsum).iloc[[0,1,50,51,100,101]]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2017-01-19T06:24:22.300777", "start_time": "2017-01-19T06:24:22.289503" }, "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 5.1\n", "1 10.0\n", "50 7.0\n", "51 13.4\n", "100 6.3\n", "101 12.1\n", "Name: sepal_length, dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species)[\"sepal_length\"].transform(pd.Series.cumsum).iloc[[0,1,50,51,100,101]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:49:46.229172", "start_time": "2017-02-06T00:49:43.818957" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
methodAstrometryEclipse Timing VariationsImagingMicrolensingOrbital Brightness ModulationPulsar TimingPulsation Timing VariationsRadial VelocityTransitTransit Timing Variations
count2.000009.00000038.00000023.0000003.0000005.000001.0553.000000397.0000004.000000
mean2011.500002010.0000002009.1315792009.7826092011.6666671998.400002007.02007.5189872011.2367762012.500000
std2.121321.4142142.7819012.8596971.1547018.38451NaN4.2490522.0778671.290994
min2010.000002008.0000002004.0000002004.0000002011.0000001992.000002007.01989.0000002002.0000002011.000000
25%2010.750002009.0000002008.0000002008.0000002011.0000001992.000002007.02005.0000002010.0000002011.750000
50%2011.500002010.0000002009.0000002010.0000002011.0000001994.000002007.02009.0000002012.0000002012.500000
75%2012.250002011.0000002011.0000002012.0000002012.0000002003.000002007.02011.0000002013.0000002013.250000
max2013.000002012.0000002013.0000002013.0000002013.0000002011.000002007.02014.0000002014.0000002014.000000
\n", "
" ], "text/plain": [ "method Astrometry Eclipse Timing Variations Imaging Microlensing \\\n", "count 2.00000 9.000000 38.000000 23.000000 \n", "mean 2011.50000 2010.000000 2009.131579 2009.782609 \n", "std 2.12132 1.414214 2.781901 2.859697 \n", "min 2010.00000 2008.000000 2004.000000 2004.000000 \n", "25% 2010.75000 2009.000000 2008.000000 2008.000000 \n", "50% 2011.50000 2010.000000 2009.000000 2010.000000 \n", "75% 2012.25000 2011.000000 2011.000000 2012.000000 \n", "max 2013.00000 2012.000000 2013.000000 2013.000000 \n", "\n", "method Orbital Brightness Modulation Pulsar Timing \\\n", "count 3.000000 5.00000 \n", "mean 2011.666667 1998.40000 \n", "std 1.154701 8.38451 \n", "min 2011.000000 1992.00000 \n", "25% 2011.000000 1992.00000 \n", "50% 2011.000000 1994.00000 \n", "75% 2012.000000 2003.00000 \n", "max 2013.000000 2011.00000 \n", "\n", "method Pulsation Timing Variations Radial Velocity Transit \\\n", "count 1.0 553.000000 397.000000 \n", "mean 2007.0 2007.518987 2011.236776 \n", "std NaN 4.249052 2.077867 \n", "min 2007.0 1989.000000 2002.000000 \n", "25% 2007.0 2005.000000 2010.000000 \n", "50% 2007.0 2009.000000 2012.000000 \n", "75% 2007.0 2011.000000 2013.000000 \n", "max 2007.0 2014.000000 2014.000000 \n", "\n", "method Transit Timing Variations \n", "count 4.000000 \n", "mean 2012.500000 \n", "std 1.290994 \n", "min 2011.000000 \n", "25% 2011.750000 \n", "50% 2012.500000 \n", "75% 2013.250000 \n", "max 2014.000000 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import seaborn as sns\n", "planets = sns.load_dataset(\"planets\")\n", "planets.groupby('method')['year'].describe().unstack(level=0) #.unstack(\"method\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:49:55.837230", "start_time": "2017-02-06T00:49:55.810815" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['setosa' 'versicolor' 'virginica']\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
other6.2622.8724.9061.676
setosa5.0063.4281.4620.246
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "other 6.262 2.872 4.906 1.676\n", "setosa 5.006 3.428 1.462 0.246" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris = sns.load_dataset(\"iris\")\n", "print(iris.species.unique())\n", "iris.set_index(\"species\").groupby(\n", " {\n", " \"setosa\": \"setosa\",\n", " \"versicolor\": \"other\",\n", " \"virginica\": \"other\"\n", " }).mean()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:50:03.644430", "start_time": "2017-02-06T00:50:03.608807" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
species
False6.2622.8724.9061.676
True5.0063.4281.4620.246
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "species \n", "False 6.262 2.872 4.906 1.676\n", "True 5.006 3.428 1.462 0.246" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(iris.species == \"setosa\").mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:50:26.808244", "start_time": "2017-02-06T00:50:26.800725" }, "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Timedelta('1 days 12:34:56')" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_timedelta(\"1days 12:34:56\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:50:33.509220", "start_time": "2017-02-06T00:50:33.416660" }, "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 59 entries, 0 to 58\n", "Data columns (total 2 columns):\n", "da 59 non-null datetime64[ns]\n", "de 59 non-null timedelta64[ns]\n", "dtypes: datetime64[ns](1), timedelta64[ns](1)\n", "memory usage: 1.0 KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
de
dada
2017-01-01110
2017-01-11110
2017-01-21110
2017-01-3111
29
2017-02-10210
2017-02-2029
\n", "
" ], "text/plain": [ " de\n", "da da \n", "2017-01-01 1 10\n", "2017-01-11 1 10\n", "2017-01-21 1 10\n", "2017-01-31 1 1\n", " 2 9\n", "2017-02-10 2 10\n", "2017-02-20 2 9" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.DataFrame({\n", " \"da\": pd.date_range(\"2017-01-01\", \"2017-02-28\"),\n", " \"de\": pd.timedelta_range(\"1days 12:34:56.789012345\", freq=\"1H\", periods=31+28)\n", "})\n", "dates.info()\n", "dates.groupby(\n", " [\n", " pd.Grouper(freq=\"10d\", key=\"da\"),\n", " dates.da.dt.month,\n", " ]).count()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2017-02-06T00:50:41.765359", "start_time": "2017-02-06T00:50:41.743117" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dayshoursminutessecondsmillisecondsmicrosecondsnanoseconds
0112345678912345
1113345678912345
2114345678912345
3115345678912345
4116345678912345
\n", "
" ], "text/plain": [ " days hours minutes seconds milliseconds microseconds nanoseconds\n", "0 1 12 34 56 789 12 345\n", "1 1 13 34 56 789 12 345\n", "2 1 14 34 56 789 12 345\n", "3 1 15 34 56 789 12 345\n", "4 1 16 34 56 789 12 345" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates.de.dt.components.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "toc_cell": false, "toc_number_sections": true, "toc_threshold": 6, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }