groupby¶
In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
iris = sns.load_dataset("iris")
In [5]:
iris.groupby(iris.species).sum()
Out[5]:
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
species | ||||
setosa | 250.3 | 171.4 | 73.1 | 12.3 |
versicolor | 296.8 | 138.5 | 213.0 | 66.3 |
virginica | 329.4 | 148.7 | 277.6 | 101.3 |
In [6]:
def f(df):
print(df.species.unique())
return df[df.species != "setosa"].sum()
iris.groupby(iris.species).apply(f)
['setosa']
['setosa']
['versicolor']
['virginica']
Out[6]:
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
species | |||||
setosa | 0.0 | 0.0 | 0.0 | 0.0 | 0 |
versicolor | 296.8 | 138.5 | 213.0 | 66.3 | versicolorversicolorversicolorversicolorversic... |
virginica | 329.4 | 148.7 | 277.6 | 101.3 | virginicavirginicavirginicavirginicavirginicav... |
In [11]:
iris.groupby(iris.species).transform(lambda x: print(x.head(2))).head(2)
0 5.1
1 4.9
Name: sepal_length, dtype: float64
0 3.5
1 3.0
Name: sepal_width, dtype: float64
0 1.4
1 1.4
Name: petal_length, dtype: float64
0 0.2
1 0.2
Name: petal_width, dtype: float64
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
50 7.0
51 6.4
Name: sepal_length, dtype: float64
50 3.2
51 3.2
Name: sepal_width, dtype: float64
50 4.7
51 4.5
Name: petal_length, dtype: float64
50 1.4
51 1.5
Name: petal_width, dtype: float64
100 6.3
101 5.8
Name: sepal_length, dtype: float64
100 3.3
101 2.7
Name: sepal_width, dtype: float64
100 6.0
101 5.1
Name: petal_length, dtype: float64
100 2.5
101 1.9
Name: petal_width, dtype: float64
Out[11]:
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN |
In [13]:
iris.groupby(iris.species)["sepal_length"].transform(lambda x: print(x.head(2))).head(2)
0 5.1
1 4.9
Name: setosa, dtype: float64
50 7.0
51 6.4
Name: versicolor, dtype: float64
100 6.3
101 5.8
Name: virginica, dtype: float64
Out[13]:
0 NaN
1 NaN
Name: sepal_length, dtype: float64
In [16]:
iris.groupby(iris.species).transform(np.sum).iloc[[0,1,50,51,100,101]]
Out[16]:
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 250.3 | 171.4 | 73.1 | 12.3 |
1 | 250.3 | 171.4 | 73.1 | 12.3 |
50 | 296.8 | 138.5 | 213.0 | 66.3 |
51 | 296.8 | 138.5 | 213.0 | 66.3 |
100 | 329.4 | 148.7 | 277.6 | 101.3 |
101 | 329.4 | 148.7 | 277.6 | 101.3 |
In [17]:
iris.groupby(iris.species).transform(pd.Series.cumsum).iloc[[0,1,50,51,100,101]]
Out[17]:
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 10.0 | 6.5 | 2.8 | 0.4 |
50 | 7.0 | 3.2 | 4.7 | 1.4 |
51 | 13.4 | 6.4 | 9.2 | 2.9 |
100 | 6.3 | 3.3 | 6.0 | 2.5 |
101 | 12.1 | 6.0 | 11.1 | 4.4 |
In [18]:
iris.groupby(iris.species)["sepal_length"].transform(pd.Series.cumsum).iloc[[0,1,50,51,100,101]]
Out[18]:
0 5.1
1 10.0
50 7.0
51 13.4
100 6.3
101 12.1
Name: sepal_length, dtype: float64
In [ ]:
In [1]:
import seaborn as sns
planets = sns.load_dataset("planets")
planets.groupby('method')['year'].describe().unstack(level=0) #.unstack("method")
Out[1]:
method | Astrometry | Eclipse Timing Variations | Imaging | Microlensing | Orbital Brightness Modulation | Pulsar Timing | Pulsation Timing Variations | Radial Velocity | Transit | Transit Timing Variations |
---|---|---|---|---|---|---|---|---|---|---|
count | 2.00000 | 9.000000 | 38.000000 | 23.000000 | 3.000000 | 5.00000 | 1.0 | 553.000000 | 397.000000 | 4.000000 |
mean | 2011.50000 | 2010.000000 | 2009.131579 | 2009.782609 | 2011.666667 | 1998.40000 | 2007.0 | 2007.518987 | 2011.236776 | 2012.500000 |
std | 2.12132 | 1.414214 | 2.781901 | 2.859697 | 1.154701 | 8.38451 | NaN | 4.249052 | 2.077867 | 1.290994 |
min | 2010.00000 | 2008.000000 | 2004.000000 | 2004.000000 | 2011.000000 | 1992.00000 | 2007.0 | 1989.000000 | 2002.000000 | 2011.000000 |
25% | 2010.75000 | 2009.000000 | 2008.000000 | 2008.000000 | 2011.000000 | 1992.00000 | 2007.0 | 2005.000000 | 2010.000000 | 2011.750000 |
50% | 2011.50000 | 2010.000000 | 2009.000000 | 2010.000000 | 2011.000000 | 1994.00000 | 2007.0 | 2009.000000 | 2012.000000 | 2012.500000 |
75% | 2012.25000 | 2011.000000 | 2011.000000 | 2012.000000 | 2012.000000 | 2003.00000 | 2007.0 | 2011.000000 | 2013.000000 | 2013.250000 |
max | 2013.00000 | 2012.000000 | 2013.000000 | 2013.000000 | 2013.000000 | 2011.00000 | 2007.0 | 2014.000000 | 2014.000000 | 2014.000000 |
In [2]:
iris = sns.load_dataset("iris")
print(iris.species.unique())
iris.set_index("species").groupby(
{
"setosa": "setosa",
"versicolor": "other",
"virginica": "other"
}).mean()
['setosa' 'versicolor' 'virginica']
Out[2]:
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
other | 6.262 | 2.872 | 4.906 | 1.676 |
setosa | 5.006 | 3.428 | 1.462 | 0.246 |
In [3]:
iris.groupby(iris.species == "setosa").mean()
Out[3]:
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
species | ||||
False | 6.262 | 2.872 | 4.906 | 1.676 |
True | 5.006 | 3.428 | 1.462 | 0.246 |
In [ ]:
In [6]:
pd.to_timedelta("1days 12:34:56")
Out[6]:
Timedelta('1 days 12:34:56')
In [7]:
dates = pd.DataFrame({
"da": pd.date_range("2017-01-01", "2017-02-28"),
"de": pd.timedelta_range("1days 12:34:56.789012345", freq="1H", periods=31+28)
})
dates.info()
dates.groupby(
[
pd.Grouper(freq="10d", key="da"),
dates.da.dt.month,
]).count()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 2 columns):
da 59 non-null datetime64[ns]
de 59 non-null timedelta64[ns]
dtypes: datetime64[ns](1), timedelta64[ns](1)
memory usage: 1.0 KB
Out[7]:
de | ||
---|---|---|
da | da | |
2017-01-01 | 1 | 10 |
2017-01-11 | 1 | 10 |
2017-01-21 | 1 | 10 |
2017-01-31 | 1 | 1 |
2 | 9 | |
2017-02-10 | 2 | 10 |
2017-02-20 | 2 | 9 |
In [8]:
dates.de.dt.components.head()
Out[8]:
days | hours | minutes | seconds | milliseconds | microseconds | nanoseconds | |
---|---|---|---|---|---|---|---|
0 | 1 | 12 | 34 | 56 | 789 | 12 | 345 |
1 | 1 | 13 | 34 | 56 | 789 | 12 | 345 |
2 | 1 | 14 | 34 | 56 | 789 | 12 | 345 |
3 | 1 | 15 | 34 | 56 | 789 | 12 | 345 |
4 | 1 | 16 | 34 | 56 | 789 | 12 | 345 |
In [ ]: