groupby

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
iris = sns.load_dataset("iris")
In [5]:
iris.groupby(iris.species).sum()
Out[5]:
sepal_length sepal_width petal_length petal_width
species
setosa 250.3 171.4 73.1 12.3
versicolor 296.8 138.5 213.0 66.3
virginica 329.4 148.7 277.6 101.3
In [6]:
def f(df):
    print(df.species.unique())
    return df[df.species != "setosa"].sum()

iris.groupby(iris.species).apply(f)
['setosa']
['setosa']
['versicolor']
['virginica']
Out[6]:
sepal_length sepal_width petal_length petal_width species
species
setosa 0.0 0.0 0.0 0.0 0
versicolor 296.8 138.5 213.0 66.3 versicolorversicolorversicolorversicolorversic...
virginica 329.4 148.7 277.6 101.3 virginicavirginicavirginicavirginicavirginicav...
In [11]:
iris.groupby(iris.species).transform(lambda x: print(x.head(2))).head(2)
0    5.1
1    4.9
Name: sepal_length, dtype: float64
0    3.5
1    3.0
Name: sepal_width, dtype: float64
0    1.4
1    1.4
Name: petal_length, dtype: float64
0    0.2
1    0.2
Name: petal_width, dtype: float64
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
50    7.0
51    6.4
Name: sepal_length, dtype: float64
50    3.2
51    3.2
Name: sepal_width, dtype: float64
50    4.7
51    4.5
Name: petal_length, dtype: float64
50    1.4
51    1.5
Name: petal_width, dtype: float64
100    6.3
101    5.8
Name: sepal_length, dtype: float64
100    3.3
101    2.7
Name: sepal_width, dtype: float64
100    6.0
101    5.1
Name: petal_length, dtype: float64
100    2.5
101    1.9
Name: petal_width, dtype: float64
Out[11]:
sepal_length sepal_width petal_length petal_width
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
In [13]:
iris.groupby(iris.species)["sepal_length"].transform(lambda x: print(x.head(2))).head(2)
0    5.1
1    4.9
Name: setosa, dtype: float64
50    7.0
51    6.4
Name: versicolor, dtype: float64
100    6.3
101    5.8
Name: virginica, dtype: float64
Out[13]:
0   NaN
1   NaN
Name: sepal_length, dtype: float64
In [16]:
iris.groupby(iris.species).transform(np.sum).iloc[[0,1,50,51,100,101]]
Out[16]:
sepal_length sepal_width petal_length petal_width
0 250.3 171.4 73.1 12.3
1 250.3 171.4 73.1 12.3
50 296.8 138.5 213.0 66.3
51 296.8 138.5 213.0 66.3
100 329.4 148.7 277.6 101.3
101 329.4 148.7 277.6 101.3
In [17]:
iris.groupby(iris.species).transform(pd.Series.cumsum).iloc[[0,1,50,51,100,101]]
Out[17]:
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 10.0 6.5 2.8 0.4
50 7.0 3.2 4.7 1.4
51 13.4 6.4 9.2 2.9
100 6.3 3.3 6.0 2.5
101 12.1 6.0 11.1 4.4
In [18]:
iris.groupby(iris.species)["sepal_length"].transform(pd.Series.cumsum).iloc[[0,1,50,51,100,101]]
Out[18]:
0       5.1
1      10.0
50      7.0
51     13.4
100     6.3
101    12.1
Name: sepal_length, dtype: float64
In [ ]:

In [1]:
import seaborn as sns
planets = sns.load_dataset("planets")
planets.groupby('method')['year'].describe().unstack(level=0)  #.unstack("method")
Out[1]:
method Astrometry Eclipse Timing Variations Imaging Microlensing Orbital Brightness Modulation Pulsar Timing Pulsation Timing Variations Radial Velocity Transit Transit Timing Variations
count 2.00000 9.000000 38.000000 23.000000 3.000000 5.00000 1.0 553.000000 397.000000 4.000000
mean 2011.50000 2010.000000 2009.131579 2009.782609 2011.666667 1998.40000 2007.0 2007.518987 2011.236776 2012.500000
std 2.12132 1.414214 2.781901 2.859697 1.154701 8.38451 NaN 4.249052 2.077867 1.290994
min 2010.00000 2008.000000 2004.000000 2004.000000 2011.000000 1992.00000 2007.0 1989.000000 2002.000000 2011.000000
25% 2010.75000 2009.000000 2008.000000 2008.000000 2011.000000 1992.00000 2007.0 2005.000000 2010.000000 2011.750000
50% 2011.50000 2010.000000 2009.000000 2010.000000 2011.000000 1994.00000 2007.0 2009.000000 2012.000000 2012.500000
75% 2012.25000 2011.000000 2011.000000 2012.000000 2012.000000 2003.00000 2007.0 2011.000000 2013.000000 2013.250000
max 2013.00000 2012.000000 2013.000000 2013.000000 2013.000000 2011.00000 2007.0 2014.000000 2014.000000 2014.000000
In [2]:
iris = sns.load_dataset("iris")
print(iris.species.unique())
iris.set_index("species").groupby(
    {
        "setosa": "setosa",
        "versicolor": "other",
        "virginica": "other"
    }).mean()
['setosa' 'versicolor' 'virginica']
Out[2]:
sepal_length sepal_width petal_length petal_width
other 6.262 2.872 4.906 1.676
setosa 5.006 3.428 1.462 0.246
In [3]:
iris.groupby(iris.species == "setosa").mean()
Out[3]:
sepal_length sepal_width petal_length petal_width
species
False 6.262 2.872 4.906 1.676
True 5.006 3.428 1.462 0.246
In [ ]:

In [6]:
pd.to_timedelta("1days 12:34:56")
Out[6]:
Timedelta('1 days 12:34:56')
In [7]:
dates = pd.DataFrame({
    "da": pd.date_range("2017-01-01", "2017-02-28"),
    "de": pd.timedelta_range("1days 12:34:56.789012345", freq="1H", periods=31+28)
})
dates.info()
dates.groupby(
    [
        pd.Grouper(freq="10d", key="da"),
        dates.da.dt.month,
    ]).count()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 2 columns):
da    59 non-null datetime64[ns]
de    59 non-null timedelta64[ns]
dtypes: datetime64[ns](1), timedelta64[ns](1)
memory usage: 1.0 KB
Out[7]:
de
da da
2017-01-01 1 10
2017-01-11 1 10
2017-01-21 1 10
2017-01-31 1 1
2 9
2017-02-10 2 10
2017-02-20 2 9
In [8]:
dates.de.dt.components.head()
Out[8]:
days hours minutes seconds milliseconds microseconds nanoseconds
0 1 12 34 56 789 12 345
1 1 13 34 56 789 12 345
2 1 14 34 56 789 12 345
3 1 15 34 56 789 12 345
4 1 16 34 56 789 12 345
In [ ]: