Pandas tips

Pandas groupby
Author

Nipun Batra

Published

January 12, 2023

import pandas as pd
import numpy as np

Dataset creation

# Create a DataFrame for whether or not a person plays tennis. 
# It has discrete features and 14 rows.

df = pd.DataFrame({
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})

Counting values of outcome variable (useful for calculating entropy)

ser = df['PlayTennis'].value_counts()
ser
Yes    9
No     5
Name: PlayTennis, dtype: int64

Using cross tab to quickly capture the relationship between two variables

df_out = pd.crosstab(df['Outlook'], df['PlayTennis'])
df_out
PlayTennis No Yes
Outlook
Overcast 0 4
Rain 2 3
Sunny 3 2
df_out.index, df_out.columns
(Index(['Overcast', 'Rain', 'Sunny'], dtype='object', name='Outlook'),
 Index(['No', 'Yes'], dtype='object', name='PlayTennis'))
e = df_out.sum(axis=1)
e
Outlook
Overcast    4
Rain        5
Sunny       5
dtype: int64
# Find the fraction of each row that is a 'Yes' and 'No' for PlayTennis
df_out['Yes'] = df_out['Yes'] / e
df_out['No'] = df_out['No'] / e
df_out
PlayTennis No Yes
Outlook
Overcast 0.0 1.0
Rain 0.4 0.6
Sunny 0.6 0.4

More efficient cross tabulation (using normalize)

pd.crosstab(df['Outlook'], df['PlayTennis'], normalize='index')
PlayTennis No Yes
Outlook
Overcast 0.0 1.0
Rain 0.4 0.6
Sunny 0.6 0.4

Using pd.groupby()

df.groupby(["Outlook"]).groups
{'Overcast': [2, 6, 11, 12], 'Rain': [3, 4, 5, 9, 13], 'Sunny': [0, 1, 7, 8, 10]}
df.groupby(["Outlook"]).get_group("Sunny")
Outlook Temperature Humidity Wind PlayTennis
0 Sunny Hot High Weak No
1 Sunny Hot High Strong No
7 Sunny Mild High Weak No
8 Sunny Cool Normal Weak Yes
10 Sunny Mild Normal Strong Yes
df.groupby(["Outlook"]).get_group("Sunny")["PlayTennis"]
0      No
1      No
7      No
8     Yes
10    Yes
Name: PlayTennis, dtype: object
df.groupby(["Outlook"]).get_group("Sunny")["PlayTennis"].value_counts()
No     3
Yes    2
Name: PlayTennis, dtype: int64
df.groupby(["Outlook", "PlayTennis"]).groups
{('Overcast', 'Yes'): [2, 6, 11, 12], ('Rain', 'No'): [5, 13], ('Rain', 'Yes'): [3, 4, 9], ('Sunny', 'No'): [0, 1, 7], ('Sunny', 'Yes'): [8, 10]}
df.groupby(["Outlook", "PlayTennis"]).get_group(("Sunny", "Yes"))
Outlook Temperature Humidity Wind PlayTennis
8 Sunny Cool Normal Weak Yes
10 Sunny Mild Normal Strong Yes
df.groupby(["Outlook", "PlayTennis"]).size()
Outlook   PlayTennis
Overcast  Yes           4
Rain      No            2
          Yes           3
Sunny     No            3
          Yes           2
dtype: int64
df.groupby(["Outlook", "PlayTennis"]).size().index
MultiIndex([('Overcast', 'Yes'),
            (    'Rain',  'No'),
            (    'Rain', 'Yes'),
            (   'Sunny',  'No'),
            (   'Sunny', 'Yes')],
           names=['Outlook', 'PlayTennis'])
df.groupby(["Outlook", "PlayTennis"]).size().unstack()
PlayTennis No Yes
Outlook
Overcast NaN 4.0
Rain 2.0 3.0
Sunny 3.0 2.0
df.groupby(["Outlook", "PlayTennis"]).size().unstack(fill_value=0)
PlayTennis No Yes
Outlook
Overcast 0 4
Rain 2 3
Sunny 3 2

Using pd.pivot_table

pivot_table = df.pivot_table(index='Outlook', columns='PlayTennis', aggfunc='size', fill_value=0)
pivot_table
PlayTennis No Yes
Outlook
Overcast 0 4
Rain 2 3
Sunny 3 2
df_out.plot(kind='bar', stacked=True)
<AxesSubplot:xlabel='Outlook'>

df_out.plot(kind='bar', stacked=False)
<AxesSubplot:xlabel='Outlook'>