import pandas as pd
import numpy as npPivot Table and Grouping in Pandas
Create a dataset
### Create a dataset
import pandas as pd
import numpy as np
# create categorical variables
age_group = pd.Categorical(["18-24", "25-34", "35-44", "45-54", "55-64", "65+", "18-24", "25-34", "35-44", "45-54", "55-64", "65+", "18-24", "25-34", "35-44"])
marital_status = pd.Categorical(["Married", "Single", "Married", "Single", "Widowed", "Divorced", "Married", "Single", "Married", "Single", "Widowed", "Divorced", "Married", "Single", "Married"])
employment_status = pd.Categorical(["Employed", "Unemployed", "Employed", "Unemployed", "Retired", "Employed", "Unemployed", "Employed", "Unemployed", "Employed", "Retired", "Unemployed", "Employed", "Unemployed", "Retired"])
gender = pd.Categorical(["Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male"])
# create numeric variables
income = np.random.normal(50000, 10000, 15)
education_years = np.random.randint(8, 16, 15)
number_of_children = np.random.randint(0, 5, 15)
hours_worked_per_week = np.random.randint(20, 60, 15)
# create census dataset
census_data = pd.DataFrame({"age_group": age_group, "marital_status": marital_status, "employment_status": employment_status, "gender": gender, "income": income, "education_years": education_years, "number_of_children": number_of_children, "hours_worked_per_week": hours_worked_per_week})
census_data.head(3)| age_group | marital_status | employment_status | gender | income | education_years | number_of_children | hours_worked_per_week | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18-24 | Married | Employed | Male | 54327.605016 | 12 | 2 | 45 | 
| 1 | 25-34 | Single | Unemployed | Female | 51258.164750 | 14 | 3 | 38 | 
| 2 | 35-44 | Married | Employed | Male | 52939.298442 | 9 | 2 | 42 | 
Pivot Table
# what is the average education years for each age_group and employment_status?(census_data
.pivot_table(index='age_group', columns='employment_status',
    values='education_years', aggfunc='mean')
)| employment_status | Employed | Retired | Unemployed | 
|---|---|---|---|
| age_group | |||
| 18-24 | 10.5 | NaN | 12.0 | 
| 25-34 | 8.0 | NaN | 14.5 | 
| 35-44 | 9.0 | 9.0 | 11.0 | 
| 45-54 | 8.0 | NaN | 15.0 | 
| 55-64 | NaN | 14.5 | NaN | 
| 65+ | 11.0 | NaN | 15.0 | 
# Using Max as aggregate function
(census_data
.pivot_table(index='age_group', columns='employment_status',
    values='education_years', aggfunc='max')
)| employment_status | Employed | Retired | Unemployed | 
|---|---|---|---|
| age_group | |||
| 18-24 | 12.0 | NaN | 12.0 | 
| 25-34 | 8.0 | NaN | 15.0 | 
| 35-44 | 9.0 | 9.0 | 11.0 | 
| 45-54 | 8.0 | NaN | 15.0 | 
| 55-64 | NaN | 15.0 | NaN | 
| 65+ | 11.0 | NaN | 15.0 | 
Using multiple aggregations
# Find the Min and Max Incomes for all the age groups
(census_data.
    pivot_table(index='age_group', 
         values='income', aggfunc=['min', 'max'])
)| min | max | |
|---|---|---|
| income | income | |
| age_group | ||
| 18-24 | 53176.552487 | 65210.548202 | 
| 25-34 | 32718.963301 | 63933.909067 | 
| 35-44 | 52939.298442 | 66377.414798 | 
| 45-54 | 38553.347657 | 61113.975298 | 
| 55-64 | 46070.260269 | 51285.443271 | 
| 65+ | 42717.383692 | 51510.970139 | 
(census_data.
    pivot_table(index='age_group', 
         values=['income','education_years'], aggfunc=['min', 'max'])
)| min | max | |||
|---|---|---|---|---|
| education_years | income | education_years | income | |
| age_group | ||||
| 18-24 | 9 | 53176.552487 | 12 | 65210.548202 | 
| 25-34 | 8 | 32718.963301 | 15 | 63933.909067 | 
| 35-44 | 9 | 52939.298442 | 11 | 66377.414798 | 
| 45-54 | 8 | 38553.347657 | 15 | 61113.975298 | 
| 55-64 | 14 | 46070.260269 | 15 | 51285.443271 | 
| 65+ | 11 | 42717.383692 | 15 | 51510.970139 | 
Using custom aggregate function in pivot_table
# We can provide a custom aggfunc function to the pivot_table 
# Let us calculate the percentage of people employment for a given age_group
def percentage_of_employment_status(ser):
    return ser.str.contains('Employed').sum() / len(ser) * 100
# We are not provide 'columns' here because we need a single column output
# The custom aggregate function will be applied on the values 'employment_status'
(census_data
.pivot_table(index='age_group', 
    values='employment_status', aggfunc=percentage_of_employment_status)
)| employment_status | |
|---|---|
| age_group | |
| 18-24 | 66.666667 | 
| 25-34 | 33.333333 | 
| 35-44 | 33.333333 | 
| 45-54 | 50.000000 | 
| 55-64 | 0.000000 | 
| 65+ | 50.000000 | 
Different Aggregations per column
# For each gender what is the average Income and max hours_worked_per_week
(census_data.
    pivot_table(index='gender',
        aggfunc={'income': 'mean', 'hours_worked_per_week': 'min'})
)| hours_worked_per_week | income | |
|---|---|---|
| gender | ||
| Female | 29 | 48829.530558 | 
| Male | 26 | 55967.094844 | 
Groupby

groupbymethod is lazy and does not perform an aggregation until we specify which aggregation to perform
# This will create a multi-Index series
# The columns in groupby will become the index of the series or dataframe
# we need to unstack the index to get the pivot table
census_data.groupby(['age_group','employment_status'])['education_years'].mean()age_group  employment_status
18-24      Employed             10.5
           Retired               NaN
           Unemployed           12.0
25-34      Employed              8.0
           Retired               NaN
           Unemployed           14.5
35-44      Employed              9.0
           Retired               9.0
           Unemployed           11.0
45-54      Employed              8.0
           Retired               NaN
           Unemployed           15.0
55-64      Employed              NaN
           Retired              14.5
           Unemployed            NaN
65+        Employed             11.0
           Retired               NaN
           Unemployed           15.0
Name: education_years, dtype: float64
# The below code will unstack the multi-Index and create a pivot table
# The column names are still multi-Index
# We can rename the column names for convineance
census_data.groupby(['age_group','employment_status'])['education_years'].mean().unstack('employment_status')| employment_status | Employed | Retired | Unemployed | 
|---|---|---|---|
| age_group | |||
| 18-24 | 10.5 | NaN | 12.0 | 
| 25-34 | 8.0 | NaN | 14.5 | 
| 35-44 | 9.0 | 9.0 | 11.0 | 
| 45-54 | 8.0 | NaN | 15.0 | 
| 55-64 | NaN | 14.5 | NaN | 
| 65+ | 11.0 | NaN | 15.0 | 
# better coding format
(census_data
.groupby(['age_group','employment_status'])
.education_years
.mean()
.unstack('employment_status')
)| employment_status | Employed | Retired | Unemployed | 
|---|---|---|---|
| age_group | |||
| 18-24 | 10.5 | NaN | 12.0 | 
| 25-34 | 8.0 | NaN | 14.5 | 
| 35-44 | 9.0 | 9.0 | 11.0 | 
| 45-54 | 8.0 | NaN | 15.0 | 
| 55-64 | NaN | 14.5 | NaN | 
| 65+ | 11.0 | NaN | 15.0 | 
Groupby using custom Aggregate function
(census_data   # dataframe on which to perform groupby
.groupby(['age_group']) # columns to group by
## Pull out the required column on which to perform aggregation
[['employment_status']] # column on which to perform aggregation operation. As we are using '[[' here, the output will be a dataframe. If using '[', the output will be series
.agg(percentage_of_employment_status) # using custom aggregate function
)| employment_status | |
|---|---|
| age_group | |
| 18-24 | 66.666667 | 
| 25-34 | 33.333333 | 
| 35-44 | 33.333333 | 
| 45-54 | 50.000000 | 
| 55-64 | 0.000000 | 
| 65+ | 50.000000 | 
groupby aggregation on multiple columns
# Find the Min and Max Incomes for all the age groups
(census_data.
    groupby(['age_group'])
    [['income', 'education_years']]
    .agg(['min', 'max'])
)| income | education_years | |||
|---|---|---|---|---|
| min | max | min | max | |
| age_group | ||||
| 18-24 | 53176.552487 | 65210.548202 | 9 | 12 | 
| 25-34 | 32718.963301 | 63933.909067 | 8 | 15 | 
| 35-44 | 52939.298442 | 66377.414798 | 9 | 11 | 
| 45-54 | 38553.347657 | 61113.975298 | 8 | 15 | 
| 55-64 | 46070.260269 | 51285.443271 | 14 | 15 | 
| 65+ | 42717.383692 | 51510.970139 | 11 | 15 | 
Different aggregations per column
# For each gender what is the average Income and max hours_worked_per_week
(census_data
    .groupby('gender')
    .agg({'income': ['min','max','mean'], 'hours_worked_per_week': 'min'})
)| income | hours_worked_per_week | |||
|---|---|---|---|---|
| min | max | mean | min | |
| gender | ||||
| Female | 32718.963301 | 63933.909067 | 48829.530558 | 29 | 
| Male | 46070.260269 | 66377.414798 | 55967.094844 | 26 | 
Named Aggregations
- We can use this functionality to name the columns when using groupby
 - We can use keyword parameter in 
aggfunction, which will turn into a column name - The input for keyword parameter will be a tuple - (column_name, agg_function)
 - Use Named Aggregations for flat column names
 - When performing groupby with multiple categorical columns, pandas will create the cartesian product of those columns even if there is no corresponding value.
 - This is not available for pivot_table
 
(census_data
.groupby('gender')
.agg(min_income=('income','min'),
    max_income=('income','max'),
    mean_income=('income','mean'),
    min_hours_worked_per_week=('hours_worked_per_week','min'),
    max_hours_worked_per_week=('hours_worked_per_week','max')
    )
)| min_income | max_income | mean_income | min_hours_worked_per_week | max_hours_worked_per_week | |
|---|---|---|---|---|---|
| gender | |||||
| Female | 32718.963301 | 63933.909067 | 48829.530558 | 29 | 54 | 
| Male | 46070.260269 | 66377.414798 | 55967.094844 | 26 | 59 | 
Grouping with Functions
def even_grouper(idx):
    return 'odd' if idx % 2  else 'even'census_data.pivot_table(index=even_grouper, aggfunc='size')even    8
odd     7
dtype: int64
(census_data
.groupby(even_grouper)
.size()
)even    8
odd     7
dtype: int64
Stacking and Unstacking
- Unstack moves an index into the columns
 - We use this on multi-index data, moving one of the indices into the columns
 - stack method does the reverse, moving multi-level column into the index
 
(census_data
.groupby(['age_group','employment_status'])
.size()
)age_group  employment_status
18-24      Employed             2
           Retired              0
           Unemployed           1
25-34      Employed             1
           Retired              0
           Unemployed           2
35-44      Employed             1
           Retired              1
           Unemployed           1
45-54      Employed             1
           Retired              0
           Unemployed           1
55-64      Employed             0
           Retired              2
           Unemployed           0
65+        Employed             1
           Retired              0
           Unemployed           1
dtype: int64
# Unstacking a series into a dataframe
(census_data
.groupby(['age_group','employment_status'])
.size()
.unstack('employment_status'))| employment_status | Employed | Retired | Unemployed | 
|---|---|---|---|
| age_group | |||
| 18-24 | 2 | 0 | 1 | 
| 25-34 | 1 | 0 | 2 | 
| 35-44 | 1 | 1 | 1 | 
| 45-54 | 1 | 0 | 1 | 
| 55-64 | 0 | 2 | 0 | 
| 65+ | 1 | 0 | 1 | 
# Unstacking a index of a dataframe into nested columns
(census_data
.groupby(['age_group','employment_status'])
.mean(numeric_only=True)
.unstack('employment_status'))| income | education_years | number_of_children | hours_worked_per_week | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| employment_status | Employed | Retired | Unemployed | Employed | Retired | Unemployed | Employed | Retired | Unemployed | Employed | Retired | Unemployed | 
| age_group | ||||||||||||
| 18-24 | 53752.078752 | NaN | 65210.548202 | 10.5 | NaN | 12.0 | 1.0 | NaN | 4.0 | 35.5 | NaN | 36.0 | 
| 25-34 | 63933.909067 | NaN | 41988.564026 | 8.0 | NaN | 14.5 | 4.0 | NaN | 1.5 | 33.0 | NaN | 46.0 | 
| 35-44 | 52939.298442 | 58349.636265 | 66377.414798 | 9.0 | 9.0 | 11.0 | 2.0 | 1.0 | 2.0 | 42.0 | 51.0 | 51.0 | 
| 45-54 | 61113.975298 | NaN | 38553.347657 | 8.0 | NaN | 15.0 | 0.0 | NaN | 0.0 | 34.0 | NaN | 29.0 | 
| 55-64 | NaN | 48677.851770 | NaN | NaN | 14.5 | NaN | NaN | 2.5 | NaN | NaN | 55.5 | NaN | 
| 65+ | 42717.383692 | NaN | 51510.970139 | 11.0 | NaN | 15.0 | 0.0 | NaN | 4.0 | 53.0 | NaN | 35.0 | 
# Stacking
# We take nested columns and send it to index
# This is the opposite of stacking
# The outermost column is 0 and the inner columns are 1
(census_data.
    pivot_table(index='age_group', 
          aggfunc={'income':['min','max'], 'hours_worked_per_week':['min','max']})
)| hours_worked_per_week | income | |||
|---|---|---|---|---|
| max | min | max | min | |
| age_group | ||||
| 18-24 | 45 | 26 | 65210.548202 | 53176.552487 | 
| 25-34 | 54 | 33 | 63933.909067 | 32718.963301 | 
| 35-44 | 51 | 42 | 66377.414798 | 52939.298442 | 
| 45-54 | 34 | 29 | 61113.975298 | 38553.347657 | 
| 55-64 | 59 | 52 | 51285.443271 | 46070.260269 | 
| 65+ | 53 | 35 | 51510.970139 | 42717.383692 | 
# Stacking the outer column
(census_data.
    pivot_table(index='age_group', 
          aggfunc={'income':['min','max'], 'hours_worked_per_week':['min','max']})
    .stack(0)
)| max | min | ||
|---|---|---|---|
| age_group | |||
| 18-24 | hours_worked_per_week | 45.000000 | 26.000000 | 
| income | 65210.548202 | 53176.552487 | |
| 25-34 | hours_worked_per_week | 54.000000 | 33.000000 | 
| income | 63933.909067 | 32718.963301 | |
| 35-44 | hours_worked_per_week | 51.000000 | 42.000000 | 
| income | 66377.414798 | 52939.298442 | |
| 45-54 | hours_worked_per_week | 34.000000 | 29.000000 | 
| income | 61113.975298 | 38553.347657 | |
| 55-64 | hours_worked_per_week | 59.000000 | 52.000000 | 
| income | 51285.443271 | 46070.260269 | |
| 65+ | hours_worked_per_week | 53.000000 | 35.000000 | 
| income | 51510.970139 | 42717.383692 | 
# Stacking the inner column
(census_data.
    pivot_table(index='age_group', 
          aggfunc={'income':['min','max'], 'hours_worked_per_week':['min','max']})
    .stack(1)
)| hours_worked_per_week | income | ||
|---|---|---|---|
| age_group | |||
| 18-24 | max | 45 | 65210.548202 | 
| min | 26 | 53176.552487 | |
| 25-34 | max | 54 | 63933.909067 | 
| min | 33 | 32718.963301 | |
| 35-44 | max | 51 | 66377.414798 | 
| min | 42 | 52939.298442 | |
| 45-54 | max | 34 | 61113.975298 | 
| min | 29 | 38553.347657 | |
| 55-64 | max | 59 | 51285.443271 | 
| min | 52 | 46070.260269 | |
| 65+ | max | 53 | 51510.970139 | 
| min | 35 | 42717.383692 | 
Flatenning the Hierarchial Index
# We can use 'reset_index' method to make the indexes columns of a df
(census_data
.groupby(['age_group','employment_status'])
['income']
.mean()
)age_group  employment_status
18-24      Employed             53752.078752
           Retired                       NaN
           Unemployed           65210.548202
25-34      Employed             63933.909067
           Retired                       NaN
           Unemployed           41988.564026
35-44      Employed             52939.298442
           Retired              58349.636265
           Unemployed           66377.414798
45-54      Employed             61113.975298
           Retired                       NaN
           Unemployed           38553.347657
55-64      Employed                      NaN
           Retired              48677.851770
           Unemployed                    NaN
65+        Employed             42717.383692
           Retired                       NaN
           Unemployed           51510.970139
Name: income, dtype: float64
# Using reset_index will make the index columns and flatten the hierarchy
(census_data
.groupby(['age_group','employment_status'])
['income']
.mean()
.reset_index()
)| age_group | employment_status | income | |
|---|---|---|---|
| 0 | 18-24 | Employed | 53752.078752 | 
| 1 | 18-24 | Retired | NaN | 
| 2 | 18-24 | Unemployed | 65210.548202 | 
| 3 | 25-34 | Employed | 63933.909067 | 
| 4 | 25-34 | Retired | NaN | 
| 5 | 25-34 | Unemployed | 41988.564026 | 
| 6 | 35-44 | Employed | 52939.298442 | 
| 7 | 35-44 | Retired | 58349.636265 | 
| 8 | 35-44 | Unemployed | 66377.414798 | 
| 9 | 45-54 | Employed | 61113.975298 | 
| 10 | 45-54 | Retired | NaN | 
| 11 | 45-54 | Unemployed | 38553.347657 | 
| 12 | 55-64 | Employed | NaN | 
| 13 | 55-64 | Retired | 48677.851770 | 
| 14 | 55-64 | Unemployed | NaN | 
| 15 | 65+ | Employed | 42717.383692 | 
| 16 | 65+ | Retired | NaN | 
| 17 | 65+ | Unemployed | 51510.970139 | 
Flattening the hierarchial columns
def flatten_cols(df):
    cols = ['_'.join(map(str, vals)) for vals in df.columns.to_flat_index()]
    df.columns = cols
    return df(census_data
.pivot_table(index='age_group', 
          aggfunc={'income':['min','max'], 'hours_worked_per_week':['min','max']})
.pipe(flatten_cols)    
)| hours_worked_per_week_max | hours_worked_per_week_min | income_max | income_min | |
|---|---|---|---|---|
| age_group | ||||
| 18-24 | 45 | 26 | 65210.548202 | 53176.552487 | 
| 25-34 | 54 | 33 | 63933.909067 | 32718.963301 | 
| 35-44 | 51 | 42 | 66377.414798 | 52939.298442 | 
| 45-54 | 34 | 29 | 61113.975298 | 38553.347657 | 
| 55-64 | 59 | 52 | 51285.443271 | 46070.260269 | 
| 65+ | 53 | 35 | 51510.970139 | 42717.383692 |