SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

6.3 Aggregating data

These examples use the Car.csv data set.

  1. Load the Car.csv data set.

    from pathlib import Path
    import os
    import numpy as np
    import pandas as pd
    car_path = Path('..') / 'datasets' / 'Car.csv'
    car_in = pd.read_csv(car_path)
    car_in = (
        car_in
            .rename(columns={
                'Unnamed: 0': 'id'}))
    
    car =  car_in.copy(deep=True)
    
    (car
       .head()
       .pipe(print))
       id   choice  college  hsg2  ...  station3 station4 station5 station6
    0   1  choice1        0     0  ...       0.3      0.3      1.0      1.0
    1   2  choice2        1     1  ...       0.1      0.1      1.0      1.0
    2   3  choice5        0     1  ...       0.3      0.3      1.0      1.0
    3   4  choice5        0     0  ...       0.7      0.7      0.1      0.1
    4   5  choice5        0     1  ...       0.7      0.7      1.0      1.0
    
    [5 rows x 71 columns]
  2. Create a categorical variable that identifies the body type that was selected first.

    gather_vars = ['type1', 'type2', 'type3', 'type4', 'type5', 'type6']
    non_gather_vars = ['id', 'choice', 'college', 'hsg2', 'coml5']
    car_type = (
        car
            .loc[:, 'id':'type6']
            .melt(
                id_vars=non_gather_vars,
                value_vars=gather_vars,
                var_name='type_order',
                value_name='type')
            .assign(
                order=lambda df: 
                    df['type_order'].str.extract('type([\\d]+)', expand=True),
                choice=lambda df: 
                    df['choice'].str.extract('choice([\\d]+)', expand=True))
            .drop(columns='type_order')
            .query('order == choice')
            .assign(
                type=lambda df: 
                    df['type'].astype('category'))
            .sort_values(by=['id']))
    
    (car_type
       .loc[:, ['id', 'choice', 'order',  'college', 'hsg2', 'coml5', 'type']]
       .head()
       .pipe(print))
           id choice order  college  hsg2  coml5    type
    0       1      1     1        0     0      0     van
    4655    2      2     2        1     1      1     van
    18618   3      5     5        0     1      0  regcar
    18619   4      5     5        0     0      1  regcar
    18620   5      5     5        0     1      0  regcar
  3. Create a table of the number of count of the selected types.

    car_type_tab = (
        car_type
            .groupby('type')
            ['type']
            .size())
    
    (car_type_tab
       .pipe(print))
    type
    regcar      2740
    sportcar     172
    sportuv      242
    stwagon      305
    truck        565
    van          630
    Name: type, dtype: int64
  4. Create a table of the number of trucks selected by college degree or not and greater than 2 in the family.

    Which of these subgroups selected truck as their first choice in greatest numbers.

    car_truck_tab = (
        car_type
            .query('type == "truck"')
            .groupby(by=['college', 'hsg2'])
            ['type']
            .size()
            .reset_index()
            .pivot(
                index='hsg2',
                columns='college',
                values='type')
            )
    
    (car_truck_tab
       .pipe(print))
    college    0    1
    hsg2             
    0        133  298
    1         52   82
  5. Create the same table with the cells reporting the proportion of these groups that select trucks.

    Which of these subgroups was most likely to select a truck as their first choice.

    car_truck_tab = (
        car_type
            .groupby(by=['college', 'hsg2'])
            ['type']
            .aggregate(lambda value: (value == "truck").mean())
            .reset_index()
            .pivot(
                index='hsg2',
                columns='college',
                values='type')
            )
    
    (car_truck_tab
       .pipe(print))
    college         0         1
    hsg2                       
    0        0.170513  0.104893
    1        0.173913  0.111717
  6. Create a variable that identifies the price of the first selected vehicle.

    gather_vars = ['price1', 'price2', 'price3', 'price4', 'price5', 'price6']
    non_gather_vars = list(set(list(car.columns)) - set(gather_vars))
    car = (
        car
            .melt(
                id_vars=non_gather_vars,
                value_vars=gather_vars,
                var_name='price_order',
                value_name='price')
            .assign(
                choice=lambda df: 
                    df['choice'].str.extract('choice([\\d]+)', expand=True),
                order=lambda df: 
                    df['price_order'].str.extract('price([\\d]+)', expand=True))
            .query('order == choice')
            .drop(columns=['price_order', 'order'])
            .sort_values(by=['id'])
            )
    
    print(car.shape)
    (4654, 66)
    (car
        .loc[:, ['id', 'choice', 'college', 'hsg2', 'coml5', 'price']]
        .head()
        .pipe(print))
           id choice  college  hsg2  coml5     price
    0       1      1        0     0      0  4.175345
    4655    2      2        1     1      1  3.310947
    18618   3      5        0     1      0  3.282154
    18619   4      5        0     0      1  5.460066
    18620   5      5        0     1      0  4.690508
  7. Create a table that provides the mean and standard deviation of price by college attendance or not groups.

    Note, income is not provide in the data set. So price can not be put back on to the original scale. What is being reported is the price normalized by income.

    car_price_tab = (
        car
            .groupby(by=['college'])
            ['price']
            .aggregate(['mean', 'std'])
            .reset_index()
            )
    
    (car_price_tab
       .pipe(print))
       college      mean       std
    0        0  4.060240  2.069075
    1        1  4.158786  1.811278