
6.3 Aggregating data
These examples use the Car.csv
data set.
Load the
Car.csv
data set.library(tidyverse)
car_path <- file.path("..", "datasets", "Car.csv") car_in <- read_csv(car_path, col_types = cols())
Warning: Missing column names filled in: 'X1' [1]
car <- car_in %>% rename( id = X1 ) glimpse(car)
Observations: 4,654 Variables: 71 $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ... $ choice <chr> "choice1", "choice2", "choice5", "choice5", "choice... $ college <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ... $ hsg2 <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ... $ coml5 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... $ type1 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type2 <chr> "regcar", "van", "truck", "truck", "truck", "regcar... $ type3 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type4 <chr> "stwagon", "stwagon", "van", "van", "van", "van", "... $ type5 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type6 <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ... $ fuel1 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel2 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel3 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel4 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel5 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ fuel6 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ price1 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price2 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price3 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price4 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price5 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ price6 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ range1 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range2 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range3 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range4 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range5 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ range6 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ acc1 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc2 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc3 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc4 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc5 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ acc6 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ speed1 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed2 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed3 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed4 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed5 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ speed6 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ size1 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size2 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size3 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size4 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size5 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ size6 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ space1 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space2 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space3 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space4 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space5 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ space6 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ cost1 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost2 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost3 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost4 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost5 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ cost6 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ station1 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station2 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station3 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station4 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station5 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0... $ station6 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
Create a categorical variable that identifies the body type that was selected first.
car <- car %>% gather(key = type_order, value = type, type1:type6) %>% separate(type_order, into = c("X1","order"), sep = "type") %>% separate(choice, into = c("X2","choice"), sep = "choice") %>% select(-X1, -X2) %>% filter(choice == order) %>% arrange(id, order) car %>% select(choice, id, college, hsg2, coml5, type) %>% head()
# A tibble: 6 x 6 choice id college hsg2 coml5 type <chr> <dbl> <dbl> <dbl> <dbl> <chr> 1 1 1 0 0 0 van 2 2 2 1 1 1 van 3 5 3 0 1 0 regcar 4 5 4 0 0 1 regcar 5 5 5 0 1 0 regcar 6 5 6 0 0 0 truck
Create a table of the number of count of the selected types.
car_type_tab <- car %>% group_by(type) %>% summarise(count = n()) %>% ungroup() car_type_tab %>% head()
# A tibble: 6 x 2 type count <chr> <int> 1 regcar 2740 2 sportcar 172 3 sportuv 242 4 stwagon 305 5 truck 565 6 van 630
Create a table of the number of trucks selected by college degree or not and greater than 2 in the family.
Which of these subgroups selected
truck
as their first choice in greatest numbers.car_truck_tab <- car %>% filter(type == "truck") %>% group_by(college, hsg2) %>% summarise(count = n()) %>% ungroup() %>% spread(key = college, value = count) car_truck_tab %>% head()
# A tibble: 2 x 3 hsg2 `0` `1` <dbl> <int> <int> 1 0 133 298 2 1 52 82
Create the same table with the cells reporting the proportion of these groups that select trucks.
Which of these subgroups was most likely to selected
truck
as their first choice.car_truck_tab <- car %>% group_by(college, hsg2) %>% summarise(proportion = sum(type == "truck") / n()) %>% ungroup() %>% spread(key = college, value = proportion) car_truck_tab %>% head()
# A tibble: 2 x 3 hsg2 `0` `1` <dbl> <dbl> <dbl> 1 0 0.171 0.105 2 1 0.174 0.112
Create a variable that identifies the price of the first selected vehicle.
car <- car %>% gather(key = price_order, value = price, price1:price6) %>% separate(price_order, into = c("X1", "order"), sep = "price") %>% select(-X1) %>% filter(choice == order) %>% arrange(id) car %>% select(id, college, hsg2, coml5, type, price) %>% head()
# A tibble: 6 x 6 id college hsg2 coml5 type price <dbl> <dbl> <dbl> <dbl> <chr> <dbl> 1 1 0 0 0 van 4.18 2 2 1 1 1 van 3.31 3 3 0 1 0 regcar 3.28 4 4 0 0 1 regcar 5.46 5 5 0 1 0 regcar 4.69 6 6 0 0 0 truck 4.82
Create a table that provides the mean and standard deviation of price by college attendance or not groups.
Note, income is not provide in the data set. So price can not be put back on to the original scale. What is being reported is the price normalized by income.
car_price_tab <- car %>% group_by(college) %>% summarise( mean = mean(price), st_dev = sd(price) ) %>% ungroup() car_price_tab
# A tibble: 2 x 3 college mean st_dev <dbl> <dbl> <dbl> 1 0 4.06 2.07 2 1 4.16 1.81