 Supporting Statistical Analysis for Research
  Supporting Statistical Analysis for Research
 6.3 Aggregating data
These examples use the Car.csv data set.
- Load the - Car.csvdata set.- library(tidyverse)- car_path <- file.path("..", "datasets", "Car.csv") car_in <- read_csv(car_path, col_types = cols())- Warning: Missing column names filled in: 'X1' [1]- car <- car_in %>% rename( id = X1 ) glimpse(car)- Observations: 4,654 Variables: 71 $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ... $ choice <chr> "choice1", "choice2", "choice5", "choice5", "choice... $ college <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ... $ hsg2 <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ... $ coml5 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... $ type1 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type2 <chr> "regcar", "van", "truck", "truck", "truck", "regcar... $ type3 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type4 <chr> "stwagon", "stwagon", "van", "van", "van", "van", "... $ type5 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type6 <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ... $ fuel1 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel2 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel3 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel4 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel5 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ fuel6 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ price1 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price2 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price3 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price4 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price5 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ price6 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ range1 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range2 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range3 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range4 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range5 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ range6 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ acc1 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc2 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc3 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc4 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc5 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ acc6 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ speed1 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed2 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed3 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed4 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed5 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ speed6 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ size1 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size2 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size3 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size4 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size5 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ size6 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ space1 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space2 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space3 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space4 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space5 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ space6 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ cost1 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost2 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost3 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost4 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost5 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ cost6 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ station1 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station2 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station3 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station4 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station5 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0... $ station6 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
- Create a categorical variable that identifies the body type that was selected first. - car <- car %>% gather(key = type_order, value = type, type1:type6) %>% separate(type_order, into = c("X1","order"), sep = "type") %>% separate(choice, into = c("X2","choice"), sep = "choice") %>% select(-X1, -X2) %>% filter(choice == order) %>% arrange(id, order) car %>% select(choice, id, college, hsg2, coml5, type) %>% head()- # A tibble: 6 x 6 choice id college hsg2 coml5 type <chr> <dbl> <dbl> <dbl> <dbl> <chr> 1 1 1 0 0 0 van 2 2 2 1 1 1 van 3 5 3 0 1 0 regcar 4 5 4 0 0 1 regcar 5 5 5 0 1 0 regcar 6 5 6 0 0 0 truck
- Create a table of the number of count of the selected types. - car_type_tab <- car %>% group_by(type) %>% summarise(count = n()) %>% ungroup() car_type_tab %>% head()- # A tibble: 6 x 2 type count <chr> <int> 1 regcar 2740 2 sportcar 172 3 sportuv 242 4 stwagon 305 5 truck 565 6 van 630
- Create a table of the number of trucks selected by college degree or not and greater than 2 in the family. - Which of these subgroups selected - truckas their first choice in greatest numbers.- car_truck_tab <- car %>% filter(type == "truck") %>% group_by(college, hsg2) %>% summarise(count = n()) %>% ungroup() %>% spread(key = college, value = count) car_truck_tab %>% head()- # A tibble: 2 x 3 hsg2 `0` `1` <dbl> <int> <int> 1 0 133 298 2 1 52 82
- Create the same table with the cells reporting the proportion of these groups that select trucks. - Which of these subgroups was most likely to selected - truckas their first choice.- car_truck_tab <- car %>% group_by(college, hsg2) %>% summarise(proportion = sum(type == "truck") / n()) %>% ungroup() %>% spread(key = college, value = proportion) car_truck_tab %>% head()- # A tibble: 2 x 3 hsg2 `0` `1` <dbl> <dbl> <dbl> 1 0 0.171 0.105 2 1 0.174 0.112
- Create a variable that identifies the price of the first selected vehicle. - car <- car %>% gather(key = price_order, value = price, price1:price6) %>% separate(price_order, into = c("X1", "order"), sep = "price") %>% select(-X1) %>% filter(choice == order) %>% arrange(id) car %>% select(id, college, hsg2, coml5, type, price) %>% head()- # A tibble: 6 x 6 id college hsg2 coml5 type price <dbl> <dbl> <dbl> <dbl> <chr> <dbl> 1 1 0 0 0 van 4.18 2 2 1 1 1 van 3.31 3 3 0 1 0 regcar 3.28 4 4 0 0 1 regcar 5.46 5 5 0 1 0 regcar 4.69 6 6 0 0 0 truck 4.82
- Create a table that provides the mean and standard deviation of price by college attendance or not groups. - Note, income is not provide in the data set. So price can not be put back on to the original scale. What is being reported is the price normalized by income. - car_price_tab <- car %>% group_by(college) %>% summarise( mean = mean(price), st_dev = sd(price) ) %>% ungroup() car_price_tab- # A tibble: 2 x 3 college mean st_dev <dbl> <dbl> <dbl> 1 0 4.06 2.07 2 1 4.16 1.81