![SSCC - Social Science Computing Cooperative SSCC - Social Science Computing Cooperative](sscc_logoBW.png)
6.2 Tidy data
These examples use the Car.csv
data set.
Load the
Car.csv
data set.library(tidyverse)
car_path <- file.path("..", "datasets", "Car.csv") car_in <- read_csv(car_path, col_types = cols())
Warning: Missing column names filled in: 'X1' [1]
car <- car_in %>% rename( id = X1 ) glimpse(car)
Observations: 4,654 Variables: 71 $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ... $ choice <chr> "choice1", "choice2", "choice5", "choice5", "choice... $ college <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ... $ hsg2 <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ... $ coml5 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... $ type1 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type2 <chr> "regcar", "van", "truck", "truck", "truck", "regcar... $ type3 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type4 <chr> "stwagon", "stwagon", "van", "van", "van", "van", "... $ type5 <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru... $ type6 <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ... $ fuel1 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel2 <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",... $ fuel3 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel4 <chr> "electric", "cng", "electric", "cng", "electric", "... $ fuel5 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ fuel6 <chr> "gasoline", "gasoline", "gasoline", "electric", "ga... $ price1 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price2 <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3... $ price3 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price4 <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598... $ price5 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ price6 <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050... $ range1 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range2 <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20... $ range3 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range4 <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,... $ range5 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ range6 <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3... $ acc1 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc2 <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4... $ acc3 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc4 <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6... $ acc5 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ acc6 <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2... $ speed1 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed2 <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ... $ speed3 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed4 <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85... $ speed5 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ speed6 <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95... $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0... $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7... $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2... $ size1 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size2 <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ... $ size3 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size4 <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ... $ size5 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ size6 <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ... $ space1 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space2 <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1... $ space3 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space4 <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1... $ space5 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ space6 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ cost1 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost2 <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ... $ cost3 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost4 <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ... $ cost5 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ cost6 <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ... $ station1 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station2 <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0... $ station3 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station4 <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0... $ station5 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0... $ station6 <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
Create a new data frame that uses only the variables for vehicle chosen, demographics, and body type options. Tidy this new data frame.
car_type <- car %>% select(id:type6) %>% gather(key = type_order, value = type, -id:-coml5) %>% separate(type_order, into = c("X1","order"), sep = "type") %>% select(-X1) %>% arrange(id, order) car_type %>% select(choice, order, college, hsg2, coml5, type) %>% head()
# A tibble: 6 x 6 choice order college hsg2 coml5 type <chr> <chr> <dbl> <dbl> <dbl> <chr> 1 choice1 1 0 0 0 van 2 choice1 2 0 0 0 regcar 3 choice1 3 0 0 0 van 4 choice1 4 0 0 0 stwagon 5 choice1 5 0 0 0 van 6 choice1 6 0 0 0 truck
Using the data set with all the variables on the vehicle options, Tidy the data set.
To gather multiple variables from multiple sets of columns, one gathers on all columns from all the sets. Then modify the variable created as the
key
to contain the names of the new variables. Finish by spreading using the column that contains the variable names. This is a common set of steps used when shapping a data frame. That is make the data frame as tall enough to contain everything that will be spread. Then spread to the desired columns.car_temp <- car %>% gather(key = type_order, value = value, -id:-coml5) %>% separate(type_order, into = c("variable", "order"), sep = -1) %>% spread(key = variable, value = value) %>% arrange(id, order) glimpse(car_temp)
Observations: 27,924 Variables: 17 $ id <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3... $ choice <chr> "choice1", "choice1", "choice1", "choice1", "choice1... $ college <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0... $ hsg2 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... $ coml5 <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0... $ order <chr> "1", "2", "3", "4", "5", "6", "1", "2", "3", "4", "5... $ acc <chr> "4", "4", "6", "6", "2.5", "2.5", "2.5", "2.5", "4",... $ cost <chr> "4", "4", "6", "6", "8", "8", "4", "4", "8", "8", "4... $ fuel <chr> "cng", "cng", "electric", "electric", "gasoline", "g... $ pollution <chr> "0.6", "0.6", "0.25", "0.25", "0.5", "0.5", "0", "0"... $ price <chr> "4.1753448", "4.1753448", "4.8177056", "4.8177056", ... $ range <chr> "250", "250", "400", "400", "250", "250", "125", "12... $ size <chr> "3", "3", "2", "2", "3", "3", "3", "3", "3", "3", "2... $ space <chr> "0.7", "0.7", "1", "1", "1", "1", "0.7", "0.7", "1",... $ speed <chr> "95", "95", "110", "110", "140", "140", "85", "85", ... $ station <chr> "0.1", "0.1", "0.3", "0.3", "1", "1", "0", "0", "0.1... $ type <chr> "van", "regcar", "van", "stwagon", "van", "truck", "...