6.2 Tidy data

SSCC - Social Science Computing Cooperative

Supporting Statistical Analysis for Research

These examples use the Car.csv data set.

Load the Car.csv data set.

library(tidyverse)

car_path <- file.path("..", "datasets", "Car.csv")
car_in <- read_csv(car_path, col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

car <- 
  car_in %>%
  rename(
    id = X1
    )

glimpse(car)

Observations: 4,654
Variables: 71
$ id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
$ choice     <chr> "choice1", "choice2", "choice5", "choice5", "choice...
$ college    <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ...
$ hsg2       <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
$ coml5      <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
$ type1      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
$ type2      <chr> "regcar", "van", "truck", "truck", "truck", "regcar...
$ type3      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
$ type4      <chr> "stwagon", "stwagon", "van", "van", "van", "van", "...
$ type5      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
$ type6      <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ...
$ fuel1      <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",...
$ fuel2      <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",...
$ fuel3      <chr> "electric", "cng", "electric", "cng", "electric", "...
$ fuel4      <chr> "electric", "cng", "electric", "cng", "electric", "...
$ fuel5      <chr> "gasoline", "gasoline", "gasoline", "electric", "ga...
$ fuel6      <chr> "gasoline", "gasoline", "gasoline", "electric", "ga...
$ price1     <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3...
$ price2     <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3...
$ price3     <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598...
$ price4     <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598...
$ price5     <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050...
$ price6     <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050...
$ range1     <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20...
$ range2     <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20...
$ range3     <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,...
$ range4     <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,...
$ range5     <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3...
$ range6     <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3...
$ acc1       <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4...
$ acc2       <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4...
$ acc3       <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6...
$ acc4       <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6...
$ acc5       <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2...
$ acc6       <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2...
$ speed1     <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ...
$ speed2     <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ...
$ speed3     <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85...
$ speed4     <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85...
$ speed5     <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95...
$ speed6     <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95...
$ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0...
$ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0...
$ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7...
$ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7...
$ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2...
$ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2...
$ size1      <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ...
$ size2      <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ...
$ size3      <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ...
$ size4      <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ...
$ size5      <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ...
$ size6      <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ...
$ space1     <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1...
$ space2     <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1...
$ space3     <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1...
$ space4     <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1...
$ space5     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ space6     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ cost1      <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ...
$ cost2      <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ...
$ cost3      <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ...
$ cost4      <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ...
$ cost5      <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ...
$ cost6      <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ...
$ station1   <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0...
$ station2   <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0...
$ station3   <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0...
$ station4   <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0...
$ station5   <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
$ station6   <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...

Create a new data frame that uses only the variables for vehicle chosen, demographics, and body type options. Tidy this new data frame.

car_type <- 
  car %>%
  select(id:type6) %>%
  gather(key = type_order, value = type, -id:-coml5) %>%
  separate(type_order, into = c("X1","order"), sep = "type") %>%
  select(-X1) %>%
  arrange(id, order)

car_type %>%
  select(choice, order,  college, hsg2, coml5, type) %>%
  head()

# A tibble: 6 x 6
  choice  order college  hsg2 coml5 type   
  <chr>   <chr>   <dbl> <dbl> <dbl> <chr>  
1 choice1 1           0     0     0 van    
2 choice1 2           0     0     0 regcar 
3 choice1 3           0     0     0 van    
4 choice1 4           0     0     0 stwagon
5 choice1 5           0     0     0 van    
6 choice1 6           0     0     0 truck

Using the data set with all the variables on the vehicle options, Tidy the data set.

To gather multiple variables from multiple sets of columns, one gathers on all columns from all the sets. Then modify the variable created as the key to contain the names of the new variables. Finish by spreading using the column that contains the variable names. This is a common set of steps used when shapping a data frame. That is make the data frame as tall enough to contain everything that will be spread. Then spread to the desired columns.

car_temp <- 
  car %>%
  gather(key = type_order, value = value, -id:-coml5) %>%
  separate(type_order, into = c("variable", "order"), sep = -1) %>%
  spread(key = variable, value = value) %>%
  arrange(id, order)

glimpse(car_temp)

Observations: 27,924
Variables: 17
$ id        <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3...
$ choice    <chr> "choice1", "choice1", "choice1", "choice1", "choice1...
$ college   <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0...
$ hsg2      <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ coml5     <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0...
$ order     <chr> "1", "2", "3", "4", "5", "6", "1", "2", "3", "4", "5...
$ acc       <chr> "4", "4", "6", "6", "2.5", "2.5", "2.5", "2.5", "4",...
$ cost      <chr> "4", "4", "6", "6", "8", "8", "4", "4", "8", "8", "4...
$ fuel      <chr> "cng", "cng", "electric", "electric", "gasoline", "g...
$ pollution <chr> "0.6", "0.6", "0.25", "0.25", "0.5", "0.5", "0", "0"...
$ price     <chr> "4.1753448", "4.1753448", "4.8177056", "4.8177056", ...
$ range     <chr> "250", "250", "400", "400", "250", "250", "125", "12...
$ size      <chr> "3", "3", "2", "2", "3", "3", "3", "3", "3", "3", "2...
$ space     <chr> "0.7", "0.7", "1", "1", "1", "1", "0.7", "0.7", "1",...
$ speed     <chr> "95", "95", "110", "110", "140", "140", "85", "85", ...
$ station   <chr> "0.1", "0.1", "0.3", "0.3", "1", "1", "0", "0", "0.1...
$ type      <chr> "van", "regcar", "van", "stwagon", "van", "truck", "...