SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

6.2 Tidy data

These examples use the Car.csv data set.

  1. Load the Car.csv data set.

    library(tidyverse)
    car_path <- file.path("..", "datasets", "Car.csv")
    car_in <- read_csv(car_path, col_types = cols())
    Warning: Missing column names filled in: 'X1' [1]
    car <- 
      car_in %>%
      rename(
        id = X1
        )
    
    glimpse(car)
    Observations: 4,654
    Variables: 71
    $ id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
    $ choice     <chr> "choice1", "choice2", "choice5", "choice5", "choice...
    $ college    <dbl> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ...
    $ hsg2       <dbl> 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
    $ coml5      <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
    $ type1      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
    $ type2      <chr> "regcar", "van", "truck", "truck", "truck", "regcar...
    $ type3      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
    $ type4      <chr> "stwagon", "stwagon", "van", "van", "van", "van", "...
    $ type5      <chr> "van", "regcar", "regcar", "regcar", "regcar", "tru...
    $ type6      <chr> "truck", "truck", "stwagon", "stwagon", "stwagon", ...
    $ fuel1      <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",...
    $ fuel2      <chr> "cng", "methanol", "cng", "methanol", "cng", "cng",...
    $ fuel3      <chr> "electric", "cng", "electric", "cng", "electric", "...
    $ fuel4      <chr> "electric", "cng", "electric", "cng", "electric", "...
    $ fuel5      <chr> "gasoline", "gasoline", "gasoline", "electric", "ga...
    $ fuel6      <chr> "gasoline", "gasoline", "gasoline", "electric", "ga...
    $ price1     <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3...
    $ price2     <dbl> 4.175345, 3.310947, 4.039574, 7.065968, 5.794157, 3...
    $ price3     <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598...
    $ price4     <dbl> 4.8177056, 3.5868590, 2.7772075, 7.3871485, 6.34598...
    $ price5     <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050...
    $ price6     <dbl> 5.1388859, 4.4145957, 3.2821543, 5.4600663, 4.69050...
    $ range1     <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20...
    $ range2     <dbl> 250, 125, 300, 200, 75, 300, 125, 125, 125, 200, 20...
    $ range3     <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,...
    $ range4     <dbl> 400, 300, 250, 75, 300, 250, 300, 300, 250, 75, 75,...
    $ range5     <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3...
    $ range6     <dbl> 250, 300, 300, 300, 350, 300, 300, 300, 300, 300, 3...
    $ acc1       <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4...
    $ acc2       <dbl> 4.0, 2.5, 6.0, 4.0, 4.0, 6.0, 6.0, 6.0, 2.5, 4.0, 4...
    $ acc3       <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6...
    $ acc4       <dbl> 6.0, 4.0, 2.5, 6.0, 6.0, 2.5, 2.5, 2.5, 4.0, 6.0, 6...
    $ acc5       <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2...
    $ acc6       <dbl> 2.5, 6.0, 4.0, 2.5, 2.5, 4.0, 4.0, 4.0, 6.0, 2.5, 2...
    $ speed1     <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ...
    $ speed2     <dbl> 95, 85, 140, 100, 85, 85, 100, 85, 65, 85, 55, 85, ...
    $ speed3     <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85...
    $ speed4     <dbl> 110, 140, 85, 85, 95, 95, 85, 140, 140, 140, 95, 85...
    $ speed5     <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95...
    $ speed6     <dbl> 140, 95, 95, 95, 110, 110, 110, 95, 85, 85, 110, 95...
    $ pollution1 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0...
    $ pollution2 <dbl> 0.6, 0.0, 0.1, 0.0, 0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0...
    $ pollution3 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7...
    $ pollution4 <dbl> 0.25, 0.40, 0.40, 0.25, 0.40, 0.25, 0.25, 0.40, 0.7...
    $ pollution5 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2...
    $ pollution6 <dbl> 0.50, 0.25, 0.75, 0.60, 0.75, 0.50, 1.00, 0.25, 0.2...
    $ size1      <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ...
    $ size2      <dbl> 3, 3, 2, 2, 1, 3, 2, 3, 3, 2, 3, 3, 0, 2, 1, 3, 3, ...
    $ size3      <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ...
    $ size4      <dbl> 2, 3, 3, 3, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 2, 1, 3, ...
    $ size5      <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ...
    $ size6      <dbl> 3, 2, 3, 3, 3, 3, 3, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, ...
    $ space1     <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1...
    $ space2     <dbl> 0.7, 0.7, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 0.7, 1.0, 1...
    $ space3     <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1...
    $ space4     <dbl> 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7, 0.7, 1.0, 0.7, 1...
    $ space5     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    $ space6     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    $ cost1      <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ...
    $ cost2      <dbl> 4, 4, 6, 8, 6, 6, 8, 1, 4, 2, 2, 2, 8, 1, 1, 8, 8, ...
    $ cost3      <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ...
    $ cost4      <dbl> 6, 8, 8, 2, 8, 8, 4, 4, 4, 2, 4, 8, 4, 2, 4, 8, 2, ...
    $ cost5      <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ...
    $ cost6      <dbl> 8, 4, 2, 4, 2, 2, 8, 8, 6, 4, 6, 2, 8, 4, 6, 4, 6, ...
    $ station1   <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0...
    $ station2   <dbl> 0.1, 0.0, 0.1, 0.0, 0.3, 0.1, 0.0, 0.0, 0.0, 0.0, 0...
    $ station3   <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0...
    $ station4   <dbl> 0.3, 0.1, 0.3, 0.7, 0.7, 0.3, 0.7, 0.7, 0.3, 0.7, 0...
    $ station5   <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
    $ station6   <dbl> 1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0...
  2. Create a new data frame that uses only the variables for vehicle chosen, demographics, and body type options. Tidy this new data frame.

    car_type <- 
      car %>%
      select(id:type6) %>%
      gather(key = type_order, value = type, -id:-coml5) %>%
      separate(type_order, into = c("X1","order"), sep = "type") %>%
      select(-X1) %>%
      arrange(id, order)
    
    car_type %>%
      select(choice, order,  college, hsg2, coml5, type) %>%
      head()
    # A tibble: 6 x 6
      choice  order college  hsg2 coml5 type   
      <chr>   <chr>   <dbl> <dbl> <dbl> <chr>  
    1 choice1 1           0     0     0 van    
    2 choice1 2           0     0     0 regcar 
    3 choice1 3           0     0     0 van    
    4 choice1 4           0     0     0 stwagon
    5 choice1 5           0     0     0 van    
    6 choice1 6           0     0     0 truck  
  3. Using the data set with all the variables on the vehicle options, Tidy the data set.

    To gather multiple variables from multiple sets of columns, one gathers on all columns from all the sets. Then modify the variable created as the key to contain the names of the new variables. Finish by spreading using the column that contains the variable names. This is a common set of steps used when shapping a data frame. That is make the data frame as tall enough to contain everything that will be spread. Then spread to the desired columns.

    car_temp <- 
      car %>%
      gather(key = type_order, value = value, -id:-coml5) %>%
      separate(type_order, into = c("variable", "order"), sep = -1) %>%
      spread(key = variable, value = value) %>%
      arrange(id, order)
    
    glimpse(car_temp)
    Observations: 27,924
    Variables: 17
    $ id        <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3...
    $ choice    <chr> "choice1", "choice1", "choice1", "choice1", "choice1...
    $ college   <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0...
    $ hsg2      <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
    $ coml5     <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0...
    $ order     <chr> "1", "2", "3", "4", "5", "6", "1", "2", "3", "4", "5...
    $ acc       <chr> "4", "4", "6", "6", "2.5", "2.5", "2.5", "2.5", "4",...
    $ cost      <chr> "4", "4", "6", "6", "8", "8", "4", "4", "8", "8", "4...
    $ fuel      <chr> "cng", "cng", "electric", "electric", "gasoline", "g...
    $ pollution <chr> "0.6", "0.6", "0.25", "0.25", "0.5", "0.5", "0", "0"...
    $ price     <chr> "4.1753448", "4.1753448", "4.8177056", "4.8177056", ...
    $ range     <chr> "250", "250", "400", "400", "250", "250", "125", "12...
    $ size      <chr> "3", "3", "2", "2", "3", "3", "3", "3", "3", "3", "2...
    $ space     <chr> "0.7", "0.7", "1", "1", "1", "1", "0.7", "0.7", "1",...
    $ speed     <chr> "95", "95", "110", "110", "140", "140", "85", "85", ...
    $ station   <chr> "0.1", "0.1", "0.3", "0.3", "1", "1", "0", "0", "0.1...
    $ type      <chr> "van", "regcar", "van", "stwagon", "van", "truck", "...