SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

4.7 Coding missing values

These exercises use the PSID.csv data set that was imported in the prior section.

  1. Import the PSID.csv data set.

    library(tidyverse)
    psid_path <- file.path("..", "datasets", "PSID.csv")
    psid_in <- read_csv(psid_path, col_types = cols())
    Warning: Missing column names filled in: 'X1' [1]
    psid_in <-
      rename(
        psid_in,
        obs_num = X1,
        intvw_num = intnum,
        person_id = persnum,
        marital_status = married
        )
    
    psid <-
      psid_in %>%
      select(-obs_num)
    glimpse(psid)
    Observations: 4,856
    Variables: 8
    $ intvw_num      <dbl> 4, 4, 4, 4, 5, 6, 6, 7, 7, 7, 10, 10, 10, 11, 1...
    $ person_id      <dbl> 4, 6, 7, 173, 2, 4, 172, 4, 170, 171, 3, 171, 1...
    $ age            <dbl> 39, 35, 33, 39, 47, 44, 38, 38, 39, 37, 48, 47,...
    $ educatn        <dbl> 12, 12, 12, 10, 9, 12, 16, 9, 12, 11, 13, 12, 1...
    $ earnings       <dbl> 77250, 12000, 8000, 15000, 6500, 6500, 7000, 50...
    $ hours          <dbl> 2940, 2040, 693, 1904, 1683, 2024, 1144, 2080, ...
    $ kids           <dbl> 2, 2, 1, 2, 5, 2, 3, 4, 3, 5, 98, 3, 0, 0, 2, 0...
    $ marital_status <chr> "married", "divorced", "married", "married", "m...
  2. Code NAs for the kids variable.

    In the prepratory exercies it was seen that there are values that are varry large for the kids variable, larger than 90. Change these to NA.

    psid <-
      psid %>%
      mutate(
        kids = if_else(kids < 90, kids, NA_real_)
      )
  3. Display observations that contain missing values in the Kids variable.

    psid %>%
      filter(is.na(kids)) %>%
      select(intvw_num, person_id, age, educatn, kids, marital_status) %>%
      arrange(person_id, age) %>%
      print(n = 15)
    # A tibble: 118 x 6
       intvw_num person_id   age educatn  kids marital_status
           <dbl>     <dbl> <dbl>   <dbl> <dbl> <chr>         
     1      8937         1    36      12    NA never married 
     2      9302         1    37       8    NA divorced      
     3      7660         1    50       8    NA divorced      
     4      2704         2    39       0    NA no histories  
     5      5806         2    45       0    NA no histories  
     6       878         2    47      12    NA divorced      
     7      8444         2    48       8    NA no histories  
     8      8652         2    48       1    NA married       
     9      5474         2    49       5    NA married       
    10      7269         2    49      11    NA separated     
    11      9004         2    49      99    NA married       
    12      1709         2    50       0    NA no histories  
    13      5413         2    50      17    NA married       
    14      7207         2    50      99    NA married       
    15      8955         2    50      11    NA married       
    # ... with 103 more rows