5.7 Relationships between columns

SSCC - Social Science Computing Cooperative

Supporting Statistical Analysis for Research

These exercises use the Chile.csv data set.

Import the Chile.csv file.

library(tidyverse)

chile_path <- file.path("..", "datasets", "Chile.csv")
chile_in <- read_csv(chile_path, col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

chile_in <- 
  chile_in %>%
  rename(
    status_quo = statusquo
  )

chile <- 
  chile_in %>%
  select(-X1)

glimpse(chile)

Observations: 2,700
Variables: 8
$ region     <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "...
$ population <dbl> 175000, 175000, 175000, 175000, 175000, 175000, 175...
$ sex        <chr> "M", "M", "F", "F", "F", "F", "M", "F", "F", "M", "...
$ age        <dbl> 65, 29, 38, 49, 23, 28, 26, 24, 41, 41, 64, 19, 27,...
$ education  <chr> "P", "PS", "P", "P", "S", "P", "PS", "S", "P", "P",...
$ income     <dbl> 35000, 7500, 15000, 35000, 35000, 7500, 35000, 1500...
$ status_quo <dbl> 1.00820, -1.29617, 1.23072, -1.03163, -1.10496, -1....
$ vote       <chr> "Y", "N", "Y", "N", "N", "N", "N", "N", "U", "N", "...

Find all rows with a missing value in any column using a related columns method.

chile_na_rows <- 
  chile %>%
  mutate(
    missing = 
      pmap_dbl(select(., region:vote),
               ~ sum(is.na(c(...)))
               ) >=
      1
    ) %>%
  filter(missing) %>%
  select(-missing)

head(chile_na_rows)

# A tibble: 6 x 8
  region population sex     age education income status_quo vote 
  <chr>       <dbl> <chr> <dbl> <chr>      <dbl>      <dbl> <chr>
1 N          175000 F        27 PS            NA      1.43  Y    
2 N          175000 M        36 PS         35000      1.49  <NA> 
3 N          175000 F        43 P             NA      0.155 A    
4 N          125000 F        32 S             NA     -0.850 N    
5 N          125000 F        34 P           2500      0.108 <NA> 
6 N          250000 F        46 S             NA      0.155 <NA>