2.3 More challenging csv and deliminated files

SSCC - Social Science Computing Cooperative

Supporting Statistical Analysis for Research

2.3 More challenging csv and deliminated files

Import the amis.csv data set.

library(tidyverse)

amis_path <- file.path("..", "datasets", "amis.csv")
amis <- read_csv(amis_path, col_types = cols())

Warning: 8440 parsing failures.
row col  expected    actual                   file
  3  -- 1 columns 2 columns '../datasets/amis.csv'
  9  -- 1 columns 3 columns '../datasets/amis.csv'
 10  -- 1 columns 5 columns '../datasets/amis.csv'
 11  -- 1 columns 5 columns '../datasets/amis.csv'
 12  -- 1 columns 5 columns '../datasets/amis.csv'
... ... ......... ......... ......................
See problems(...) for more details.

Are there any rows that need to be ignored in the amis data set? If so, modify your import to account for them.

head(amis)

# A tibble: 6 x 1
  speed                                                                    
  <chr>                                                                    
1 Speeds of cars (in miles per hour).                                      
2 period                                                                   
3 A numeric column indicating the time that the reading was taken. A value~
4 warning                                                                  
5 A numeric column indicating whether the location of the reading was chos~
6 pair

There are 11 rows that do not contain either data or column names in the file.

amis <- read_csv(amis_path, skip = 11, col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

Are there any special symbols that need to be set to missing in the amis data set? If so, modify your import to account for them.

glimpse(amis)

Observations: 8,437
Variables: 5
$ X1      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
$ speed   <dbl> 26, 26, 26, 26, 27, 28, 28, 28, 28, 29, 29, 29, 29, 29...
$ period  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ warning <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ pair    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

There does not appear to be any missing identifiers in the data set.

Import the mifem.csv data set.

mifem_path <- file.path("..", "datasets", "mifem.csv")
mifem <- read_csv(mifem_path, col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

Is there any meta data at the top or bottom of the mifem data set? You will need to determine how to view the bottom of a data set. If so, modify your import to account for them.

head(mifem)

# A tibble: 6 x 11
     X1 outcome   age yronset premi smstat diabetes highbp hichol angina
  <dbl> <chr>   <dbl>   <dbl> <chr> <chr>  <chr>    <chr>  <chr>  <chr> 
1     1 live       63      85 n     x      n        y      y      n     
2     6 live       55      85 n     c      n        y      y      n     
3     8 live       68      85 y     nk     nk       y      nk     y     
4    10 live       64      85 n     x      n        y      n      y     
5    11 dead       67      85 n     nk     nk       nk     nk     nk    
6    15 live       66      85 n     x      nk       nk     nk     nk    
# ... with 1 more variable: stroke <chr>

tail(mifem)

# A tibble: 6 x 11
     X1 outcome   age yronset premi smstat diabetes highbp hichol angina
  <dbl> <chr>   <dbl>   <dbl> <chr> <chr>  <chr>    <chr>  <chr>  <chr> 
1  6345 live       65      93 n     n      n        y      n      n     
2  6347 live       69      93 n     x      n        y      nk     n     
3  6359 live       54      93 n     n      n        y      y      n     
4  6360 live       64      93 n     n      n        y      y      n     
5  6361 live       36      93 n     c      n        n      y      n     
6  6366 live       65      93 n     n      n        n      nk     n     
# ... with 1 more variable: stroke <chr>

There does not appear to be any meta data at the top or bottom of the data frame.

Are there any special symbols that need to be set to missing in the mifem data set? If so, modify your import to account for them.

glimpse(mifem)

Observations: 1,295
Variables: 11
$ X1       <dbl> 1, 6, 8, 10, 11, 15, 21, 22, 23, 28, 36, 40, 41, 43, ...
$ outcome  <chr> "live", "live", "live", "live", "dead", "live", "live...
$ age      <dbl> 63, 55, 68, 64, 67, 66, 63, 68, 46, 66, 59, 63, 55, 5...
$ yronset  <dbl> 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 8...
$ premi    <chr> "n", "n", "y", "n", "n", "n", "n", "y", "n", "y", "n"...
$ smstat   <chr> "x", "c", "nk", "x", "nk", "x", "n", "n", "c", "c", "...
$ diabetes <chr> "n", "n", "nk", "n", "nk", "nk", "n", "n", "n", "n", ...
$ highbp   <chr> "y", "y", "y", "y", "nk", "nk", "y", "y", "y", "y", "...
$ hichol   <chr> "y", "y", "nk", "n", "nk", "nk", "n", "y", "nk", "n",...
$ angina   <chr> "n", "n", "y", "y", "nk", "nk", "n", "y", "nk", "n", ...
$ stroke   <chr> "n", "n", "n", "n", "nk", "nk", "n", "y", "n", "y", "...

There are values of nk in several of the variables. The data descriptions do not identify any other indicator for missing.

mifem <- read_csv(mifem_path, na = c("", "NA", "nk"), col_types = cols())

Warning: Missing column names filled in: 'X1' [1]

glimpse(mifem)

Observations: 1,295
Variables: 11
$ X1       <dbl> 1, 6, 8, 10, 11, 15, 21, 22, 23, 28, 36, 40, 41, 43, ...
$ outcome  <chr> "live", "live", "live", "live", "dead", "live", "live...
$ age      <dbl> 63, 55, 68, 64, 67, 66, 63, 68, 46, 66, 59, 63, 55, 5...
$ yronset  <dbl> 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 8...
$ premi    <chr> "n", "n", "y", "n", "n", "n", "n", "y", "n", "y", "n"...
$ smstat   <chr> "x", "c", NA, "x", NA, "x", "n", "n", "c", "c", "c", ...
$ diabetes <chr> "n", "n", NA, "n", NA, NA, "n", "n", "n", "n", "n", "...
$ highbp   <chr> "y", "y", "y", "y", NA, NA, "y", "y", "y", "y", "y", ...
$ hichol   <chr> "y", "y", NA, "n", NA, NA, "n", "y", NA, "n", "n", "y...
$ angina   <chr> "n", "n", "y", "y", NA, NA, "n", "y", NA, "n", "n", "...
$ stroke   <chr> "n", "n", "n", "n", NA, NA, "n", "y", "n", "y", "n", ...