SSCC - Social Science Computing Cooperative Supporting Statistical Analysis for Research

4.6 Subsets of a data frame

  1. Import the PSID.csv data set that was imported in the prior section.

    library(tidyverse)
    psid_path <- file.path("..", "datasets", "PSID.csv")
    psid_in <- read_csv(psid_path, col_types = cols())
    Warning: Missing column names filled in: 'X1' [1]
    psid_in <-
      rename(
        psid_in,
        obs_num = X1,
        intvw_num = intnum,
        person_id = persnum,
        marital_status = married
        )
    
    psid <- psid_in
    glimpse(psid)
    Observations: 4,856
    Variables: 9
    $ obs_num        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
    $ intvw_num      <dbl> 4, 4, 4, 4, 5, 6, 6, 7, 7, 7, 10, 10, 10, 11, 1...
    $ person_id      <dbl> 4, 6, 7, 173, 2, 4, 172, 4, 170, 171, 3, 171, 1...
    $ age            <dbl> 39, 35, 33, 39, 47, 44, 38, 38, 39, 37, 48, 47,...
    $ educatn        <dbl> 12, 12, 12, 10, 9, 12, 16, 9, 12, 11, 13, 12, 1...
    $ earnings       <dbl> 77250, 12000, 8000, 15000, 6500, 6500, 7000, 50...
    $ hours          <dbl> 2940, 2040, 693, 1904, 1683, 2024, 1144, 2080, ...
    $ kids           <dbl> 2, 2, 1, 2, 5, 2, 3, 4, 3, 5, 98, 3, 0, 0, 2, 0...
    $ marital_status <chr> "married", "divorced", "married", "married", "m...

    The obs_num variable is retained for these exaple. The examples of this section operate on row numbers and this variable has the row numbers.

  2. Display the last three rows of the data frame using positional values to subset.

    slice(psid, -1:-(n() - 3))
    # A tibble: 3 x 9
      obs_num intvw_num person_id   age educatn earnings hours  kids
        <dbl>     <dbl>     <dbl> <dbl>   <dbl>    <dbl> <dbl> <dbl>
    1    4854      9302         1    37       8    22045  2793    98
    2    4855      9305         2    40       6      134    30     3
    3    4856      9306         2    37      17    33000  2423     4
    # ... with 1 more variable: marital_status <chr>

    Displaying using the tail() function to confirm the correct three rows are displayed.

    tail(psid)
    # A tibble: 6 x 9
      obs_num intvw_num person_id   age educatn earnings hours  kids
        <dbl>     <dbl>     <dbl> <dbl>   <dbl>    <dbl> <dbl> <dbl>
    1    4851      9294         2    37      12        0     0     2
    2    4852      9297         2    42       2     3000  1040     4
    3    4853      9301         2    43      12        0     0     2
    4    4854      9302         1    37       8    22045  2793    98
    5    4855      9305         2    40       6      134    30     3
    6    4856      9306         2    37      17    33000  2423     4
    # ... with 1 more variable: marital_status <chr>
  3. Display the first, third, fifth, and seventh rows of columns two and three.

    slice(psid, c(1, 3, 5, 7))
    # A tibble: 4 x 9
      obs_num intvw_num person_id   age educatn earnings hours  kids
        <dbl>     <dbl>     <dbl> <dbl>   <dbl>    <dbl> <dbl> <dbl>
    1       1         4         4    39      12    77250  2940     2
    2       3         4         7    33      12     8000   693     1
    3       5         5         2    47       9     6500  1683     5
    4       7         6       172    38      16     7000  1144     3
    # ... with 1 more variable: marital_status <chr>

    or

    psid[c(1, 3, 5, 7), ]
    # A tibble: 4 x 9
      obs_num intvw_num person_id   age educatn earnings hours  kids
        <dbl>     <dbl>     <dbl> <dbl>   <dbl>    <dbl> <dbl> <dbl>
    1       1         4         4    39      12    77250  2940     2
    2       3         4         7    33      12     8000   693     1
    3       5         5         2    47       9     6500  1683     5
    4       7         6       172    38      16     7000  1144     3
    # ... with 1 more variable: marital_status <chr>
  4. Create a smaller data frame using the first 20 rows.

    psid_small <- slice(psid, 1:20)

    or

    psid_small <- psid[1:20, ]