Data Management II

Agenda

  • dplyr II
  • Overwriting data

Learning objectives

By the end of the lecture, you will be able to …

  • implement more basic variable manipulation
  • understand overwriting principles

Code-along 04

Download and open code-along-04.qmd

Packages

Load the standard packages.

library(tidyverse)
library(haven) # not core tidyverse
library(gssr)
library(gssrdoc) # load the GSS codebook
library(summarytools)

Load your data

# Get the data all survey years
data(gss_all)

dplyr II

select()

Use select() to choose columns in a dataframe.


my_data <- gss_all |>
  select(year, agekdbrn, childs, hrs1)
1
Save a new data frame
2
Keep only the listed variables

head() & tail()

head() looks at the first few column names and first few rows of your df.
tail() looks at the first few column names and last few rows of your df.


# look at the first rows & columns of the data frame
head(my_data)
# A tibble: 6 × 4
  year      agekdbrn    childs    hrs1       
  <dbl+lbl> <dbl+lbl>   <dbl+lbl> <dbl+lbl>  
1 1972      NA(i) [iap] 0         NA(i) [iap]
2 1972      NA(i) [iap] 5         NA(i) [iap]
3 1972      NA(i) [iap] 4         NA(i) [iap]
4 1972      NA(i) [iap] 0         NA(i) [iap]
5 1972      NA(i) [iap] 2         NA(i) [iap]
6 1972      NA(i) [iap] 0         NA(i) [iap]

Mini-task

Use tail() to look at the last few rows of my_data


# look at the first rows & LAST columns of the data frame
tail(my_data)
# A tibble: 6 × 4
  year      agekdbrn                              childs    hrs1       
  <dbl+lbl> <dbl+lbl>                             <dbl+lbl> <dbl+lbl>  
1 2024      NA(x) [not available in this release] 2         NA(i) [iap]
2 2024      NA(x) [not available in this release] 3            15      
3 2024      NA(x) [not available in this release] 4            18      
4 2024      NA(x) [not available in this release] 3         NA(i) [iap]
5 2024      NA(x) [not available in this release] 3            37      
6 2024      NA(x) [not available in this release] 2         NA(i) [iap]

Mini-task

Create a dataframe with the variables year, agekdbrn, childs, and hrs1, including only respondents from the year 2022 who have no missing values in any of these variables.


my_data <- gss_all |>
  select(year, agekdbrn, childs, hrs1) |>
  filter(year == 2022) |>
  drop_na()

head(my_data)
1
No need to list all variables inside drop_na()

Mini-task

# A tibble: 6 × 4
  year      agekdbrn  childs    hrs1     
  <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
1 2022      27        1         40       
2 2022      27        1         52       
3 2022      28        1         31       
4 2022      24        2         40       
5 2022      21        2         10       
6 2022      25        7         20       

mutate()

Use mutate() to add new columns or change existing ones.


# Use mutate to create a new variable: annual_hrs1
my_data <- my_data |>
  mutate(annual_hrs1 = hrs1 * 52)

head(my_data)
# A tibble: 6 × 5
  year      agekdbrn  childs    hrs1      annual_hrs1
  <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>       <dbl>
1 2022      27        1         40               2080
2 2022      27        1         52               2704
3 2022      28        1         31               1612
4 2022      24        2         40               2080
5 2022      21        2         10                520
6 2022      25        7         20               1040

Mini-task

Use mutate(), mean(), and sd() to create a column with Z scores for hrs1


my_data <- my_data |>
  # Create Z-score for annual_hours
  mutate(z_hrs1 = (hrs1 - mean(hrs1)) /
    sd(hrs1))

head(my_data) # Check your work

Mini-task

# A tibble: 6 × 6
  year      agekdbrn  childs    hrs1      annual_hrs1   z_hrs1
  <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>       <dbl>    <dbl>
1 2022      27        1         40               2080 -0.00756
2 2022      27        1         52               2704  0.820  
3 2022      28        1         31               1612 -0.628  
4 2022      24        2         40               2080 -0.00756
5 2022      21        2         10                520 -2.08   
6 2022      25        7         20               1040 -1.39   

mutate() & logical variables

my_data <- my_data |>
  mutate(teen_parent = (agekdbrn < 18) * 1)

head(my_data)
1
Returns a logical vector (TRUE, FALSE, or NA) and * 1 converts it to numeric
# A tibble: 6 × 7
  year      agekdbrn  childs    hrs1      annual_hrs1   z_hrs1 teen_parent
  <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>       <dbl>    <dbl>       <dbl>
1 2022      27        1         40               2080 -0.00756           0
2 2022      27        1         52               2704  0.820             0
3 2022      28        1         31               1612 -0.628             0
4 2022      24        2         40               2080 -0.00756           0
5 2022      21        2         10                520 -2.08              0
6 2022      25        7         20               1040 -1.39              0

Code Meets Question

What proportion of new parents were teenagers (e.g., under 18 years old)?


my_data |>
  summarise(proportion = mean(teen_parent))
1
Mean of the variable, which equals the proportion of 1s
# A tibble: 1 × 1
  proportion
       <dbl>
1     0.0696

Mini-task

Use mutate() and summarise() to determine what proportion of new parents were older than 40 years old.


my_data |>
  mutate(older_parent = (agekdbrn > 40) * 1) |>
  summarise(proportion = mean(older_parent))
# A tibble: 1 × 1
  proportion
       <dbl>
1     0.0192

mutate() with case_when()

Use case_when() inside mutate() to create values based on conditions.


my_data <- my_data |>
  # Use mutate to create a new variable: childs_3cat
  mutate(childs_3cat = case_when(
    childs == 1 ~ "1 kid",
    childs == 2 ~ "2 kids",
    childs >= 3 ~ "3+ kids",
    TRUE ~ NA_character_
  ))

# Always double check your work!
table(my_data$childs, my_data$childs_3cat)

mutate() with case_when()

   
    1 kid 2 kids 3+ kids
  1   320      0       0
  2     0    497       0
  3     0      0     251
  4     0      0     101
  5     0      0      44
  6     0      0      16
  7     0      0       6
  8     0      0      15

Code Meets Question

What proportion of new parents had their first child in their teens, 20s, 30s, or after age 40?


my_data <- my_data |>
  mutate(age_groups = case_when(
    agekdbrn < 18 ~ "<18",
    agekdbrn >= 18 & agekdbrn <= 29 ~ "18–29",
    agekdbrn >= 30 & agekdbrn <= 39 ~ "30–39",
    agekdbrn >= 40 ~ "40+",
    TRUE ~ NA_character_
  ))

my_data |>
  freq(age_groups, report.nas = FALSE, headings = FALSE)

Code Meets Question


              Freq        %   % Cum.
----------- ------ -------- --------
        <18     87     6.96     6.96
      18–29    840    67.20    74.16
      30–39    291    23.28    97.44
        40+     32     2.56   100.00
      Total   1250   100.00   100.00

Overwriting data

Overwriting & assignment operators

Heads Up!

Overwriting datasets and variables can be intentional or unintentional.


Let’s make a tiny data frame to use as an example:

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)

df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     1 a    
2     2 a    
3     3 b    
4     4 c    
5     5 c    

Suppose you run the following and then you inspect df.

Will the x variable have values 1, 2, 3, 4, 5 or 2, 4, 6, 8, 10?


df |>
  mutate(x = x * 2)
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     2 a    
2     4 a    
3     6 b    
4     8 c    
5    10 c    
df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     1 a    
2     2 a    
3     3 b    
4     4 c    
5     5 c    

Do something and show me

Suppose you run the following and then you inspect df.

Will the x variable have values 1, 2, 3, 4, 5 or 2, 4, 6, 8, 10?


df <- df |>
  mutate(x = x * 2)
df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     2 a    
2     4 a    
3     6 b    
4     8 c    
5    10 c    

Do something, save result, overwriting original

Do something, save result, overwriting original

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)
df <- df |>
  mutate(x = x * 2)
df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     2 a    
2     4 a    
3     6 b    
4     8 c    
5    10 c    

Do something, save result, not overwriting original

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)
df_new <- df |>
  mutate(x = x * 2)
df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     1 a    
2     2 a    
3     3 b    
4     4 c    
5     5 c    

Do something, save result, overwriting original when you shouldn’t

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)
df <- df |>
  group_by(y) |>
  summarize(mean_x = mean(x))
df
# A tibble: 3 × 2
  y     mean_x
  <chr>  <dbl>
1 a        1.5
2 b        3  
3 c        4.5

Do something, save result, not overwriting original when you shouldn’t

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)
df_summary <- df |>
  group_by(y) |>
  summarize(mean_x = mean(x))
df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     1 a    
2     2 a    
3     3 b    
4     4 c    
5     5 c    

Do something, save result, overwriting original
data frame

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)
df <- df |>
  mutate(z = x + 2)
df
# A tibble: 5 × 3
      x y         z
  <dbl> <chr> <dbl>
1     1 a         3
2     2 a         4
3     3 b         5
4     4 c         6
5     5 c         7

Do something, save result, overwriting original
column

df <- tibble(
  x = c(1, 2, 3, 4, 5),
  y = c("a", "a", "b", "c", "c")
)
df <- df |>
  mutate(x = x + 2)
df
# A tibble: 5 × 2
      x y    
  <dbl> <chr>
1     3 a    
2     4 a    
3     5 b    
4     6 c    
5     7 c    

Do something and show me

gss_all |>
  select(year, agekdbrn) |>
  filter(year == 2022) |>
  drop_na() |>
  mutate(age_groups = case_when(
    agekdbrn < 18 ~ "<18",
    agekdbrn >= 18 & agekdbrn <= 29 ~ "18–29",
    agekdbrn >= 30 & agekdbrn <= 39 ~ "30–39",
    agekdbrn >= 40 ~ "40+",
    TRUE ~ NA_character_
  )) |>
  group_by(age_groups) |>
  summarise(
    count = n(),
    proportion = round(count / sum(count), 3)
  )

Do something, save result, not overwriting original.

# Do something
gss_all <- gss_all |>
  mutate(age_groups = case_when(
    agekdbrn < 18 ~ "<18",
    agekdbrn >= 18 & agekdbrn <= 29 ~ "18–29",
    agekdbrn >= 30 & agekdbrn <= 39 ~ "30–39",
    agekdbrn >= 40 ~ "40+",
    TRUE ~ NA_character_
  ))

# Now show me
gss_all |>
  select(year, age_groups) |>
  filter(year == 2022) |>
  drop_na() |>
  group_by(age_groups) |>
  summarise(
    count = n(),
    proportion = round(count / sum(count), 3)
  )

Overwriting & raw data

Never overwrite your raw dataset!


Heads Up!

Always save transformed data as a new object, which should remain untouched for reference and reproducibility.

write_rds() & read_rds()

write_rds() saves an R object to a file
read_rds() loads an R object from a .rds file


write_rds(my_data, "my_data_file.rds")

read_rds("my_data_file.rds")
1
Saves your df in current working directory
2
Opens your df if in current working directory

Think Like a Statistician

In 2024, how did the average amount of television watched differ between married people and those who were never married? And how did it compare to people who were widowed, divorced, or separated?


How do we find out?

Use select(), filter(), mutate(), group_by(), & summarise()


Can you reproduce this table?

# A tibble: 3 × 5
  mar_cat       count  mean median    sd
  <chr>         <int> <dbl>  <dbl> <dbl>
1 Married         892  2.8       2  2.44
2 Never married   687  3.5       2  3.78
3 Widow/div/sep   565  3.87      3  3.73

Your Data Take


What’s your conclusion to our research question?