Presenting Data

Agenda

Table 01
Data Quality Essentials

Learning objectives

By the end of the lecture, you will be able to …

Use tbl_summary() to create a descriptive summary table
Detect and address missing values and outliers

Code-along 07

Download and open code-along-07.qmd

Packages

Load the standard packages & 1 new package: gtsummary()

# Only need to install once per machine. 
# install.packages("gtsummary")

library(here)
library(tidyverse) 
library(haven) # not core tidyverse
library(gssr)
library(gssrdoc)
library(summarytools)
library(gtsummary) # load the new package

Load your data

# Get the data from the 2022 survey
gss22 <- gss_get_yr(2022)

Variables

lifenow R’s rating of life overall now from 0-10
age Age of respondent
educ Respondents highest edu credit
hrs1 how many hours did you work last week
fefam Better for man to work, woman tend home
race What race do you consider yourself
sex Respondents sex

Variable Management

Make a df with only the (pretty) categorical and continuous variables we’ll analyze.

# Categorical Variables
my_cat <- gss22 |>
  select(id, fefam, race, sex) |>
  zap_missing() |>
  as_factor() |>
  droplevels()

# Continuous Variables
my_con <- gss22 |>
  select(id, lifenow, age, educ, hrs1) |>
  mutate(
    lifenow = as.numeric(lifenow),
    age = as.numeric(age),
    educ = as.numeric(educ),
    hrs1 = as.numeric(hrs1))

# Combine the two dataframes
my_data <- left_join(my_cat, my_con, by = "id")

Table 01

`tbl_summary()`

Calculates descriptive statistics for continuous, categorical, and dichotomous variables

my_data |>
  tbl_summary()

Characteristic	N = 4,149¹
respondent id number	2,075 (1,038, 3,112)
fefam
strongly agree	171 (6.3%)
agree	516 (19%)
disagree	1,182 (43%)
strongly disagree	866 (32%)
Unknown	1,414
race
white	2,651 (65%)
black	775 (19%)
other	659 (16%)
Unknown	64
sex
male	1,910 (46%)
female	2,216 (54%)
Unknown	23
lifenow	8.00 (7.00, 9.00)
Unknown	2,001
age	47 (33, 63)
Unknown	256
educ	14.00 (12.00, 16.00)
Unknown	29
hrs1	40 (36, 45)
Unknown	1,829
¹ Median (Q1, Q3); n (%)

my_data |>
  drop_na() |>
  tbl_summary()

my_data |>
  drop_na() |>
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")
    )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")
    )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    label = list(
      race     ~ "Race",
      sex      ~ "Gender",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0-10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}"
    ))

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    label = list(
      race     ~ "Race",
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0-10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
     type  = list(sex  ~ "dichotomous"),
     value = list(sex  = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}"),
  )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,  
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0-10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")
  )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0–10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")) |>
  add_overall()  # This adds the Total column

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0–10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")) |>
  add_overall()  |>
  modify_header(
    label = '**Variable**')

Change YAML output: pdf_document

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, \nwoman tend home",
      lifenow  ~ "Life satisfaction (0–10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours \nworked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")) |>
  add_overall() |>
  modify_header(
    label = '**Variable**') |>
  as_flex_table()

Title & Notes

Add a title and footnotes to your Quarto document

Title:
> Table 01. Unweighted Descriptive Statistics by Race

Footnotes:
> Notes: Unweighted data from the 2022 U.S. General Social Survey

Data Quality Essentials

Missing Data:

Problem

Reduces sample size (and thus, statistical power)
Potential bias (and thus, non-representativeness)

Missing Data:

Detection

descriptive statistics of each variable
Ns change across regression models
ggplot2 warning: “Removed X rows containing missing values”.

Missing Data:

Solutions

drop or change variable(s)
treat missing as a category
recode values
listwise deletion
imputation

Missing Data: Solutions

Listwise deletion

df_clean <- df |>
  drop_na()

Imputation

df <- df |>
  mutate(var_A = if_else(is.na(var_A), 
                         mean(var_A, na.rm = TRUE), var_A))

Outliers:

Problem

conclusions may be inaccurate or unreliable
may mask patterns or inflate summary statistics
can skew scales in figures

Outliers:

Solutions

descriptive statistics of each variable
scatterplots

Outliers:

Solutions

correct or remove errors
cap or transform: limit their influence
transparent reporting (drop or keep)

Think Like a Statistician

1. Create Table 01 for your research project

2. Create a scatterplot with your DV & 1 IV

3. Create a bargraph with your DV & 1 IV