Presenting Data

Agenda

  • Table 01
  • Data Quality Essentials

Learning objectives

By the end of the lecture, you will be able to …

  • Use tbl_summary() to create a descriptive summary table
  • Detect and address missing values and outliers

Code-along 07

Download and open code-along-07.qmd

Packages

Load the standard packages & 1 new package: gtsummary()

# Only need to install once per machine. 
# install.packages("gtsummary")

library(here)
library(tidyverse) 
library(haven) # not core tidyverse
library(gssr)
library(gssrdoc)
library(summarytools)
library(gtsummary) # load the new package

Load your data

# Get the data from the 2022 survey
gss22 <- gss_get_yr(2022)

Variables

  • lifenow R’s rating of life overall now from 0-10
  • age Age of respondent
  • educ Respondents highest edu credit
  • hrs1 how many hours did you work last week
  • fefam Better for man to work, woman tend home
  • race What race do you consider yourself
  • sex Respondents sex

Variable Management

Make a df with only the (pretty) categorical and continuous variables we’ll analyze.

# Categorical Variables
my_cat <- gss22 |>
  select(id, fefam, race, sex) |>
  zap_missing() |>
  as_factor() |>
  droplevels()

# Continuous Variables
my_con <- gss22 |>
  select(id, lifenow, age, educ, hrs1) |>
  mutate(
    lifenow = as.numeric(lifenow),
    age = as.numeric(age),
    educ = as.numeric(educ),
    hrs1 = as.numeric(hrs1))

# Combine the two dataframes
my_data <- left_join(my_cat, my_con, by = "id")

Table 01

tbl_summary()

Calculates descriptive statistics for continuous, categorical, and dichotomous variables

my_data |>
  tbl_summary()
Characteristic N = 4,1491
respondent id number 2,075 (1,038, 3,112)
fefam
    strongly agree 171 (6.3%)
    agree 516 (19%)
    disagree 1,182 (43%)
    strongly disagree 866 (32%)
    Unknown 1,414
race
    white 2,651 (65%)
    black 775 (19%)
    other 659 (16%)
    Unknown 64
sex
    male 1,910 (46%)
    female 2,216 (54%)
    Unknown 23
lifenow 8.00 (7.00, 9.00)
    Unknown 2,001
age 47 (33, 63)
    Unknown 256
educ 14.00 (12.00, 16.00)
    Unknown 29
hrs1 40 (36, 45)
    Unknown 1,829
1 Median (Q1, Q3); n (%)

my_data |>
  drop_na() |>
  tbl_summary()

my_data |>
  drop_na() |>
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")
    )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")
    )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    label = list(
      race     ~ "Race",
      sex      ~ "Gender",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0-10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}"
    ))

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    label = list(
      race     ~ "Race",
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0-10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
     type  = list(sex  ~ "dichotomous"),
     value = list(sex  = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}"),
  )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,  
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0-10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")
  )

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0–10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")) |>
  add_overall()  # This adds the Total column

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, woman tend home",
      lifenow  ~ "Life satisfaction (0–10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours worked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")) |>
  add_overall()  |>
  modify_header(
    label = '**Variable**') 

Change YAML output: pdf_document

my_data |>
  drop_na() |>
  select(lifenow, race, sex, fefam, age, hrs1, educ) |>
  tbl_summary(
    by = race,
    label = list(
      sex      ~ "Women",
      fefam    ~ "Better for man to work, \nwoman tend home",
      lifenow  ~ "Life satisfaction (0–10)",
      age      ~ "Age",
      hrs1     ~ "Number of hours \nworked last week",
      educ     ~ "Years of education"),
    type = list(sex ~ "dichotomous"),
    value = list(sex = "female"),  
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{p}")) |>
  add_overall() |>
  modify_header(
    label = '**Variable**') |>
  as_flex_table()

Title & Notes

Add a title and footnotes to your Quarto document


Title:
> Table 01. Unweighted Descriptive Statistics by Race


Footnotes:
> Notes: Unweighted data from the 2022 U.S. General Social Survey

Data Quality Essentials

Missing Data:

Problem

  • Reduces sample size (and thus, statistical power)
  • Potential bias (and thus, non-representativeness)

Missing Data:

Detection

  • descriptive statistics of each variable
  • Ns change across regression models
  • ggplot2 warning: “Removed X rows containing missing values”.

Missing Data:

Solutions

  • drop or change variable(s)
  • treat missing as a category
  • recode values
  • listwise deletion
  • imputation

Missing Data: Solutions

Listwise deletion

df_clean <- df |>
  drop_na()


Imputation

df <- df |>
  mutate(var_A = if_else(is.na(var_A), 
                         mean(var_A, na.rm = TRUE), var_A))

Outliers:

Problem

  • conclusions may be inaccurate or unreliable
  • may mask patterns or inflate summary statistics
  • can skew scales in figures

Outliers:

Solutions

  • descriptive statistics of each variable
  • scatterplots

Outliers:

Solutions

  • correct or remove errors
  • cap or transform: limit their influence
  • transparent reporting (drop or keep)

Think Like a Statistician

Think Like a Statistician

1. Create Table 01 for your research project


2. Create a scatterplot with your DV & 1 IV


3. Create a bargraph with your DV & 1 IV