# check if 'librarian' is installed and if not, install it
if (! "librarian" %in% rownames(installed.packages()) ){
install.packages("librarian")
}
# load packages if not already loaded
::shelf(
librarian/causalworkshop
tidyverse, broom, rsample, ggdag, causaldata, halfmoon, ggokabeito, malcolmbarrett-causal/propensity, gt, gtExtras)
, magrittr, ggplot2, estimatr, Formula, r
# set the default theme for plotting
theme_set(theme_bw(base_size = 18) + theme(legend.position = "top"))
Lab 7 - Causality: DAGs
SOLUTIONS
Introduction
In today’s lab, you’ll practice working with DAGs and building a causal workflow.
Learning goals
By the end of the lab you will…
- Be able to build DAGs to model causal assumptions and use the causal model to extract implications for answering causal questions.
- Be able to build a causal workflow to answer causal questions.
Packages
Exercise 1: DAGs and open paths
Find the open paths from D (treatment) to Y (outcome) in the four DAGs below.
You can examine the DAGS to identify (this is the recommended first step) and then use the code for the DAGs (in your repo), along with the function dagitty::paths
to confirm
Four example DAGs
# (1)
<-
dag_1 ::dagify(
ggdag~ D
A ~ A
, Y ~ A
, B coords = ggdag::time_ordered_coords(
, list(
"D" # time point 1
c("A","B") # time point 2
, "Y" # time point 3
,
)
),exposure = "D",
outcome = "Y"
)
# (2)
<-
dag_2 ::dagify(
ggdag~ A
D ~ D
, E ~ E
, Y ~ F
, E ~ F
, B ~ A
, B ~ B
, C coords = ggdag::time_ordered_coords(
, list(
c("A","D") # time point 1
c("B","F","E") # time point 2
, c("C","Y") # time point 3
,
)
),exposure = "D",
outcome = "Y"
)
# (3)
<-
dag_3 ::dagify(
ggdag~ D
A ~ D
, Y ~ B
, D ~ B
, A ~ B
, Y coords = ggdag::time_ordered_coords(
, list(
"D" # time point 1
c("B","A") # time point 2
, "Y" # time point 3
,
)
),exposure = "D",
outcome = "Y"
)
# (4)
<-
dag_4 ::dagify(
ggdag~ D
Y ~ C
, Y ~ B
, D ~ A
, D ~ C
, B ~ A
, B coords = ggdag::time_ordered_coords(
, list(
c("A","D") # time point 1
"B" # time point 2
, c("C","Y") # time point 3
,
)
),exposure = "D",
outcome = "Y"
) # %>%
# ggdag::ggdag(text = TRUE) +
# ggdag::theme_dag()
<-
dag_flows ::map(
purrrlist(dag_1 = dag_1, dag_2 = dag_2, dag_3 = dag_3, dag_4 = dag_4)
::tidy_dagitty
, ggdag|>
) ::map("data") |>
purrr::list_rbind(names_to = "dag") |>
purrr::mutate(dag = factor(dag, levels = c("dag_1", "dag_2", "dag_3", "dag_4")))
dplyr
|>
dag_flows ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
::geom_dag_edges(edge_width = 1) +
ggdag::geom_dag_point() +
ggdag::geom_dag_text() +
ggdagfacet_wrap(~ dag) +
::expand_plot(
ggdagexpand_x = expansion(c(0.2, 0.2)),
expand_y = expansion(c(0.2, 0.2))
+
) ::theme_dag() ggdag
Exercise 2: Building a DAG
You work for a company that sells a commodity to retail customers, and your management is interested in the relationship between your price and the demand for the commodity at your outlets. You have one competitor and your pricing tactic is to set your price at slightly less that your competitor’s. Your company surveys the competitors prices several times per day and once you know the competitor’s price, the pricing team resets your prices according to the pricing tactic. The public is well informed of both prices when they make their choice to buy.
You and your competitor buy from the wholesaler at a price that is set by the global market, and the wholesaler’s price is reset at the beginning of the each day according to the market price at the end of the day before. As the market is traded globally it reflects global demand for the commodity as well as other global and local economic shocks that you customers might be exposed to (interest rates, general business conditions, wages, etc.).
Your company has data on its prices, competitor prices and sales, and has asked you to do an analysis of the pricing tactics to increase demand.
To confirm your understanding of the business, perhaps identify missing data, and to inform your analysis, create a DAG describing the assumed relationships between the driving factors for this problem.
What data might be missing from dataset provided by the company?
Exercise 3: Inverse Probability Weights (IPWs)
In class we used the function propensity::wt_ate
to calculate inverse probability weights, starting from the propensity scores, as in the code below:
<- glm(
propensity_model ~ income + health + temperature,
net data = causalworkshop::net_data,
family = binomial()
)
Repeat the calculation of the IPWs, using the definition of the weight as the inverse probability, and show that your calculated weights are the same as those computed by propensity::wt_ate
Exercise 4: Randomized Controlled Trials
The essence of exchangeability is that the treated and untreated groups are very similar with respect to values of potential confounders. Randomization of treatment makes outcomes independent of treatment, and also makes the treated and untreated groups very similar with respect to values of potential confounders.
Show that this is the case for our mosquito net data by simulating random treatment assignment as follows:
# use this data - mosquito net data plus a row id numer
<- causalworkshop::net_data |>
smpl_dat ::rowid_to_column() tibble
- use
tidysmd::tidy_smd
withsmpl_dat
and group=net to calculate the standardized mean differences (SMDs) for the confounders income, health and temperature. - use
dplyr::slice_sample
to randomly sample fromsmpl_dat
, with proportion 0.5. Give this sample data a name. - mutate the sample to add a column with smpl = 1.
- take the data not in the first sample and form a second sample (start with the original data (smpl_dat) and remove the rows that appear in the sample of step 1. This is why we added a row id. Give this second sample data a name.
- mutate the second sample to add a column with smpl = 0.
- bind the two samples together by rows (e.g.
dplyr::bind_rows
). - use
tidysmd::tidy_smd
with the combined samples from step 6 and group=smpl to calculate the standardized mean differences (SMDs) for the confounders income, health and temperature.
Did randomization make the treatment groups more alike with respect to income, health and temperature?
Exercise 5: Frisch-Waugh-Lovell Theorem
Here we’ll look at credit and default-risk data. First we’ll load the data:
# load data
= readr::read_csv("data/risk_data.csv", show_col_types = FALSE) risk_data
The FWL Theorem states that a multivariate linear regression can be estimated all at once or in three separate steps. For example, you can regress default
on the financial variables credit_limit
, wage
, credit_score1
, and credit_score2
as follows:
# regress default on financial variabbles in the dataset
<- lm(default ~ credit_limit + wage + credit_score1 + credit_score2, data = risk_data)
model
|> broom::tidy(conf.int = TRUE) model
# A tibble: 5 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.404 0.00860 46.9 0 3.87e-1 4.21e-1
2 credit_limit 0.00000306 0.00000154 1.99 4.69e- 2 4.16e-8 6.08e-6
3 wage -0.0000882 0.00000607 -14.5 8.33e-48 -1.00e-4 -7.63e-5
4 credit_score1 -0.0000417 0.0000183 -2.28 2.28e- 2 -7.77e-5 -5.82e-6
5 credit_score2 -0.000304 0.0000152 -20.1 4.10e-89 -3.34e-4 -2.74e-4
Per FWL you can also break this down into
- a de-biasing step, where you regress the treatment (
credit_limit
) on the financial confounderswage
,credit_score1
, andcredit_score2
, obtaining the residuals - a de-noising step, where you regress the outcome (
default
) on the financial confounders, obtaining the residuals - an outcome model, where you regress the outcome residuals from step 2 on the treatment residuals of step 1.
Due to confounding, the data looks like this, with default percentage trending down by credit limit.
|>
risk_data ::group_by(credit_limit) |>
dplyr# add columns for number of measurements in the group, and the mean of the group
::mutate(size = n(), default = mean(default), ) |>
dplyr# pull ot the distict values
::distinct(default,credit_limit, size) |>
dplyrggplot(aes(x = credit_limit, y = default, size = size)) +
geom_point() +
labs(title = "Default Rate by Credit Limit", x = "credit_limit", y = "default") +
theme_minimal()
Step 1:
- Create the debiasing model, and
- add the residuals to the risk_data, saving the result in
risk_data_deb
in a columncredit_limit_res
. - plot the de-biased data
- regress the outcome (
default
) oncredit_limit_res
The de-biasing step is crucial for estimating the correct causal effect, while the de-noising step is nice to have, since it reduces the variance of the coefficient estimate and narrows the confidence interval.
Step 2:
- create the de-noising model,
- add the residuals to the de-biased data (risk_data_deb), saving the result in
risk_data_denoise
in a columndefault_res
.
Step 3:
- regress the default residuals (
default_res
) on the credit limit residuals (credit_limit_res
)
Exercise 6: Causal Modeling
In this guided exercise, we attempt to answer a causal question: does quitting smoking make you gain weight? Causal modeling has a special place in the history of smoking research: the studies that demonstrated that smoking causes lung cancer were observational. Thanks to other studies, we also know that, if you’re already a smoker, quitting smoking reduces your risk of lung cancer. However, some have observed that former smokers tend to gain weight. Is this the result of quitting smoking, or does something else explain this effect? In the book Causal Inference by Hernán and Robins, the authors analyze this question using several causal inference techniques.
To answer this question, we’ll use causal inference methods to examine the relationship between quitting smoking and gaining weight. First, we’ll draw our assumptions with a causal diagram (a directed acyclic graph, or DAG), which will guide our model. Then, we’ll use a modeling approach called inverse probability weighting–one of many causal modeling techniques–to estimate the causal effect we’re interested in.
We’ll use data from NHEFS to try to estimate the causal effect of quitting smoking on weight game. NHEFS is a longitudinal, observational study that has many of the variables we’ll need. Take a look at causaldata::nhefs_codebook
if you want to know more about the variables in this data set. These data are included in the {causaldata} package. We’ll use the causaldata::nhefs_complete
data set, but we’ll remove people who were lost to follow-up.
NHEFS data
<- causaldata::nhefs_complete |>
nhefs_complete_uc ::filter(censored == 0)
dplyr nhefs_complete_uc
# A tibble: 1,566 × 67
seqn qsmk death yrdth modth dadth sbp dbp sex age race income
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <fct> <dbl>
1 233 0 0 NA NA NA 175 96 0 42 1 19
2 235 0 0 NA NA NA 123 80 0 36 0 18
3 244 0 0 NA NA NA 115 75 1 56 1 15
4 245 0 1 85 2 14 148 78 0 68 1 15
5 252 0 0 NA NA NA 118 77 0 40 0 18
6 257 0 0 NA NA NA 141 83 1 43 1 11
7 262 0 0 NA NA NA 132 69 1 56 0 19
8 266 0 0 NA NA NA 100 53 1 29 0 22
9 419 0 1 84 10 13 163 79 0 51 0 18
10 420 0 1 86 10 17 184 106 0 43 0 16
# ℹ 1,556 more rows
# ℹ 55 more variables: marital <dbl>, school <dbl>, education <fct>, ht <dbl>,
# wt71 <dbl>, wt82 <dbl>, wt82_71 <dbl>, birthplace <dbl>,
# smokeintensity <dbl>, smkintensity82_71 <dbl>, smokeyrs <dbl>,
# asthma <dbl>, bronch <dbl>, tb <dbl>, hf <dbl>, hbp <dbl>,
# pepticulcer <dbl>, colitis <dbl>, hepatitis <dbl>, chroniccough <dbl>,
# hayfever <dbl>, diabetes <dbl>, polio <dbl>, tumor <dbl>, …
Let’s look at the distribution of weight gain between the two groups.
weight gain distributions
|>
nhefs_complete_uc ggplot(aes(wt82_71, fill = factor(qsmk))) +
geom_vline(xintercept = 0, color = "grey60", linewidth = 1) +
geom_density(color = "white", alpha = .75, linewidth = .5) +
::scale_color_okabe_ito(order = c(1, 5)) +
ggokabeitotheme_minimal() +
theme(legend.position = "bottom") +
labs(
x = "change in weight (kg)",
fill = "quit smoking (1 = yes)"
)
There’s a difference–former smokers do seemed to have gained a bit more weight–but there’s also a lot of variation. Let’s look at the numeric summaries.
weight gain summaries
# ~4.5 kg gained for quit vs. not quit
|>
nhefs_complete_uc ::group_by(qsmk) |>
dplyr::summarize(
dplyrmean_weight_change = mean(wt82_71),
sd = sd(wt82_71),
.groups = "drop"
)
# A tibble: 2 × 3
qsmk mean_weight_change sd
<dbl> <dbl> <dbl>
1 0 1.98 7.45
2 1 4.53 8.75
Here, it looks like those who quit smoking gained, on average, 4.5 kg. But is there something else that could explain these results? There are many factors associated with both quitting smoking and gaining weight; could one of those factors explain away the results we’re seeing here?
To truly answer this question, we need to specify a causal diagram based on domain knowledge. Sadly, for most circumstances, there is no data-driven approach that consistently identify confounders. Only our causal assumptions can help us identify them. Causal diagrams are a visual expression of those assumptions linked to rigorous mathematics that allow us to understand what we need to account for in our model.
In R, we can visualize and analyze our DAGs with the {ggdag} package. {ggdag} uses {ggplot2} and {ggraph} to visualize diagrams and {dagitty} to analyze them. Let’s set up our assumptions. The dagify()
function takes formulas, much like lm()
and friends, to express assumptions. We have two basic causal structures: the causes of quitting smoking and the causes of gaining weight. Here, we’re assuming that the set of variables here affect both. Additionally, we’re adding qsmk
as a cause of wt82_71
, which is our causal question; we also identify these as our outcome and exposure. Finally, we’ll add some labels so the diagram is easier to understand. The result is a dagitty
object, and we can transform it to a tidy_dagitty
data set with tidy_dagitty()
.
tidy version of the DAG
# set up DAG
<- ggdag::dagify(
smk_wt_dag # specify causes of quitting smoking and weight gain:
~ sex + race + age + education +
qsmk + smokeyrs + exercise + active + wt71,
smokeintensity ~ qsmk + sex + race + age + education +
wt82_71 + smokeyrs + exercise + active + wt71,
smokeintensity # specify causal question:
exposure = "qsmk",
outcome = "wt82_71",
coords = ggdag::time_ordered_coords(),
# set up labels:
# here, I'll use the same variable names as the data set, but I'll label them
# with clearer names
labels = c(
# causal question
"qsmk" = "quit\nsmoking",
"wt82_71" = "change in\nweight",
# demographics
"age" = "age",
"sex" = "sex",
"race" = "race",
"education" = "education",
# health
"wt71" = "baseline\nweight",
"active" = "daily\nactivity\nlevel",
"exercise" = "exercise",
# smoking history
"smokeintensity" = "smoking\nintensity",
"smokeyrs" = "yrs of\nsmoking"
)|>
) ::tidy_dagitty()
ggdag
smk_wt_dag
# A DAG with 11 nodes and 19 edges
#
# Exposure: qsmk
# Outcome: wt82_71
#
# A tibble: 20 × 9
name x y direction to xend yend circular label
<chr> <int> <int> <fct> <chr> <int> <int> <lgl> <chr>
1 active 1 -4 -> qsmk 2 0 FALSE "daily\nac…
2 active 1 -4 -> wt82_71 3 0 FALSE "daily\nac…
3 age 1 -3 -> qsmk 2 0 FALSE "age"
4 age 1 -3 -> wt82_71 3 0 FALSE "age"
5 education 1 -2 -> qsmk 2 0 FALSE "education"
6 education 1 -2 -> wt82_71 3 0 FALSE "education"
7 exercise 1 -1 -> qsmk 2 0 FALSE "exercise"
8 exercise 1 -1 -> wt82_71 3 0 FALSE "exercise"
9 qsmk 2 0 -> wt82_71 3 0 FALSE "quit\nsmo…
10 race 1 0 -> qsmk 2 0 FALSE "race"
11 race 1 0 -> wt82_71 3 0 FALSE "race"
12 sex 1 1 -> qsmk 2 0 FALSE "sex"
13 sex 1 1 -> wt82_71 3 0 FALSE "sex"
14 smokeintensity 1 2 -> qsmk 2 0 FALSE "smoking\n…
15 smokeintensity 1 2 -> wt82_71 3 0 FALSE "smoking\n…
16 smokeyrs 1 3 -> qsmk 2 0 FALSE "yrs of\ns…
17 smokeyrs 1 3 -> wt82_71 3 0 FALSE "yrs of\ns…
18 wt71 1 4 -> qsmk 2 0 FALSE "baseline\…
19 wt71 1 4 -> wt82_71 3 0 FALSE "baseline\…
20 wt82_71 3 0 <NA> <NA> NA NA FALSE "change in…
Let’s visualize our assumptions with ggdag()
.
graphic version of the DAG
|>
smk_wt_dag ::ggdag(text = FALSE, use_labels = "label") + ggdag::theme_dag() ggdag
What do we need to control for to estimate an unbiased effect of quitting smoking on weight gain? In many DAGs, there will be many sets of variables–called adjustment sets–that will give us the right effect (assuming our DAG is correct–a big, unverifiable assumption!). ggdag_adjustment_set()
can help you visualize them. Here, there’s only one adjustment set: we need to control for everything! While we’re add it, since a {ggdag} plot is just a {ggplot2} plot, let’s clean it up a bit, too.
DAG adjustment sets
|>
smk_wt_dag ::ggdag_adjustment_set(text = FALSE, use_labels = "label") +
ggdag::theme_dag() +
ggdag::scale_color_okabe_ito(order = c(1, 5)) +
ggokabeito::scale_fill_okabe_ito(order = c(1, 5)) ggokabeito
Let’s fit a model with these variables. Note that we’ll fit all continuous variables with squared terms, as well, to allow them a bit of flexibility.
fit of data using all variables
lm(
~ qsmk + sex +
wt82_71+ age + I(age^2) + education +
race + I(smokeintensity^2) +
smokeintensity + I(smokeyrs^2) + exercise + active +
smokeyrs + I(wt71^2),
wt71 data = nhefs_complete_uc
|>
) ::tidy(conf.int = TRUE) |>
broom::filter(term == "qsmk") dplyr
# A tibble: 1 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 qsmk 3.46 0.438 7.90 5.36e-15 2.60 4.32
When we adjust for the variables in our DAG, we get an estimate of about 3.5 kg–people who quit smoking gained about this amount of weight. However, we are trying to answer a specific causal question: how much weight would a person gain if the quit smoking vs. if the same person did not quit smoking? Let’s use an inverse probability weighting model to try to estimate that effect at the population level (what if everyone quit smoking vs what if no one quit smoking).
For a simple IPW model, we have two modeling steps. First, we fit a propensity score model, which predicts the probability that you received a treatment or exposure (here, that a participant quit smoking). We use this model to calculate inverse probability weights–1 / your probability of treatment. Then, in the second step, we use this weights in the outcome model, which estimates the effect of exposure on the outcome (here, the effect of quitting smoking on gaining weight).
For the propensity score model, we’ll use logistic regression (since quitting smoking is a binary variable). The outcome is quitting smoking, and the variables in the model are all those included in our adjustment set. Then, we’ll use augment()
from {broom} (which calls predict()
on the inside) to calculate our weights using propensity::wt_ate()
and save it back into our data set.
propensity score calculations
<- glm(
propensity_model ~ sex +
qsmk + age + I(age^2) + education +
race + I(smokeintensity^2) +
smokeintensity + I(smokeyrs^2) + exercise + active +
smokeyrs + I(wt71^2),
wt71 family = binomial(),
data = nhefs_complete_uc
)
<- propensity_model |>
nhefs_complete_uc # predict whether quit smoking
::augment(type.predict = "response", data = nhefs_complete_uc) |>
broom# calculate inverse probability
::mutate(wts = propensity::wt_ate(.fitted, qsmk))
dplyr
|>
nhefs_complete_uc ::select(qsmk, .fitted, wts) dplyr
# A tibble: 1,566 × 3
qsmk .fitted wts
<dbl> <dbl> <dbl>
1 0 0.0987 1.11
2 0 0.140 1.16
3 0 0.126 1.14
4 0 0.400 1.67
5 0 0.294 1.42
6 0 0.170 1.20
7 0 0.220 1.28
8 0 0.345 1.53
9 0 0.283 1.40
10 0 0.265 1.36
# ℹ 1,556 more rows
Let’s look at the distribution of the weights.
propensity weight distribution
ggplot(nhefs_complete_uc, aes(wts)) +
geom_histogram(color = "white", fill = "#E69F00", bins = 50) +
# use a log scale for the x axis
scale_x_log10() +
theme_minimal(base_size = 20) +
xlab("Weights")
It looks a little skewed, particularly that there are some participants with much higher weights. There are a few techniques for dealing with this – trimming weights and stabilizing weights – but we’ll keep it simple for now and just use them as is.
The main goal here is to break the non-causal associations between quitting smoking and gaining weight–the other paths that might distort our results. In other words, if we succeed, there should be no differences in these variables between our two groups, those who quit smoking and those who didn’t. This is where randomized trials shine; you can often assume that there is no baseline differences among potential confounders between your treatment groups (of course, no study is perfect, and there’s a whole set of literature on dealing with this problem in randomized trials).
Standardized mean differences (SMD) are a simple measurement of differences that work across variable types. In general, the closer to 0 we are, the better job we have done eliminating the non-causal relationships we drew in our DAG. Note that low SMDs for everything we adjust for does not mean that there is not something else that might confound our study. Unmeasured confounders or misspecified DAGs can still distort our effects, even if our SMDs look great!
We’ll use the {halfmoon} package to calculate the SMDs, then visualize them.
Standardized mean differences plot
<- c(
vars "sex", "race", "age", "education",
"smokeintensity", "smokeyrs",
"exercise", "active", "wt71"
)
<- halfmoon::tidy_smd(
plot_df
nhefs_complete_uc,all_of(vars),
qsmk,
wts
)
ggplot(
data = plot_df,
mapping = aes(x = abs(smd), y = variable, group = method, color = method)
+
) ::geom_love() halfmoon
These look pretty good! Some variables are better than others, but weighting appears to have done a much better job eliminating these differences than an unadjusted analysis.
We can also use halfmoon’s geom_mirror_histogram()
to visualize the impact that the weights are having on our population.
Inverse Probability Weight distributions
|>
nhefs_complete_uc ::mutate(qsmk = factor(qsmk)) |>
dplyrggplot(aes(.fitted)) +
::geom_mirror_histogram(
halfmoonaes(group = qsmk),
bins = 50
+
) ::geom_mirror_histogram(
halfmoonaes(fill = qsmk, weight = wts),
bins = 50,
alpha = .5
+
) scale_y_continuous(labels = abs) +
labs(x = "propensity score") +
theme_minimal(base_size = 20)
Both groups are being upweighted so that their distributions of propensity scores are much more similar.
We could do more here to analyze our assumptions, but let’s move on to our second step: fitting the outcome model weighted by our inverse probabilities. Some researchers call these Marginal Structural Models, in part because the model is marginal; we only need to include our outcome (wt82_71
) and exposure (qsmk
). The other variables aren’t in the model; they are accounted for with the IPWs!
Fit using inverse probability weights
<- lm(
ipw_model ~ qsmk,
wt82_71 data = nhefs_complete_uc,
weights = wts # inverse probability weights
)
<- ipw_model |>
ipw_estimate ::tidy(conf.int = TRUE) |>
broom::filter(term == "qsmk")
dplyr
ipw_estimate
# A tibble: 1 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 qsmk 3.44 0.408 8.43 7.47e-17 2.64 4.24
This estimate is pretty similar to what we saw before, if a little smaller. In fact, for simple causal questions, this is often the case: adjusting for confounders directly in your regression model sometimes estimates the same effect as IPWs and other causal techniques. Causal techniques are special, though, in that they use counterfactual modeling, which allows you to deal with many circumstances, such as when you have selection bias or time-dependendent confounding. They also often have variance properties.
But we have other problem that we need to address. While we’re just using lm()
to estimate our IPW model, it doesn’t properly account for the weights. That means our standard error is too small, which will artificially narrow confidence intervals and artificially shrink p-values. There are many ways to address this, including robust estimators. We’ll focus on using the bootstrap via the {rsamples} package in this lab, but here’s one way to do it with robust standard errors:
Robust linear fit without bootstrap
# also see robustbase, survey, gee, and others
library(estimatr)
<- estimatr::lm_robust(
ipw_model_robust ~ qsmk,
wt82_71 data = nhefs_complete_uc,
weights = wts
)
<- ipw_model_robust |>
ipw_estimate_robust ::tidy(conf.int = TRUE) |>
broom::filter(term == "qsmk")
dplyr
ipw_estimate_robust
term estimate std.error statistic p.value conf.low conf.high df
1 qsmk 3.440535 0.5264638 6.535179 8.573524e-11 2.407886 4.473185 1564
outcome
1 wt82_71
Now let’s try the bootstrap. First, we need to wrap our model in a function so we can call it many times on our bootstrapped data. A function like this might be your instinct; however, it’s not quite right.
not quite right function for bootstrapped estimates
# fit ipw model for a single bootstrap sample
<- function(split, ...) {
fit_ipw_not_quite_rightly # get bootstrapped data sample with `rsample::analysis()`
<- rsample::analysis(split)
.df
# fit ipw model
lm(wt82_71 ~ qsmk, data = .df, weights = wts) |>
tidy()
}
The problem is that we need to account for the entire modeling process, so we need to include the first step of our analysis – fitting the inverse probability weights.
right function for bootstrapped estimates
<- function(split, ...) {
fit_ipw <- rsample::analysis(split)
.df
# fit propensity score model
<- glm(
propensity_model ~ sex +
qsmk + age + I(age^2) + education +
race + I(smokeintensity^2) +
smokeintensity + I(smokeyrs^2) + exercise + active +
smokeyrs + I(wt71^2),
wt71 family = binomial(),
data = .df
)
# calculate inverse probability weights
<- propensity_model |>
.df ::augment(type.predict = "response", data = .df) |>
broom::mutate(wts = propensity::wt_ate(
dplyr
.fitted,
qsmk, exposure_type = "binary"
))
# fit correctly bootstrapped ipw model
lm(wt82_71 ~ qsmk, data = .df, weights = wts) |>
tidy()
}
{rsample} makes the rest easy for us: bootstraps()
resamples our data 1000 times, then we can use purrr::map()
to apply our function to each resampled set (splits
). {rsample}’s int_*()
functions help us get confidence intervals for our estimate.
fit ipw model to bootstrapped estimates
# fit ipw model to bootstrapped samples
<- rsample::bootstraps(causaldata::nhefs_complete, 1000, apparent = TRUE) |>
ipw_results ::mutate(results = purrr::map(splits, fit_ipw))
dplyr
# get t-statistic-based CIs
<- rsample::int_t(ipw_results, results) |>
boot_estimate ::filter(term == "qsmk")
dplyr
boot_estimate
# A tibble: 1 × 6
term .lower .estimate .upper .alpha .method
<chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 qsmk 2.47 3.46 4.45 0.05 student-t
Let’s compare to our naive weighted model that just used a single estimate from lm()
comparison of robust and naive weighted models
::bind_rows(
dplyr|>
ipw_estimate ::select(estimate, conf.low, conf.high) |>
dplyr::mutate(type = "ols"),
dplyr|>
ipw_estimate_robust ::select(estimate, conf.low, conf.high) |>
dplyr::mutate(type = "robust"),
dplyr|>
boot_estimate ::select(estimate = .estimate, conf.low = .lower, conf.high = .upper) |>
dplyr::mutate(type = "bootstrap")
dplyr|>
) # calculate CI width to sort by it
::mutate(width = conf.high - conf.low) |>
dplyr::arrange(width) |>
dplyr# fix the order of the model types for the plot
::mutate(type = forcats::fct_inorder(type)) |>
dplyrggplot(aes(x = type, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_pointrange(color = "#0172B1", size = 1, fatten = 3) +
coord_flip() +
theme_minimal(base_size = 20) +
theme(axis.title.y = element_blank())
Our bootstrapped confidence intervals are wider, which is expected; remember that they were artificially narrow in the naive OLS model!
So, we have a final estimate for our causal effect: on average, a person who quits smoking will gain 3.5 kg (95% CI 2.4 kg, 4.4 kg) versus if they had not quit smoking.
Questions:
- Please enumerate the steps in the causal analysis workflow
- What do you think? Is this estimate reliable? Did we do a good job addressing the assumptions we need to make for a causal effect, particularly that there is no confounding? How might you criticize this model, and what would you do differently?
Grading
Total points available: 30 points.
Component | Points |
---|---|
Ex 1 - 6 | 30 |