\listfiles % \documentclass[a4paper]{article} % \documentclass[article,nojss]{jss} % alternative: \documentclass[nojss,article,shortnames,table]{jss} % \usepackage[margin=2.5cm]{geometry} \usepackage{float} \usepackage{amssymb,amsmath,amsfonts} % \usepackage{authblk} \usepackage[multiple]{footmisc} \usepackage[utf8]{inputenc} % \usepackage[colorlinks=true,urlcolor=blue]{hyperref} \usepackage{graphicx} % \usepackage[font={scriptsize, it}]{caption} \usepackage[english]{babel} \usepackage{pdflscape} \usepackage{booktabs} \usepackage{ctable} % \usepackage{color} % causes conflict with jss style: \usepackage[]{xcolor} % \usepackage[]{colortbl} % \usepackage{listings} % \lstset{breaklines=true} % \lstset{ % language=R, % choose the language of the code % basicstyle=\color{red}, % keywordstyle=\color{red}, % style for keywords % numbers=none, % where to put the line-numbers % numberstyle=\tiny, % the size of the fonts that are used for the line-numbers % backgroundcolor=\color{gray90}, % showspaces=false, % show spaces adding particular underscores % showstringspaces=false, % underline spaces within strings % showtabs=false, % show tabs within strings adding particular underscores % frame=single, % adds a frame around the code % tabsize=2, % sets default tabsize to 2 spaces % rulesepcolor=\color{gray90}, % rulecolor=\color{black}, % captionpos=b, % sets the caption-position to bottom % breaklines=true, % sets automatic line breaking % breakatwhitespace=false, % } \definecolor{blue}{rgb}{.2,.2,.7} \definecolor{red}{rgb}{.7,.2,.2} \definecolor{green}{rgb}{0,.6,.3} \definecolor{gray}{rgb}{0.45,0.45,0.45} \newcommand{\btext}[1]{\textcolor{blue}{#1}} \newcommand{\rtext}[1]{\textcolor{red}{#1}} \newcommand{\gtext}[1]{\textcolor{green}{#1}} \newcommand{\wtext}[1]{\textcolor{white}{#1}} \newcommand{\old}[1]{\textcolor{gray}{#1}} \definecolor{gray90}{RGB}{229,229,229} \definecolor{gray77}{RGB}{196,196,196} \definecolor{gray60}{RGB}{153,153,153} \renewcommand{\thefootnote}{\alph{footnote}} %%\newcommand{\acronym}[1]{\textsc{#1}} %%\newcommand{\class}[1]{\mbox{\textsf{#1}}} % \newcommand{\code}[1]{\mbox{\texttt{#1}}} % \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} % \newcommand{\proglang}[1]{\textsf{#1}} \newcommand\XOR{\mathbin{\char`\^}} \newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}} \def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}} \newcommand{\specialcell}[2][c]{% \begin{tabular}[#1]{@{}c@{}}#2\end{tabular}} \author{Oleg Sofrygin\\Division of Research,\\Kaiser Permanente Northern California\\University of California, Berkeley\And Mark J. van der Laan\\University of California, Berkeley\AND Romain Neugebauer\\Division of Research,\\Kaiser Permanente Northern California} \Plainauthor{Oleg Sofrygin, Mark J. van der Laan, Romain Neugebauer} \title{\pkg{simcausal} Package: Technical Details and Extended Examples of Simulations with Complex Longitudinal Data} \Plaintitle{simcausal Package: Technical Details and Extended Examples of Simulations with Complex Longitudinal Data)} \Shorttitle{\pkg{simcausal}: Causal Simulation Package} %% a short title (if necessary) \Abstract{ The \pkg{simcausal} \proglang{R} package is a tool for specification and simulation of complex longitudinal data structures that are based on structural equation models. The package aims to provide a flexible tool for simplifying the conduct of transparent and reproducible simulation studies, with a particular emphasis on the types of data and interventions frequently encountered in real-world causal inference problems, such as, observational data with time-dependent confounding, selection bias, and random monitoring processes. The package interface allows for concise expression of complex functional dependencies between a large number of nodes, where each node may represent a measurement at a specific time point. The package allows for specification and simulation of counterfactual data under various user-specified interventions (e.g., static, dynamic, deterministic, or stochastic). In particular, the interventions may represent exposures to treatment regimens, the occurrence or non-occurrence of right-censoring events, or of clinical monitoring events. Finally, the package enables the computation of a selected set of user-specified features of the distribution of the counterfactual data that represent common causal quantities of interest, such as, treatment-specific means, the average treatment effects and coefficients from working marginal structural models. The applicability of \pkg{simcausal} is demonstrated by replicating the results of two published simulation studies. } \Keywords{causal inference, simulation, marginal structural model, structural equation model, directed acyclic graph, causal model, \proglang{R}} \Plainkeywords{causal inference, simulation, marginal structural model, structural equation model, directed acyclic graph, causal model, R} %% without formatting % \Address{ % Oleg Sofrygin\\ % Division of Research\\ % Kaiser Permanente Northern California\\ % Oakland, CA 94612 \\ % \emph{and}\\ % Division of Biostatistics, School of Public Health\\ % University of California, Berkeley\\ % Berkeley, CA 94720\\ % E-mail: \email{oleg.sofrygin@gmail.com}\\ % \\ % Mark J. van der Laan\\ % Division of Biostatistics, School of Public Health\\ % University of California, Berkeley\\ % Berkeley, CA 94720\\ % E-mail: \email{laan@berkeley.edu}\\ % \\ % Romain Neugebauer\\ % Division of Research\\ % Kaiser Permanente Northern California\\ % 2000 Broadway\\ % Oakland, CA 94612 \\ % E-mail: \email{Romain.S.Neugebauer@kp.org}\\ % } %% need no \usepackage{Sweave.sty} %\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{simcausal Package: Simulations with Complex Longitudinal Data (Technical Details and Extended Examples)} \begin{document} <>= exportTexOnly=FALSE # do not include any chunks in the final result (R code is still evaluated) require(knitr) require(simcausal) cache_opt <- TRUE opts_chunk$set(fig.path='figure/beamer-',fig.align='center',fig.show='hold',size='footnotesize',escape=TRUE) # to crop white space in output figures: knit_hooks$set(pdfcrop = hook_pdfcrop) # To disable syntax color highlighing of R code in the entire document # opts_chunk$set(highlight=FALSE) # To change the background color on knitR output from default gray to white: # opts_chunk$set(background='white') opts_chunk$set(include=!exportTexOnly) # options(width=80) # make the printing fit on the page options(width=90) # make the printing fit on the page set.seed(1121) # make the results repeatable @ <>= # Need to first load that packages that need to be cited: # require(simcausal) # require(knitr) # require(data.table) # require(ggplot2) # require(igraph) # require(lavaan); require(lavaan.survey); require(sem); require(semPLS); # # require(OpenMx) # require(gems); require(aftgee); require(survsim) # # # Select packages to cite: # citPkgs <- names(sessionInfo()$otherPkgs) # citPkgs <- c(citPkgs, "base") # # Write the bibtex file: # write_bib(citPkgs, file = "R-Pckgs.bib") @ <>= # get MSM survival predictions from the full data.table in long format (melted) by time (t_vec), and by the MSM term (MSMtermName) # predictions from the estimated msm model (based on observational data) can be obtained by passing estimated msm model, est.msm # given vector of t (t_vec), results of MSM target.eval and an MSM term get survival table by action survbyMSMterm <- function(MSMres, t_vec, MSMtermName, use_actions=NULL, est.msm=NULL) { library("data.table") # look up MSMtermName in MSMterm map, if exists -> use the new name, if doesn't exist use MSMtermName if (!is.null(MSMres$S.msm.map)) { mapS_exprs <- as.character(MSMres$S.msm.map[,"S_exprs_vec"]) XMSMterms <- as.character(MSMres$S.msm.map[,"XMSMterms"]) map_idx <- which(mapS_exprs%in%MSMtermName) XMSMtermName <- XMSMterms[map_idx] if (!is.null(XMSMtermName)&&length(XMSMtermName)>0) { MSMtermName <- XMSMtermName } } print("MSMtermName used"); print(MSMtermName) t_dt <- data.table(t=as.integer(t_vec)); setkey(t_dt, t) get_predict <- function(actname) { # setkey(MSMres$df_long[[actname]], t) setkeyv(MSMres$df_long[[actname]], c("t", MSMtermName)) # MSMterm_vals <- as.numeric(MSMres$df_long[[actname]][t_dt, mult="first"][[MSMtermName]]) # print("MSMterm_vals"); print(MSMterm_vals) MSMterm_vals <- as.numeric(MSMres$df_long[[actname]][t_dt, mult="last"][[MSMtermName]]) # print("MSMterm_vals last"); print(MSMterm_vals) newdata=data.frame(t=t_vec, MSMterm_vals=MSMterm_vals) colnames(newdata) <- c("t", MSMtermName) # print("newdata"); print(newdata) if (!is.null(est.msm)) { pred <- predict(est.msm, newdata=newdata, type="response") } else { pred <- predict(MSMres$m, newdata=newdata, type="response") } return(data.frame(t=t_vec, pred=pred)) } action_names <- names(MSMres$df_long) if (!is.null(use_actions)) { action_names <- action_names[action_names%in%use_actions] } surv <- lapply(action_names, function(actname) { res <- get_predict(actname) if (MSMres$hazard) { res$surv <- cumprod(1-res$pred) } else { res$surv <- 1-res$pred } res$pred <- NULL res$action <- actname res }) names(surv) <- names(MSMres$df_long) surv_melt <- do.call('rbind', surv) surv_melt$action <- factor(surv_melt$action, levels=unique(surv_melt$action), ordered=TRUE) surv_melt } plotsurvbyMSMterm <- function(surv_melt_dat) { library("ggplot2") f_ggplot_surv_wS <- ggplot(data= surv_melt_dat, aes(x=t, y=surv)) + geom_line(aes(group = action, color = action), size=.4, linetype="dashed") + theme_bw() } plotsurvbyMSMterm_facet <- function(surv_melt_dat1, surv_melt_dat2, msm_names=NULL) { library("ggplot2") if (is.null(msm_names)) { msm_names <- c("MSM1", "MSM2") } surv_melt_dat1$MSM <- msm_names[1] surv_melt_dat2$MSM <- msm_names[2] surv_melt_dat <- rbind(surv_melt_dat1,surv_melt_dat2) f_ggplot_surv_wS <- ggplot(data= surv_melt_dat, aes(x=t, y=surv)) + geom_line(aes(group = action, color = action), size=.4, linetype="dashed") + theme_bw() + facet_wrap( ~ MSM) } @ <>= # @param DAG Object specifying the directed acyclic graph for the observed data, # must have a well-defined MSM target parameter (\code{set.target.MSM()}) # @param obs_df Simulated observational data # @param Aname Generic names of the treatment nodes (can be time-varying) # @param Cname Generic names of the censoring nodes (can be time-varying) # @param Lnames Generic names of the time-varying covariates (can be time-varying) # @param tvec Vector of time points for Y nodes # @param actions Which actions (regimens) should be used in estimation from the observed simulated data. # If NULL then all actions that were defined in DAG will be considered. # @param package Character vector for R package name to use for estimation. Currently only "ltmle" is implemented. # @param fun Character name for R function name to employ for estimation. Currently only "ltmleMSM" is implemented. # @param ... Additional named arguments that will be passed on to ltmleMSM function est.targetMSM <- function(DAG, obs_df, Aname="A", Cname="C", Lnames, Ytvec, ACLtvec, actions=NULL, package="ltmle", fun="ltmleMSM", ...) { outnodes <- attr(DAG, "target")$outnodes param_name <- attr(DAG, "target")$param_name if (is.null(outnodes$t)) stop("estimation is only implemented for longitudinal data with t defined") if (!param_name%in%"MSM") stop("estimation is only implemented for MSM target parameters") if (is.null(actions)) { message("actions argument underfined, using all available actions") actions <- A(DAG) } # all time points actually used in the observed data t_all <- attr(obs_df, "tvals") tvec <- outnodes$t t_sel <- ACLtvec # ltmle allows for pooling Y's over smaller subset of t's (for example t=(2:8)) # in this case summary measures HAVE TO MATCH the dimension of finYnodes, not t_sel # currently this is not supported, thus, if tvec is a subset of t_sel this will cause an error finYnodes <- outnodes$gen_name%+%"_"%+%Ytvec Ynodes <- outnodes$gen_name%+%"_"%+%Ytvec Anodes <- Aname%+%"_"%+%ACLtvec Cnodes <- Cname%+%"_"%+%ACLtvec Lnodes <- t(sapply(Lnames, function(Lname) Lname%+%"_"%+%ACLtvec[-1])) Lnodes <- as.vector(matrix(Lnodes, nrow=1, ncol=ncol(Lnodes)*length(Lnames), byrow=FALSE)) Nobs <- nrow(obs_df) #------------------------------------------------------------ # getting MSM params #------------------------------------------------------------ params.MSM <- attr(DAG, "target")$params.MSM working.msm <- params.MSM$form msm.family <- params.MSM$family if (params.MSM$hazard) stop("ltmleMSM cannot estimate hazard MSMs...") # the number of attributes and their dimensionality have to match between different actions n_attrs <- length(attr(actions[[1]], "attnames")) #------------------------------------------------------------ # define the final ltmle arrays regimens_arr <- array(dim = c(Nobs, length(ACLtvec), length(actions))) summeas_arr <- array(dim = c(length(actions), (n_attrs+1), length(Ytvec))) # loop over actions (regimes) creating counterfactual mtx of A's for each action: for (action_idx in seq(actions)) { # I) CREATE COUNTERFACTUAL TREATMENTS & # II) CREATE summary.measure that describes each attribute by time + regimen #------------------------------------------------------------ # needs to assign observed treatments and replace the action timepoints with counterfactuals A_mtx_act <- as.matrix(obs_df[,Anodes]) #------------------------------------------------------------ action <- actions[[action_idx]] # action-spec. time-points t_act <- as.integer(attr(action, "acttimes")) # action-spec. attribute names attnames <- attr(action, "attnames") # list of action-spec. attributes attrs <- attr(action, "attrs") # time points for which we need to evaluate the counterfactual treatment assignment as determined by action: #------------------------------------------------------------ # Action t's need always be the same subset of t_sel (outcome-based times), otherwise we are in big trouble t_act_idx <- which(t_sel%in%t_act) t_chg <- t_sel[t_act_idx] # modify only A's which are defined in this action out of all Anodes As_chg <- Anodes[t_act_idx] #------------------------------------------------------------ # creates summary measure array that is of dimension (length(t_chg)) - time-points only defined for this action # which t's are in the final pooled MSM => need to save the summary measures only for these ts t_vec_idx <- which(t_act%in%tvec) summeas_attr <- matrix(nrow=length(attnames), ncol=length(tvec)) #------------------------------------------------------------ # extract values of terms in MSM formula: get all attribute values from +action(...,attrs) #------------------------------------------------------------ obs_df_attr <- obs_df # add all action attributes to the observed data for (attr_idx in seq(attnames)) { # self-contained loop # grab values of the attributes, # loop over all attributes if (length(attrs[[attnames[attr_idx]]])>1) { attr_i <- attnames[attr_idx]%+%"_"%+%t_chg val_attr_i <- attrs[[attnames[attr_idx]]][t_act_idx] } else { attr_i <- attnames[attr_idx] val_attr_i <- attrs[[attnames[attr_idx]]] } # summary measures, for each action/measure summeas_attr[attr_idx,] <- matrix(val_attr_i, nrow=1, ncol=length(t_chg))[,t_vec_idx] # observed data values of the attribute df_attr_i <- matrix(val_attr_i, nrow=Nobs, ncol=length(val_attr_i), byrow=TRUE) # create the combined data.frame (attrs + O.dat) colnames(df_attr_i) <- attr_i; obs_df_attr <- cbind(data.frame(df_attr_i), obs_df_attr) } # end of loop summeas_attr <- rbind(summeas_attr, t_chg[t_vec_idx]) rownames(summeas_attr) <- c(attnames, "t") #------------------------------------------------------------ # add action specific summary measures to the full array summeas_arr[action_idx, , ] <- summeas_attr dimnames(summeas_arr)[[2]] <- rownames(summeas_attr) #------------------------------------------------------------ # GENERATING A MATRIX OF COUNTERFACTUAL TREATMENTS: for (Achange in As_chg) { # for each A defined in the action, evaluate its value applied to the observed data cur.node <- action[As_chg][[Achange]] t <- cur.node$t newAval <- with(obs_df_attr, { # for static no need to sample from a distr ANCHOR_VARS_OBSDF <- TRUE simcausal:::eval_nodeform(as.character(cur.node$dist_params$prob), cur.node)$evaled_expr }) if (length(newAval)==1) { newA <- rep(newAval, Nobs) } else { newA <- newAval } A_mtx_act[,which(Anodes%in%Achange)] <- newA } # Result matrix A_mtx_act has all treatments that were defined in that action replaced with # their counterfactual values #------------------------------------------------------------ # add action specific summary measures to the full array regimens_arr[, , action_idx] <- A_mtx_act #------------------------------------------------------------ } list(regimens_arr=regimens_arr, summeas_arr=summeas_arr) } @ \newpage{} \tableofcontents \newpage{} %********************************************************************* \section{Introduction}\label{secintro} \addcontentsline{toc}{section}{Introduction} This article describes the \pkg{simcausal} package \citep{R-simcausal}, a comprehensive set of tools for the specification and simulation of complex longitudinal data structures to study causal inference methodologies. The package is developed using the \proglang{R} system for statistical computing \citep{r} and is available from the Comprehensive \proglang{R} Archive Network (CRAN) at \url{http://CRAN.R-project.org/package=simcausal}. Our package is intended to provide a flexible tool to facilitate the process of conducting \emph{transparent} and \emph{reproducible} simulation studies, with a particular emphasis on the types of data and interventions frequently encountered in real-world causal inference problems. For example, our package simplifies the simulation of observational data based on random clinical monitoring to evaluate the effect of time-varying interventions in the presence of time-dependent confounding and sources of selection bias (e.g., informative right censoring). The package provides a novel user-interface that allows concise and intuitive expression of complex functional dependencies between a large number of nodes that may represent time-varying random variables (e.g., repeated measurements over time of the same subject-matter attribute, such as, blood pressure).\\ In this package, a data generating distribution is defined via the specification of a structural equation model (SEM) \citep{pearl1995, Pearl2009, pearl2010}. Therefore, \pkg{simcausal} also allows for specification and simulation of counterfactual data under various user-specified interventions (e.g., static, dynamic, deterministic, or stochastic), which are referred to as ``\emph{actions}''. These actions may represent exposure to treatment regimens, the occurrence or non-occurrence of right-censoring events, or of clinical monitoring events (e.g., laboratory measurements based on which treatment decisions may be made). Finally, the package enables the computation of a selected set of user-specified features of the distribution of the counterfactual data that represent common causal quantities of interest, referred to as causal target parameters, such as, treatment-specific means, the average treatment effects (ATE) (on the multiplicative or additive scale) and coefficients from working marginal structural model (MSM) \citep{robins1998marginal,neugebauer2007} are a few of the causal target parameters implemented that can be evaluated by the package.\\ The CRAN system contains several \proglang{R} packages for conducting data simulations with various statistical applications. We reference some of these packages below. Our review is not intended to be exhaustive and we focus on two key aspects in which \pkg{simcausal} differ from these other simulation tools.\\ First, simulations in the \pkg{simcausal} package are based on data generating distributions that can be specified via \emph{general} structural equation models. By allowing the specification of a broad range of structural equations with independent or dependent errors, the set of possible distributions available to the analyst for simulating data is meant to be not overly restrictive. For instance, any sampling distribution that is currently available in \proglang{R} or that can be user-defined in the \proglang{R} programming environment can be used for defining the conditional distribution of a node given its parents. Some of the other \proglang{R} packages rely on alternative approaches for specifying and simulating data. For example, the package \pkg{gems} \citep{R-gems} is based on the generalized multistate models, and the package \pkg{survsim} \citep{R-survsim} is based on the Weibull, log-logistic or log-normal models. Finally, the following \proglang{R} simulation packages rely on linear structural equation models: \pkg{lavaan} \citep{R-lavaan}, \pkg{lavaan.survey} \citep{R-lavaan.survey}, \pkg{sem} \citep{fox2006teacher,R-sem}, \pkg{semPLS} \citep{R-semPLS}, \pkg{OpenMx} \citep{openmx2011,R-OpenMx} and \pkg{simsem} \citep{R-simsem}.\\ Second, unlike the \pkg{simFrame} package, which is meant as a general object-oriented tool for designing simulation studies, the \pkg{simcausal} package is instead tailored to study causal inference methodologies and is particularly suited to investigate problems based on complex longitudinal data structures \citep{robins1998marginal}. Indeed, \pkg{simcausal} provides a single pipeline for performing the following common steps frequently encountered in simulation studies from the causal inference literature: defining the observed data distribution, defining intervention/counterfactual distributions, defining causal parameters, simulating observed and counterfactual data, and evaluating the true value of causal parameters. In addition, the package introduces an intuitive user-interface for specifying complex data-generating distributions to emulate realistic real-world longitudinal data studies characterized by a large number of repeated measurements of the same subject-matter attributes over time. In particular, the \pkg{simcausal} package was designed to facilitate the study of causal inference methods for investigating the effects of complex intervention regimens such as dynamic and stochastic interventions (not just the common static and deterministic intervention regimens), and summary measures of these effects defined by (working) marginal structural models. We note, however, that while the package was initially developed for this particular methodological research purpose, its utility can be extended to a broader range of causal inference research, e.g., to perform simulation-based power calculations for informing the design of real-world studies.\\ The rest of this vignette is organized as follows. In Section~\ref{sectechn}, we provide an overview of the technical details for a typical use of the package. In Section~\ref{simstudy.pt}, we describe a template workflow for a simple simulation study with single time point interventions. In Section~\ref{simstudy.timevar}, we describe the use of the package for a more realistic and complex simulation study example based on survival data with repeated measures and dynamic interventions at multiple time points. In Section~\ref{repsimstudy1}, we apply the \pkg{simcausal} package to replicate results of a previously published simulation study by \citet{neugebauer2014, neugebauer2015}. In Section~\ref{repsimstudy2}, we apply the \pkg{simcausal} package to replicate results of another published simulation study by \citet{lefebvre2008}. We conclude with a discussion in Section~\ref{secdiscus}. \newpage{} %********************************************************************* \section{Technical details}\label{sectechn} \addcontentsline{toc}{section}{Technical details} \subsection{The workflow} \addcontentsline{toc}{subsection}{The workflow} The following schematic shows the order in which \pkg{simcausal} routines would be utilized in a typical simulation study: \[ \begin{array}{ccc} D=\mathtt{DAG.empty}()\\ \downarrow\\ D=D+\mathtt{node}(\ldots)+\mathtt{node}(\ldots)\\ \downarrow\\ D=\mathtt{set.DAG}(D) & \longrightarrow & \mathtt{sim}(D)\\ \downarrow\\ D=D+\mathtt{action}(\ldots) & \longrightarrow & \mathtt{sim}(D,\mathtt{actions}=\ldots)\\ \downarrow\\ D=\mathtt{set.targetE}(D,\ldots)\\ D=\mathtt{set.targetMSM}(D,\ldots)\\ \downarrow\\ \mathtt{eval.target}(D) \end{array} \] \paragraph{Data structures.} The following most common types of output are produced by the package. \begin{description} \item [\emph{parameterized causal \code{DAG} model}] - object that specifies the structural equation model, along with interventions and the causal target parameter of interest. \item [\emph{observed data}] - data simulated from the (pre-intervention) distribution specified by the structural equation model. \item [\emph{counterfactual data}] - data simulated from one or more post-intervention distributions defined by actions on the structural equation model. \item [\emph{causal target parameter}] - the true value of the causal target parameter evaluated with counterfactual data. \end{description} \paragraph{Routines.} The following routines will be generally invoked by a user, in the same order as presented below. \begin{description} \item [{\code{DAG.empty}}] initiates an empty \code{DAG} object that contains no nodes. \item [{\code{node}}] defines a node in the structural equation model and its conditional distribution, i.e., the outcome of one equation in the structural equation model and the formula that links the outcome value to that of earlier covariates, referred to as parent nodes. A call to \code{node} can specify either a single node or multiple nodes at once, with \code{name} and \code{distr} being the only required arguments. To specify multiple nodes with a single \code{node} call, one must also provide an indexing vector of integers as an argument \code{t}. In this case, each node shares the same name, but is indexed by distinct values in \code{t}. The simultaneous specification of multiple nodes is particularly relevant for providing a shorthand syntax for defining a time-varying covariate, i.e., for defining repeated measurements over time of the same subject-matter attribute, as shown in the example in Section~\ref{node2}. \item [{\code{add.nodes} or \code{D + node}}] provide two equivalent ways of growing the structural equation model by adding new nodes and their conditional distributions. Informally, these routines are intended to be used to sequentially populate a \code{DAG} object with all the structural equations that make up the causal model of interest. See Sections~\ref{node1} and \ref{node2} for examples. \item [{\code{set.DAG}}] locks the \code{DAG} object in the sense that no additional nodes can be subsequently added to the structural equation model. In addition, this routine performs several consistency checks of the user-populated \code{DAG} object. In particular, the routine attempts to simulate observations to verify that all conditional distributions in the \code{DAG} object are well-defined. \item [{\code{sim}}] simulates independent and identically distributed (iid) observations of the complete node sequence defined by a \code{DAG} object. The output dataset is stored as a \code{data.frame} and is referred to as the \emph{observed data}. It can be structured in one of two formats, as discussed in Section~\ref{longdat}. \item [{\code{add.action} or \code{D + action}}] provides two equivalent ways to define one or more actions. An action modifies the conditional distribution of one or more nodes of the structural equation model. The resulting data generating distribution is referred to as the post-intervention distribution. It is saved in the \code{DAG} object alongside the original structural equation model. See Sections~\ref{actions1} and \ref{actions2} for examples. \item [{\code{sim(..., actions = ...)}}] can also be used for simulating independent observations from one or more post-intervention distributions, as specified by the \code{actions} argument. The resulting output is a named list of \code{data.frame} objects, collectively referred to as the \emph{counterfactual data}. The number of \code{data.frame} objects in this list is equal to the number of post-intervention distributions specified in the \code{actions} argument, where each \code{data.frame} object is an iid sample from a particular post-intervention distribution. \item [{\code{set.targetE} and \code{set.targetMSM}}] define two distinct types of target causal parameters. The output from these routines is the input \code{DAG} object with the definition of the target causal parameter saved alongside the interventions. See Sections~\ref{targets1} and \ref{targets2} for examples defining various target parameters. \item [{\code{eval.target}}] evaluates the causal parameter of interest using simulated counterfactual data. As input, it can take previously simulated counterfactual data (i.e., the output of a call to the \code{sim(..., actions = ...)} function) or, alternatively, the user can specify the sample size \code{n}, based on which counterfactual data will be simulated first. \end{description} \subsection{Specifying a structural equation model}\label{SEMintro} \addcontentsline{toc}{subsection}{Specifying a structural equation model} The \pkg{simcausal} package encodes a structural equation model using a \code{DAG} object. The \code{DAG} object is a collection of nodes, each node represented by a \code{DAG.node} object that captures a single equation of the structural equation model. \code{DAG.node} objects are created by calling the \code{node} function. When the \code{node} function is used to simultaneously define multiple nodes, these nodes share the same name, but must be indexed by distinct user-specified integer values of the time variable \code{t}, as shown in the example in Section~\ref{node2}. We will refer to a collection of nodes defined simultaneously in this manner as a \emph{time-varying node} and we will refer to each node of such a collection as a measurement at a specific time point.\\ Each node is usually added to a growing \code{DAG} object by using either the \code{add.nodes} function or equivalently the \code{'+'} function, as shown in the example in Sections~\ref{node1} and \ref{node2}. Each new node added to a \code{DAG} object must be uniquely identified by its name or the combination of a name and a value for the time variable argument \code{t}.\\ The user may explicitly specify the temporal ordering of each node using the \code{order} argument of the \code{node()} function. However, if this argument is omitted, the \code{add.nodes} function assigns the temporal ordering to a node by using the actual order in which this node was added to the \code{DAG} object and, if applicable, the value of the time variable that indexes this node (earlier added nodes receive a lower order value, compared to those that are added later; nodes with a lower value for the \code{t} argument receive a lower order value, compared to those with a higher value of \code{t}).\\ The \code{node} function also defines the conditional distribution of a node, given its parents, with a combination of the sampling distribution specified by the \code{distr} argument and the distributional parameters specified as additional named arguments to the \code{node()} function. This \code{distr} argument can be set to the name of any \proglang{R} function that accepts an integer argument named \code{n} and returns a vector of size \code{n}. Examples of such distribution functions are provided in Section~\ref{custom}.\\ The distributional parameters are specified as additional named arguments of the \code{node()} function and can be either constants or some summary measures of the parent nodes. Their values can be set to any evaluable \code{R} expressions that may reference any standard or user-specified \code{R} function, and also, may invoke a novel and intuitive shorthand syntax for referencing specific measurements of time-varying parent nodes, i.e., nodes identified by the combination of a node name and a time point value \code{t}. The syntax for identifying specific measurements of time-varying nodes is based on a re-purposed \proglang{R} square-bracket vector subsetting function \code{'['}: e.g., writing the expression \code{sum(A[0:5])}, where \code{A} is the name of a previously defined time-varying node, defines the summary measure that is the sum of the node values over time points \code{t = 0},$\ldots$,\code{5}. This syntax may also be invoked to simultaneously define the conditional distribution of the measurements of a time-varying node over multiple time points \code{t} at once. For example, defining the conditional distribution of a time-varying node with the \proglang{R} expression \code{sum(A[max(0, t - 5):t]) + t} will resolve to different node formulas for each measurement of the time-varying node, depending on the value of \code{t}: \begin{enumerate} \item \code{A[0]} at \code{t = 0}; \item \code{sum(A[0:1]) + 1} at \code{t = 1}, $\ldots$, \code{sum(A[0:5]) + 5} at \code{t = 5}; \item \code{sum(A[1:6]) + 6} at \code{t = 6}, $\ldots$, \code{sum(A[5:10]) + 10} at \code{t = 10}. \end{enumerate} Concrete applications of this syntax are described in Section \ref{node2}, as well as in the documentation of the \code{node()} function (\code{?node}).\\ Note that the user can also define a causal model with one or more nodes that represent the occurrence of \emph{end of follow-up} (EFU) events (e.g., right-censoring events or failure events of interest). Such nodes are defined by calling the \code{node()} function with the \code{EFU} argument being set to \code{TRUE}. The EFU nodes encode binary random variables whose value of 1 indicates that, by default, all of the subsequent nodes (i.e., nodes with a higher temporal order value) are to be replaced with a constant \code{NA} (missing) value. As an alternative, the user may choose to impute missing values for the time-varying node that represents the failure event of interest using the \emph{last time point value carried forward} (LTCF) imputation method. This imputation procedure consists in replacing missing values for measurements of a time-varying node at time points \code{t} after an end of follow-up event with its last known measurement value prior to the occurrence of an end of follow-up event. Additional details about this imputation procedure are provided in Sections~\ref{introsim} and \ref{LTCFdat} and its relevance is demonstrated in Section~\ref{Ex1survMSM} (Example 1 of \code{set.targetMSM}).\\ Finally, we note that the package includes pre-written wrapper functions for random sampling from some commonly employed distributions. These routines can be passed directly to the \code{distr}\textbf{ }argument of the node function with the relevant distributional parameters on which they depend. These built-in functions can be listed at any time by calling \code{distr.list()}. In particular, the routines \code{"rbern"}, \code{"rconst"}, and \code{"rcat.b1"} can be used for specifying a Bernoulli distribution, a degenerate distribution (constant at a given value), and a categorical distribution, respectively. One can also use any of the standard random generating \proglang{R} functions, e.g., \code{"rnorm"} for sampling from the normal distribution and \code{"runif"} for sampling from the uniform distribution, as demonstrated in Sections~\ref{node1} and ~\ref{custom}. \subsection{Specifying interventions} \addcontentsline{toc}{subsection}{Specifying interventions} An intervention regimen (also referred to as action regimen) is defined as a sequence of conditional distributions that replace the original distributions of a subset of nodes in a \code{DAG} object. To specify an intervention regimen, the user must identify the set of nodes to be intervened upon and provide new node distributions for them. The user may define a static, dynamic, deterministic or stochastic intervention on any given node, depending on the type of distributions specified. A deterministic static intervention is characterized by replacing a node distribution with a degenerate distribution such that the node takes on a constant value. A deterministic dynamic intervention is characterized by a conditional degenerate distribution such that the node takes on a value that is only a function of the values of its parents (i.e., a decision rule). A stochastic intervention is characterized by a non-degenerate conditional distribution. A stochastic intervention is dynamic if it is characterized by a non-degenerate conditional distribution that is defined as a function of the parent nodes and it is static otherwise. Note that a particular intervention may span different types of nodes and consist of different types of distributions, e.g., an intervention on a monitoring node can be static, while the intervention on a treatment node from the same structural equation model may be dynamic.\\ To define an intervention the user must call \code{D + }\code{action(A, nodes = B)} (or equivalently \code{add.action(}\code{D, A, nodes = B)}), where \code{D} is a \code{DAG} object, \code{A} is a unique character string that represents the intervention name, and \code{B} is a list of \code{DAG.node} objects defining the intervention regimen. To construct \code{B} the user must first aggregate the output from one or more calls to \code{node} (using \code{c(..., ...)}), with the \code{name} argument of the \code{node} function call set to node names that already exist in the locked \code{DAG} object \code{D}. The example in Section~\ref{actions2} demonstrates this functionality. Alternatively, repeated calls to \code{add.action} or \code{D+action} with the same intervention name, e.g., \code{A = }\code{"A1"}, allow the incremental definition of an intervention regimen by passing each time a different \code{node} object, enabling iterative build-up of the collection \code{B} of the intervened nodes that define the intervention regimen. Note, however, that by calling \code{D + action} or \code{add.action(D, ...)} with a new action name, e.g., \code{action("A2", ...)}, the user initiates the definition of a new intervention regimen. \subsection{Specifying a target causal parameter} \addcontentsline{toc}{subsection}{Specifying a target causal parameter} The causal parameter of interest (possibly a vector) is defined by either calling the function \code{set.targetE} or \code{set.targetMSM}. The function \code{set.targetE} defines causal parameters as the expected value(s) of \code{DAG} node(s) under one post-intervention distribution or the contrast of such expected value(s) from two post-intervention distributions. The function \code{set.targetMSM} defines causal parameters based on a \textit{working} marginal structural model \citep{neugebauer2007}. In both cases, the true value of the causal parameter is defined by one or several post-intervention distributions and can thus be approximated using counterfactual data.\\ The following types of causal parameters can be defined with the function \code{set.targetE}: \begin{itemize} \item The expectation of an outcome node under an intervention regimen denoted by $d$, where the outcome under $d$ is denoted by $Y_{d}$. This parameter can be naturally generalized to a vector of measurements of a time-varying node, i.e., the collection of nodes $Y_{d}(t)$ sharing the same name, but indexed by distinct time points \code{t} that represents a sequence of repeated measurements of the same attribute (e.g., a CD4 count or the indicator of past occurrence of a given failure event): \[ E(Y_{d})\textrm{ or }(E(Y_{d}(t)))_{t=0,1,\ldots}. \] \item The difference between two expectations of an outcome node under two interventions, $d_{1}$ and $d_{0}$. This parameter can also be naturally generalized to a vector of measurements of a time-varying node: \[ E(Y_{d_{1}})-E(Y_{d_{0}})\textrm{ or }(E(Y_{d_{1}}(t))-E(Y_{d_{0}}(t)))_{t=0,1,\ldots}. \] \item The ratio of two expectations of an outcome node under two interventions. This parameter can also be naturally generalized to a vector of measurements of a time-varying node: \[ \frac{E(Y_{d_{1}})}{E(Y_{d_{0}})}\textrm{ or }\bigg(\frac{E(Y_{d_{1}}(t))}{E(Y_{d_{0}}(t))}\bigg)_{t=0,1,\ldots}. \] \end{itemize} Note that if the \code{DAG} object contains nodes of type \code{EFU = TRUE} other than the outcome nodes of interest $Y_{d}(t)$, the target parameter must be defined by intervention regimens that set all such nodes that precede all outcomes of interest $Y_{d}(t)$ to $0$. Also note that with such intervention regimens, if the outcome node is time-varying of type \code{EFU = TRUE} then the nodes $Y_{d}(t)$ remain well defined (equal to $1$) even after the time point when the simulated value for the outcome jumps to $1$ for the first time. The nodes $Y_{d}(t)$ can then be interpreted as indicators of past failures in the absence of right-censoring events. The specification of these target parameters is covered with examples in Sections~\ref{targets1NP} and \ref{targets2NP}.\\ When the definition of the target parameter is based on a working marginal structural model, the vector of coefficients (denoted by $\alpha$) of the working model defines the target parameter. The definition of these coefficients relies on the specification of a particular weighting function when the working model is not a correct model (see \citet{neugebauer2007} for details). This weighting function is set to the constant function of 1 in this package. The corresponding true value of the coefficients $\alpha$ can then be approximated by running a standard (unweighted) regression routines applied to simulated counterfactual data observations. The following types of working models, denoted by $m()$, can be defined with the function \code{set.targetMSM}: \begin{itemize} \item The working linear or logistic model for the expectation of one outcome node under intervention $d$, possibly conditional on baseline node(s) $V$, where a baseline node is any node preceding the earliest node that is intervened upon, i.e., $E(Y_{d}\mid V)$: \[ m(d,V\:|\:\alpha). \] Such a working model can, for example, be used to evaluate the effects of HIV treatment regimens on the mean CD4 count measured at one point in time. \item The working linear or logistic model for the expectation vector of measurements of a time-varying outcome node, possibly conditional on baseline node(s) $V$, i.e., $E(Y_{d}(t)\mid V)$: \[ m(t,d,V\:|\:\alpha)\textrm{, for }t=0,1,\ldots. \] Such a working model can, for example, be used to evaluate the effects of HIV treatment regimens on survival probabilities over time. \item The logistic working model for discrete-time hazards, i.e., for the probabilities that a measurement of a time-varying outcome node of type \code{EFU=TRUE} is equal to $1$ under intervention $d$, given that the previous measurement of the time-varying outcome node under intervention $d$ is equal to $0$, possibly conditional on baseline node(s) $V$, i.e., $E(Y_{d}(t)\mid Y_{d}(t-1)=0,V)$: \[ m(t,d,V)\textrm{, for }t=0,1,\ldots. \] Such a working model can, for example, be used to evaluate the effects of HIV treatment regimens on discrete-time hazards of death over time. \end{itemize} Examples of the specification of the above target parameters are provided in Sections~\ref{targets1MSM} and \ref{targets2MSM}. As shown above, the working MSM formula $m()$ can be a function of $t$, $V$ and $d$, where $d$ is a unique identifier of each intervention regimen. In Sections~\ref{targets1MSM} and \ref{MSMwS.2} we describe in detail how to specify such identifiers for $d$ as part of the \code{action} function call. Also note that the working MSM formula, $m$, may reference time-varying nodes using the square-bracket syntax introduced in Section~\ref{SEMintro}, as long as all such instances are embedded within the syntax \code{S(...)}. This formula syntax can be used to define $V$, where $V$ is defined by a baseline measurement of a time-varying node. Example uses of this syntax are provided in Section \ref{MSMwS.1} (Example 6 of \code{set.targetMSM}) and Section \ref{MSMwS.2} (Example 7 of \code{set.targetMSM}). \subsection{Simulating data and evaluating the target causal parameter}\label{introsim} \addcontentsline{toc}{subsection}{Simulating data and evaluating the target causal parameter} The \pkg{simcausal} package can simulate two types of data: 1) observed data, sampled from the (pre-intervention) distribution specified by the structural equation model and 2) counterfactual data, sampled from one or more post-intervention distributions defined by actions on the structural equation model. Both types of data are simulated by invoking the \code{sim} function and the user can set the seed for the random number generator using the argument \code{rndseed}. The examples showing how to simulate observed data are provided in Sections~\ref{simobs1} and \ref{simobs2}, whereas the examples showing how to simulate counterfactual data are provided in Sections~\ref{simfull1} and \ref{simfull2}.\\ We note that two types of structural equation models can be encoded with the \code{DAG} object: 1) models where some or all nodes are defined by specifying the ``time'' argument \code{t} to the \code{node} function, or 2) models where the argument \code{t} is not used for any of the nodes. For the first type of structural equation models, the simulated data can be structured in either \emph{long} or \emph{wide} formats. A dataset is considered to be in wide format when each simulated observation of the complete sequence of nodes is represented by only one row of data, with each time-varying node represented by columns spanning distinct values of \code{t}. In contrast, for a dataset in long format, each simulated observation is typically represented by multiple rows of data indexed by distinct values of \code{t} and each time-varying node represented by a single column. The format of the output data is controlled by setting the argument \code{wide} of the \code{sim} function to \code{TRUE} or \code{FALSE}. The default setting for \code{sim} is to simulate data in wide format, i.e., \code{wide = TRUE}. An example describing these two formats is provided in Section~\ref{longdat}.\\ In addition, as previously described, for nodes representing the occurrence of end of follow-up events (i.e., censoring or outcome nodes declared with \code{EFU = TRUE}), the value of $1$ indicates that, during data simulation, by default, all values of subsequent nodes (including the outcome nodes) are set to missing (\code{NA}). To instead impute these missing values after a particular end of follow-up event occurs (typically the outcome event) with the \emph{last time point value carried forward} (LTCF) method, the user must set the argument \code{LTCF} of the \code{sim} function to the name of the \code{EFU}-type node that represents the end of follow-up event of interest. This will result in carrying forward the last observed measurement value for all time-varying nodes, after the value of the EFU node whose name is specified by the \code{LTCF} argument is observed to be $1$. For additional details see the package documentation for the function \code{sim}. Examples demonstrating the use of the \code{LTCF} argument for data simulation are provided in Sections~\ref{LTCFdat} and \ref{targets2NP} (Example 1 of \code{set.targetMSM}).\\ In the last step of a typical workflow, the function \code{eval.target} is generally invoked for estimation of the true value of a previously defined target causal parameter. The true value is estimated using counterfactual data simulated from post-intervention distributions. The function \code{eval.target} can be called with either previously simulated counterfactual data, specified by the argument \code{data} or a sample size value, specified by the argument \code{n}. In the latter case, counterfactual data with the user-specified sample size will be simulated first. \newpage{} %********************************************************************* \section{Simulation study with single time point interventions}\label{simstudy.pt} \addcontentsline{toc}{section}{Simulation study with single time point interventions} This example describes a typical workflow for specifying a simple structural equation model, defining various interventions, simulating observed and counterfactual data, and calculating various causal target parameters. The structural equation model chosen here illustrates a common point treatment problem in which one is interested in evaluating the effect of an intervention on one treatment node on a single outcome node using observational data with confounding by baseline covariates. This example also demonstrates the use of a plotting functionality of the \pkg{simcausal} package that builds upon the \pkg{igraph} \proglang{R} package \citep{igraph} to visualize the Directed Acyclic Graph (DAG) \citep{pearl1995, Pearl2009, pearl2010} implied by the structural equation model encoded in the \code{DAG} object. \subsection{Specifying the structural equation model}\label{node1} \addcontentsline{toc}{subsection}{Specifying the structural equation model} The example below shows how to specify a structural equation model with six nodes to represent a single time point intervention study: one categorical baseline covariate with 3 categories (\code{race}), one normally distributed baseline confounder (\code{W1}), one uniformly distributed baseline confounder (\code{W2}), one binary baseline confounder (\code{W3}), one binary exposure (\code{Anode}), and one binary outcome (\code{Y}). This example uses pre-defined \proglang{R} functions \code{rcat.b1}, \code{rbern}, \code{rnorm} and \code{runif} for sampling from the categorical, Bernoulli, normal, and uniform distributions, respectively. For details and examples on writing sampling functions for arbitrary distributions see Section~\ref{custom}. We also refer to Section~\ref{custom} for a description on how to specify node formulas (distributional parameters), such as with the use of \proglang{R} expressions specified by the \code{probs}, \code{mean} and \code{prob} arguments in the example below. <>= library("simcausal") D <- DAG.empty() D <- D + node("race", distr = "rcat.b1", probs = c(0.5, 0.25, 0.25)) + node("W1", distr = "rnorm", mean = ifelse(race == 1, 0, ifelse(race == 2, 3, 10)), sd = 1) + node("W2", distr = "runif", min = 0, max = 1) + node("W3", distr = "rbern", prob = plogis(-0.5 + 0.7 * W1 + 0.3 * W2)) + node("Anode", distr = "rbern", prob = plogis(-0.5 - 0.3 * W1 - 0.3 * W2 - 0.2 * W3)) + node("Y", distr = "rbern", prob = plogis(-0.1 + 1.2 * Anode + 0.1 * W1 + 0.3 * W2 + 0.2 * W3)) Dset <- set.DAG(D) @ Running the code above results in implicitly assigning a sampling order (temporal order) to each node - based on the order in which the nodes were added to the \code{DAG} object \code{D}. Alternatively, one can use the optional \code{node()} argument \code{order} to explicitly specify the integer value of the sampling order of each node, as described in more detail in the documentation for the \code{node} function.\\ The resulting internal representation of the structural equation model encoded by the \code{DAG} object \code{Dset} can be examined as follows (output showing only the first node of \code{Dset}). <>= str(Dset[1]) @ Figure~\ref{fig:DAG1t} shows the plot of the DAG that is generated by calling function \code{plotDAG}. This DAG is implied by the structural equation model specified above and the plotting is accomplished by using the visualization functionality from the \pkg{igraph} package \citep{igraph}. The directional arrows represent the functional dependencies in the structural equation model. More specifically, the node of origin of each arrow is an extracted node name from the \emph{node formula(s)}. Note that the appearance of the resulting diagram can be customized with additional arguments, as shown in the example below and in the \code{plotDAG} example in Section~\ref{node2}. <>= plotDAG(Dset, xjitter = 0.3, yjitter = 0.04, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 12, label.cex = 0.8)) @ <>= plotDAG(Dset, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 12, label.cex = 0.8)) @ \subsection[Simulating observed data (sim)]{Simulating observed data (\code{sim})}\label{simobs1} \addcontentsline{toc}{subsection}{Simulating observed data (sim)} Simulating observed data is accomplished by calling the function \code{sim} and specifying its arguments \code{DAG} and \code{n} that indicate the causal model and sample size of interest. Below is an example of how to simulate an observed dataset with 10,000 observations using the causal model defined in the previous section. The output is a \code{data.frame} object. <>= Odat <- sim(DAG = Dset, n = 100, rndseed = 123) @ The format of the output dataset is easily understood by examining the first row of the \code{data.frame} object: <>= Odat[1,] @ \subsection[Specifying interventions (+ action)]{Specifying interventions (\code{+ action})}\label{actions1} \addcontentsline{toc}{subsection}{Specifying interventions (+ action)} The example below defines two actions on the treatment node. The first action named \code{"A1"} consists in replacing the distribution of the treatment node \code{Anode} with the degenerate distribution at value 1. The second action named \code{"A0"} consists in replacing the distribution of the treatment node \code{Anode} with the degenerate distribution at value 0. As shown below, these interventions are defined by invoking the \code{ + action} syntax on the existing \code{DAG} object. This syntax automatically adds and saves the new intervention object within the original \code{DAG} object, without overwriting it. <>= A1 <- node("Anode", distr = "rbern", prob = 1) Dset <- Dset + action("A1", nodes = A1) A0 <- node("Anode", distr = "rbern", prob = 0) Dset <- Dset + action("A0", nodes = A0) @ The added actions can be examined by looking at the result of the call \code{A(Dset)}. Note that \code{A(Dset)} returns a list of \code{DAG.action} objects, with each \code{DAG.action} encoding a particular post-intervention distribution, i.e., it is a modified copy of the original \code{DAG} object, where the original distribution of the node \code{Anode} is replaced with the degenerate distribution at value 0 or 1, for actions \code{"A0"} and \code{"A1"}, respectively. <>= names(A(Dset)) class(A(Dset)[["A0"]]) @ The following example shows how to display the modified distribution of the intervention node \code{Anode} under action \code{"A0"}: <<>>= A(Dset)[["A0"]]$Anode @ To examine the complete internal representation of the \code{DAG.action} object for action \code{"A0"}, invoke the \code{str()} function as follows (output not shown): <>= str(A(Dset)[["A0"]]) @ \subsection[Simulating counterfactual data (sim)]{Simulating counterfactual data (\code{sim})}\label{simfull1} \addcontentsline{toc}{subsection}{Simulating counterfactual data (sim)} Simulating counterfactual data is accomplished by calling the function \code{sim} and specifying its arguments \code{DAG}, \code{actions} and \code{n} to indicate the causal model, interventions, and sample size of interest. Counterfactual data can be simulated for all actions stored in the \code{DAG} object or a subset by setting the \code{actions} argument to the vector of the desired action names.\\ The example below shows how to use the \code{sim} function to simulate 100,000 observations for each of the two actions, \code{"A1"} and \code{"A0"}. These actions were defined as part of the \code{DAG} object \code{Dset} above. The call to \code{sim} below produces a list of two named \code{data.frame} objects, where each \code{data.frame} object contains observations simulated from the same post-intervention distribution defined by one particular action only. <>= Xdat1 <- sim(DAG = Dset, actions = c("A1", "A0"), n = 1000, rndseed = 123) names(Xdat1) nrow(Xdat1[["A1"]]) nrow(Xdat1[["A0"]]) @ The format of the output list is easily understood by examining the first row of each \code{data.frame} object: <>= Xdat1[["A1"]][1, ] Xdat1[["A0"]][1, ] @ \subsection{Defining and evaluating various causal target parameters}\label{targets1} \addcontentsline{toc}{subsection}{Defining and evaluating various causal target parameters} \subsubsection[Causal parameters defined with set.targetE]{Causal parameters defined with \code{set.targetE}}\label{targets1NP} The first example below defines the causal quantity of interest as the expectation of node \code{Y} under action \code{"A1"}: <>= Dset <- set.targetE(Dset, outcome = "Y", param = "A1") @ The true value of the above causal parameter is now evaluated by calling the function \code{eval.target} and passing the previously simulated counterfactual data object \code{Xdat1}. <>= eval.target(Dset, data = Xdat1)$res @ Alternatively, \code{eval.target} can be called without the simulated counterfactual data, specifying the sample size argument \code{n} instead. In this case a counterfactual dataset with the user-specified sample size is simulated first. <>= eval.target(Dset, n = 1000, rndseed = 123)$res @ The example below defines the causal target parameter as the ATE on the additive scale, i.e. the expectation of \code{Y} under action \code{"A1"} minus its expectation under action \code{"A0"}. <>= Dset <- set.targetE(Dset, outcome = "Y", param = "A1-A0") eval.target(Dset, data = Xdat1)$res @ Similarly, the ATE on the multiplicative scale can be evaluated as follows: <>= Dset <- set.targetE(Dset, outcome = "Y", param = "A1/A0") eval.target(Dset, data = Xdat1)$res @ \subsubsection[Causal parameters defined with set.targetMSM]{Causal parameters defined with \code{set.targetMSM}}\label{targets1MSM} To specify MSM target causal parameter, the user must provide the following arguments to \code{set.targetMSM}: (1) the \code{DAG} object that contains all and only the actions of interest; (2) \code{outcome}, the name of the outcome node (possibly time-varying); (3) for a time-varying outcome node, the vector of time points \code{t} that index the outcome measurements of interest; (4) \code{form}, the regression formula defining the working MSM; (5) \code{family}, the working model family that is passed on to \code{glm}, e.g., \code{family = "binomial"} or \code{family = "gaussian"} for a logistic or a linear working model; and (6) for time-to-event outcomes, the logical flag \code{hazard} that indicates whether the working MSM describes discrete-time hazards (\code{hazard = TRUE}) or survival probabilities (\code{hazard = FALSE}). \\ In the examples above, the two actions \code{"A1"} and \code{"A0"} are defined as deterministic static interventions on the node \code{Anode}, setting it to either constant 0 or 1. Thus, each of these two interventions is uniquely indexed by the post-intervention value of the node \code{Anode} itself. In the following example, we instead introduce the variable $d\in\{0,1\}$ to explicitly index each of the two post-intervention distributions when defining the two actions of interest. We then define the target causal parameter as the coefficients of the following linear marginal structural model $m(d\:|\:\alpha)=\alpha_{0}+\alpha_{1}d$. As expected, the estimated true value for $\alpha_{1}$ obtained below corresponds exactly with the estimated value for the ATE on the additive scale obtained above by running \code{set.targetE} with the parameter \code{param = "A1-A0"}.\\ As just described, we now redefine the actions \code{"A1"} and \code{"A0"} by indexing the intervention node formula (the distributional parameter \code{prob}) with parameter \code{d} before setting its values to $0$ or $1$ by introducing an additional new argument named \code{d} into the \code{action} function call. This creates an action- specific attribute variable \code{d} whose value uniquely identifies each of the two actions and that will be included as an additional column variable to the simulating counterfactual data sets. <>= A1 <- node("Anode", distr = "rbern", prob = d) Dset <- Dset + action("A1", nodes = A1, d = 1) A0 <- node("Anode",distr = "rbern", prob = d) Dset <- Dset + action("A0", nodes = A0, d = 0) @ Creating such an action-specific attribute \code{d} allows it to be referenced in the MSM regression formula as shown below: <>= msm.form <- "Y ~ d" Dset <- set.targetMSM(Dset, outcome = "Y", form = msm.form, family = "gaussian") msm.res <- eval.target(Dset, n = 1000, rndseed = 123) msm.res$coef @ \subsection{Defining node distributions and vectorizing node formula functions} \label{custom} \addcontentsline{toc}{subsection}{Defining node distributions and vectorizing node formula functions} To facilitate the comprehension of the following two subsections, we note that, in the \pkg{simcausal} package, simulation of observed or counterfactual data follows the temporal ordering of the nodes that define the DAG object and is \textbf{vectorized}. More specifically, the simulation of a dataset with sample size \code{n} is carried out by first sampling the vector of all \code{n} observations of the first node, before sampling the vector of all \code{n} observations of the second node and so on, where the node ranking is defined by the temporal ordering that was explicitly or implicitly specified by the user during the creation of the DAG object (see Section~\ref{SEMintro} for a discussion of temporal ordering). \subsubsection{Defining node distributions} The distribution of a particular node is specified by passing the name of an evaluable \proglang{R} function to the \code{distr} argument of the function \code{node}. Such a distribution function must implement the mapping of \code{n} independent realizations of the parent nodes into \code{n} independent realizations of this node. In general, any node with a lower temporal ordering can be defined as a parent. Thus, such a distribution function requires an argument \code{n}, but will also typically rely on additional input arguments referred to as distributional parameters. In addition, the output of the distribution function must also be a vector of length \code{n}. Distributional parameters must be either scalars or vectors of \code{n} realizations of summary measures of the parent nodes. The latter types of distributional arguments are referred to as the \emph{node formula(s)} because they are specified by evaluable \proglang{R} expressions. Distributional parameters are passed as named arguments to the \code{node} function so they can be mapped uniquely to the relevant argument of the function that is user-specified by the \code{distr} argument of the \code{node} function call. The node formula(s) of any given node may invoke the name(s) of any other node(s) with a lower temporal order value. The parents of a particular node are thus defined as the collection of nodes that are referenced by its node formula(s). Note that unlike the values of distributional parameters, the value of the argument \code{n} of the \code{distr} function is internally determined during data simulation and is set to the sample size value passed to the \code{sim} function by the user.\\ For example, as shown below, the pre-written wrapper function for the Bernoulli distribution \code{rbern} is defined with two arguments, \code{n} and \code{prob}. When defining a node with the \code{distr} argument set to \code{"rbern"}, only the second argument must be explicitly user-specified by a distributional parameter named \code{prob} in the call to the \code{node} function, e.g., \code{node("N1", distr="rbern", prob = 0.5)}. The argument \code{prob} can be either a numeric constant as in the previous example or an evaluable \proglang{R} expression. When \code{prob} is a numeric constant, the distribution function \code{rbern} returns \code{n} iid realizations of the Bernoulli random variable with probability \code{prob}. When \code{prob} is an \proglang{R} expression (e.g., see the definition of node \code{W3} in Section~\ref{node1}) that involves parent nodes, the \code{prob} argument passed to the \code{rbern} function becomes a vector of length \code{n}. The value of each of its component is determined by the \proglang{R} expression evaluated using one of the \code{n} iid realizations of the parent nodes simulated previously. Thus, the resulting simulated independent observations of the child node (e.g., \code{W3} in Section ~\ref{node1}) are not necessarily identically distributed if the vector \code{prob} contains distinct values. We note that the \proglang{R} expression in the \code{prob} argument is evaluated in the environment containing the simulated observations of all previous nodes (i.e., nodes with a lower temporal order value). \\ To see the names of all pre-written distribution wrapper functions that are specifically optimized for use as \code{distr} functions in the \pkg{simcausal} package, invoke \code{distr.list()}, as shown below: <>= distr.list() @ For a template on how to write a custom distribution function, see the documentation \code{?rdistr.template} and \code{rdistr.template}, as well as any of the pre-written distribution functions above. For example, the \code{rbern} function below simply wraps around the standard \proglang{R} function \code{rbinom} to define the Bernoulli random variable generator: <>= rbern @ Another example on how to write a custom distribution function to define a custom left-truncated normal distribution function based on the standard \proglang{R} function \code{rnorm} with arguments \code{mean} and \code{sd} is demonstrated below. The truncation level is specified by an additional distributional parameter \code{minval}, with default value set to $0$. <>= rnorm_trunc <- function(n, mean, sd, minval = 0) { out <- rnorm(n = n, mean = mean, sd = sd) minval <- minval[1] out[out < minval] <- minval out } @ The example below makes use of this function to define the outcome node \code{Y} with positive values only: <>= Dmin0 <- DAG.empty() Dmin0 <- Dmin0 + node("W", distr = "rbern", prob = plogis(-0.5)) + node("Anode", distr = "rbern", prob = plogis(-0.5 - 0.3 * W)) + node("Y", distr = "rnorm_trunc", mean = -0.1 + 1.2 * Anode + 0.3 * W, sd = 10) Dmin0set <- set.DAG(Dmin0) @ In the next example, we overwrite the previous definition of node \code{Y} to demonstrate how alternative values for the truncation parameter \code{minval} may be passed by the user as part of the \code{node} function call: <>= Dmin0 <- Dmin0 + node("Y", distr = "rnorm_trunc", mean = -0.1 + 1.2 * Anode + 0.3 * W, sd = 10, minval = 10) Dmin10set <- set.DAG(Dmin0) @ Finally, we illustrate how the \code{minval} argument can also be defined as a function of parent node realizations: <>= Dmin0 <- Dmin0 + node("Y", distr = "rnorm_trunc", mean = -0.1 + 1.2 * Anode + 0.3 * W, sd = 10, minval = ifelse(Anode == 0, 5, 10)) Dminset <- set.DAG(Dmin0) @ \subsubsection[Vectorizing node formula functions vecfun.add]{Vectorizing node formula functions (\code{vecfun.add})} As just described, the distributional parameters defining a particular node distribution can be evaluable \proglang{R} expressions referred to as \emph{node formulas}. These expressions can contain any build-in or user-defined \proglang{R} functions. By default any function inside such an \proglang{R} expression is assumed non-vectorized, except for functions on the \pkg{simcausal} built-in list of known vectorized functions. To see this list, call the \code{vecfun.all.print} function: <>= vecfun.all.print() @ When parsing node formulas, the default behavior implemented in \pkg{simcausal} is to replace each call to any function not on this list (i.e., a function not recognized as vectorized) with a call to the \code{apply} function to loop over the rows of the dataset containing the simulated realizations of the parents nodes stored in wide format and to call the unrecognized function individually on every row of the data. For example, with the user-defined function \code{power2} below, the simulation of observations of the node \code{W3} first involves automatic replacement of the calls \code{power2(W1)} and \code{power2(W2)} in the node formulas \code{mean} and \code{sd} with the \code{apply} function calls \code{apply(W1, 1, power2)} and \code{apply(W2, 1, power2)}, respectively: <>= power2 <- function(arg) arg^2 D <- DAG.empty() D <- D + node("W1", distr = "rnorm", mean = 0, sd = 1) + node("W2", distr = "rnorm", mean = 0, sd = 1) + node("W3", distr = "rnorm", mean = power2(W1), sd = power2(W2)) @ This behavior results in increased computing time that can be avoided as described here. The list of known vectorized functions can be easily expanded to include any function \code{funname}, by simply invoking the call \code{vecfun.add(funname)} prior to data simulation with the \code{sim} function. Doing this avoids the often unnecessary calls to the \code{apply} loops shown above and typically results in a significant performance boost for data simulation.\\ The performance gain from vectorization is demonstrated with the \code{power2} function in the example below. We first examine the simulation time when \code{power2} is not added to the list of recognized vectorized functions as follows: <>= D1 <- set.DAG(D) (tnonvec <- system.time(sim1nonvec <- simobs(D1, n = 1000000, rndseed = 123))) @ Second, we examine the simulation time after adding \code{power2} to the list of recognized vectorized functions as follows: <>= vecfun.add(c("power2")) D1vec <- set.DAG(D) (tvec <- system.time(sim1vec <- simobs(D1vec, n = 1000000, rndseed = 123))) all.equal(sim1nonvec,sim1vec) @ We note that data simulation is approximately \Sexpr{round(tnonvec['elapsed']/tvec['elapsed'],0)} times faster with the second approach above.\\ To see the names of user-added recognized vectorized functions, call \code{vecfun.print}. To reset this list, call \code{vecfun.reset}. Note that the built-in list of known vectorized functions cannot be modified/reset. <>= vecfun.print() vecfun.reset() vecfun.print() @ It is important to note that all non-vectorized (or unrecognized vectorized) functions referenced in a node formula, such as \code{ifelse1} in the example below, must be declared with at most one argument. Trying to declare such a function with more than one argument will result in an error. Indeed, the default behavior when parsing node formulas in \pkg{simcausal} is to aggregate the arguments of a node formula function not on the list of recognized vectorized functions into a matrix using a call to the \code{cbind} function. This matrix is then passed as a single argument to the \code{apply} loop that replaces the call to the user-specified node formula. For example with the custom version of the \code{ifelse} function defined below, the call to \code{ifelse1(c(W1, 0.5, 0.1))} is automatically replaced with a call to \code{apply(cbind(W1, 0.5, 0.1), 1, ifelse1)}: <>= vecfun.reset() ifelse1 <- function(arg) { ifelse(arg[1], arg[2], arg[3]) } D <- DAG.empty() D <- D + node("W1", distr = "rbern", prob = 0.05) + node("W2", distr = "rbern", prob = ifelse1(c(W1, 0.5, 0.1))) D2nonvec <- set.DAG(D) @ The restriction in the number of arguments for the node formula functions is easily avoided by adding functions such as \code{ifelse1} above to the list of recognized vectorized functions. Doing so allows the function to be declared with an arbitrary number of arguments. The example below demonstrates the use of the second custom version of \code{ifelse}, called \code{ifelse2}, which is declared with 3 arguments, added to the list of recognized vectorized functions and then called as part of the node formula of \code{W2}: <>= ifelse2 <- function(arg, val1, val2) { ifelse(arg, val1, val2) } vecfun.add(c("ifelse2")) D <- DAG.empty() D <- D + node("W1", distr = "rbern", prob = 0.05) + node("W2", distr = "rbern", prob = ifelse2(W1, 0.5, 0.1)) D2vec <- set.DAG(D) @ The performance gain from vectorization is demonstrated below for the previous two custom versions of the \code{ifelse} function: <>= (t2nonvec <- system.time(sim2nonvec <- simobs(D2nonvec, n = 100000, rndseed = 123))) (t2vec <- system.time(sim2vec <- simobs(D2vec, n = 100000, rndseed = 123))) all(unlist(lapply(seq(ncol(sim2nonvec)), function(coli) all.equal(sim2nonvec[, coli], sim2vec[, coli])))) @ We note that the above data simulation is approximately \Sexpr{round(t2nonvec['elapsed']/t2vec['elapsed'],0)} times faster with the \code{ifelse2} function compared to the \code{ifelse1} function. We also note that the above performance gain of vectorization will be larger if the sample size \code{n} is increased. \newpage{} %********************************************************************* \section{Simulation study with multiple time point interventions}\label{simstudy.timevar} \addcontentsline{toc}{section}{Simulation study with multiple time point interventions} In this example we replicate results from the longitudinal data simulation protocol used in two published manuscripts \citet{neugebauer2014, neugebauer2015}. We first describe the structural equation model that implies the data generating distribution of the observed data, with time-to-event outcome, as reported in Section 5.1 of \citet{neugebauer2015}. We then show how to specify this model using the \pkg{simcausal} \proglang{R} interface, simulate observed data, define static and time-varying intervention, simulate counterfactual data, and calculate various causal parameters based on these interventions. In particular, we replicate estimates of true counterfactual risk differences under the dynamic interventions reported in \citet{neugebauer2014}. \subsection{Specifying the structural equation model}\label{node2} \addcontentsline{toc}{subsection}{Specifying the structural equation model} In this section, we demonstrate how to specify the structural equation model described by the following longitudinal data simulation protocol (Section 5.1 of \citet{neugebauer2015}): \begin{enumerate} \item $L_{2}(0)\sim\mathcal{B}(0.05)$ where $\mathcal{B}$ denotes the Bernoulli distribution (e.g., $L_{2}(0)$ represents a baseline value of a time-dependent variable such as low versus high A1c) \item If $L_{2}(0)=1$ then $L_{1}(0)\sim\mathcal{B}(0.5)$, else $L_{1}(0)\sim\mathcal{B}(0.1)$ (e.g., $L_{1}(0)$ represents a time-independent variable such as history of cardiovascular disease at baseline) \item If $(L_{1}(0),L_{2}(0))=(1,0)$ then $A_{1}(0)\sim\mathcal{B}(0.5)$, else if $(L_{1}(0),L_{2}(0))=(0,0)$ then $A_{1}(0)\sim\mathcal{B}(0.1)$, else if $(L_{1}(0),L_{2}(0))=(1,1)$ then $A_{1}(0)\sim\mathcal{B}(0.9)$, else if $(L_{1}(0),L_{2}(0))=(0,1)$ then $A_{1}(0)\sim\mathcal{B}(0.5)$ (e.g., $A_{1}(0)$ represents the binary exposure to an intensified type 2 diabetes pharmacotherapy) \item $A_{2}(0)\sim\mathcal{B}(0)$ (e.g., $A_{2}(0)$ represents occurrence of a right-censoring event) \item for $t=1,\ldots,16$ and as long as $Y(t-1)=0$ (by convention, $Y(0)=0$): \begin{enumerate} \item $Y(t)\sim\mathcal{B}(\frac{1}{1+\exp{(-(-6.5+L_{1}(0)+4L_{2}(t-1)+0.05*\sum_{j=0}^{t-1}I(L_{2}(j)=0)))}})$ (e.g., Y(t) represents the indicator of failure such as onset or progression of albuminuria) \item If $A_{1}(t-1)=1$ then $L_{2}(t)\sim\mathcal{B}(0.1)$, else if $L_{2}(t-1)=1$ then $L_{2}(t)\sim\mathcal{B}(0.9)$, else $L_{2}(t)\sim\mathcal{B}(\min(1,0.1+t/16))$ \item If $A_{1}(t-1)=1$ then $A_{1}(t)=1$, else if $(L_{1}(0),L_{2}(t))=(1,0)$ then $A_{1}(t)\sim\mathcal{B}(0.3)$, else if $(L_{1}(0),L_{2}(t))=(0,0)$ then $A_{1}(t)\sim\mathcal{B}(0.1)$, else if $(L_{1}(0),L_{2}(t))=(1,1)$ then $A_{1}(t)\sim\mathcal{B}(0.7)$, else if $(L_{1}(0),L_{2}(t))=(0,1)$ then $A_{1}(t)\sim\mathcal{B}(0.5)$ \item If $t=16$ then $A_{2}(t)\sim\mathcal{B}(1)$ (e.g., administrative end of study), else $A_{2}(t)\sim\mathcal{B}(0)$ (e.g., no right-censoring). \end{enumerate} \end{enumerate} \newpage{} First, the example below shows how to define the nodes \code{L2}, \code{L1}, \code{A1} and \code{A2} at time point \code{t = 0} as Bernoulli random variables, using the distribution function \code{"rbern"}: % get rid of the gray background: % <>= <>= library("simcausal") options(simcausal.verbose=FALSE) D <- DAG.empty() D <- D + node("L2", t = 0, distr = "rbern", prob = 0.05) + node("L1", t = 0, distr = "rbern", prob = ifelse(L2[0] == 1, 0.5, 0.1)) + node("A1", t = 0, distr = "rbern", prob = ifelse(L1[0] == 1 & L2[0] == 0, 0.5, ifelse(L1[0] == 0 & L2[0] == 0, 0.1, ifelse(L1[0] == 1 & L2[0] == 1, 0.9, 0.5)))) + node("A2", t = 0, distr = "rbern", prob = 0, EFU = TRUE) @ Second, the example below shows how one may use the \code{node} function with node formulas based on the square bracket function \code{'['} to easily define the time-varying nodes \code{Y}, \code{L1}, \code{A1} and \code{A2} simultaneously for all subsequent time points \code{t} ranging from 1 to 16: <>= t.end <- 16 D <- D + node("Y", t = 1:t.end, distr = "rbern", prob = plogis(-6.5 + L1[0] + 4 * L2[t-1] + 0.05 * sum(I(L2[0:(t-1)] == rep(0, t)))), EFU = TRUE) + node("L2", t = 1:t.end, distr = "rbern", prob = ifelse(A1[t-1] == 1, 0.1, ifelse(L2[t-1] == 1, 0.9, min(1, 0.1 + t / 16)))) + node("A1", t = 1:t.end, distr = "rbern", prob = ifelse(A1[t-1] == 1, 1, ifelse(L1[0] == 1 & L2[t] == 0, 0.3, ifelse(L1[0] == 0 & L2[t] == 0, 0.1, ifelse(L1[0] == 1 & L2[t] == 1, 0.7, 0.5))))) + node("A2", t = 1:t.end, distr = "rbern", prob = {if(t == 16) {1} else {0}}, EFU = TRUE) lDAG <- set.DAG(D) @ Note that the \code{node} formulas specified with the \code{prob} argument above use the generic time variable \code{t} both outside and inside the square-bracket vector syntax. For example, the conditional distribution of the time-varying node \code{Y} is defined by an \proglang{R} expression that contains the syntax \code{sum(I(L2[0:(t - 1)] == rep(0, t)))}, which evaluates to different \proglang{R} expressions, as \code{t} ranges from 0 to 16: \begin{enumerate} \item \code{sum(I(L2[0] == 0))}, for \code{t = 1}; and \item \code{sum(I(L2[0:1] == c(0, 0)))}, for \code{t = 2}, \ldots, \code{sum(I(L2[0:16] == c(0, ..., 0)))}, for \code{t = 16}. \end{enumerate} For more details on the specification of node formulas, see Section~\ref{custom}.\\ One can visualize the observed data generating distribution defined in the \code{lDAG} object as shown in Figures~\ref{fig:DAGlong} and \ref{fig:DAGlongtmax3} by calling \code{plotDAG}. The directional arrows represent the functional dependencies in the structural equation model. More specifically, the node of origin of each arrow is an extracted node name from the \emph{node formula(s)}. Note that the appearance of the resulting diagram can be customized with additional arguments, as shown in the following examples. The argument \code{tmax} can be used to restrict the plotting of the DAG object to only the nodes indexed by time points below the user-specified \code{tmax} value, as shown in Figure~\ref{fig:DAGlongtmax3}.\\ Figure~\ref{fig:DAGlong} is created by running the following code: <>= plotDAG(lDAG, xjitter = 0.3, yjitter = 0.01) @ Figure~\ref{fig:DAGlongtmax3} is created by running the following code: <>= plotDAG(lDAG, tmax = 3, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 12, label.cex = 0.8)) @ <>= plotDAG(lDAG, xjitter = 0.3, yjitter = 0.01) @ <>= plotDAG(lDAG, tmax = 3, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 12, label.cex = 0.8)) @ \subsection[Simulating observed data (sim)]{Simulating observed data (\code{sim})}\label{simobs2} \addcontentsline{toc}{subsection}{Simulating observed data (sim)} Simulating observed data is accomplished by calling the function \code{sim} and specifying its arguments \code{DAG} and \code{n} that indicate the causal model and sample size of interest. Below is an example of how to simulate an observed dataset with $10,000$ observations using the causal model defined previously. The output is a \code{data.frame} object. <>= Odat <- sim(DAG = lDAG, n = 100, rndseed = 123) @ The format of the output dataset is easily understood by examining the first 10 columns of the first row of the \code{data.frame} object: <<>>= Odat[1,1:10] @ \subsection[Specifying interventions (+ action)]{Specifying interventions (\code{+ action})}\label{actions2} \addcontentsline{toc}{subsection}{Specifying interventions (+ action)} \subsubsection{Dynamic interventions}\label{dynact} The following two dynamic interventions on the time-varying node \code{A1} of the structural equation model encoded by the previously defined \code{lDAG} object were studied in \citet{neugebauer2014}: `\emph{Initiate treatment $A_{1}$ the first time $t$ that the covariate $L_{2}$ is greater than or equal to $\theta$ and continue treatment thereafter (i.e., $\bar{A}_{1}(t-1)=0$ and $A(t)=1, A(t+1)=1$, $\ldots$)}', for $\theta=0,1$. The example below demonstrates how to specify these two dynamic interventions.\\ First, we define the list of intervention nodes and their post-intervention distributions. Note that these distributions are indexed by the attribute \code{theta}, whose value is not yet defined: <<>>= act_theta <-c(node("A1", t = 0, distr = "rbern", prob = ifelse(L2[0] >= theta , 1, 0)), node("A1", t = 1:(t.end), distr = "rbern", prob = ifelse(A1[t-1] == 1, 1, ifelse(L2[t] >= theta, 1, 0)))) @ Second, we add the two dynamic interventions to the \code{lDAG} object while defining the value of \code{theta} for each intervention: <>= Ddyn <- lDAG Ddyn <- Ddyn + action("A1_th0", nodes = act_theta, theta = 0) Ddyn <- Ddyn + action("A1_th1", nodes = act_theta, theta = 1) @ We refer to the argument \code{theta} passed to the \code{+action} function as an \emph{action attribute}.\\ One can select and inspect particular actions saved in a DAG object by invoking the function \code{A()}: <<>>= class(A(Ddyn)[["A1_th0"]]) A(Ddyn)[["A1_th0"]] @ Figure~\ref{fig:actDAGlongtmax3} shows the plot of the \code{DAG.action} object associated with the dynamic intervention named \code{"A1_th0"} (with plotting restricted to nodes indexed by time points lower than or equal to 3). Note that the intervention nodes are marked in red and that the action attribute \code{theta} is represented as a separate node. The following code is used to generate Figure~\ref{fig:actDAGlongtmax3}: <>= plotDAG(A(Ddyn)[["A1_th0"]], tmax = 3, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 15, label.cex = 0.7)) @ <>= plotDAG(A(Ddyn)[["A1_th0"]], tmax = 3, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 15, label.cex = 0.7)) @ The distribution of some or all of the the intervention nodes that define an action saved within a \code{DAG} object can be modified by adding a new intervention object with the same action name to the \code{DAG} object. The new intervention object can involve actions on only a subset of the original intervention nodes for a partial modification of the original action definition. For example, the code below demonstrates how the existing action \code{"A1\_th0"} with the previously defined dynamic and deterministic intervention on the node \code{A1[0]} is partially modified by replacing the intervention distribution for the node \code{A1[0]} with a deterministic and static intervention defined by a degenerate distribution at value $1$. Note that the other intervention nodes previously defined as part of the action \code{"A1\_th0"} remain unchanged. <>= A(Ddyn)[["A1_th0"]]$A1_0 Ddyntry <- Ddyn + action("A1_th0", nodes = node("A1", t = 0, distr = "rbern", prob = 0)) A(Ddyntry)[["A1_th0"]]$A1_0 @ Similarly, some or all of the action attributes that define an action saved within a \code{DAG} object can be modified by adding a new intervention object with the same action name but a different attribute value to the \code{DAG} object. This functionality is demonstrated with the example below in which the previous value $0$ of the action attribute \code{theta} that defines the action named \code{"A1_th0"} is replaced with the value $1$ and in which a new attribute \code{newparam} is simultaneously added to the previously defined action \code{"A1_th0"}: <>= A(Ddyntry)[["A1_th0"]] Ddyntry <- Ddyntry + action("A1_th0", nodes = act_theta, theta = 1, newparam = 100) A(Ddyntry)[["A1_th0"]] @ Finally, we note that an action attribute can also be defined as a time-varying vector, rather than a scalar, i.e., a vector of scalars that are each indexed by a time point. This functionality is demonstrated in the example below to define interventions that are indexed by a scalar \code{theta} whose value changes over time. Note that in this example the square-bracket syntax \code{theta[t]} is used for referencing the time-varying values of the action attribute \code{theta}. More details on the use of time-varying action attributes are provided in the next section. <>= act_theta_t <-c(node("A1",t = 0, distr = "rbern", prob = ifelse(L2[0] >= theta[t], 1, 0)), node("A1",t = 1:t.end, distr = "rbern", prob = ifelse(A1[t-1]==1, 1, ifelse(L2[t] >= theta[t], 1, 0))) ) Ddyntry <- Ddyntry + action("A1_th0", nodes = act_theta_t, theta = rep(0,(t.end)+1)) A(Ddyntry)[["A1_th0"]] @ \subsubsection{Static interventions}\label{staticact} Here we diverge from the replication of simulation results presented in \citet{neugebauer2014}. Instead, we build on the structural equation model introduced in that paper to illustrate the specification of static interventions on the treatment nodes \code{A1}. These static interventions are defined by more or less early treatment initiation during follow-up followed by subsequent treatment continuation. Each of these static interventions is thus uniquely identified by the time when the measurements of the time-varying node \code{A1} switch from value $0$ to $1$. The time of this value switch is represented by the parameter \code{tswitch} in the code below. Note that the value \code{tswitch = 16} identifies the static intervention corresponding with no treatment initiation during follow-up in our example while the values $0$ through $15$ represent $16$ distinct interventions representing increasingly delayed treatment initiation during follow-up.\\ First, we define the list of intervention nodes and their post-intervention distributions. Note that these distributions are indexed by the attribute \code{tswitch}, whose value is not yet defined: <>= `%+%` <- function(a, b) paste0(a, b) Dstat <- lDAG act_A1_tswitch <- node("A1",t = 0:(t.end), distr = "rbern", prob = ifelse(t >= tswitch, 1, 0)) @ Second, we add the 17 static interventions to the \code{lDAG} object while defining the value of \code{tswitch} for each intervention: <>= tswitch_vec <- (0:t.end) for (tswitch_i in tswitch_vec) { abar <- rep(0, length(tswitch_vec)) abar[which(tswitch_vec >= tswitch_i)] <- 1 Dstat <- Dstat + action("A1_ts"%+%tswitch_i, nodes = act_A1_tswitch, tswitch = tswitch_i, abar = abar) } @ Note that in addition to the action attribute \code{tswitch}, each intervention is also indexed by an additional action attribute \code{abar} that also uniquely identifies the intervention and that represents the actual sequence of treatment decisions that defines the intervention, i.e., $\bar{a}(tswitch-1)=0$, $a(tswitch)=1$, $\ldots$: <>= A(Dstat)[["A1_ts3"]] @ The purpose of this additional action attribute \code{abar} will become clear when we illustrate the definition of target parameters defined by working MSMs based on these 17 static interventions in Section~\ref{MSMwS.2} (Example 7 of \code{set.targetMSM}).\\ Figure~\ref{fig:act2DAGlongtmax3} shows the plot of the \code{DAG.action} object associated with the action named \code{"A1_ts3"} (with plotting restricted to nodes indexed by time points lower than or equal to 3 and with exclusion of the action attribute \code{"abar"} from the plot). Note that the intervention nodes are marked in red by default. The following code is used to generate Figure~\ref{fig:act2DAGlongtmax3}: <>= plotDAG(A(Dstat)[["A1_ts3"]], tmax = 3, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 15, label.cex = 0.7), excludeattrs = "abar") @ <>= plotDAG(A(Dstat)[["A1_ts3"]], tmax = 3, xjitter = 0.3, yjitter = 0.03, edge_attrs = list(width = 0.5, arrow.width = 0.4, arrow.size = 0.8), vertex_attrs = list(size = 15, label.cex = 0.7), excludeattrs = "abar") @ \subsection[Simulating counterfactual data (sim)]{Simulating counterfactual data (\code{sim})}\label{simfull2} \addcontentsline{toc}{subsection}{Simulating counterfactual data (sim)} Simulating counterfactual data is accomplished by calling the function \code{sim} and specifying its arguments \code{DAG}, \code{actions} and \code{n} to indicate the causal model, interventions, and sample size of interest. The counterfactual data can be simulated for all actions stored in the \code{DAG} object or a subset by setting the \code{actions} argument to the vector of the desired-action names. \subsubsection{Dynamic interventions}\label{simfull2.dyn} The example below shows how to use the \code{sim} function to simulate 100,000 observations for each of the two dynamic actions, \code{"A1_th0"} and \code{"A1_th1"}, defined in Section~\ref{dynact}. The call to \code{sim} below produces a list of two named \code{data.frame} objects, where each \code{data.frame} object contains observations simulated from the same post-intervention distribution defined by one particular action only. <>= Xdyn <- sim(Ddyn, actions = c("A1_th0", "A1_th1"), n = 100, rndseed = 123) nrow(Xdyn[["A1_th0"]]) nrow(Xdyn[["A1_th1"]]) names(Xdyn) @ The default format of the output list generated by the \code{sim} function is easily understood by examining the first $15$ columns of the first row of each \code{data.frame} object: <>= Xdyn[["A1_th0"]][1, 1:15] Xdyn[["A1_th1"]][1, 1:15] @ \subsubsection{Static interventions}\label{simfull2.stat} This example shows how to use the \code{sim} function to simulate 1,000 observations for each of the 17 static actions defined in Section \ref{staticact}. The call to the \code{sim} function below produces a list of named \code{data.frame} objects, where each \code{data.frame} object contains observations simulated from the same post-intervention distribution defined by one particular action only. <>= Xstat <- sim(Dstat, actions = names(A(Dstat)), n = 100, rndseed = 123) length(Xstat) nrow(Xstat[["A1_ts3"]]) @ The default format of the output list generated by the \code{sim} function is easily understood by examining the first row of the \code{data.frame} object associated with the action \code{"A1_ts3"}: <>= Xstat[["A1_ts3"]][1, ] @ \subsection[Converting datasets from wide to long format (DF.to.long)]{Converting a dataset from \emph{wide} to \emph{long} format (\code{DF.to.long})}\label{longdat} \addcontentsline{toc}{subsection}{Converting datasets from wide to long format (DF.to.long)} The specification of structural equation models based on time-varying nodes such as the one described in Section~\ref{node2} allows for simulated (observed or counterfactual) data to be structured in either long or wide formats. Below, we illustrate these two alternatives. We note that, by default, simulated (observed or counterfactual) data from the \code{sim} function are stored in wide format. The data output format from the \code{sim} function can, however, be changed to the long format by setting the \code{wide} argument of the \code{sim} function to \code{FALSE} or, equivalently, by applying the function \code{DF.to.long} to an existing simulated dataset in wide format.\\ The following code demonstrates the default data formatting behavior of the \code{sim} function and how this behavior can be modified to generate data in the long format: <>= Odat.wide <- sim(DAG = lDAG, n = 100, wide = TRUE, rndseed = 123) Odat.wide[1:2, 1:18] Odat.long <- sim(DAG = lDAG, n = 100, wide = FALSE, rndseed = 123) Odat.long[1:5, ] @ As with observed data, the default behavior of the \code{sim} function can be changed so that simulated counterfactual data are instead structured in long format: <>= lXdyn <- sim(Ddyn, actions = c("A1_th0", "A1_th1"), n = 1000, wide = FALSE, rndseed = 123) head(lXdyn[["A1_th0"]], 5) @ The following example demonstrates how the function \code{DF.to.longDT} can be used to convert simulated (counterfactual or observed) data stored in wide format to long format (note that \code{DF.to.longDT} internally uses the \code{data.table} package, but returns a \code{data.frame} by default): <>= Odat.long2 <- DF.to.longDT(Odat.wide) Odat.long2[1:5, ] @ \subsection[Implementing imputation by last time point value carried forward (doLTCF)]{Implementing imputation by \emph{last time point value carried forward} (\code{doLTCF})}\label{LTCFdat} \addcontentsline{toc}{subsection}{Implementing imputation by last time point value carried forward (doLTCF)} As described in Sections~\ref{SEMintro} and \ref{introsim}, the default behavior of the \code{sim} function consists in setting all nodes that temporally follow an \code{EFU} node whose simulated value is 1 to missing (i.e., \code{NA}). The argument \code{LTCF} of the \code{sim} function can however be used to change this default behavior and impute some of these missing values with \emph{last time point value carried forward} (LTCF). More specifically, only the missing values of time-varying nodes (i.e., those with non-missing \code{t} argument) that follow the end of follow-up event encoded by the \code{EFU} node specified by the \code{LTCF} argument will be imputed. Equivalently, one can use the function \code{doLTCF} to apply the same \emph{last time point value carried forward} imputation to an existing simulated dataset obtained from the function \code{sim} that was called with its default imputation setting (i.e., with no \code{LTCF} argument). Illustration of the use of this LTCF imputation functionality is given in Section~\ref{Ex1survMSM} (Example 1 of \code{set.targetMSM}).\\ The following code demonstrates the default data format of the \code{sim} function after an end of follow-up event and how this behavior can be modified to generate data with LTCF imputation by either using the \code{LTCF} argument of the \code{sim} function or by calling the \code{doLTCF} function. <>= Odat.wide <- sim(DAG = lDAG, n = 1000, rndseed = 123) Odat.wide[c(11,76), 1:18] Odat.wideLTCF <- sim(DAG = lDAG, n = 1000, LTCF = "Y", rndseed = 123) Odat.wideLTCF[c(11,76), 1:18] @ The code below demonstrates how to use the \code{doLTCF} function to perform LTCF imputation on already existing data simulated with the \code{sim} function based on its default non-imputation behavior: <>= Odat.wideLTCF2 <- doLTCF(data = Odat.wide, LTCF = "Y") Odat.wideLTCF2[c(11,76), 1:18] @ \subsection{Defining and evaluating various causal target parameters}\label{targets2} \addcontentsline{toc}{subsection}{Defining and evaluating various causal target parameters} \subsubsection[Causal parameters defined with set.targetE]{Causal parameters defined with \code{set.targetE}}\label{targets2NP} \paragraph{Example 1.} In the example below, we first define two causal target parameters as two vectors, each containing the expectations of the node \code{Y[t]}, for time points \code{t}=1, $\ldots$, 16, under the post-intervention distribution defined by one of the two dynamic interventions \code{"A1_th0"} and \code{"A1_th1"} defined in Section~\ref{dynact}. Second, we evaluate these target parameters using the counterfactual data simulated previously in Section~\ref{simfull2.dyn} and we map the resulting estimates of cumulative risks into estimates of survival probabilities. We also plot the corresponding two counterfactual survival curves using the \pkg{simcausal} routine \code{plotSurvEst} as shown in Figure~\ref{fig:survfig1}. Finally, we note that Figure~\ref{fig:survfig1} replicates the simulation study results reported in Figure 4 of \citet{neugebauer2014}. <>= Ddyn <- set.targetE(Ddyn, outcome = "Y", t = 1:16, param = "A1_th1") surv_th1 <- 1 - eval.target(Ddyn, data = Xdyn)$res Ddyn <- set.targetE(Ddyn, outcome = "Y", t = 1:16, param = "A1_th0"); surv_th0 <- 1 - eval.target(Ddyn, data = Xdyn)$res @ <>= plotSurvEst(surv = list(d_theta1 = surv_th1, d_theta0 = surv_th0), xindx = 1:17, ylab = "Counterfactual survival for each intervention", ylim = c(0.75,1.0)) @ \paragraph{Example 2.} In the example below, we first define the causal target parameters as the ATE on the additive scale (cumulative risk differences) and the ATE on the multiplicative scale (cumulative risk ratios), for the two dynamic interventions (\code{"A1_th1"} and \code{"A1_th0"}) defined in Section~\ref{dynact} at time point \code{t = 12}. Second, we evaluate these target parameters using the previously simulated counterfactual data from Section~\ref{simfull2.dyn}.\\ ATE on the additive scale: <>= Ddyn <- set.targetE(Ddyn, outcome = "Y", t = 12, param = "A1_th1-A1_th0") (psi <- eval.target(Ddyn, data = Xdyn)$res) @ ATE on the multiplicative scale: <>= Ddyn <- set.targetE(Ddyn, outcome = "Y", t = 12, param = "A1_th0/A1_th1") eval.target(Ddyn, data = Xdyn)$res @ We also note that the above result for the ATE on the additive scale (\code{psi}=\Sexpr{psi}) replicates the simulation result reported for $\psi$ in Section 5.1 and Figure 4 of \citet{neugebauer2014}, where $\psi$ was defined as the difference between the cumulative risks of failure at $t_0 = 12$ for the two dynamic interventions $d_1$ and $d_0$. \subsubsection[Causal parameters defined with set.targetMSM]{Causal parameters defined with \code{set.targetMSM}}\label{targets2MSM} In Section~\ref{targets1MSM}, we described the arguments of the function \code{set.targetMSM} that the user must specify to define MSM target causal parameters. They include the specification of the argument \code{form} which encodes the working MSM formula. This formula can only be a function of the time index \code{t}, action attributes that uniquely identify each intervention of interest, and baseline nodes (defined as nodes that precede the earliest intervention node). Both baseline nodes that are measurements of time-varying nodes and time-varying action attributes must be referenced in the \proglang{R} expression passed to the \code{form} argument within the wrapping syntax \code{S(...)} as illustrated in several examples below. \paragraph{Example 1. Working dynamic MSM for survival probabilities over time.}\label{Ex1survMSM} Here, we illustrate the evaluation of the counterfactual survival curves $E(Y_{d_{\theta}}(t))$ for $t=1, \ldots, 16$ under the dynamic interventions $d_{\theta}$ for $\theta=0,1$ introduced in Section~\ref{dynact} using the following pooled working logistic MSM: $$ \mbox{expit}\left(\alpha_{0}+\alpha_{1}\theta+\alpha_{2}t+\alpha_{3}t\theta\right), $$ where the true values of the coefficients $(\alpha_{i},i=0,\ldots,3)$ define the target parameters of interest. First, we define these target parameters: <>= msm.form <- "Y ~ theta + t + I(theta*t)" Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = 1:16, form = msm.form, family = "binomial", hazard = FALSE) @ Note that when the outcome is a time-varying node of type \code{EFU}, the argument \code{hazard = FALSE} indicates that the working MSM of interest is a model for survival probabilities. The argument \code{family = "binomial"} indicates that the working model is a logistic model. Second, we evaluate the coefficients of the working model: <>= MSMres <- eval.target(Ddyn, n = 1000, rndseed = 123) MSMres$coef @ We also note that no previously simulated counterfactual data were passed as argument to the function \code{eval.target} above. Instead, the sample size argument \code{n} was specified and the routine will thus first sample \code{n = 10,000} observations from each of the two post-intervention distributions before fitting the working MSM with these counterfactual data to derive estimates of the true coefficient values. Alternatively, the user could have passed the previously simulated counterfactual data. Note however that in this case, the user must either simulate counterfactual data by calling the \code{sim} function with the argument \code{LTCF = "Y"} or convert the previously simulated counterfactual data with the \emph{last time point value carried forward} imputation function \code{doLTCF}. Both approaches are described in Section~\ref{LTCFdat} and the latter approach is demonstrated in the example below, where we first impute the \code{EFU} outcome \code{Y} in the previously simulated counterfactual data \code{Xdyn}. <>= XdynLTCF <- lapply(Xdyn, doLTCF, LTCF = "Y") eval.target(Ddyn, data = XdynLTCF)$coef @ The resulting coefficient estimates can be mapped into estimates of the two counterfactual survival curves and plotted as shown in Figure~\ref{fig:survfig2} using the \code{plotSurvEst} function: <>= surv_th0 <- 1 - predict(MSMres$m, newdata = data.frame(theta = rep(0, 16), t = 1:16), type = "response") surv_th1 <- 1 - predict(MSMres$m, newdata = data.frame(theta = rep(1, 16), t = 1:16), type = "response") plotSurvEst(surv = list(MSM_theta1 = surv_th1, MSM_theta0 = surv_th0), xindx = 1:16, ylab = "MSM Survival, P(T>t)", ylim = c(0.75, 1.0)) @ \paragraph{Example 2. More complex working dynamic MSM for survival probabilities over time.} The previous example can be modified to illustrate the evaluation of the survival curves of interest with a more flexible (i.e., more non-parametric) working MSM as follows: <>= msm.form <- "Y ~ theta + t + I(t^2) + I(t^3) + I(t^4) + I(t^5) + I(t*theta) + I(t^2*theta) + I(t^3*theta) + I(t^4*theta) + I(t^5*theta)" Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = 1:16, formula = msm.form, family = "binomial", hazard = FALSE) MSMres2 <- eval.target(Ddyn, n = 1000, rndseed = 123) MSMres2$coef @ <>= surv_th0 <- 1 - predict(MSMres2$m, newdata = data.frame(theta = rep(0, 16), t = 1:16), type = "response") surv_th1 <- 1 - predict(MSMres2$m, newdata = data.frame(theta = rep(1, 16), t = 1:16), type = "response") plotSurvEst(surv = list(MSM_theta1 = surv_th1, MSM_theta0 = surv_th0), xindx = 1:16, ylab = "MSM Survival, P(T>t)", ylim = c(0.75, 1.0)) @ The resulting estimates of the survival curves shown in Figure~\ref{fig:survfig3} are indeed visually closer to the true survival curves reported in Figure 4 of \citet{neugebauer2014}. \paragraph{Example 3. Saturated dynamic MSM for survival probabilities over time.}\label{saturMSM} Here, we further modify the working model formula by specifying a saturated MSM to directly replicate the results reported in Figure 4 of \citet{neugebauer2014} that are based on a non-parametric MSM approach: <>= msm.form <- "Y ~ theta + as.factor(t) + as.factor(t):theta " Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = 1:16, formula = msm.form, family = "binomial", hazard = FALSE) MSMres3 <- eval.target(Ddyn, n = 1000, rndseed = 123) MSMres3$coef @ <>= surv_th0 <- 1 - predict(MSMres3$m, newdata = data.frame(theta = rep(0, 16), t = 1:16), type = "response") surv_th1 <- 1 - predict(MSMres3$m, newdata = data.frame(theta = rep(1, 16), t = 1:16), type = "response") plotSurvEst(surv = list(MSM_theta1 = surv_th1, MSM_theta0 = surv_th0), xindx = 1:16, ylab = "MSM Survival, P(T>t)", ylim = c(0.75, 1.0)) @ The resulting estimates of the survival curves shown in Figure~\ref{fig:survfig4} approximate that reported in Figure 4 of \citet{neugebauer2014}. Differences can be explained by the relative small sample size of the counterfactual data based on which the following plot is based (\code{n=10,000} here vs. \code{n=1,000,000} in the simulation study from \citet{neugebauer2014}). \paragraph{Example 4. Working dynamic MSM for discrete-time hazards over time.} Here, we illustrate the evaluation of discrete-time hazards $E(Y_{d_{\theta}}(t))|Y_{d_{\theta}}(t-1)=0)$, for $t=1,\ldots,16$ under the dynamic interventions $d_{\theta}$, for $\theta=0,1$ introduced in Section~\ref{dynact} using the following pooled working logistic MSM: $$ \mbox{expit}\left(\alpha_{0}+\alpha_{1}\theta+\alpha_{2}t+\alpha_{3}t\theta\right), $$ where the true values of coefficients $(\alpha_{i},i=0,\ldots,3)$ define the target parameters of interest. First, we define these target parameters: <<>>= msm.form <- "Y ~ theta + t + I(theta*t)" Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = 1:16, form = msm.form, family = "binomial", hazard = TRUE) @ Note that when the outcome is a time-varying node of type \code{EFU}, the argument \code{hazard = TRUE} indicates that the working MSM of interest is now a model for hazards.\\ Second, we evaluate the coefficients of the working model: <>= MSMres <- eval.target(Ddyn, n = 1000, rndseed = 123) MSMres$coef @ Note that no previously simulated counterfactual data were passed as argument to the call to \code{eval.target} above. Instead, the sample size argument \code{n} was specified and the routine will thus first sample \code{n=10,000} observations from each of the two post-intervention distributions before fitting the working MSM with these counterfactual data to derive estimates of the true coefficient values. Alternatively, the user could have passed the previously simulated counterfactual data using the argument \code{data = Xdyn}. The resulting coefficient estimates can be mapped into estimates of the two counterfactual hazard curves and plotted as shown in Figure~\ref{fig:survfig5} using the \code{plotSurvEst} function: <>= h_th0 <- predict(MSMres$m, newdata = data.frame(theta = rep(0, 16), t = 1:16), type = "response") h_th1 <- predict(MSMres$m, newdata = data.frame(theta = rep(1, 16), t = 1:16), type = "response") plotSurvEst(surv = list(MSM_theta1 = h_th1, MSM_theta0 = h_th0), xindx = 1:16, ylab = "MSM hazard function", ylim = c(0.0, 0.03)) @ Alternatively, the resulting coefficient estimates can also be mapped into estimates of the two counterfactual survival curves and plotted as shown in Figure~\ref{fig:survfig6} using the \code{plotSurvEst} function: <>= Surv_h_th0 <- cumprod(1 - h_th0) Surv_h_th1 <- cumprod(1 - h_th1) plotSurvEst(surv = list(MSM_theta1 = Surv_h_th1, MSM_theta0 = Surv_h_th0), xindx = 1:16, ylab = "Survival P(T>t) derived from MSM hazard", ylim = c(0.75, 1.0)) @ \paragraph{Example 5. Working dynamic MSM to evaluate effect modification by a time-independent covariate.} The previous example can be modified to illustrate the evaluation of effect modification by a baseline covariate through the inclusion of an interaction term between $\theta$ and $L_1$ in the working logistic MSM for $E(Y_{d_{\theta}}(t))|Y_{d_{\theta}}(t-1)=0,L_{1})$: $$ \mbox{expit}\left(\alpha_{0}+\alpha_{1}\theta+\alpha_{2}t+\alpha_{3}t\theta+\alpha_{4}\theta L_{1}\right),\ \mbox{for }t=1,\ldots,16 \textrm{ and } \theta=0,1, $$ where the true values of coefficients $\alpha_{i}\textrm{, for }i=0,\ldots,4$ define the target parameters of interest. First, we define and estimate these target parameters: <>= msm.form_sum <- "Y ~ theta + t + I(theta*t) + I(theta*L1)" Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = 1:16, form = msm.form_sum, family = "binomial", hazard = TRUE) MSMres <- eval.target(Ddyn, n = 1000, rndseed = 123) MSMres$coef @ Second, we map the coefficient estimates into counterfactual survival curves within subgroups defined by the baseline covariate \code{L1} levels: <<>>= get_haz <- function(thetaval, L1val) { predict(MSMres$m, newdata = data.frame(theta = rep(thetaval, 16), t = 1:16, L1 = L1val), type = "response") } Sth0L1_0 <- cumprod(1 - get_haz(thetaval = 0, L1val = 0)) Sth1L1_0 <- cumprod(1 - get_haz(thetaval = 1, L1val = 0)) Sth0L1_1 <- cumprod(1 - get_haz(thetaval = 0, L1val = 1)) Sth1L1_1 <- cumprod(1 - get_haz(thetaval = 1, L1val = 1)) @ Third, we plot the resulting survival curves for each subgroup of interest as shown in Figure~\ref{fig:survfig7}: <>= par(mfrow = c(1,2)) plotSurvEst(surv = list(MSM_theta1 = Sth1L1_0, MSM_theta0 = Sth0L1_0), xindx = 1:16, ylab = "Survival P(T>t), for L1=0", ylim = c(0.5, 1.0)) plotSurvEst(surv = list(MSM_theta1 = Sth1L1_1, MSM_theta0 = Sth0L1_1), xindx = 1:16, ylab = "Survival P(T>t), for L1=1", ylim = c(0.5, 1.0)) @ \paragraph{Example 6. Working dynamic MSM to evaluate effect modification by the baseline measurement of a time-dependent covariate.}\label{MSMwS.1} Here, the previous example is modified to illustrate the evaluation of effect modification by the baseline measurement of the time-varying node \code{L2} using the following working logistic MSM: $$ \mbox{expit}\left(\alpha_{0}+\alpha_{1}\theta+\alpha_{2}t+\alpha_{3}t\theta+\alpha_{4}\theta L_{2}(0)\right), $$ where the true values of coefficients $\alpha_{i}\textrm{ for }i=0,\ldots,4$ define the target parameters of interest. First, we define and estimate these target parameters: <>= msm.form.correct <- "Y ~ theta + t + I(theta*t) + I(theta * S(L2[0]))" Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = 1:16, form = msm.form.correct, family = "binomial", hazard = TRUE) MSMres.correct <- eval.target(Ddyn, n = 1000, rndseed = 123) MSMres.correct$coef @ Note that the baseline measurement of the time-varying node \code{L2} must be referenced within the \code{S(...)} in the working MSM formula. \paragraph{Example 7. Working static MSM for survival probabilities over time.}\label{MSMwS.2} Here, we illustrate the evaluation of discrete-time hazards $E(Y_{\bar{a}}(t))|Y_{\bar{a}}(t-1)=0)$, for $t=1,...,16$ under the $17$ static interventions introduced in Section~\ref{staticact} using the following pooled working logistic MSM: $$ expit\big(\alpha_{0}+\alpha_{1}t+\alpha_{2}\frac{1}{t}\sum_{j=0}^{t-1}a(j) + \alpha_{3} \sum_{j=0}^{t-1}a(j) \big), $$ where we use the notation $\bar{a}=(a(0),a(1),...,a(16))$ to denote the $17$ static intervention regimens on the time- varying treatment node \code{A1}. Note that the time-varying action attribute \code{abar} introduced in Section~\ref{staticact} directly encodes the $17$ treatment regimens values $\bar{a}$ referenced in the MSM working model above. To evaluate the target parameters $\alpha_{j}$ above, for $j=0,...,3$, we first simulate counterfactual data for the $17$ static interventions of interest as follows: <>= Xts <- sim(Dstat, actions = names(A(Dstat)), n = 1000, rndseed = 123) @ Second, we define the target parameters and estimate them using the counterfactual data just simulated as follows: <>= msm.form_1 <- "Y ~ t + S(mean(abar[0:(t-1)])) + I(t*S(mean(abar[0:(t-1)])))" Dstat <- set.targetMSM(Dstat, outcome = "Y", t = 1:16, form = msm.form_1, family = "binomial", hazard = TRUE) MSMres <- eval.target(Dstat, data = Xts) MSMres$coef @ Note that the working MSM formulas can reference arbitrary summary measures (functions) of time-varying action attributes such as \code{abar}. The square-bracket \code{'['} syntax can then be used to identify specific elements of the time-varying action attributes in the same way it can be used in node formulas to reference particular measurements of time-varying nodes. For example, the term \code{sum(abar[0:t])} indicates a summation over the elements of the action attribute \code{abar} indexed by time points lower than or equal to value \code{t} and the syntax \code{S(abar[max(0, t - 2)])} creates a summary measure representing time-lagged values of \code{abar} that are equal to \code{abar[0]} if \code{t}$<2$ and to \code{abar[t-2]} if \code{t}$\geq2$. Note also that references to time-varying action attributes in the working MSM formula must be wrapped within a call to the \code{S(...)} function, e.g., \code{Y}$\sim$\code{t + S(mean(abar[0:t]))}. \\ The \code{eval.target} function returns a list with the following named attributes: the working MSM fit returned by a \code{glm} function call (\code{msm}), the coefficient estimates (\code{coef}), the mapping (\code{S.msm.map}) of the formula terms defined by expressions enclosed within the \code{S(...)} function into the corresponding variable names in the design matrix that was used to implement the regression, and the design matrix (\code{df_long}) stored as a list of \code{data.table} objects from the \proglang{R} package \pkg{data.table} \citep{R-data.table}. Each of these \code{data.table} objects contains counterfactual data indexed by a particular intervention. These counterfactual data are stored in long format with possibly additional new columns representing terms in the working MSM formula defined by expressions enclosed with the \code{S()} function. The design matrix can be derived by row binding these \code{data.table} objects. <>= names(MSMres) MSMres$S.msm.map names(MSMres$df_long) MSMres$df_long[["A1_ts2"]] @ Finally, we plot the resulting counterfactual survival curve estimates using the function \code{survbyMSMterm} (source code provided in the appendix) as shown in Figure~\ref{fig:survfig8}. <>= survMSMh_wS <- survbyMSMterm(MSMres = MSMres, t_vec = 1:16, MSMtermName = "mean(abar[0:(t - 1)])") print(plotsurvbyMSMterm(survMSMh_wS)) @ \newpage{} %********************************************************************* \section[Replication study of the comparative performances of two estimators]{\protect\parbox{0.95\textwidth}{\protect\centering Replication study of the comparative performances of two estimators}}\label{repsimstudy1} \addcontentsline{toc}{section}{Replication study of the comparative performances of two estimators} In this section, we demonstrate how the \pkg{simcausal} package can be used to conduct a complex but transparent and reproducible simulation study to compare the finite sample properties (bias an relative efficiency) of two causal effect estimators. Specifically, we aim to replicate the results of the simulation study described by \citet{neugebauer2014} that compared targeted minimum loss based estimation (TMLE) and inverse probability weighting (IPW) estimation of a causal risk difference defined by two dynamic treatment regimens (see ATE in \emph{Example 2} of \code{set.targetE} in Section~\ref{targets2NP}). We carried out this replication study using simulation protocol 3 described in \textbf{Section 5.3.} of \citet{neugebauer2014} and compared the bias and relative efficiency of TMLE and IPW estimation using the same performance metrics as that reported in \textbf{Table 6} of \citet{neugebauer2014}. The observed data were thus generated using the following slightly modified version of the structural equation model from Section~\ref{node2} to match the data generating distribution described in protocol 3 of \textbf{Section 5.3.} of \citet{neugebauer2014}: <>= t.end <- 12 D <- DAG.empty() D <- D + node("L2", t = 0, distr = "rbern", prob = 0.05) + node("m1L2", t = 0, distr = "rconst", const = 1 - L2[0]) + node("L1", t = 0, distr = "rbern", prob = ifelse(L2[0] == 1, 0.8, 0.3)) + node("A1", t = 0, distr = "rbern", prob = ifelse(L1[0] == 1 & L2[0] == 0, 0.5, ifelse(L1[0] == 0 & L2[0] == 0, 0.2, ifelse(L1[0] == 1 & L2[0] == 1, 0.8, 0.5)))) + node("A2", t = 0, distr = "rbern", prob = 0, EFU = TRUE) D <- D + node("Y", t = 1:t.end, distr = "rbern", prob = plogis(-7 + 3 * L1[0] + 5 * L2[t-1] + 0.1 * sum(I(L2[0:(t-1)] == rep(0, t)))), EFU = TRUE) + node("L2", t = 1:t.end, distr = "rbern", prob = ifelse(A1[t-1] == 1, 0.1, ifelse(L2[t-1] == 1, 0.9, min(1,0.1 + t/16)))) + node("m1L2", t = 1:t.end, distr = "rconst", const = 1-L2[t]) + node("A1", t = 1:t.end, distr = "rbern", prob = ifelse(A1[t-1] == 1, 1, ifelse(L1[0] == 1 & L2[t] == 0, 0.4, ifelse(L1[0] == 0 & L2[t] == 0, 0.2, ifelse(L1[0] == 1 & L2[t] == 1, 0.8, 0.6))))) + node("A2", t = 1:t.end, distr = "rbern", prob = {if(t == t.end) {1} else {0}}, EFU = TRUE) lDAG <- set.DAG(D) Ddyn <- lDAG @ <>= anodes <- node("A1", t = (0:t.end), distr = "rbern", prob = {if (t == 0) {ifelse(L2[0] >= theta, 1, 0)} else {ifelse(A1[t-1] == 1, 1, ifelse(L2[t] >= theta, 1, 0))}}) Ddyn <- Ddyn + action("A1_th0", nodes = anodes, theta = 0) + action("A1_th1", nodes = anodes, theta = 1) @ The target causal parameter $\psi_{0}$ in this replication study is defined as the difference between the cumulative risks of failure at $t_{0}=11$ under the two dynamic regimes $d_{0}$ and $d_{1}$ introduced in Section~\ref{dynact}, i.e., $\psi_{0}=P(Y_{d_{1}}(t_{0}+1)=1)-P(Y_{d_{0}}(t_{0}+1)=1)$:\\ <>= t0 <- 12 Ddyn <- set.targetE(Ddyn, outcome = "Y", t = (1:t0), param = "A1_th1-A1_th0") getNP.truetarget <- function() { resNP <- eval.target(Ddyn, n = 150000, rndseed = 123)$res return(as.vector(resNP[paste0("Diff_Y_", t0)])) } f1name <- "vignette_dat/repstudy1_psi0.t0.NP.Rdata" if (file.exists(f1name)) { load(f1name) } else { psi0.t0.NP <- getNP.truetarget() save(list = "psi0.t0.NP", file = f1name) } psi0.t0.NP @ The above estimate of the true value of $\psi_{0}$ closely matches the value reported in Figure 10 of \citet{neugebauer2014}. We note that the target parameter $\psi_{0}$ can also be defined and thus evaluated with the following saturated MSM: $$ \mbox{logit}(P(Y_{d_{\theta}}(t+1)=1))=\alpha_{0}+\alpha_{0}^{1}\theta+\sum_{k=1}^{t_{0}}\alpha_{k}^{0}I(t=k)+\sum_{k=1} ^{t_{0}}\alpha_{k}^{1}\theta I(t=k), $$ for $t=0,1,\ldots,t_{0}$ and $\theta\in\{0,1\}$. Indeed, the risk difference $\psi_{0}$ can then be derived from the coefficients of this MSM as follows: \begin{equation} \label{eq:psi0MSM} \psi_{0}=\mbox{expit}(\alpha_{0}+\alpha_{0}^{1}+\alpha_{t_{0}}^{0}+\alpha_{t_{0}}^{1})-\mbox{expit}(\alpha_{0}+\alpha_{t_{0}}^{0}). \end{equation} Evaluation of the target parameter $\psi_{0}$ can thus be implemented through fitting of a saturated MSM as shown below: <>= MSM_RD_t <- function(resMSM, t) { invlogit <- function(x) 1 / (1 + exp(-x)) Riskth0 <- invlogit(resMSM["(Intercept)"] + resMSM[paste0("as.factor(t)",t)]) Riskth1 <- invlogit(resMSM["(Intercept)"] + resMSM[paste0("as.factor(t)",t)] + resMSM["theta"] + resMSM[paste0("theta:as.factor(t)",t)]) return(as.vector(Riskth1-Riskth0)) } msm.form <- "Y ~ theta + as.factor(t) + as.factor(t):theta " Ddyn <- set.targetMSM(Ddyn, outcome = "Y", t = (1:t0), formula = msm.form, family = "binomial", hazard = FALSE) getMSM.truetarget <- function() { resMSM <- eval.target(Ddyn, n = 150000, rndseed = 123)$coef return(as.vector(MSM_RD_t(resMSM = resMSM, t = t0))) } f2name <- "vignette_dat/repstudy1_psi0.t0.MSM.Rdata" if (file.exists(f2name)) { load(f2name) } else { psi0.t0.MSM <- getMSM.truetarget() save(list = "psi0.t0.MSM", file = f2name) } psi0.t0.MSM all.equal(psi0.t0.NP, psi0.t0.MSM) @ To evaluate the bias and relative efficiency of TMLE and IPW estimation of the risk difference $\psi_{0}$ with observed data, we generated $1,000$ observed datasets, each with sample size $50,000$. With each simulated observed data set, the coefficients of the saturated MSM were then estimated by TMLE and IPW estimation using the \code{ltmeMSM} function from the \pkg{ltmle} \proglang{R} package \citep{R-ltmle} as shown in the appendix. For an in-depth description of TMLE, we refer to \citet{ltmleMSMs2014} and the \pkg{ltmle} package manual. The IPW and TMLE estimates of $\psi_{0}$ were then derived from the estimated MSM coefficients $\alpha$ using formula~\eqref{eq:psi0MSM}. As in \citet{neugebauer2014}, both TMLE and IPW estimators of $\psi_{0}$ were derived using a correctly specified model for the treatment mechanism but also a misspecified model (covariate \code{L2[0]} missing from the logistic model for the propensity scores).\\ We report our simulation results in Table~\ref{tab6aNeugebauer}. For each estimator, we report the empirical mean of the $1,000$ estimates ($\psi_{n}$), corresponding bias and the empirical standard deviation of the $1,000$ estimates ($\sigma_{emp}$). We also report the relative efficiency for IPW vs. TMLE, evaluated as the ratio of their respective empirical standard deviations. The function \code{simrun_ltmleMSM()} (source code provided in the appendix) can be used to generate the results presented in Table~\ref{tab6aNeugebauer}.\\ Our TMLE results match those reported in \textbf{Table 6} by \citet{neugebauer2014}, while our IPW results differ from those reported in \citet{neugebauer2014}. Specifically, in our simulations the IPW was shown to have a smaller empirical standard deviation, compared to \citet{neugebauer2014}, resulting in a smaller reported relative efficiency of 12\%, compared to 39\% relative efficiency reported by \citet{neugebauer2014}. We note though that the IPW estimator implemented by \citet{neugebauer2014} is different from the IPW estimator implemented in the \pkg{ltmle} package. The latter IPW estimator is defined based on the fitting of a separate treatment mechanism model for each time point. Additionally, the IPW estimator for $\psi_{0}$ implemented with the \pkg{ltmle} package is based on a saturated MSM for the counterfactual \emph{survival} functions, whereas the IPW estimator implemented in \citet{neugebauer2014} was constructed using the survival probability estimates derived from a saturated MSM for the counterfactual \emph{hazard} functions. To our knowledge, such an approach is not automated currently in an existing \proglang{R} package. Therefore, the IPW estimation performance in this simulation study should not be expected to match that reported in \citet{neugebauer2014}. <>= times <- c(0:(t.end-1)) gforms <- c("A1_0 ~ L1_0 + L2_0","A2_0 ~ L1_0") timesm0 <- times[which(times > 0)] # correctly specified g: gforms <- c("A1_0 ~ L1_0 + L2_0","A2_0 ~ L1_0") gformm0 <- as.vector(sapply(timesm0, function(t) c("A1_"%+%t%+%" ~ A1_"%+%(t-1)%+%" + L1_0"%+%" + L2_"%+%t%+%" + I(L2_"%+%t%+%"*A1_"%+%(t-1)%+%")+ I(L1_0*A1_"%+%(t-1)%+%")", "A2_"%+%t%+%" ~ L1_0"))) gforms <- c(gforms, gformm0) # mis-specified g (no TV covar L2): gforms_miss <- c("A1_0 ~ L1_0","A2_0 ~ L1_0") gformm0_miss <- as.vector(sapply(timesm0, function(t) c("A1_"%+%t%+%" ~ L1_0*A1_"%+%(t-1), "A2_"%+%t%+%" ~ L1_0"))) gforms_miss <- c(gforms_miss, gformm0_miss) Qformallt <- "Q.kplus1 ~ L1_0" Lterms <- function(var, tlast){ tstr <- c(0:tlast) strout <- paste(var%+%"_"%+%tstr, collapse = " + ") return(strout) } tY <- (0:11) Ynames <- paste("Y_"%+%c(tY+1)) Qforms <- unlist(lapply(tY, function(t) { a <- Qformallt%+%" + I("%+%Lterms("m1L2",t)%+%") + "%+%Lterms("L2",t) return(a) })) names(Qforms) <- Ynames survivalOutcome <- TRUE stratify_Qg <- TRUE mhte.iptw <- TRUE Anodesnew <- "A1_"%+%(0:(t.end-1)) Cnodesnew <- "A2_"%+%(0:(t.end-1)) L2nodesnew <- "L2_"%+%(1:(t.end-1)) mL2nodesnew <- "m1L2_"%+%(1:(t.end-1)) Lnodesnew <- as.vector(rbind(L2nodesnew, mL2nodesnew)) Ynodesnew <- "Y_"%+%(1:t.end) finYnodesnew <- Ynodesnew dropnms <- c("ID","L2_"%+%t.end,"m1L2_"%+%t.end, "A1_"%+%t.end, "A2_"%+%t.end) pooledMSM <- FALSE weight.msm <- FALSE @ <>= simrun_ltmleMSM <- function(sim, DAG, N, t0,gbounds, gforms) { library("ltmle") O_datnew <- sim(DAG = DAG, n = N) ltmleMSMparams <- est.targetMSM(DAG, O_datnew, Aname = "A1", Cname = "A2", Lnames = "L2", Ytvec = (1:t.end), ACLtvec = (0:t.end), package = "ltmle") summeas_arr <- ltmleMSMparams$summeas_arr regimens_arr <- ltmleMSMparams$regimens_arr[,c(1:t.end),] O_datnewLTCF <- doLTCF(data = O_datnew, LTCF = "Y") O_dat_selCnew <- O_datnewLTCF[,-which(names(O_datnewLTCF)%in%dropnms)] O_dat_selCnew[,Cnodesnew] <- 1-O_dat_selCnew[,Cnodesnew] reslTMLE.MSM <- ltmleMSM(data = O_dat_selCnew, Anodes = Anodesnew, Cnodes = Cnodesnew, Lnodes = Lnodesnew, Ynodes = Ynodesnew, survivalOutcome = survivalOutcome, gform = gforms, Qform = Qforms, stratify = stratify_Qg, mhte.iptw = mhte.iptw, iptw.only = FALSE, working.msm = msm.form, pooledMSM = pooledMSM, final.Ynodes = finYnodesnew, regimes = regimens_arr, summary.measures = summeas_arr, weight.msm=weight.msm, estimate.time = FALSE, gbounds = gbounds) iptwMSMcoef <- summary(reslTMLE.MSM, estimator = "iptw")$cmat[,1] iptwRD <- MSM_RD_t(resMSM = iptwMSMcoef, t = t0) tmleMSMcoef <- summary(reslTMLE.MSM, estimator = "tmle")$cmat[,1] tmleRD <- MSM_RD_t(resMSM = tmleMSMcoef, t = t0) return(c(simN = sim, iptwRD = iptwRD, tmleRD = tmleRD)) } @ <>= t0 <- 12 Nltmle <- 50000 Nsims <- 1000 source("./determineParallelBackend.R") sim50K.stratQg.notrunc.g <- foreach(sim = seq(Nsims), .combine = 'rbind')%dopar%{ simrun_ltmleMSM(sim = sim,DAG = Ddyn, N = Nltmle, t0 = t0, gbounds = c(0.0000001, 1), gforms = gforms) } save(list = "sim50K.stratQg.notrunc.g", file = "vignette_dat/sim50K.stratQg.notrunc.g.Rdata") sim50K.stratQg.notrunc.missg <- foreach(sim = seq(Nsims), .combine = 'rbind')%dopar%{ simrun_ltmleMSM(sim = sim,DAG = Ddyn, N = Nltmle, t0 = t0, gbounds = c(0.0000001, 1), gforms = gforms_miss) } save(list = "sim50K.stratQg.notrunc.missg", file = "vignette_dat/sim50K.stratQg.notrunc.missg.Rdata") @ <>= library("Hmisc") load(file = "vignette_dat/repstudy1_psi0.t0.MSM.Rdata") load(file = "vignette_dat/sim50K.stratQg.notrunc.g.Rdata") iptw_sims <- sim50K.stratQg.notrunc.g[,"iptwRD"] tmle_sims <- sim50K.stratQg.notrunc.g[,"tmleRD"] getrestab <- function(iptw_sims, tmle_sims, origres, modelnm) { resrow <- function(sims, estname) { data.frame(estname, sprintf("%.3f",mean(sims)), # round(mean(sims),3), sprintf("%.4f",mean(sims)-psi0.t0.MSM), # round(((mean(sims)-psi0.t0.MSM)/psi0.t0.MSM)*100,3), sprintf("%.4f",sd(sims)), # round(sd(sims),4) stringsAsFactors=FALSE ) } tabcolnames <- c("Estimator", "$\\psi_{n}$", "Bias", "$\\sigma_{emp}$", "$\\sigma_{emp}^{IPTW}/\\sigma_{emp}^{TMLE}$") restab <- rbind(resrow(tmle_sims, "TMLE "%+%modelnm), resrow(iptw_sims, "IPTW "%+%modelnm)) restab <- cbind(restab, c(round(sd(iptw_sims)/sd(tmle_sims),2), "")) restab <- rbind(c("\\emph{Results from this replication study}", "", "", "", ""), restab) restab <- rbind(restab, c("\\emph{Results reported by Neugebauer et al. (2014)}", "", "", "", "")) colnames(restab) <- tabcolnames colnames(origres) <- tabcolnames restab <- rbind(restab, origres) restab } # Results reported by Neugebauer for correct model: tmleorigres <- c("TMLE (correct model)", "0.108", "0.0010", "0.0060", "1.392") iptworigres <- c("IPTW (correct model)", "0.108", "0.0011", "0.0078", "") # TMLE (correct model), 0.108, 0.001, 0.006, 1.392 # IPTW (correct model), 0.108, 0.0011, 0.0078 restab_corrg <- getrestab(iptw_sims, tmle_sims, origres=rbind(tmleorigres,iptworigres), modelnm="(correct model)") @ <>= load(file = "vignette_dat/sim50K.stratQg.notrunc.missg.Rdata") iptw_sims <- sim50K.stratQg.notrunc.missg[,"iptwRD"] tmle_sims <- sim50K.stratQg.notrunc.missg[,"tmleRD"] # Results reported by Neugebauer for incorrect model 1: tmleorigres <- c("TMLE (incorrect model 1)", "0.109", "0.0019", "0.0074", "") iptworigres <- c("IPTW (incorrect model 1)", "0.182", "0.0750", "0.0107", "") # TMLE (incorrect model 1), 0.109, 0.0019, 0.0074, # TMLE (incorrect model 1), 0.182, 0.075, 0.0107 restab_missg <- getrestab(iptw_sims, tmle_sims, origres=rbind(tmleorigres,iptworigres), modelnm="(incorrect model 1)") restab <- rbind(restab_corrg, restab_missg) @ <>= cat("\n") latex(restab,file = "",where = "!htpb", caption.loc = 'bottom', caption = "Replication of the results from simulation protocol 3 reported in Table 6 of \\citet{neugebauer2014} based on two models for estimating the treatment mechanism: 1) a correctly specified model and 2) a misspecified model missing a term for the time-dependent variable. $\\psi_{n}$ - mean point estimates over 1,000 simulated data sets; $\\sigma_{emp}$ - empirical standard deviation (SD) of point estimates over 1,000 simulated data sets; $\\sigma_{emp}^{IPTW}/\\sigma_{emp}^{TMLE}$ - the relative efficiency measured by the ratio of the empirical SDs associated with the IPW and TMLE point estimates.", label = 'tab6aNeugebauer', booktabs = TRUE, rowname = NULL, landscape = FALSE, col.just = c("l", rep("r", 4)), size = "small") @ \newpage{} %********************************************************************* \section[Replication study of the impact of misspecification of propensity score models]{\protect\parbox{0.95\textwidth}{\protect\centering Replication study of the impact of misspecification of propensity score models}}\label{repsimstudy2} \addcontentsline{toc}{section}{Replication study of the impact of misspecification of propensity score models} In this section, we demonstrate how the \pkg{simcausal} package can be used to replicate a simulation study from \citet{lefebvre2008}. Specifically, we replicate the results first reported in Tables II and IV of that paper. We first specify the observed data generating distribution using the two structural equation models corresponding with Scenarios 1 and 3 described in \citet{lefebvre2008}. Second, for each scenario, we evaluate the true values of the coefficients of the MSM using counterfactual data and compare them to those reported by \citet{lefebvre2008}. Finally for each scenario, we implement the same IPW estimators of these MSM coefficients and evaluate their performances using the same two metrics (bias and mean squared error) as in \citet{lefebvre2008}. Each IPW estimator is defined by a distinct model for the propensity scores. Our replication results are reported in Tables~\ref{tab2Lefebvre} and \ref{tab4Lefebvre}, and we show the simulations results as they were reported by \citet{lefebvre2008} in Tables~\ref{origtab2Lefebvre} and \ref{origtab4Lefebvre}.\\ To carry out the simulation study, we first define a new distribution function \code{rbivNorm} for simulating observations from a bivariate normal distribution with a user-specified mean vector (specified by the argument \code{mu}) and a user-specified covariance matrix (specified by the arguments \code{var1}, \code{var2}, and \code{rho} to represent the diagonal and off-diagonal scalars, respectively). This new distribution function is based on Cholesky decomposition of the covariance matrix and independent observations simulated from the standard normal distribution which are provided by the input argument \code{norms}. The argument \code{whichbiv} indicates whether the function should return independent observations from the first or second element of the bivariate normal vector.\\ <>= rbivNorm <- function(n, whichbiv, norms, mu, var1 = 1, var2 = 1, rho = 0.7) { whichbiv <- whichbiv[1]; var1 <- var1[1]; var2 <- var2[1]; rho <- rho[1] sigma <- matrix(c(var1, rho, rho, var2), nrow = 2) Scol <- chol(sigma)[, whichbiv] bivX <- (Scol[1] * norms[, 1] + Scol[2] * norms[, 2]) + mu bivX } @ Second, using this distribution function, we define the structural equation model specified for data simulation according to Scenario 1 in \citet{lefebvre2008}.\\ <>= `%+%` <- function(a, b) paste0(a, b) Lnames <- c("LO1", "LO2", "LO3", "LC1") D <- DAG.empty() for (Lname in Lnames) { D <- D + node(Lname%+%".norm1", distr = "rnorm", mean = 0, sd = 1) + node(Lname%+%".norm2", distr = "rnorm", mean = 0, sd = 1) } D <- D + node("LO1", t = 0:1, distr = "rbivNorm", whichbiv = t + 1, norms = c(LO1.norm1, LO1.norm2), mu = 0) + node("LO2", t = 0:1, distr = "rbivNorm", whichbiv = t + 1, norms = c(LO2.norm1, LO2.norm2), mu = 0) + node("LO3", t = 0:1, distr = "rbivNorm", whichbiv = t + 1, norms = c(LO3.norm1, LO3.norm2), mu = 0) + node("LC1", t = 0:1, distr = "rbivNorm", whichbiv = t + 1, norms = c(LC1.norm1, LC1.norm2), mu = {if (t == 0) {0} else {-0.30 * A[t-1]}}) + node("alpha", t = 0:1, distr = "rconst", const = {if(t == 0) {log(0.6)} else {log(1.0)}}) + node("A", t = 0:1, distr = "rbern", prob = plogis(alpha[t] + log(5)*LC1[t] + {if(t == 0) {0} else {log(5)*A[t-1]}})) + node("Y", t = 1, distr = "rnorm", mean = (0.98 * LO1[t] + 0.58 * LO2[t] + 0.33 * LO3[t] + 0.98 * LC1[t] - 0.37 * A[t]), sd = 1) DAGO.sc1 <- set.DAG(D) @ Third, we define the target parameter as the coefficients $\beta_{1}$ and $\beta_{2}$ of the following correctly specified marginal structural model: $$E[Y_{a(0),a(1)}]=\beta_{0}+\beta_{1}a(0)+\beta_{2}a(1),$$ defined by the following four possible static and deterministic interventions ($a(0),a(1)$) on the treatment process ($A(0),A(1)$): $(0,0)$, $(1,0)$, $(0,1)$, and $(1,1)$. <>= defAct <- function (Dact) { act.At <- node("A", t = 0:1, distr = "rbern", prob = abar[t]) Dact <- Dact + action("A00", nodes = act.At, abar = c(0, 0)) + action("A10", nodes = act.At, abar = c(1, 0)) + action("A01", nodes = act.At, abar = c(0, 1)) + action("A11", nodes = act.At, abar = c(1, 1)) return(Dact) } Dact.sc1 <- defAct(DAGO.sc1) msm.form <- "Y ~ S(abar[0]) + S(abar[1])" Dact.sc1 <- set.targetMSM(Dact.sc1, outcome = "Y", t = 1, form = msm.form, family = "gaussian") @ Fourth, we evaluate the true values of these MSM coefficients using the \code{eval.target} function and note that our results closely match the true value of the MSM coefficients reported in Table II of \citet{lefebvre2008}: <>= repstudy2.sc1.truetarget <- function() { trueMSMreps.sc1 <- NULL reptrue <- 50 for (i in (1:reptrue)) { res.sc1.i <- eval.target(Dact.sc1, n = 500000)$coef trueMSMreps.sc1 <- rbind(trueMSMreps.sc1, res.sc1.i) } return(trueMSMreps.sc1) } f1name <- "vignette_dat/trueMSMreps.sc1.Rdata" if (file.exists(f1name)) { load(f1name) } else { trueMSMreps.sc1 <- repstudy2.sc3.truetarget() save(list = "trueMSMreps.sc1", file = f1name) } (trueMSM.sc1 <- apply(trueMSMreps.sc1, 2, mean)) @ Note that the true values of the MSM coefficients above were obtained from the averages of coefficient estimates obtained from several simulated counterfactual data sets. This approach was implemented to avoid the memory limitation that can be encountered when trying to simulate a single very large counterfactual data set.\\ Finally, we replicate the IPW estimation results for Scenario 1 presented originally in Table II of \citet{lefebvre2008} using the source \proglang{R} code provided in the appendix. To estimate the propensity scores $P(A(0)|L(0))$ and $P(A(1)|A(0),L(1))$ that define each of the three IPW estimators considered, we used the same three models presented in Table I of \citet{lefebvre2008}. For the three sample sizes $N=300$; $1,000$; and $10,000$, we report the bias of each IPW estimator, multiplied by 10 (\emph{Bias*10}) and the mean-squared error, also multiplied by 10 (\emph{MSE*10}) in Table~\ref{tab2Lefebvre}. We note that our results closely match those in Table~\ref{origtab2Lefebvre} which were originally reported in \citet{lefebvre2008}.\\ <>= runMSMsw <- function(DAGO, Lnames, trueA, nsamp, nsims) { Lnames_0 <- Lnames%+%"_0" Lnames_1 <- Lnames%+%"_1" gforms <- c("A_0 ~ "%+%paste(Lnames_0, collapse = " + "), "A_1 ~ A_0 + "%+%paste(Lnames_1, collapse = " + ")) res_sw <- NULL for (sims in (1:nsims)) { datO <- sim(DAGO, n = nsamp) glmA_0 <- glm(datO[,c("A_0",Lnames_0)], formula = gforms[1], family = "binomial") glmA_1 <- glm(datO[,c("A_1","A_0",Lnames_0,Lnames_1)], formula = gforms[2], family = "binomial") probA0_1 <- predict(glmA_0, type = "response") weight_t0 <- 1 / (probA0_1^(datO$A_0) * (1-probA0_1)^(1-datO$A_0)) probA1_1 <- predict(glmA_1, type = "response") weight_t1 <- 1 / (probA1_1^(datO$A_1) * (1-probA1_1)^(1-datO$A_1)) sw1 <- weight_t0*weight_t1 emp.pA1cA0 <- table(datO$A_1,datO$A_0)/nrow(datO) empPA1 <- data.frame(A_0 = c(0,0,1,1),A_1 = c(0,1,1,0)) empPA1$empPA_1_cA_0 <- apply(empPA1, 1, function(rowA) emp.pA1cA0[as.character(rowA["A_1"]), as.character(rowA["A_0"])]) empPA1 <- merge(datO[, c("ID","A_0","A_1")],empPA1, sort = FALSE) empPA1 <- empPA1[order(empPA1$ID),] swts <- empPA1$empPA_1_cA_0*(weight_t0*weight_t1) datO$swts <- swts MSMres_sw <- glm(datO, formula = "Y_1 ~ A_0 + A_1", weights = swts, family = "gaussian") res_sw <- rbind(res_sw, coef(MSMres_sw)) } meanres <- apply(res_sw, 2, mean) Varres <- apply(res_sw, 2, var) bias <- c(meanres["A_0"]-trueA["A_0"], meanres["A_1"]-trueA["A_1"]) MSE <- c(bias^2+Varres[c("A_0","A_1")]) bias10 <- sprintf("%.3f",bias*10) MSE10 <- sprintf("%.3f",MSE*10) resrow <- c(bias10[1], MSE10[1], bias10[2], MSE10[2]) col36names <- c("\\specialcell[t]{A(0)\\\\ Bias*10}", "\\specialcell[t]{A(0)\\\\ MSE*10}", "\\specialcell[t]{A(1)\\\\ Bias*10}", "\\specialcell[t]{A(1)\\\\ MSE*10}") names(resrow) <- col36names return(resrow) } @ <>= # recreating Tables 2 & 4 reported in Lefebvre et al. nsamp <- c(300,1000,10000) # Lefebvre et al. Tab 2: covnmT2 <- c(c("\\emph{Lefebvre et al.}: Confounder(s) only", rep("",2)), c("\\emph{Lefebvre et al.}: Confounder(s) &", "risk factors", rep("",1))) lefebvreT2 <- data.frame( covnm = covnmT2, N = rep(nsamp,2), A0Bias10 = sprintf("%.3f",c(0.768, 0.265, 0.057, 0.757, 0.283, 0.056)), A0MSE10 = sprintf("%.3f",c(1.761, 0.761, 0.146, 1.642, 0.718, 0.139)), A1Bias10 = sprintf("%.3f",c(0.889, 0.312, 0.086, 0.836, 0.330, 0.081)), A1MSE10 = sprintf("%.3f",c(1.728, 0.723, 0.120, 1.505, 0.638, 0.114)),stringsAsFactors = FALSE) # Lefebvre et al. Tab 4: covnmT4 <- c(c("\\emph{Lefebvre et al.}: Confounder(s) only", rep("",2)), c("\\emph{Lefebvre et al.}: Confounder(s) &", "risk factors", ""), c("\\emph{Lefebvre et al.}: Confounder(s) &", "IVs", ""), c("\\emph{Lefebvre et al.}: Confounder(s),", "IVs & risk factors",""), c("\\emph{Lefebvre et al.}: Mis-specified", rep("",2)), c("\\emph{Lefebvre et al.}: Full Model", rep("",2))) lefebvreT4 <- data.frame( covnm = covnmT4, N = rep(nsamp,6), A0Bias10 = sprintf("%.3f",c(-0.080, -0.371, -0.368, -0.110, -0.330, -0.378, 1.611, 0.824, 0.241, 1.600, 0.867, 0.235, 3.146, 2.460, 2.364, 1.524, 0.878, 0.240)), A0MSE10 = sprintf("%.3f",c(1.170, 0.385, 0.056, 1.092, 0.340, 0.051, 3.538, 2.063, 0.684, 3.477, 2.053, 0.676, 3.326, 1.700, 0.832, 3.648, 2.099, 0.679)), A1Bias10 = sprintf("%.3f",c(0.099, -0.035, -0.203, 0.112, -0.108, -0.207, 2.069, 1.245, 0.379, 2.143, 1.170, 0.372, 5.591, 5.258, 4.943, 2.221, 1.185, 0.377)), A1MSE10 = sprintf("%.3f",c(1.155, 0.331, 0.043, 0.865, 0.245, 0.037, 3.841, 2.188, 0.622, 3.598, 2.043, 0.625, 5.494, 3.851, 2.705, 3.907, 2.099, 0.630)), stringsAsFactors = FALSE) col1name <- "Covariates in $P(A|L)$" colnames(lefebvreT2)[1] <- colnames(lefebvreT4)[1] <- col1name col36names <- c("\\specialcell[t]{A(0)\\\\ Bias*10}", "\\specialcell[t]{A(0)\\\\ MSE*10}", "\\specialcell[t]{A(1)\\\\ Bias*10}", "\\specialcell[t]{A(1)\\\\ MSE*10}") colnames(lefebvreT2)[3:6] <- colnames(lefebvreT4)[3:6] <- col36names @ <>= trueA <- c(A_0 = -0.294, A_1 = -0.370) nsims <- 10000; restab <- NULL runsim <- function(Lnames, DAGO) { for (nsamp in c(300,1000,10000)) { resSc <- runMSMsw(DAGO = DAGO, Lnames = Lnames, trueA = trueA, nsamp = nsamp, nsims = nsims) restab <- rbind(restab, c(N = nsamp, resSc)) } restab } Lnames <- c("LC1") covnm <- c("Confounder(s) only", rep("",2)) restab_1 <- cbind(covnm, runsim(Lnames, DAGO.sc1)) # restab_1 <- rbind(restab_1, as.matrix(lefebvreT2[1:3,])) Lnames <- c("LC1", "LO1", "LO2", "LO3") covnm <- c("Confounder(s) &", "risk factors", rep("",1)) restab_2 <- cbind(covnm, runsim(Lnames, DAGO.sc1)) # restab_2 <- rbind(restab_2, as.matrix(lefebvreT2[4:6,])) restab <- rbind(restab_1, restab_2) col1name <- "Covariates in $P(A|L)$" colnames(restab)[1] <- col1name # restabwLef <- restab # save(list = "restabwLef", file = "vignette_dat/restabwLefSc1_all_1Ksims.Rdata"); # restab <- restab[c(1:3, 7:9),] save(list = "restab", file = "vignette_dat/restabSc1_all_1Ksims.Rdata"); @ <>= library("Hmisc") load(file = "vignette_dat/restabSc1_all_1Ksims.Rdata"); cat("\n") latex(restab, file = "", where = "!htpb", caption.loc = 'bottom', caption = "Replication of the simulation results from \\citet{lefebvre2008} for Scenario 1.", label = 'tab2Lefebvre',booktabs = TRUE,rowname = NULL,landscape = FALSE, col.just = c("l", rep("r", 5)), size = "small") @ <>= cat("\n") latex(lefebvreT2, file = "", where = "!htpb", caption.loc = 'bottom', caption = "Simulation results for Scenario 1 as reported in Table II of \\citet{lefebvre2008}.", label = 'origtab2Lefebvre', booktabs = TRUE, rowname = NULL, landscape = FALSE, col.just = c("l", rep("r", 5)), size = "small") @ Next, using the same approach described above, we replicate the simulation results for Scenario 3 reported in Table IV of \citet{lefebvre2008}. We start by defining the structural equation model specified for data simulation according to Scenario 3 in \citet{lefebvre2008} as follows: <>= `%+%` <- function(a, b) paste0(a, b) Lnames <- c("LO1", "LO2", "LO3", "LE1", "LE2", "LE3", "LC1", "LC2", "LC3") D <- DAG.empty() for (Lname in Lnames) { D <- D + node(Lname%+%".norm1", distr = "rnorm") + node(Lname%+%".norm2", distr = "rnorm") } @ <>= coefAi <- c(-0.10, -0.20, -0.30) sdLNi <- c(sqrt(1), sqrt(5), sqrt(10)) for (i in (1:3)) { D <- D + node("LO"%+%i, t = 0:1, distr = "rbivNorm", whichbiv = t + 1, mu = 0, params = list(norms = "c(LO"%+%i%+%".norm1, LO"%+%i%+%".norm2)")) + node("LE"%+%i, t = 0:1, distr = "rbivNorm", whichbiv = t + 1, mu = 0, var1 = 1, var2 = 1, rho = 0.7, params = list(norms = "c(LE"%+%i%+%".norm1, LE"%+%i%+%".norm2)")) + node("LC"%+%i, t = 0:1, distr = "rbivNorm", whichbiv = t + 1, mu = {if (t == 0) {0} else {.(coefAi[i]) * A[t-1]}}, params = list(norms = "c(LC"%+%i%+%".norm1, LC"%+%i%+%".norm2)")) + node("LN"%+%i, t = 0:1, distr = "rnorm", mean = 0, sd = .(sdLNi[i])) } D <- D + node("alpha", t = 0:1, distr = "rconst", const = {if(t == 0) {log(0.6)} else {log(1.0)}}) + node("A", t = 0:1, distr = "rbern", prob = plogis(alpha[t] + log(5) * LC1[t] + log(2) * LC2[t] + log(1.5) * LC3[t] + log(5) * LE1[t] + log(2) * LE2[t] + log(1.5) * LE3[t] + {if (t == 0) {0} else {log(5) * A[t-1]}})) + node("Y", t = 1, distr = "rnorm", mean = 0.98 * LO1[t] + 0.58 * LO2[t] + 0.33 * LO3[t] + 0.98 * LC1[t] + 0.58 * LC2[t] + 0.33 * LC3[t] - 0.39 * A[t], sd = 1) DAGO.sc3 <- set.DAG(D) @ Similar to Scenario 1, we then define the same four actions on the new \code{DAG} object before defining and evaluating the causal target parameter of interest. We note that our results match the true value of the MSM coefficients reported in Table IV of \citet{lefebvre2008}: <>= Dact.sc3 <- defAct(DAGO.sc3) msm.form <- "Y ~ S(abar[0]) + S(abar[1])" Dact.sc3 <- set.targetMSM(Dact.sc3, outcome = "Y", t = 1, form = msm.form, family = "gaussian") repstudy2.sc3.truetarget <- function() { trueMSMreps.sc3 <- NULL reptrue <- 50 for (i in (1:reptrue)) { res.sc3.i <- eval.target(Dact.sc3, n = 500000)$coef trueMSMreps.sc3 <- rbind(trueMSMreps.sc3, res.sc3.i) } return(trueMSMreps.sc3) } f2name <- "vignette_dat/trueMSMreps.sc3.Rdata" if (file.exists(f2name)) { load(f2name) } else { trueMSMreps.sc3 <- repstudy2.sc3.truetarget() save(list = "trueMSMreps.sc3", file = f2name) } (trueMSM.sc3 <- apply(trueMSMreps.sc3, 2, mean)) @ Finally, using the \proglang{R} code provided in the appendix, we replicate in Table~\ref{tab4Lefebvre} the IPW estimation results for Scenario 3 presented originally in Table IV of \citet{lefebvre2008}. We note that our simulation results closely match those in Table~\ref{origtab4Lefebvre} which were originally reported by \citet{lefebvre2008}.\\ <>= trueA <- c(A_0 = -0.316, A_1 = -0.390) nsims <- 10000; restab <- NULL runsim <- function(Lnames, DAGO) { for (nsamp in c(300,1000,10000)) { resSc <- runMSMsw(DAGO = DAGO, Lnames = Lnames, trueA = trueA, nsamp = nsamp, nsims = nsims) restab <- rbind(restab, c(N = nsamp, resSc)) } restab } Lnames <- c("LC1", "LC2", "LC3") covnm <- c("Confounder(s) only", rep("",2)) restab_1 <- cbind(covnm, runsim(Lnames, DAGO.sc3)) # restab_1 <- rbind(restab_1, as.matrix(lefebvreT4[1:3,])) Lnames <- c("LO1", "LO2", "LO3", "LC1", "LC2", "LC3") covnm <- c("Confounder(s) &", "risk factors", "") restab_2 <- cbind(covnm, runsim(Lnames, DAGO.sc3)) # restab_2 <- rbind(restab_2, as.matrix(lefebvreT4[4:6,])) Lnames <- c("LE1", "LE2", "LE3", "LC1", "LC2", "LC3") covnm <- c("Confounder(s) &", "IVs", "") restab_3 <- cbind(covnm, runsim(Lnames, DAGO.sc3)) # restab_3 <- rbind(restab_3, as.matrix(lefebvreT4[7:9,])) Lnames <- c("LO1", "LO2", "LO3", "LE1", "LE2", "LE3", "LC1", "LC2", "LC3") covnm <- c("Confounder(s),", "IVs & risk factors","") restab_4 <- cbind(covnm, runsim(Lnames, DAGO.sc3)) # restab_4 <- rbind(restab_4, as.matrix(lefebvreT4[10:12,])) Lnames <- c("LE1", "LE2", "LE3", "LC1") covnm <- c("Mis-specified", rep("",2)) restab_5 <- cbind(covnm, runsim(Lnames, DAGO.sc3)) # restab_5 <- rbind(restab_5, as.matrix(lefebvreT4[13:15,])) Lnames <- c("LO1", "LO2", "LO3", "LE1", "LE2", "LE3", "LC1", "LC2", "LC3", "LN1", "LN2", "LN3") covnm <- c("Full Model", rep("",2)) restab_6 <- cbind(covnm, runsim(Lnames, DAGO.sc3)) # restab_6 <- rbind(restab_6, as.matrix(lefebvreT4[16:18,])) restab <- rbind(restab_1, restab_2, restab_3, restab_4, restab_5, restab_6) col1name <- "Covariates in $P(A|L)$" colnames(restab)[1] <- col1name # restabwLef <- restab # save(list = "restabwLef", file = "vignette_dat/restabwLefSc3_all_1Ksims.Rdata"); # restab <- restab[c(1:3, 7:9, 13:15, 19:21, 25:27, 31:33),] save(list = "restab", file = "vignette_dat/restabSc3_all_1Ksims.Rdata"); @ <>= library("Hmisc") load(file = "vignette_dat/restabSc3_all_1Ksims.Rdata"); cat("\n") latex(restab,file = "",where = "!htpb", caption.loc = 'bottom', caption = "Replication of the simulation results from \\citet{lefebvre2008} for Scenario 3.", label = 'tab4Lefebvre',booktabs = TRUE,rowname = NULL,landscape = FALSE, col.just = c("l", rep("r", 5)), size = "small") @ <>= cat("\n") latex(lefebvreT4,file = "",where = "!htpb", caption.loc = 'bottom', caption = "Simulation results for Scenario 3 as reported in Table IV of \\citet{lefebvre2008}.", label = 'origtab4Lefebvre',booktabs = TRUE,rowname = NULL,landscape = FALSE, col.just = c("l", rep("r", 5)), size = "small") @ \newpage{} %********************************************************************* \section{Discussion}\label{secdiscus} \addcontentsline{toc}{section}{Discussion} We demonstrated that the \pkg{simcausal} \proglang{R} package is a flexible tool that facilitates the conduct of transparent and reproducible simulation studies to evaluate causal inference methodologies. The package allows the user to simulate complex longitudinal data structures based on structural equation models using a novel interface which allows concise and intuitive expression of complex functional dependencies for a large number of nodes. The package allows the user to specify and simulate counterfactual data under various interventions (e.g., static, dynamic, deterministic, or stochastic). These interventions may represent exposures to treatment regimens, the occurrence or non- occurrence of right-censoring events, or of specific monitoring events. The package also enables the computation of a selected set of user-specified features of the distribution of the counterfactual data that represent common causal target parameters, such as, treatment-specific means, average treatment effects and coefficients from working marginal structural models. In addition, the package provides a flexible graphical component that produces plots of directed acyclic graphs for observed or post-intervention data generating distributions.\\ We demonstrated the functionality of the package with a single time point intervention simulation study in Section~\ref{simstudy.pt} and a complex multiple time point simulation study in Section~\ref{simstudy.timevar}. We also showed two real-world applications of the \pkg{simcausal} package in Sections~\ref{repsimstudy1} and \ref{repsimstudy2}, first, by replicating results of the simulation study by \citet{neugebauer2014,neugebauer2015} that evaluated the comparative performance of two estimation procedures, and second, by replicating results of the simulation study by \citet{lefebvre2008} that evaluated the impact of model misspecification of the treatment mechanism on IPW inferences about MSM coefficients.\\ Finally, we acknowledge that the \pkg{simcausal} package is in the early stages of its development and that implementation of additional functionalities in future releases of the package should further expand its utility for methods research. Among such possible improvements is the evaluation of additional causal parameters, e.g., the average treatment effect on the treated \citep{holland1986,imbens2004,ATETshpitser}, survivorship causal effects \citep{joffe2007defining,greene2013balanced} and direct/indirect effects \citep{PearlDirInd,petersen2006,vanderweele2009,VanderWeele2014,hafeman2011}. Additionally, future versions of the \pkg{simcausal} package may allow simulation of non-iid observations, to study causal inference methodologies to analyze data resulting from experiments with more complex sampling methodologies, e.g., survey-based sampled data \citep{sarndal2003model} or network-based sampled (dependent) data \citep{eckles2014design}. \newpage{} %********************************************************************* \section*{Acknowledgments} \textbf{FUNDING ACKNOWLEDGEMENT}: This study was partially funded through internal operational funds provided by the Kaiser Permanente Center for Effectiveness \& Safety Research (CESR). This work was also partially supported through a Patient-Centered Outcomes Research Institute (PCORI) Award (ME-1403-12506) and an NIH grant (R01 AI074345-07).\\ \textbf{DISCLAIMER}: All statements in this report, including its findings and conclusions, are solely those of the authors and do not necessarily represent the views of the Patient-Centered Outcomes Research Institute (PCORI), its Board of Governors or Methodology Committee. \newpage{} % \bibliographystyle{unsrt} \bibliography{SimCausal_2014,R-Pckgs} % doesn't work: % \addcontentsline{toc}{section}{References} % \bibliography{} \newpage{} \appendix \section*{Source code} % doesn't work: % \addcontentsline{toc}{section}{Source code} \subsection{Source code for obtaining and plotting MSM survival curves} <>= @ \newpage{} \subsection{Source code for replicating the simulation study in Neugebauer et al., 2014} \emph{Source code for the function that creates inputs for the \code{ltmleMSM} function of the \pkg{ltmle} package \citep{R-ltmle}.} <>= @ \emph{Source code for setting up some of the parameters of the \code{ltmleMSM} function of the \pkg{ltmle} package \citep{R-ltmle}.} <>= @ \emph{Source code for running the data simulation and estimation with the \code{ltmleMSM} function of the \pkg{ltmle} package \citep{R-ltmle}.} <>= @ <>= @ \newpage{} \subsection{Source code for replicating the simulation study in Lefebvre et al., 2008} \emph{Source code for replicating the IPW estimator used by \citet{lefebvre2008}.} <>= @ \emph{Source code for recreating Tables II and IV from \citet{lefebvre2008}.} <>= @ \emph{Source code for replicating the simulation results for Scenario 1 in \citet{lefebvre2008}.} <>= @ <>= @ <>= @ \emph{Source code for replicating the simulation results for Scenario 3 in \citet{lefebvre2008}.} <>= @ <>= @ <>= @ \end{document}