# Written using Rversion 2.15.3 - security blanket
# Program by Rod Walker, Biostatistician, Group Health Research Institute
# Email: walker.rl@ghc.org
# Required R packages (not written by Walker)
library(osDesign)
library(ggplot2)
library(gridExtra)
# See osDesign documentation for detail on that package and two-phase analysis methods.
# Further, the osDesign package is geared for broader functionality in analysis of two-phase designs
# than this simulation tool. As such, it is recommended that the user review the osDesign package
# documentation to determine whether its suite of functions would better suit the user's needs.
# Also, it is recommended that the user of this simulation tool be familiar with simulations, R, and
# two-phase methodology, so as to understand the functionality, interpretation, and limitations of the code.
# Finally, feel free to alter or repurpose the code as needed.
# This simulation assumes a cohort study with binary exposure (X), outcome (Y), and 2 confounders (C1,C2).
# Assumes true relationship dictated by logistic regression model: logit(p) = b0 + bX*X + bC1*C1 + bC2*C2
# Y is assumed to be known at phase 1 and used in the formation of phase 1 strata.
# Options allow for knowledge of X, C1, C2, or surrogates (Xs, C1s, C2s) at phase 1.
# Phase 2 is asssumed to collect information on X, C1, and C2, using a balanced design.
# The goal of the simulation is to evaluate performance of a two-phase study design.
### BEGINNING OF INPUT SPECIFICATION
myseed <- 5743 # choose seed value to be able to reproduce simulation, if needed
set.seed(myseed)
tolFail <- log(1000) # exclusion tolerance
# NOTE: Some simulated data might yield unusally extreme model parameter estimates.
# If you want to exclude these extremes, you can set this tolFail value to a large number.
# For instance, setting tolFail=log(1000) tells the simulation to exclude any simulation trials
# that produce an estimate for bX, bC1, or bC2 that is outside the interval [-log(1000),log(1000)],
# i.e., it excludes if an odds ratio > 1000 or an odds ratio < 1/1000.
Ntrials <- 10000 # number of simulation trials to run
Ntot <- 150000 # Phase 1 sample size
pIIsampleN <- 250 # Phase 2 sample size
pY <- 0.01 # Outcome (Y) incidence
pX <- 0.20 # Exposure (X) prevalence
pC1 <- 0.10 # Confounder (C1) prevalence
pC2 <- 0.40 # Confounder (C2) prevalence
bX <- 1.00 # odds ratio for association between Y and X
bC1 <- 4.00 # odds ratio for association between Y and C1
bC2 <- 2.00 # odds ratio for association between Y and C2
aC1 <- 3.00 # odds ratio for association between X and C1
aC2 <- 2.00 # odds ratio for association between X and C2
sensX <- 1.00 # sensitivity of phase 1 proxy measure of X, if exists
specX <- 1.00 # specificity of phase 1 proxy measure of X, if exists
sensC1 <- 0.40 # sensitivity of phase 1 proxy measure of C1, if exists
specC1 <- 0.99 # specificity of phase 1 proxy measure of C1, if exists
sensC2 <- 1.00 # sensitivity of phase 1 proxy measure of C2, if exists
specC2 <- 1.00 # specificity of phase 1 proxy measure of C2, if exists
ph2method <- c("PL") # analysis method: "PL" for profile-likelihood or "WL" for weighted-likelihood
ph1svars <- c("X","C1s") # planned phase 1 stratification, in addition to Y
# NOTE: Do not include Y in ph1svars above, as this program already assumes
# it will be used in formation of the phase 1 strata. The possible variables
# that can be included are: "X" or "Xs", "C1" or "C1s", and "C2" or "C2s".
# The "s" versions of the variables are used to denote a phase 1 proxy measure.
# So, for instance, if phase 1 strata were going to be formed on the basis of
# known exposure X (in addition to outcome Y), then set ph1svars <- c("X").
# If phase 1 strata were going to be formed on the basis of known exposure X and
# known confounder C1 (in addition to outcome Y), then set ph1svars <- c("X","C1").
# If phase 1 strata were going to be formed on the basis of known exposure X and
# a proxy for confounder C1 (in addition to outcome Y), then set ph1svars <- c("X","C1s").
mypdf <- c("//HOME/walkrl3/MiniSentinel/DocumentSimulation.pdf") # path and name of output pdf report
# NOTE: This is the end of the required simulation inputs; however, you may need or want to adjust
# formatting, axes, etc. of the output tables and figures. Those can be adjusted in the
# last section of this program, the SUMMARIZING RESULTS section.
### END OF INPUT SPECIFICATION
### BEGINNING OF SIMULATION
# Setup phase 1 stratafication variables for proper formula syntax
strataForm <- as.formula(paste(c("Nobs", paste(c(ph1svars, "Y"), collapse="+")), collapse="~"))
NoYstrataForm <- as.formula(paste(c("Nobs", paste(ph1svars, collapse="+")), collapse="~"))
# Setup initial matrices and vectors to store simulation results
simsCoefSE <- matrix(NA,Ntrials,2)
colnames(simsCoefSE) <- c("logbX","se")
simsCIsW <- matrix(NA,Ntrials,2)
colnames(simsCIsW) <- c("ci_025","ci_975")
simsDatList <- as.list(rep(NA,Ntrials))
simsFail <- rep(0,Ntrials)
simsMaxCoef <- rep(NA,Ntrials)
# Setup alpha and beta parameters (log odds ratios) based on above inputs
alpha1 <- log(c(aC1, aC2))
beta1 <- log(c(bX, bC1, bC2))
# Setup initial design matrices used by program based on above inputs
var_grid <- expand.grid(X=c(0,1), C1=c(0,1), C2=c(0,1), Xs=c(0,1), C1s=c(0,1), C2s=c(0,1))
var_desmat <- as.matrix(cbind(Int=1,var_grid[,colnames(var_grid) %in% c('X','C1','C2')]))
var_desmatC <- as.matrix(cbind(Int=1,var_grid[,colnames(var_grid) %in% c('C1','C2')]))
var_grid$prev <- NA
var_grid <- within(var_grid, {
prev[C1 == 1] <- pC1
prev[C1 == 0] <- (1 - pC1)
prev[C1 == 1 & C1s == 1] <- prev[C1 == 1 & C1s == 1] * sensC1
prev[C1 == 1 & C1s == 0] <- prev[C1 == 1 & C1s == 0] * (1-sensC1)
prev[C1 == 0 & C1s == 0] <- prev[C1 == 0 & C1s == 0] * specC1
prev[C1 == 0 & C1s == 1] <- prev[C1 == 0 & C1s == 1] * (1-specC1)
prev[C2 == 1] <- prev[C2 == 1] * pC2
prev[C2 == 0] <- prev[C2 == 0] * (1 - pC2)
prev[C2 == 1 & C2s == 1] <- prev[C2 == 1 & C2s == 1] * sensC2
prev[C2 == 1 & C2s == 0] <- prev[C2 == 1 & C2s == 0] * (1-sensC2)
prev[C2 == 0 & C2s == 0] <- prev[C2 == 0 & C2s == 0] * specC2
prev[C2 == 0 & C2s == 1] <- prev[C2 == 0 & C2s == 1] * (1-specC2)
})
alpha1all <- c(beta0(alpha1, X=var_desmatC, N=var_grid$prev, rhoY=pX), alpha1)
prevX <- (exp(var_desmatC %*% alpha1all)/(1 + exp(var_desmatC %*% alpha1all)))
var_grid <- within(var_grid, {
prev[X == 1] <- prev[X == 1] * prevX[X == 1]
prev[X == 0] <- prev[X == 0] * (1 - prevX[X == 0])
prev[X == 1 & Xs == 1] <- prev[X == 1 & Xs == 1] * sensX
prev[X == 1 & Xs == 0] <- prev[X == 1 & Xs == 0] * (1-sensX)
prev[X == 0 & Xs == 0] <- prev[X == 0 & Xs == 0] * specX
prev[X == 0 & Xs == 1] <- prev[X == 0 & Xs == 1] * (1-specX)
})
beta1all <- c(beta0(beta1, X=var_desmat, N=var_grid$prev, rhoY=pY), beta1)
var_grid$probY <- (exp(var_desmat %*% beta1all)/(1 + exp(var_desmat %*% beta1all)))
var_grid$Nobs <- round(var_grid$prev * Ntot)
rm(prevX,var_desmatC)
# Trials start here
for(j in 0:Ntrials){
# Initialize data for simulation trial
Mdat <- var_grid
Mdat$NY1 <- 0
Mdat$NY0 <- 0
# In first iteration, compute expected number of cases and controls
# Otherwise, simulate number of cases and controls
if(j==0){
Mdat$NY1 <- round(Mdat$Nobs*Mdat$probY)
Mdat$NY0 <- Mdat$Nobs - Mdat$NY1
} else{
for(b in 1:nrow(Mdat)){
Mdat$NY1[b] <- rbinom(1,Mdat$Nobs[b],Mdat$probY[b])
Mdat$NY0[b] <- Mdat$Nobs[b] - Mdat$NY1[b]
}
}
Mdat <- rbind(cbind(Mdat,Y=0),cbind(Mdat,Y=1))
Mdat$Nobs <- NULL
Mdat$Nobs <- ifelse(Mdat$Y==1, Mdat$NY1, Mdat$NY0)
Mdat <- Mdat[,colnames(Mdat) %in% c('X','C1','C2','Xs','C1s','C2s','Y','Nobs')]
# Compute number of observations in phase 1 strata
aggMdat <- aggregate(formula=strataForm, data=Mdat, FUN=sum)
numstrata <- nrow(aggMdat)
aggMdat$strata <- 1:numstrata
# Compute the number of observations in each stratum that will be sampled for phase 2 data collection.
# We assume a balanced phase 2 design (i.e., attempt to select equal numbers from each stratum).
# If a stratum has less observations than were proposed to be sampled from that stratum,
# then the program will redistribute the surplus of the phase 2 sample to the other strata.
aggMdat$Ssize <- floor(pmin(pIIsampleN/nrow(aggMdat),aggMdat$Nobs))
extraN <- (pIIsampleN - sum(aggMdat$Ssize))
extraS <- sum((aggMdat$Nobs - aggMdat$Ssize)!=0)
while(extraN>=extraS){
aggMdat$Ssize <- aggMdat$Ssize + floor(pmin(extraN/extraS,aggMdat$Nobs - aggMdat$Ssize))
extraN <- (pIIsampleN - sum(aggMdat$Ssize))
extraS <- sum((aggMdat$Nobs - aggMdat$Ssize)!=0)
}
# In first iteration, stop here and store the expected phase 1 and 2 counts to display in table format for report
# Otherwise, continue with simulating the phase 2 data and analysis
if(j==0){
mystext <- tableGrob(aggMdat, show.rownames=FALSE)
} else{
# Put strata information into format in preparation for use of the tps function
NoYstrata <- aggregate(formula=NoYstrataForm, data=aggMdat, FUN=sum)
NoYstrata$tps_strata <- 1:nrow(NoYstrata)
NoYstrata$Nobs <- NULL
aggMdat <- merge(x=aggMdat, y=NoYstrata)
Controls <- aggMdat[aggMdat$Y==0,]
Cases <- aggMdat[aggMdat$Y==1,]
nn0_tps_strata <- Controls$Nobs[with(Controls, order(tps_strata))]
nn1_tps_strata <- Cases$Nobs[with(Cases, order(tps_strata))]
nII0_tps_strata <- Controls$Ssize[with(Controls, order(tps_strata))]
nII1_tps_strata <- Cases$Ssize[with(Cases, order(tps_strata))]
aggMdat$Nobs <- NULL
Mdat <- merge(x=Mdat, y=aggMdat)
# Simulate selection of the phase 2 sample
for(i in 1:numstrata){
Mdat$Sdrawn[Mdat$strata==i] <- rmvhyper(Mdat$Nobs[Mdat$strata==i],aggMdat$Ssize[aggMdat$strata==i])
}
# Store the sample that was generated for this trial of the simulation.
simsDatList[[j]] <- Mdat[Mdat$Nobs!=0,]
# Expand dataset to create one record per phase 2 observation
Mdat <- Mdat[rep(seq_len(nrow(Mdat)), Mdat$Sdrawn), 1:ncol(Mdat)]
# Perform phase 2 analysis using weighted likelihood approach
ModFit <- suppressWarnings(tps(formula=Y ~ X + C1 + C2, data=Mdat,
nn0=nn0_tps_strata,
nn1=nn1_tps_strata,
group=Mdat$tps_strata,
method=ph2method,
cohort=TRUE))
# Store estimated regression coefficient of X, i.e., the log odds ratio (bX)
# for the adjusted association between Y and X, along with the standard error.
# Also, store the Wald-based 95% confidence interval for this log odds ratio.
if(ph2method=='WL'){
tempCoefSE <- cbind(ModFit$coef,sqrt(diag(ModFit$cove)))[2,]
} else{
tempCoefSE <- cbind(ModFit$coef,sqrt(diag(ModFit$covm)))[2,]
}
tempCIsW <- c(tempCoefSE[1]-1.96*tempCoefSE[2],tempCoefSE[1]+1.96*tempCoefSE[2])
tempMaxCoef <- max(abs(ModFit$coef[-1]))
if(tempMaxCoef>tolFail){
simsFail[j] <- 1
simsMaxCoef[j] <- tempMaxCoef
} else{
simsCoefSE[j,] <- tempCoefSE
simsCIsW[j,] <- tempCIsW
}
rm(tempCoefSE,tempCIsW,tempMaxCoef)
}
print(j)
}
### END OF SIMULATION
### BEGINNING OF SUMMARIZING RESULTS
# Compute bias and MSE (on logOR and OR scales), as well as coverage probability
Bias_logbX <- mean(simsCoefSE[,1]-beta1all[2],na.rm=TRUE)
MSE_logbX <- mean((simsCoefSE[,1]-beta1all[2])^2,na.rm=TRUE)
Bias_bX <- mean(exp(simsCoefSE[,1])-exp(beta1all[2]),na.rm=TRUE)
MSE_bX <- mean((exp(simsCoefSE[,1])-exp(beta1all[2]))^2,na.rm=TRUE)
CoverW <- mean((simsCIsW[,1] < beta1all[2]) & (beta1all[2] < simsCIsW[,2]),na.rm=TRUE)
# Create histogram (i.e., sampling distribution) of the point estimates (for the odds ratio
# of the association between Y and X) generated across all simulated trials,
# along with the average (on the odds ratio scale).
# You may need or want to adjust axes for display in the figure below.
hplotdata <- as.data.frame(exp(simsCoefSE[,1]))
colnames(hplotdata) <- c("OR")
myhplot <- ggplot(hplotdata, aes(x = OR)) +
geom_histogram() +
xlim(0.0,2.0) +
xlab("Estimated OR (between Y and X)") +
ylab("Frequency") +
ggtitle(paste("ORs from all simulated trials, Avg =", round(mean(exp(simsCoefSE[,1]),na.rm=TRUE),2)))
# Create figure showing a sample (e.g., 25) of the 95% CIs (for the odds ratio of the association
# between Y and X) generated from the simulated trials, along with the average width
# of the CIs across all simulated trials (on the odds ratio scale).
# You may need or want to adjust axes for display in the figure below.
avgCIlength <- mean(exp(simsCIsW[,2])-exp(simsCIsW[,1]),na.rm=TRUE)
Ndisp <- 25
fplotdata <- as.data.frame(exp(cbind(simsCoefSE[1:Ndisp,1], simsCIsW[1:Ndisp,1], simsCIsW[1:Ndisp,2])))
colnames(fplotdata) <- c("y","ylo","yhi")
fplotdata$x <- seq(1:Ndisp)
forestplotCUST <- function(d, avelen, xlab="Estimated OR (between Y and X)", ylab="Simulation trial"){
require(ggplot2)
p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi)) +
geom_pointrange() +
ylim(0,2.5) +
coord_flip() +
geom_hline(aes(yintercept=1), lty=2) +
ylab(xlab) +
xlab(ylab) +
ggtitle(paste("Sample CIs, Avg Width =",round(avelen,2)))
return(p)}
myfplot <- forestplotCUST(fplotdata,avgCIlength)
# Create table showing the proportion of the 95% CIs (for the odds ratio of the association
# between Y and X) across simulated trial that exclude certain values (e.g., 1.0 to 1.5).
# You may need or want to change the values shown in the table below.
counter <- 1
vals <- seq(1,1.5,by=0.1)
coveragevals <- matrix(NA,length(vals),2)
colnames(coveragevals) <- c("OR","Prop_CIs_Excluding")
for(val in vals){
coverageX <- ((log(val) > simsCIsW[,1]) & (log(val) < simsCIsW[,2]))
excludeX <- (1 - mean(coverageX,na.rm=TRUE))
coveragevals[counter,] <- c(val,round(excludeX,2))
counter <- counter + 1
}
coveragevals
myctext <- tableGrob(coveragevals, show.rownames=FALSE)
# Display summary of bias and mean squared error for the estimate of the confounder adjusted
# outcome/exposure association on the log odds (log_bX) and odds ratio (bX) scales.
# Display 95% Wald confidence interval coverage probability for this parameter.
# Display proportion of simulated trials for which extreme estimates were excluded due to exceeding tolFail.
SimResults <- as.vector(round(c(log(bX), Bias_logbX, MSE_logbX,
bX, Bias_bX, MSE_bX,
CoverW,
sum(simsFail)/Ntrials),4))
names(SimResults) <- c("True_logbX", "Bias_logbX", "MSE_logbX",
"True_bX", "Bias_bX", "MSE_bX",
"Wald95Coverage",
"pctFailed")
print(SimResults)
# Generate pdf containing the tables and figures summarizing simulation results.
pdf(mypdf,width=8,height=12)
grid.arrange(mystext,myhplot,myfplot,myctext, nrow=2, ncol=2)
dev.off()
### END OF SUMMARIZING RESULTS