################################################################################################# 
# Provides Code for Creating Different Confounding Situations and to Generate Data			#
# Created by: Andrea Cook, Robert Wellman, and Tracey Marsh for FDA Sentinel PRISM Project	#
# Date: November 30, 2010												#
#################################################################################################


#################################################################################################
#fun.scena: One Binary Confounders (sex: Z1~binary(0.5)							#
#		and Multiple Sites determined by length of Site_Dist						#
#	N: Number of Observations											#
#	Site_Dist: vector of the proportion of observations per site					#
#output: list with Z (matrix of one confounder) and S (matrix of number of sites-1 			#
#		indicator variables											#
#################################################################################################
fun.scena<-function(N,Site_Dist=c(1/3,1/3,1/3))
{
	#confounders
	z1<-rbinom(N,1,0.5)
	Z<-cbind(z1)

	#site distribution
	tmp<-site.gen(N,site.p=Site_Dist)
	S<-tmp$S

  return(list(Z=Z, S=S))
}

#################################################################################################
#fun.scenb: One Continous Confounder (10yr Age: z1~(unif(35,65)-50)/10))				#
#		and Multiple Sites determined by length of Site_Dist						#
#	N: Number of Observations											#
#	Site_Dist: vector of the proportion of observations per site					#
#output: list with Z (matrix of one confounder) and S (matrix of number of sites-1 			#
#		indicator variables											#
#################################################################################################
fun.scenb<-function(N,Site_Dist=c(1/3,1/3,1/3))
{
	#confounders
 	Z1_0<-runif(N,35,65)
  	z1<-(Z1_0-50)/10
  	Z<-z1

	#site distribution
	tmp<-site.gen(N,site.p=Site_Dist)
	S<-tmp$S

  return(list(Z=Z, S=S))
}

#################################################################################################
#fun.scenc: Two Confounders (sex: Z1~binary(0.5); 10yr Age: Z2~(unif(35,65)-50)/10)			#
#		and Multiple Sites determined by length of Site_Dist						#
#	N: Number of Observations											#
#	Site_Dist: vector of the proportion of observations per site					#
#output: list with Z (matrix of two confounders) and S (matrix of number of sites-1 		#
#		indicator variables											#
#################################################################################################
fun.scenc<-function(N,Site_Dist=c(1/3,1/3,1/3))
{
  z1<-rbinom(N,1,0.5)
  Z2_0<-runif(N,35,65)
  z2<-(Z2_0-50)/10
  Z<-cbind(z1,z2)

  #site distribution
  tmp<-site.gen(N,site.p=Site_Dist)
  S<-tmp$S

  return(list(Z=Z, S=S))
}

#################################################################################################
#fun.scend: Three Confounders (sex: z1~binary(0.5); 								#
#		10yr Age: z2~(unif(35,65)-50)/10; Obesity: z3~multinomial(0.45, 0.35, 0.20)		#
#		and Multiple Sites determined by length of Site_Dist						#
#	N: Number of Observations											#
#	Site_Dist: vector of the proportion of observations per site					#
#output: list with Z (matrix of two confounders) and S (matrix of number of sites-1 		#
#		indicator variables											#
#################################################################################################
fun.scend<-function(N,Site_Dist=c(1/3,1/3,1/3))
{
  z1<-rbinom(N,1,0.5)
  Z2_0<-runif(N,35,65)
  z2<-(Z2_0-50)/10
  z3<-model.matrix(~factor(rep(c(0,1,2), times = rmultinom(1,N,c(0.45,0.35,0.20)))))[,2:3]
  z3.vars<-paste(rep('z3_',dim(z3)[[2]]),as.character(1:dim(z3)[[2]]),sep="")
  colnames(z3)<-z3.vars

  Z<-cbind(z1,z2,z3)

  #site distribution
  tmp<-site.gen(N,site.p=Site_Dist)
  S<-tmp$S

  return(list(Z=Z, S=S))
}

###Create site indicators
site.gen<-function(N,site.p=c(1/3,1/3,1/3))
{
	s0<-sample(0:(length(site.p)-1),size=N,replace=T,prob=site.p)
	
	# Creates indicator matrix of sites #
	s.mat<-model.matrix(~factor(s0))[,-1]
	colnames(s.mat)<-paste(rep('s',dim(s.mat)[[2]]),as.character(1:dim(s.mat)[[2]]),sep="")
	S<-cbind(s.mat)

	return(list(S=S))
}
	
#################################################################################################
## simrun : Function to run a simulation evaluation of a single scenario   				#
#input:	Nobs: number of total observations									#
#		LTime: a vector of times of each look (length 1 if no sequential monitoring)		#
#		confound.fun: name of the function specifying confounder distribution (examples:	#
#			fun.scena, fun.scenb, fun.scenc, fun.scend	 					#
#		Beta_y: a vector of the log(OR) denoting the strength of confounding between		# 
#			z (confounders) and s (sites as indicator variables) on outcome			#
#		Beta_x: a vector of the log(OR) denoting the strength of confounding between		# 
#			z (confounders) and s (sites as indicator variables) on x				#
#		Site_Dist: a vector of the proportion of observations per site (e.g. c(1/3,1/3,1/3))#
#		py0: marginal probability of outcome, y, if x=0 P(Y|X=0)					#
#		py1: marginal probability of outcome, y, if x=1 P(Y|X=1)					#
#		px: marginal probability of x across all sites							#
#		Nsim: number of simulations to use in the permutation test (larger is better)		#
#		sumoutput: If True then provides summary output such as power, 				#
#			mean time to signal (sigtime), and mean time to study end (studyendtime). 	#
#			If false then provides all simulations results for each row if they		#
#			signaled or not (signal) and minimum of end of study or time of signal 		#
#			(endtime)												#
#output: sumoutput=T a list that returns py0, py1, RD (risk difference), px, Beta_y, Beta_x,	#
#		Site_Dist, outputIPW (power, sigtime, studyendtime) for unstratified IPW, 		#
#		outputIPW_s (power, sigtime, studyendtime) for stratified IPW estimator.		#
#	   sumoutput=F a list that returns py0, py1, RD (risk difference), px, Beta_y, Beta_x,	#
#		Site_Dist, outputIPWall (signal, endtime) for unstratified IPW,		 		#
#		outputIPW_sall (signal,endtime) for stratified IPW estimator.				#  
#################################################################################################
simrun<-function(Nobs,LTime,confound.fun=fun.scenc,Beta_y,Beta_x,Site_Dist,py0,py1,px,Nsim=1500,sumoutput=T)
{
	Beta_y<-as.matrix(Beta_y)
	Beta_x<-as.matrix(Beta_x)

	params<-calc_params(Nsim=1000, Nset=Nobs,confound.gen=confound.fun,Beta_y=Beta_y,Beta_x=Beta_x,Site_Dist=Site_Dist,
				tgt_p0=py0,tgt_p1=py1,tgt_px=px)
	
	outputIPWtemp<-NULL
	outputIPWtemp_s<-NULL
	for(i in 1:Nsim)
	{
		#Start date is uniform across study
		start<-floor(runif(Nobs,1,max(LTime)+1))
	
		#generate counfounders and sort according to start day
		confounders<-confound.fun(N=Nobs, Site_Dist=Site_Dist)
		z<-confounders$Z[order(start),]
		S<-confounders$S[order(start),]
		zs<-cbind(z,S)
		start<-start[order(start)]
	
		#generate individual exposures and outcomes for desired scenario#
		#DOI flag
		x<-rbinom(Nobs,1,1/(1+exp(- params$x0 - zs%*%Beta_x)))
		#Outcome
		y<-rbinom(Nobs,1,1/(1+exp(- params$y0 - params$y1*x - zs%*%Beta_y)))
	
		s<-as.numeric(apply(S,1,sum)==0)
		for(i in 1:ncol(S))
		{
			s[S[,i]==1]<-i+1
		}
		res<-IPWseq(y,x,z,s,start,LTime,alpha=0.05,delta=0.5,nperm=1500)
		outputIPWtemp<-rbind(outputIPWtemp,c(signal=res$signal,endtime=res$endtime))
		outputIPWtemp_s<-rbind(outputIPWtemp_s,c(signal=res$signal_s,endtime=res$endtime_s))
	}
	#Summary 
	if(sumoutput)
	{
		outputIPW<-cbind(power=mean(outputIPWtemp[,1]),sigtime=mean(outputIPWtemp[outputIPWtemp[,1]==1,2]),
				studyendtime=mean(outputIPWtemp[,2]))
		outputIPW_s<-cbind(power=mean(outputIPWtemp_s[,1]),sigtime=mean(outputIPWtemp_s[outputIPWtemp_s[,1]==1,2]),
				studyendtime=mean(outputIPWtemp_s[,2]))
		return(list(py0=py0,py1=py1,RD=py1-py0,px=px,Beta_y=Beta_y,Beta_x=Beta_x,Site_Dist=Site_Dist,
			outputIPW=outputIPW,outputIPW_s=outputIPW_s))
	}
	else
	{
		return(list(py0=py0,py1=py1,RD=py1-py0,px=px,Beta_y=Beta_y,Beta_x=Beta_x,Site_Dist=Site_Dist,outputIPWall=outputIPWtemp,
			outputIPW_sall=outputIPWtemp_s))
	}
}



#################################################################################################
## datagen : Function to a single dataset for a given scenario		   				#
#input:	Nobs: number of total observations									#
#		LTime: a vector of times of each look (length 1 if no sequential monitoring)		#
#		confound.fun: name of the function specifying confounder distribution (examples:	#
#			fun.scena, fun.scenb, fun.scenc, fun.scend	 					#
#		Beta_y: a vector of the log(OR) denoting the strength of confounding between		# 
#			z (confounders) and s (sites as indicator variables) on outcome			#
#		Beta_x: a vector of the log(OR) denoting the strength of confounding between		# 
#			z (confounders) and s (sites as indicator variables) on x				#
#		Site_Dist: a vector of the proportion of observations per site (e.g. c(1/3,1/3,1/3))#
#		py0: marginal probability of outcome, y, if x=0 P(Y|X=0)					#
#		py1: marginal probability of outcome, y, if x=1 P(Y|X=1)					#
#		px: marginal probability of x across all sites							#
#		studylength: length of time of study (e.g. 720 for a two year study)			#
#		Nsim: number of simulations to use to find parameters to get correct marginal probs	#
#output: a list with elements y (outcome), x (predictor of interest), start (time subject		#
#		enters study based on uniform distribution), z (matrix of confounders, and		# 
#		s (vector of unique site indicators)						 		#  
#################################################################################################
datagen<-function(Nobs,confound.fun=fun.scenc,Beta_y,Beta_x,Site_Dist,py0,py1,px,studylength=720,Nsim=1000)
{
	Beta_y<-as.matrix(Beta_y)
	Beta_x<-as.matrix(Beta_x)

	params<-calc_params(Nsim=Nsim, Nset=Nobs,confound.gen=confound.fun,Beta_y=Beta_y,Beta_x=Beta_x,Site_Dist=Site_Dist,
				tgt_p0=py0,tgt_p1=py1,tgt_px=px)

	#single time use = one day exposure - no need to define end day of exposure yet
	start<-floor(runif(Nobs,1,studylength+1))
	
	#generate counfounders and sort according to start day
	confounders<-confound.fun(N=Nobs, Site_Dist=Site_Dist)
	z<-confounders$Z[order(start),]
	S<-confounders$S[order(start),]
	zs<-cbind(z,S)
	start<-start[order(start)]
	
	#generate individual exposures and outcomes for desired scenario#
	#DOI flag
	x<-rbinom(Nobs,1,1/(1+exp(- params$x0 - zs%*%Beta_x)))
	#Outcome
	y<-rbinom(Nobs,1,1/(1+exp(- params$y0 - params$y1*x - zs%*%Beta_y)))
	
	s<-as.numeric(apply(S,1,sum)==0)
	for(i in 1:ncol(S))
	{
		s[S[,i]==1]<-i+1
	}

	return(list(y=y,x=x,start=start,z=z,s=s))
}