This replication material is written in R markdown. The advantages of an R markdown write up are (a) it allows for nice commenting of the code which makes it easier to follow the analysis strategy and (b) the write up requires an actual running of the code and production of all material so there is a kind of live validation that everything works as it should. In addition output can be easily produced in different forms, as a webpage, pdf file, or word file. The raw code can be either copied or downloaded as a separate file (grab the Rmd file that produced this document here, a file with R code only is here).

# 1 Getting started

We use two packages. The MCMCpack has a useful function for drawing from a Dirichlet distribution. The rstan package is used for the main analysis. Note that rstan is a little tricky to install. There is help here for this.

Here’s the code to load up the packages.

rm(list=ls(all=TRUE))
library(MCMCpack)
library(rstan)

# 2 BIQQ functions

Now lets set up the BIQQ functions. First we set up a generic BIQQ model in rstan that handles two clues on $b$$b$ and $d$$d$ types. This model’s inputs are the $X,Y,K$$X,Y,K$ data, recorded in the object XYKK and various parameters to capture priors. The vector alpha_prior gives the parameters for a Dirichlet distribution over the $\lambda$$\lambda$s. Vectors pi_alpha and pi_beta record the $\alpha$$\alpha$ and $\beta$$\beta$ parameters for Beta distributions on the $\pi$$\pi$s. Vectors q1bd1_alpha, q1bd1_beta record $\alpha$$\alpha$ and $\beta$$\beta$ parameters for Beta distributions on the first clue (each vector has two elements, one for ${\varphi }_{1b}^{1}$$\phi^1_{1b}$ and one for one for ${\varphi }_{1d}^{1}$$\phi^1_{1d}$). Similarly q2bd1_alpha q2bd1_beta recording $\alpha$$\alpha$ and $\beta$$\beta$ parameters for Beta distributions on the second clue (thus specifying priors over ${\varphi }_{1b}^{2}$$\phi^2_{1b}$ and ${\varphi }_{1d}^{2}$$\phi^2_{1d}$). This illustration differs from the baseline model by having up to two clues but only for a subset of cases: this part is seen in the defintion of the event probabilities w[5] and w[6].

stan.nr <- '
data {
int<lower=0> XYKK[6];       # summary data, providing number of XYK

vector[4] alpha_prior;      # initial hyperparameters for the Dirichlet distribution; provided as data
vector[4] pi_alpha;         # initial hyperparameters for the beta distributions for pi_a, pi_b  etc
vector[4] pi_beta ;         # initial hyperparameters for the beta distributions for pi_a, pi_b  etc
vector[2] q1bd1_alpha ;     # initial hyperparameters for first clue (X=Y=1 cases only)  etc
vector[2] q1bd1_beta ;      # initial hyperparameters for first clue (X=Y=1 cases only)  etc
vector[2] q2bd1_alpha ;     # initial hyperparameters for second clue
vector[2] q2bd1_beta ;      # initial hyperparameters for second clue

}

parameters {
simplex[4]  abcd;               # Parameters of interest: share of causal types in population
real<lower=0,upper=1> pi_a;     #
real<lower=0,upper=1> pi_b;     #
real<lower=0,upper=1> pi_c;     #
real<lower=0,upper=1> pi_d;     #
real<lower=0,upper=1> q1_b1;    #
real<lower=0,upper=1> q1_d1;    #
real<lower=0,upper=1> q2_b1;    #
real<lower=0,upper=1> q2_d1;    #

}

transformed parameters {
# Parameters determine the multinomial event probabilities
# POssibilities are 00 01 10 110 1110 1111

simplex[6] w;

w[1]<- (1-pi_b)*abcd[2]          +(1-pi_c)*abcd[3]          ;    #00- from b and c types - clue 1 not sought
w[2]<- (1-pi_a)*abcd[1]          +(1-pi_d)*abcd[4]          ;    #01- from a and d types - clue 1 not sought
w[3]<- pi_a*abcd[1]              +pi_c*abcd[3]              ;    #10- from a and c types - clue 1 not sought
w[4]<- pi_b*abcd[2]*(1-q1_b1)     +pi_d*abcd[4]*(1-q1_d1)   ;    #110- from b and d types - clue 1  sought but not found
w[5]<- pi_b*abcd[2]*(q1_b1)*(1-q2_b1) +pi_d*abcd[4]*(q1_d1)*(1-q2_d1) ; #1110- clue 1 found but not clue 2
w[6]<- pi_b*abcd[2]*(q1_b1)*q2_b1     +pi_d*abcd[4]*(q1_d1)*q2_d1     ; #1111- clue 1 and clue 2 sought and found
}

model {
abcd ~ dirichlet(alpha_prior);            # Priors for causal types
pi_a ~ beta(pi_alpha[1], pi_beta[1]);     # Priors for ps
pi_b ~ beta(pi_alpha[2], pi_beta[2]);
pi_c ~ beta(pi_alpha[3], pi_beta[3]);
pi_d ~ beta(pi_alpha[4], pi_beta[4]);

q1_b1 ~ beta(q1bd1_alpha[1], q1bd1_beta[1]);
q1_d1 ~ beta(q1bd1_alpha[2], q1bd1_beta[2]);

q2_b1 ~ beta(q2bd1_alpha[1], q2bd1_beta[1]);
q2_d1 ~ beta(q2bd1_alpha[2], q2bd1_beta[2]);

XYKK ~  multinomial(w);                   # Likelihood Part 1: (XY and w have 4 elements)

}
'

We “initiate” the stan model by running it once and creating a basic model called sim.biqq.nr. Once it is run we can modify it.

sim.biqq.nr <- stan(model_code = stan.nr,
data       = list(
XYKK         = c(  1,  1,  1,
1,  1,  1),
alpha_prior = c(  1,  1,  1,  1),
pi_alpha    = c(  1,  1,  1,  1),
pi_beta     = c(  1,  1,  1,  1),
q1bd1_alpha    = c(  1,  1),
q1bd1_beta     = c(  1,  1),
q2bd1_alpha    = c(  1,  1),
q2bd1_beta     = c(  1,  1)),
iter   = 500,
chains = 4,
seed   = 1000
)

Now we will make a function that provides arbitrary data to the stan function. This is done by defining a function biqq.nr that modifies sim.biqq.nr using different data and priors. With this we will be able to quickly run models with different data and priors.

# BIQQ Function for drawing from posterior distribution using STAN
biqq.nr <- function(  fit         = sim.biqq.nr,
XYKK         = c(  61,  6,  64,
1,  1,  3),
alpha_prior = c(  1,  1,  1,  1),
pi_alpha    = c(  1,  1,  1,  1),
pi_beta     = c(  1,  1,  1,  1),
q1bd1_alpha    = c(  1,  1),
q1bd1_beta     = c(  1,  1),
q2bd1_alpha    = c(  1,  1),
q2bd1_beta     = c(  1,  1),
iter        = 20000,
chains      = 4,
warmup      = 5000,
seed        = 1000
)  {

data <- list(
XYKK         = XYKK,
alpha_prior = alpha_prior,
pi_alpha    = pi_alpha,
pi_beta     = pi_beta,
q1bd1_alpha    = q1bd1_alpha,
q1bd1_beta     = q1bd1_beta,
q2bd1_alpha    = q2bd1_alpha,
q2bd1_beta     = q2bd1_beta
)
#  suppressWarnings can also be added to reduce print out
posterior <- suppressMessages(
stan(fit     = fit,
data    = data,
iter    = iter,
chains  = chains,
warmup  = warmup,
verbose = FALSE,
refresh = 0)
)

return(posterior)
}                    

We are now ready to go.

# 3 Run the Models

We run four versions of the analysis with different priors. Note in all cases we use a flat priors on the $\lambda$$\lambda$s by setting alpha_prior = c(1,1,1,1) and in all those cases in which parameters for Beta distributions on ${\varphi }_{jx}$$\phi_{jx}$ are not set they default to $\left(1,1\right)$$(1,1)$, for example q1bd1_alpha defaults to c(1,1) which corresponds to a uniform prior over $\varphi$$\phi$ check. You can do any of this differently by putting in different values for alpha_prior or for the various Beta distribution parameters.

KNN  <- biqq.nr()
KNY  <- biqq.nr(q2bd1_alpha    = c(10000,1),    q2bd1_beta     = c(1,10000))
KYN  <- biqq.nr(q1bd1_alpha    = c(10000,5000), q1bd1_beta     = c(1,5000))
KYY  <- biqq.nr(q1bd1_alpha    = c(10000,5000), q1bd1_beta     = c(1,5000),
q2bd1_alpha    = c(10000,1),    q2bd1_beta     = c(1,10000))

# 4 Report the output

Output can be reported in tables or graphs. Lets start with a table (corresponding to Table 6 in the paper).

stats = function(x1,x2){c(
mean.ab  = mean(x1),
sd.ab    = sd(x1),
lower.ab = quantile(x1,probs = .025),
upper.ab = quantile(x1,probs = 1-.025),
mean.c   = mean(x2),
sd.c     = sd(x2))
}

row.makr <- function(out){
d <- extract(out)
stats(d$abcd[,2]-d$abcd[,1],d$pi_c) } abcd <- rdirichlet(n = 1000000,c(1,1,1,1)) # Draws from prior for first row: c.pi <- rbeta(n = 1000000,1,1) row.1 <- stats(abcd[,2]-abcd[,1], c.pi) row.2 <- row.makr(KNN) # "Posteriors | Both clues weak" row.3 <- row.makr(KNY) # "Posteriors | Expert clue weak" row.4 <- row.makr(KYN) # "Posteriors | Mechanisms clue weak"" row.5 <- row.makr(KYY) # "Posteriors | Both clues strong" tab <- round(rbind(row.1,row.2,row.3,row.4,row.5),3) row.names(tab) <- c("No clues", "K2 uninformative, K1 Uninformative", "K2 informative, K1 Uninformative", "K2 uninformative, K1 Informative", "K2 informative, K1 Informative") Here’s how the table looks: knitr::kable(tab, caption="Replication of Table 6.") Replication of Table 6. mean.ab sd.ab lower.ab.2.5% upper.ab.97.5% mean.c sd.c No clues 0.000 0.316 -0.631 0.631 0.500 0.289 K2 uninformative, K1 Uninformative -0.018 0.195 -0.404 0.367 0.504 0.215 K2 informative, K1 Uninformative -0.006 0.193 -0.390 0.373 0.510 0.216 K2 uninformative, K1 Informative -0.015 0.195 -0.403 0.371 0.507 0.215 K2 informative, K1 Informative -0.009 0.193 -0.393 0.374 0.511 0.215 To graph the output we make a function which takes a model and a title and produces graphs of the form used in the paper. gr.nr = function(Model, main =""){ R <- extract(Model) s <- seq(-1,1,.005) plot(R$abcd[,2]-R$abcd[,1],R$pi_c, col = rgb(.5,.5,0,.2), pch = 16, cex=.33,
xlab = paste("ate (posterior mean = ",round(mean(R$abcd[,2] - R$abcd[,1]),3),")"),
ylab = "Assignment propensity for c types", main=main)
h = sapply(s, function(i)  sum((R$abcd[,2]-R$abcd[,1])> i & (R$abcd[,2]-R$abcd[,1])<=(i+.1)))
lines(s,.5*h/max(h), lwd=2, col = rgb(.7,0,.1,.5))
}

Now we will make a panel and do two of these plots (this corresponds to Figure 7 in the Supplementary Materials, with very small differences due to simulation error). Here’s how they look:

  par(mfrow=c(1,2))
gr.nr(KNN, main = "Posteriors | Both clues weak")
gr.nr(KYY, main = "Posteriors | Both clues strong")

# 5 Archive

The code below creates an R file from this Rmd file so that you can run it on its own. The output is saved as nr_replication.R (posted here).

library(knitr)
purl("nr_replication.Rmd", output = "nr_replication.R", documentation = 2)