This replication material is written in R markdown. The advantages of an R markdown write up are (a) it allows for nice commenting of the code which makes it easier to follow the analysis strategy and (b) the write up requires an actual running of the code and production of all material so there is a kind of live validation that everything works as it should. In addition output can be easily produced in different forms, as a webpage, pdf file, or word file. The raw code can be either copied or downloaded as a separate file (grab the Rmd file that produced this document here, a file with R code only is here).

1 Getting started

We use two packages. The MCMCpack has a useful function for drawing from a Dirichlet distribution. The rstan package is used for the main analysis. Note that rstan is a little tricky to install. There is help here for this.

Here’s the code to load up the packages.

rm(list=ls(all=TRUE))
library(MCMCpack)
library(rstan)

2 BIQQ functions

Now lets set up the BIQQ functions. First we set up a generic BIQQ model in rstan that handles two clues on b and d types. This model’s inputs are the X,Y,K data, recorded in the object XYKK and various parameters to capture priors. The vector alpha_prior gives the parameters for a Dirichlet distribution over the λs. Vectors pi_alpha and pi_beta record the α and β parameters for Beta distributions on the πs. Vectors q1bd1_alpha, q1bd1_beta record α and β parameters for Beta distributions on the first clue (each vector has two elements, one for ϕ1b1 and one for one for ϕ1d1). Similarly q2bd1_alpha q2bd1_beta recording α and β parameters for Beta distributions on the second clue (thus specifying priors over ϕ1b2 and ϕ1d2). This illustration differs from the baseline model by having up to two clues but only for a subset of cases: this part is seen in the defintion of the event probabilities w[5] and w[6].

stan.nr <- '
 data {
  int<lower=0> XYKK[6];       # summary data, providing number of XYK 
  
  vector[4] alpha_prior;      # initial hyperparameters for the Dirichlet distribution; provided as data
  vector[4] pi_alpha;         # initial hyperparameters for the beta distributions for pi_a, pi_b  etc
  vector[4] pi_beta ;         # initial hyperparameters for the beta distributions for pi_a, pi_b  etc
  vector[2] q1bd1_alpha ;     # initial hyperparameters for first clue (X=Y=1 cases only)  etc
  vector[2] q1bd1_beta ;      # initial hyperparameters for first clue (X=Y=1 cases only)  etc
  vector[2] q2bd1_alpha ;     # initial hyperparameters for second clue
  vector[2] q2bd1_beta ;      # initial hyperparameters for second clue
  
  }
  
 parameters {
  simplex[4]  abcd;               # Parameters of interest: share of causal types in population
  real<lower=0,upper=1> pi_a;     #  
  real<lower=0,upper=1> pi_b;     #  
  real<lower=0,upper=1> pi_c;     #  
  real<lower=0,upper=1> pi_d;     #  
  real<lower=0,upper=1> q1_b1;    #  
  real<lower=0,upper=1> q1_d1;    #  
  real<lower=0,upper=1> q2_b1;    #  
  real<lower=0,upper=1> q2_d1;    #  
  
  }

 transformed parameters {                        
  # Parameters determine the multinomial event probabilities
  # POssibilities are 00 01 10 110 1110 1111
  
  simplex[6] w;  
  
  w[1]<- (1-pi_b)*abcd[2]          +(1-pi_c)*abcd[3]          ;    #00- from b and c types - clue 1 not sought
  w[2]<- (1-pi_a)*abcd[1]          +(1-pi_d)*abcd[4]          ;    #01- from a and d types - clue 1 not sought
  w[3]<- pi_a*abcd[1]              +pi_c*abcd[3]              ;    #10- from a and c types - clue 1 not sought
  w[4]<- pi_b*abcd[2]*(1-q1_b1)     +pi_d*abcd[4]*(1-q1_d1)   ;    #110- from b and d types - clue 1  sought but not found
  w[5]<- pi_b*abcd[2]*(q1_b1)*(1-q2_b1) +pi_d*abcd[4]*(q1_d1)*(1-q2_d1) ; #1110- clue 1 found but not clue 2
  w[6]<- pi_b*abcd[2]*(q1_b1)*q2_b1     +pi_d*abcd[4]*(q1_d1)*q2_d1     ; #1111- clue 1 and clue 2 sought and found
  }

 model {
  abcd ~ dirichlet(alpha_prior);            # Priors for causal types        
  pi_a ~ beta(pi_alpha[1], pi_beta[1]);     # Priors for ps
  pi_b ~ beta(pi_alpha[2], pi_beta[2]);
  pi_c ~ beta(pi_alpha[3], pi_beta[3]);
  pi_d ~ beta(pi_alpha[4], pi_beta[4]);
  
  q1_b1 ~ beta(q1bd1_alpha[1], q1bd1_beta[1]);
  q1_d1 ~ beta(q1bd1_alpha[2], q1bd1_beta[2]);
  
  q2_b1 ~ beta(q2bd1_alpha[1], q2bd1_beta[1]);
  q2_d1 ~ beta(q2bd1_alpha[2], q2bd1_beta[2]);
  
  XYKK ~  multinomial(w);                   # Likelihood Part 1: (XY and w have 4 elements)
  
  }
'

We “initiate” the stan model by running it once and creating a basic model called sim.biqq.nr. Once it is run we can modify it.

sim.biqq.nr <- stan(model_code = stan.nr, 
                      data       = list(
                           XYKK         = c(  1,  1,  1, 
                                              1,  1,  1), 
                           alpha_prior = c(  1,  1,  1,  1), 
                           pi_alpha    = c(  1,  1,  1,  1), 
                           pi_beta     = c(  1,  1,  1,  1), 
                           q1bd1_alpha    = c(  1,  1), 
                           q1bd1_beta     = c(  1,  1), 
                           q2bd1_alpha    = c(  1,  1), 
                           q2bd1_beta     = c(  1,  1)), 
                      iter   = 500, 
                      chains = 4,
                      seed   = 1000
)

Now we will make a function that provides arbitrary data to the stan function. This is done by defining a function biqq.nr that modifies sim.biqq.nr using different data and priors. With this we will be able to quickly run models with different data and priors.

# BIQQ Function for drawing from posterior distribution using STAN
biqq.nr <- function(  fit         = sim.biqq.nr,                       
                      XYKK         = c(  61,  6,  64, 
                                         1,  1,  3), 
                      alpha_prior = c(  1,  1,  1,  1), 
                      pi_alpha    = c(  1,  1,  1,  1), 
                      pi_beta     = c(  1,  1,  1,  1), 
                      q1bd1_alpha    = c(  1,  1), 
                      q1bd1_beta     = c(  1,  1), 
                      q2bd1_alpha    = c(  1,  1), 
                      q2bd1_beta     = c(  1,  1), 
                      iter        = 20000, 
                      chains      = 4,
                      warmup      = 5000,
                      seed        = 1000
)  {
     
     data <- list(
          XYKK         = XYKK, 
          alpha_prior = alpha_prior, 
          pi_alpha    = pi_alpha, 
          pi_beta     = pi_beta, 
          q1bd1_alpha    = q1bd1_alpha, 
          q1bd1_beta     = q1bd1_beta, 
          q2bd1_alpha    = q2bd1_alpha, 
          q2bd1_beta     = q2bd1_beta 
     )
     #  suppressWarnings can also be added to reduce print out
     posterior <- suppressMessages(
          stan(fit     = fit, 
               data    = data, 
               iter    = iter, 
               chains  = chains,
               warmup  = warmup,
               verbose = FALSE,
               refresh = 0)
     )
         
     return(posterior)
}                    

We are now ready to go.

3 Run the Models

We run four versions of the analysis with different priors. Note in all cases we use a flat priors on the λs by setting alpha_prior = c(1,1,1,1) and in all those cases in which parameters for Beta distributions on ϕjx are not set they default to (1,1), for example q1bd1_alpha defaults to c(1,1) which corresponds to a uniform prior over ϕ check. You can do any of this differently by putting in different values for alpha_prior or for the various Beta distribution parameters.

KNN  <- biqq.nr()
KNY  <- biqq.nr(q2bd1_alpha    = c(10000,1),    q2bd1_beta     = c(1,10000))
KYN  <- biqq.nr(q1bd1_alpha    = c(10000,5000), q1bd1_beta     = c(1,5000))
KYY  <- biqq.nr(q1bd1_alpha    = c(10000,5000), q1bd1_beta     = c(1,5000),  
                q2bd1_alpha    = c(10000,1),    q2bd1_beta     = c(1,10000))

4 Report the output

Output can be reported in tables or graphs. Lets start with a table (corresponding to Table 6 in the paper).

stats = function(x1,x2){c(
  mean.ab  = mean(x1),
  sd.ab    = sd(x1),
  lower.ab = quantile(x1,probs = .025),
  upper.ab = quantile(x1,probs = 1-.025),
  mean.c   = mean(x2),
  sd.c     = sd(x2))
 }
  
row.makr <- function(out){
  d <- extract(out)  
  stats(d$abcd[,2]-d$abcd[,1],d$pi_c)
}

abcd  <- rdirichlet(n = 1000000,c(1,1,1,1)) # Draws from prior for first row: 
c.pi  <- rbeta(n = 1000000,1,1) 
row.1 <- stats(abcd[,2]-abcd[,1], c.pi)
row.2 <- row.makr(KNN)  # "Posteriors | Both clues weak"
row.3 <- row.makr(KNY)  # "Posteriors | Expert clue weak"
row.4 <- row.makr(KYN)  # "Posteriors | Mechanisms clue weak""
row.5 <- row.makr(KYY)  # "Posteriors | Both clues strong"

tab <- round(rbind(row.1,row.2,row.3,row.4,row.5),3)
row.names(tab) <- c("No clues", 
                    "K2 uninformative, K1 Uninformative", "K2 informative, K1 Uninformative", 
                    "K2 uninformative, K1 Informative", "K2 informative, K1 Informative")

Here’s how the table looks:

knitr::kable(tab, caption="Replication of Table 6.")
Replication of Table 6.
mean.ab sd.ab lower.ab.2.5% upper.ab.97.5% mean.c sd.c
No clues 0.000 0.316 -0.631 0.631 0.500 0.289
K2 uninformative, K1 Uninformative -0.018 0.195 -0.404 0.367 0.504 0.215
K2 informative, K1 Uninformative -0.006 0.193 -0.390 0.373 0.510 0.216
K2 uninformative, K1 Informative -0.015 0.195 -0.403 0.371 0.507 0.215
K2 informative, K1 Informative -0.009 0.193 -0.393 0.374 0.511 0.215

To graph the output we make a function which takes a model and a title and produces graphs of the form used in the paper.

gr.nr = function(Model, main =""){
  R <- extract(Model)
  s  <- seq(-1,1,.005)
  plot(R$abcd[,2]-R$abcd[,1],R$pi_c, col = rgb(.5,.5,0,.2), pch = 16, cex=.33, 
          xlab = paste("ate (posterior mean = ",round(mean(R$abcd[,2] - R$abcd[,1]),3),")"), 
          ylab = "Assignment propensity for c types", main=main)
     h = sapply(s, function(i)  sum((R$abcd[,2]-R$abcd[,1])> i & (R$abcd[,2]-R$abcd[,1])<=(i+.1)))
     lines(s,.5*h/max(h), lwd=2, col = rgb(.7,0,.1,.5))  
}

Now we will make a panel and do two of these plots (this corresponds to Figure 7 in the Supplementary Materials, with very small differences due to simulation error). Here’s how they look:

  par(mfrow=c(1,2))
  gr.nr(KNN, main = "Posteriors | Both clues weak")
  gr.nr(KYY, main = "Posteriors | Both clues strong")

5 Archive

The code below creates an R file from this Rmd file so that you can run it on its own. The output is saved as nr_replication.R (posted here).

library(knitr)
purl("nr_replication.Rmd", output = "nr_replication.R", documentation = 2)