Part 5 Guide to existing packages

5.1 RDS package + service multiplier

  • This package as far as I can see only offers prevalence (proportion estimation), so it is not suitable for estimation of hidden population size

  • That said it can be used for estimation of denominator for service multiplier method as per (Johnston et al. 2013)

  • Below are examples of data structure required for estimation


library(RDS)

data(faux)
as_tibble(faux)

# RDS::RDS.I.estimates(rds.data=faux,outcome.variable='X')
# RDS::RDS.I.estimates(rds.data=faux,outcome.variable='X',smoothed=TRUE)
# 
# RDS::RDS.II.estimates(rds.data=faux,outcome.variable='X')
# RDS::RDS.II.estimates(rds.data=faux,outcome.variable='X',subset= Y!="blue")

data(fauxmadrona)
as_tibble(fauxmadrona)


# RDS::RDS.SS.estimates(rds.data=fauxmadrona,outcome.variable="disease",N=1000)

5.2 sspse package

  • This package implements approach proposed in (Handcock, Gile, and Mar 2014) and allows for population size estimation
  • Estimation is done using Bayesian model described in the paper

library(sspse)

N0 <- 200
n <- 100
K <- 10

# Create probabilities for a Waring distribution 
# with scaling parameter 3 and mean 5, but truncated at K=10.
probs <- c(0.33333333,0.19047619,0.11904762,0.07936508,0.05555556,
           0.04040404,0.03030303,0.02331002,0.01831502,0.01465201)
probs <- probs / sum(probs)

#
# Create a sample
#
set.seed(1)
pop<-sample(1:K, size=N0, replace = TRUE, prob = probs)
s<-sample(pop, size=n, replace = FALSE, prob = pop)
 
# Here interval=1 so that it will run faster. It should be higher in a 
# real application.
out <- sspse:: posteriorsize(s=s,interval=1)
plot(out, HPD.level=0.9,data=pop[s])
summary(out, HPD.level=0.9)
# Let's look at some MCMC diagnostics
plot(out, HPD.level=0.9,mcmc=TRUE)

5.3 chords package

  • This package implements approach based on timing of enrollment proposed in (Berchenko, Rosenblatt, and Frost 2017) and allows for average degree and population size estimation

  • The estimation can be done using MLE, parametric, leave-\(d\)-out resampling, and maximum-a-posteriori (MAP) with Jeffreys priors methods among others


library(chords)

chords::initializeRdsObject()

data(brazil)
rds.object2<- initializeRdsObject(brazil)
see <- function(x) plot(x$estimates$Nk.estimates, type='h')

# Maximum likelihood
rds.object <- Estimate.b.k(rds.object = rds.object2, type = "jeffreys")
(rds.object$estimates$Nk.estimates)

see(rds.object)

5.4 networkreporting package

  • This package implements NSUM method by (Salganik et al. 2011) and allows for both degree and population size estimation
  • Guide to estimation can be found here and standard errors are calculated using bootstrap

library(networkreporting)

head(example.survey)

kp.vec <- df.to.kpvec(example.knownpop.dat, kp.var="known.popn", kp.value="size")

kp.vec

# total size of the population
tot.pop.size <- 10e6

d.hat <- 
  suppressWarnings(kp.degree.estimator(survey.data = example.survey,
                                       known.popns = kp.vec/10e6,
                                       total.popn.size = 1,
                                       missing="complete.obs"))


nsum.estimator(survey.data=example.survey,
               d.hat.vals=d.hat,
               total.popn.size=1,
               killworth.se = TRUE,
               y.vals="idu",
               missing="complete.obs")

# idu.est <- 
#   surveybootstrap::bootstrap.estimates(## this describes the sampling design of the
#     ## survey; here, the PSUs are given by the
#     ## variable cluster, and the strata are given
#     ## by the variable region
#     survey.design = ~ cluster + strata(region),
#     ## the number of bootstrap resamples to obtain
#     ## (NOTE: in practice, you should use more than 100.
#     ##  this keeps building the package relatively fast)
#     num.reps=100,
#     ## this is the name of the function
#     ## we want to use to produce an estimate
#     ## from each bootstrapped dataset
#     estimator.fn="nsum.estimator",
#     ## these are the sampling weights
#     weights="indweight",
#     ## this is the name of the type of bootstrap
#     ## we wish to use
#     bootstrap.fn="rescaled.bootstrap.sample",
#     ## our dataset
#     survey.data=example.survey,
#     ## other parameters we need to pass
#     ## to the nsum.estimator function
#     d.hat.vals=d.hat,
#     total.popn.size=tot.pop.size,
#     y.vals="idu",
#     missing="complete.obs")

5.5 NSUM package


library(NSUM)

## Killworth

## load data
data(McCarty)

## simulate from model with barrier effects
sim.bar <- with(McCarty, nsum.simulate(100, known, unknown, N, model="barrier",
                                       mu, sigma, rho))

## estimate unknown population sizes
dat.bar <- sim.bar$y
NK.killworth <- with(McCarty, killworth(dat.bar, known, N))

## Maltiel

## load data
data(McCarty)

## simulate from model with barrier effects
sim.bar <- with(McCarty, nsum.simulate(100, known, unknown, N, model="barrier", 
                                       mu, sigma, rho))

## estimate unknown population size
dat.bar <- sim.bar$y
mcmc <- with(McCarty, nsum.mcmc(dat.bar, known, N, model="barrier", iterations=100, 
                                burnin=50))

## view posterior distribution of subpopulation sizes for the first subpopulation
hist(mcmc$NK.values[1,])

## view posterior distribution of barrier effect parameters for the first subpopulation
hist(mcmc$rho.values[1,])

5.6 Rcapture package

  • Allows for capture-recapture population size estimation for closed and open populations [Will we be using this method?]

library(Rcapture)

# hare data set
hare.closedp <- closedp.t(hare)
hare.closedp
boxplot(hare.closedp)
# Third primary period of mvole data set
period3 <- mvole[, 11:15]
closedp.t(period3)
# BBS2001 data set
BBS.closedp <- closedp.0(BBS2001, dfreq = TRUE, dtype = "nbcap", t = 50, t0 = 20)
BBS.closedp
plot(BBS.closedp)
### Seber (1982) p.107
# When there is 2 capture occasions, the heterogeneity models cannot be fitted
X <- matrix(c(1,1,167,1,0,781,0,1,254), byrow = TRUE, ncol = 3)
closedp.t(X, dfreq = TRUE)
### Example of captures in continuous time
# Illegal immigrants data set
closedp.0(ill, dtype = "nbcap", dfreq = TRUE, t = Inf)

Service Multiplier

Berchenko, Yakir, Jonathan D. Rosenblatt, and Simon D. W. Frost. 2017. “Modeling and Analyzing Respondent-Driven Sampling as a Counting Process.” Biometrics 73 (4): 1189–98. https://doi.org/10.1111/biom.12678.

Handcock, Mark S., Krista J. Gile, and Corinne M. Mar. 2014. “Estimating Hidden Population Size Using Respondent-Driven Sampling Data.” Electronic Journal of Statistics 8 (1): 1491–1521. https://doi.org/10.1214/14-EJS923.

Johnston, Lisa G., Dimitri Prybylski, H. Fisher Raymond, Ali Mirzazadeh, Chomnad Manopaiboon, and Willi McFarland. 2013. “Incorporating the Service Multiplier Method in Respondent-Driven Sampling Surveys to Estimate the Size of Hidden and Hard-to-Reach Populations: Case Studies from Around the World.” Sexually Transmitted Diseases 40 (4): 304310. https://doi.org/10.1097/OLQ.0b013e31827fd650.

Salganik, Matthew J., Dimitri Fazito, Neilane Bertoni, Alexandre H. Abdo, Maeve B. Mello, and Francisco I. Bastos. 2011. “Assessing Network Scale-up Estimates for Groups Most at Risk of Hiv/Aids: Evidence from a Multiple-Method Study of Heavy Drug Users in Curitiba, Brazil.” American Journal of Epidemiology 174 (10): 11901196.