Part 5 Guide to existing packages
5.1 RDS
package + service multiplier
This package as far as I can see only offers prevalence (proportion estimation), so it is not suitable for estimation of hidden population size
That said it can be used for estimation of denominator for service multiplier method as per (Johnston et al. 2013)
Below are examples of data structure required for estimation
library(RDS)
data(faux)
as_tibble(faux)
# RDS::RDS.I.estimates(rds.data=faux,outcome.variable='X')
# RDS::RDS.I.estimates(rds.data=faux,outcome.variable='X',smoothed=TRUE)
#
# RDS::RDS.II.estimates(rds.data=faux,outcome.variable='X')
# RDS::RDS.II.estimates(rds.data=faux,outcome.variable='X',subset= Y!="blue")
data(fauxmadrona)
as_tibble(fauxmadrona)
# RDS::RDS.SS.estimates(rds.data=fauxmadrona,outcome.variable="disease",N=1000)
5.2 sspse
package
- This package implements approach proposed in (Handcock, Gile, and Mar 2014) and allows for population size estimation
- Estimation is done using Bayesian model described in the paper
library(sspse)
N0 <- 200
n <- 100
K <- 10
# Create probabilities for a Waring distribution
# with scaling parameter 3 and mean 5, but truncated at K=10.
probs <- c(0.33333333,0.19047619,0.11904762,0.07936508,0.05555556,
0.04040404,0.03030303,0.02331002,0.01831502,0.01465201)
probs <- probs / sum(probs)
#
# Create a sample
#
set.seed(1)
pop<-sample(1:K, size=N0, replace = TRUE, prob = probs)
s<-sample(pop, size=n, replace = FALSE, prob = pop)
# Here interval=1 so that it will run faster. It should be higher in a
# real application.
out <- sspse:: posteriorsize(s=s,interval=1)
plot(out, HPD.level=0.9,data=pop[s])
summary(out, HPD.level=0.9)
# Let's look at some MCMC diagnostics
plot(out, HPD.level=0.9,mcmc=TRUE)
5.3 chords
package
This package implements approach based on timing of enrollment proposed in (Berchenko, Rosenblatt, and Frost 2017) and allows for average degree and population size estimation
The estimation can be done using MLE, parametric, leave-\(d\)-out resampling, and maximum-a-posteriori (MAP) with Jeffreys priors methods among others
library(chords)
chords::initializeRdsObject()
data(brazil)
rds.object2<- initializeRdsObject(brazil)
see <- function(x) plot(x$estimates$Nk.estimates, type='h')
# Maximum likelihood
rds.object <- Estimate.b.k(rds.object = rds.object2, type = "jeffreys")
(rds.object$estimates$Nk.estimates)
see(rds.object)
5.4 networkreporting
package
- This package implements NSUM method by (Salganik et al. 2011) and allows for both degree and population size estimation
- Guide to estimation can be found here and standard errors are calculated using bootstrap
library(networkreporting)
head(example.survey)
kp.vec <- df.to.kpvec(example.knownpop.dat, kp.var="known.popn", kp.value="size")
kp.vec
# total size of the population
tot.pop.size <- 10e6
d.hat <-
suppressWarnings(kp.degree.estimator(survey.data = example.survey,
known.popns = kp.vec/10e6,
total.popn.size = 1,
missing="complete.obs"))
nsum.estimator(survey.data=example.survey,
d.hat.vals=d.hat,
total.popn.size=1,
killworth.se = TRUE,
y.vals="idu",
missing="complete.obs")
# idu.est <-
# surveybootstrap::bootstrap.estimates(## this describes the sampling design of the
# ## survey; here, the PSUs are given by the
# ## variable cluster, and the strata are given
# ## by the variable region
# survey.design = ~ cluster + strata(region),
# ## the number of bootstrap resamples to obtain
# ## (NOTE: in practice, you should use more than 100.
# ## this keeps building the package relatively fast)
# num.reps=100,
# ## this is the name of the function
# ## we want to use to produce an estimate
# ## from each bootstrapped dataset
# estimator.fn="nsum.estimator",
# ## these are the sampling weights
# weights="indweight",
# ## this is the name of the type of bootstrap
# ## we wish to use
# bootstrap.fn="rescaled.bootstrap.sample",
# ## our dataset
# survey.data=example.survey,
# ## other parameters we need to pass
# ## to the nsum.estimator function
# d.hat.vals=d.hat,
# total.popn.size=tot.pop.size,
# y.vals="idu",
# missing="complete.obs")
5.5 NSUM
package
library(NSUM)
## Killworth
## load data
data(McCarty)
## simulate from model with barrier effects
sim.bar <- with(McCarty, nsum.simulate(100, known, unknown, N, model="barrier",
mu, sigma, rho))
## estimate unknown population sizes
dat.bar <- sim.bar$y
NK.killworth <- with(McCarty, killworth(dat.bar, known, N))
## Maltiel
## load data
data(McCarty)
## simulate from model with barrier effects
sim.bar <- with(McCarty, nsum.simulate(100, known, unknown, N, model="barrier",
mu, sigma, rho))
## estimate unknown population size
dat.bar <- sim.bar$y
mcmc <- with(McCarty, nsum.mcmc(dat.bar, known, N, model="barrier", iterations=100,
burnin=50))
## view posterior distribution of subpopulation sizes for the first subpopulation
hist(mcmc$NK.values[1,])
## view posterior distribution of barrier effect parameters for the first subpopulation
hist(mcmc$rho.values[1,])
5.6 Rcapture
package
- Allows for capture-recapture population size estimation for closed and open populations [Will we be using this method?]
library(Rcapture)
# hare data set
hare.closedp <- closedp.t(hare)
hare.closedp
boxplot(hare.closedp)
# Third primary period of mvole data set
period3 <- mvole[, 11:15]
closedp.t(period3)
# BBS2001 data set
BBS.closedp <- closedp.0(BBS2001, dfreq = TRUE, dtype = "nbcap", t = 50, t0 = 20)
BBS.closedp
plot(BBS.closedp)
### Seber (1982) p.107
# When there is 2 capture occasions, the heterogeneity models cannot be fitted
X <- matrix(c(1,1,167,1,0,781,0,1,254), byrow = TRUE, ncol = 3)
closedp.t(X, dfreq = TRUE)
### Example of captures in continuous time
# Illegal immigrants data set
closedp.0(ill, dtype = "nbcap", dfreq = TRUE, t = Inf)
Service Multiplier
Berchenko, Yakir, Jonathan D. Rosenblatt, and Simon D. W. Frost. 2017. “Modeling and Analyzing Respondent-Driven Sampling as a Counting Process.” Biometrics 73 (4): 1189–98. https://doi.org/10.1111/biom.12678.
Handcock, Mark S., Krista J. Gile, and Corinne M. Mar. 2014. “Estimating Hidden Population Size Using Respondent-Driven Sampling Data.” Electronic Journal of Statistics 8 (1): 1491–1521. https://doi.org/10.1214/14-EJS923.
Johnston, Lisa G., Dimitri Prybylski, H. Fisher Raymond, Ali Mirzazadeh, Chomnad Manopaiboon, and Willi McFarland. 2013. “Incorporating the Service Multiplier Method in Respondent-Driven Sampling Surveys to Estimate the Size of Hidden and Hard-to-Reach Populations: Case Studies from Around the World.” Sexually Transmitted Diseases 40 (4): 304310. https://doi.org/10.1097/OLQ.0b013e31827fd650.
Salganik, Matthew J., Dimitri Fazito, Neilane Bertoni, Alexandre H. Abdo, Maeve B. Mello, and Francisco I. Bastos. 2011. “Assessing Network Scale-up Estimates for Groups Most at Risk of Hiv/Aids: Evidence from a Multiple-Method Study of Heavy Drug Users in Curitiba, Brazil.” American Journal of Epidemiology 174 (10): 11901196.