Skip to content

Commit

Permalink
WIP of adding Beaumont-Emond estimator as an option for generalized r…
Browse files Browse the repository at this point in the history
…eplication methods.
  • Loading branch information
bschneidr committed Sep 9, 2024
1 parent 7e8dadd commit 868e776
Show file tree
Hide file tree
Showing 9 changed files with 153 additions and 32 deletions.
7 changes: 7 additions & 0 deletions R/fays_generalized_replication.R
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@ make_fays_gen_rep_factors <- function(
#' \item\strong{"SD2"}: \cr The circular successive-differences variance estimator described by Ash (2014).
#' This estimator is the basis of the "successive-differences replication" estimator commonly used
#' for variance estimation for systematic sampling.
#' \item\strong{"Beaumont-Emond"}: \cr The variance estimator of Beaumont and Emond (2022)
#' for multistage unequal-probability sampling without replacement.
#' }
#' @param aux_var_names (Only used if \code{variance_estimator = "Deville-Tille")}.
#' A vector of the names of auxiliary variables used in sampling.
Expand Down Expand Up @@ -367,6 +369,11 @@ make_fays_gen_rep_factors <- function(
#' - Ash, S. (2014). "\emph{Using successive difference replication for estimating variances}."
#' \strong{Survey Methodology}, Statistics Canada, 40(1), 47–59.
#' \cr \cr
#' - Beaumont, J.-F.; Émond, N. (2022).
#' "\emph{A Bootstrap Variance Estimation Method for Multistage Sampling and Two-Phase Sampling When Poisson Sampling Is Used at the Second Phase}."
#' \strong{Stats}, \emph{5}: 339–357.
#' https://doi.org/10.3390/stats5020019
#' \cr \cr
#' - Deville, J.‐C., and Tillé, Y. (2005). "\emph{Variance approximation under balanced sampling.}"
#' \strong{Journal of Statistical Planning and Inference}, 128, 569–591.
#' \cr \cr
Expand Down
78 changes: 52 additions & 26 deletions R/quadratic_forms.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
#' for variance estimation for systematic sampling.
#' \item \strong{"Deville-Tille"}: The estimator of Deville and Tillé (2005),
#' developed for balanced sampling using the cube method.
#' \item \strong{"Beaumont-Emond"}: The variance estimator of Beaumont and Emond (2022)
#' for multistage unequal-probability sampling without replacement.
#' }
#' @param probs Required if \code{variance_estimator} equals \code{"Deville-1"}, \code{"Deville-2"}, or \code{"Breidt-Chauvet"}.
#' This should be a matrix or data frame of sampling probabilities.
Expand Down Expand Up @@ -83,6 +85,7 @@
#' | SD2 | | | Required | Optional | Optional | Required | |
#' | Deville-1 | Required | | Required | Optional | | | |
#' | Deville-2 | Required | | Required | Optional | | | |
#' | Beaumont-Emond | Required | | Required | Optional | | | |
#' | Deville-Tille | Required | | Required | Optional | | | Required |
#' | Yates-Grundy | | Required | | | | | |
#' | Horvitz-Thompson | | Required | | | | | |
Expand Down Expand Up @@ -178,7 +181,7 @@ make_quad_form_matrix <- function(variance_estimator = "Yates-Grundy",
accepted_variance_estimators <- c(
"Yates-Grundy", "Horvitz-Thompson",
"Ultimate Cluster", "Stratified Multistage SRS",
"SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille"
"SD1", "SD2", "Deville-1", "Deville-2", "Beaumont-Emond", "Deville-Tille"
)

if (length(variance_estimator) > 1) {
Expand All @@ -205,20 +208,20 @@ make_quad_form_matrix <- function(variance_estimator = "Yates-Grundy",

# Check inputs and assemble all necessary information
# for estimators of stratified/clustered designs
if (variance_estimator %in% c("Stratified Multistage SRS", "Ultimate Cluster", "SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille")) {
if (variance_estimator %in% c("Stratified Multistage SRS", "Ultimate Cluster", "SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond")) {

use_sparse_matrix <- TRUE

# Ensure the minimal set of inputs is supplied
if (variance_estimator %in% c("Stratified Multistage SRS", "Ultimate Cluster", "Deville-1", "Deville-2", "Deville-Tille")) {
if (variance_estimator %in% c("Stratified Multistage SRS", "Ultimate Cluster", "Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond")) {
if (is.null(cluster_ids) || is.null(strata_ids)) {
sprintf(
"For `variance_estimator='%s'`, must supply a matrix or data frame to both `strata_ids` and `cluster_ids`",
variance_estimator
) |> stop()
}
}
if (variance_estimator %in% c("Deville-1", "Deville-2", "Deville-Tille")) {
if (variance_estimator %in% c("Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond")) {
if (is.null(probs)) {
sprintf(
"For `variance_estimator='%s'`, must supply a matrix or data frame to `probs`.",
Expand All @@ -232,7 +235,7 @@ make_quad_form_matrix <- function(variance_estimator = "Yates-Grundy",
stop("For `variance_estimator='Stratified Multistage SRS'`, must supply a matrix or data frame to `strata_pop_sizes`.")
}
}
if (variance_estimator %in% c("SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille")) {
if (variance_estimator %in% c("SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond")) {
if (is.null(cluster_ids)) {
sprintf(
"For `variance_estimator='%s'`, must supply a matrix or data frame to `cluster_ids`",
Expand Down Expand Up @@ -355,7 +358,7 @@ make_quad_form_matrix <- function(variance_estimator = "Yates-Grundy",
}
}

if (variance_estimator %in% c("Stratified Multistage SRS", "Deville-1", "Deville-2", "Deville-Tille")) {
if (variance_estimator %in% c("Stratified Multistage SRS", "Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond")) {
quad_form_matrix <- Matrix::Matrix(data = 0,
nrow = number_of_ultimate_units,
ncol = number_of_ultimate_units) |>
Expand Down Expand Up @@ -404,7 +407,7 @@ make_quad_form_matrix <- function(variance_estimator = "Yates-Grundy",
prev_stages_samp_prob <- prev_samp_fraction
}

if (variance_estimator %in% c("Deville-1", "Deville-2")) {
if (variance_estimator %in% c("Deville-1", "Deville-2", "Beaumont-Emond")) {

# Get quadratic form at current stage
current_cluster_ids <- cluster_ids[stratum_indices, stage, drop = TRUE]
Expand Down Expand Up @@ -639,16 +642,17 @@ make_srswor_matrix <- function(n, f = 0) {
#' \itemize{
#' \item "Deville-1"
#' \item "Deville-2"
#' \item "Beaumont-Emond"
#' }

#' @return A symmetric matrix whose dimension matches the length of \code{probs}.
#' @details
#' These variance estimators have been shown to be effective
#' @section Deville's Estimators:
#' The "Deville-1" and "Deville-2" approximations have been shown to be effective
#' for designs that use a fixed sample size with a high-entropy sampling method.
#' This includes most PPSWOR sampling methods,
#' but unequal-probability systematic sampling is an important exception.
#'
#' These variance estimators generally take the following form:
#' Deville's variance estimators generally take the following form:
#' \deqn{
#' \hat{v}(\hat{Y}) = \sum_{i=1}^{n} c_i (\breve{y}_i - \frac{1}{\sum_{i=k}^{n}c_k}\sum_{k=1}^{n}c_k \breve{y}_k)^2
#' }
Expand Down Expand Up @@ -677,6 +681,17 @@ make_srswor_matrix <- function(n, f = 0) {
#' Horvitz-Thompson and Yates-Grundy variance estimators.
#' In the case of simple random sampling without replacement (SRSWOR),
#' these estimators are identical to the usual Horvitz-Thompson variance estimator.
#'
#' @section Beaumont-Emond Estimator:
#' Beaumont and Emond (2022) proposed a variance estimator for unequal probability
#' sampling without replacement. This estimator is simply the Horvitz-Thompson
#' variance estimator with the following approximation for the joint inclusion
#' probabilities.
#' \deqn{
#' \pi_{kl} \approx \pi_k \pi_l \frac{n - 1}{(n-1) + \sqrt{(1-\pi_k)(1-\pi_l)}}
#' }
#' In the case of cluster sampling, this approximation should be
#' applied to the clusters rather than the units within clusters.
#'
#' @references
#' Matei, Alina, and Yves Tillé. 2005.
Expand All @@ -690,23 +705,34 @@ make_ppswor_approx_matrix <- function(probs, method = "Deville-1") {
n <- length(probs)
one_minus_pi <- 1 - probs

if (method == "Deville-1") {
c_k <- (1 - probs) * (n/(n-1))
}
if (method == "Deville-2") {
c_k <- (1 - probs) / (
1 - sum( (one_minus_pi/sum(one_minus_pi))^2 )
)
if (method %in% c("Deville-1", "Deville-2")) {
if (method == "Deville-1") {
c_k <- (1 - probs) * (n/(n-1))
}
if (method == "Deville-2") {
c_k <- (1 - probs) / (
1 - sum( (one_minus_pi/sum(one_minus_pi))^2 )
)
}

c_sum <- sum(c_k)

if ((n == 1) || (c_sum == 0)) {
Sigma <- Matrix::Matrix(0, nrow = length(c_k), ncol = length(c_k))
} else {
Sigma <- outer(c_k, -c_k) / c_sum
diag(Sigma) <- c_k*(1 - c_k/c_sum)
Sigma <- as(Sigma, "symmetricMatrix")
}
}

c_sum <- sum(c_k)

if ((n == 1) || (c_sum == 0)) {
Sigma <- Matrix::Matrix(0, nrow = length(c_k), ncol = length(c_k))
} else {
Sigma <- outer(c_k, -c_k) / c_sum
diag(Sigma) <- c_k*(1 - c_k/c_sum)
Sigma <- as(Sigma, "symmetricMatrix")
if (method == "Beaumont-Emond") {
if ((n == 1)) {
Sigma <- Matrix::Matrix(0, n, n)
} else {
Sigma <- - tcrossprod(sqrt(one_minus_pi)) / (n-1)
diag(Sigma) <- one_minus_pi
Sigma <- as(Sigma, "symmetricMatrix")
}
}

return(Sigma)
Expand Down
8 changes: 5 additions & 3 deletions R/quadratic_forms_of_survey_design_objects.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#' \item\strong{"SD2"}: \cr The circular successive-differences variance estimator described by Ash (2014).
#' This estimator is the basis of the "successive-differences replication" estimator commonly used
#' for variance estimation for systematic sampling.
#' \item \strong{"Beaumont-Emond"}: The variance estimator of Beaumont and Emond (2022)
#' for multistage unequal-probability sampling without replacement.
#' }
#' @param ensure_psd If \code{TRUE} (the default), ensures
#' that the result is a positive semidefinite matrix. This
Expand Down Expand Up @@ -159,7 +161,7 @@ get_design_quad_form.survey.design <- function(design, variance_estimator,
"Yates-Grundy", "Horvitz-Thompson",
"Poisson Horvitz-Thompson",
"Ultimate Cluster", "Stratified Multistage SRS",
"SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille"
"SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond"
)

if (is.null(variance_estimator)) {
Expand Down Expand Up @@ -225,7 +227,7 @@ get_design_quad_form.survey.design <- function(design, variance_estimator,
sort_order = NULL
)
}
if (variance_estimator %in% c("Deville-1", "Deville-2")) {
if (variance_estimator %in% c("Deville-1", "Deville-2", "Beaumont-Emond")) {
Sigma <- make_quad_form_matrix(
variance_estimator = variance_estimator,
cluster_ids = design$cluster,
Expand Down Expand Up @@ -283,7 +285,7 @@ get_design_quad_form.twophase2 <- function(design, variance_estimator,
"Yates-Grundy", "Horvitz-Thompson",
"Poisson Horvitz-Thompson",
"Ultimate Cluster", "Stratified Multistage SRS",
"SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille"
"SD1", "SD2", "Deville-1", "Deville-2", "Deville-Tille", "Beaumont-Emond"
)
accepted_phase2_estimators <- c(
"Ultimate Cluster", "Stratified Multistage SRS",
Expand Down
28 changes: 28 additions & 0 deletions R/variance-estimators.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,30 @@
#' of sampling fractions from earlier stages of sampling. For example, at a third stage of sampling,
#' the variance estimate from a third-stage stratum is multiplied by \eqn{\frac{n_1}{N_1}\frac{n_2}{N_2}},
#' which is the product of sampling fractions from the first-stage stratum and second-stage stratum.
#' @section Beaumont-Emond:
#' The \strong{"Beaumont-Emond"} variance estimator was proposed by Beaumont and Emond (2022),
#' intended for designs that use fixed-size, unequal-probability random sampling without replacement.
#' The variance estimator is simply the Horvitz-Thompson
#' variance estimator with the following approximation for the joint inclusion
#' probabilities.
#' \deqn{
#' \pi_{kl} \approx \pi_k \pi_l \frac{n - 1}{(n-1) + \sqrt{(1-\pi_k)(1-\pi_l)}}
#' }
#' In the case of cluster sampling, this approximation is
#' applied to the clusters rather than the units within clusters,
#' with \eqn{n} denoting the number of sampled clusters. and the probabilities \eqn{\pi}
#' referring to the cluster's sampling probability. For stratified samples,
#' the joint probability for units \eqn{k} and \eqn{l} in different strata
#' is simply the product of \eqn{\pi_k} and \eqn{\pi_l}.
#'
#' For multistage samples, this approximation is applied to the clusters at each stage, separately by stratum.
#' For later stages of sampling, the variance estimate from a stratum is multiplied by the product
#' of sampling probabilities from earlier stages of sampling. For example, at a third stage of sampling,
#' the variance estimate from a third-stage stratum is multiplied by \eqn{\pi_1 \times \pi_{(2 | 1)}},
#' where \eqn{\pi_1} is the sampling probability of the first-stage unit
#' and \eqn{\pi_{(2|1)}} is the sampling probability of the second-stage unit
#' within the first-stage unit.
#'
#' @section Deville 1 and Deville 2:
#' The \strong{"Deville-1"} and \strong{"Deville-2"} variance estimators
#' are clearly described in Matei and Tillé (2005),
Expand Down Expand Up @@ -144,6 +168,10 @@
#' @references
#' Ash, S. (2014). "\emph{Using successive difference replication for estimating variances}."
#' \strong{Survey Methodology}, Statistics Canada, 40(1), 47–59.
#'
#' Beaumont, J.-F.; Émond, N. (2022). "\emph{A Bootstrap Variance Estimation Method for Multistage Sampling and Two-Phase Sampling When Poisson Sampling Is Used at the Second Phase}."
#' \strong{Stats}, \emph{5}: 339–357.
#' https://doi.org/10.3390/stats5020019
#'
#' Bellhouse, D.R. (1985). "\emph{Computing Methods for Variance Estimation in Complex Surveys}."
#' \strong{Journal of Official Statistics}, Vol.1, No.3.
Expand Down
7 changes: 7 additions & 0 deletions man/as_fays_gen_rep_design.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/get_design_quad_form.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 19 additions & 3 deletions man/make_ppswor_approx_matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/make_quad_form_matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 868e776

Please sign in to comment.