Issue 184: Add outputs to generate_simulated_data() fxn and package…

… data (#220)
CDCgov · Oct 31, 2024 · 3ff2d1e · 3ff2d1e
1 parent 3e58bec
commit 3ff2d1e
Show file tree

Hide file tree

Showing 19 changed files with 482 additions and 22 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,11 @@
 # wwinference 0.1.0.99 (dev)
 
 ## User-visible changes
-
+- Add wastewater data into the forecast period to output in `generate_simulated_data()` function and as package data. Also adds subpopulation-level
+hospital admissions to output of function and package data. ([#184](https://github.com/CDCgov/ww-inference-model/issues/184))
 - `wwinference` now checks whether `site_pop` is fixed per site (see issue [#223](https://github.com/CDCgov/ww-inference-model/issues/226) reported by [@akeyel](https://github.com/akeyel)).
 
 ## Internal changes
-
 - Updated the workflow for posting the pages artifact to PRs (issue [#229](https://github.com/CDCgov/ww-inference-model/issues/229)).
 - Modify `plot_forecasted_counts()` so that it does not require an evaluation dataset ([#218](https://github.com/CDCgov/ww-inference-model/pull/218))
 

diff --git a/R/data.R b/R/data.R
@@ -39,6 +39,47 @@
 #' @source vignette_data.R
 "ww_data"
 
+#' Example evaluation wastewater dataset.
+#'
+#' A dataset containing the simulated retrospective wastewater concentrations
+#' (labeled here as `log_genome_copies_per_ml_eval`) by sample collection date
+#' (`date`), the site where the sample was collected (`site`) and the lab
+#' where the samples were processed (`lab`). Additional columns that are
+#' required attributes needed for the model are the limit of detection for
+#' that lab on each day (labeled here as `log_lod`) and the population size of
+#' the wastewater catchment area represented by the wastewater concentrations
+#' in each `site`.
+#'
+#' This data is generated via the default values in the
+#'  `generate_simulated_data()` function. They represent the bare minumum
+#'  required fields needed to pass to the model, and we recommend that users
+#'  try to format their own data to match this format.
+#'
+#' The variables are as follows:
+#'
+#' @format ## ww_data_eval
+#' A tibble with 126 rows and 6 columns
+#' \describe{
+#'   \item{date}{Sample collection date, formatted in ISO8601 standards as
+#'   YYYY-MM-DD}
+#'   \item{site}{The wastewater treatment plant where the sample was collected}
+#'   \item{lab}{The lab where the sample was processed}
+#'   \item{log_genome_copies_per_ml_eval}{The natural log of the wastewater
+#'   concentration measured on the date specified, collected in the site
+#'   specified, and processed in the lab specified. The package expects
+#'   this quantity in units of log estimated genome copies per mL.}
+#'   \item{log_lod}{The log of the limit of detection in the site and lab on a
+#'   particular day of the quantification device (e.g. PCR).  This should be in
+#'    units of log estimated genome copies per mL.}
+#'   \item{site_pop}{The population size of the wastewater catchment area
+#'   represented by the site variable}
+#'   \item{location}{ A string indicating the location that all of the
+#'   data is coming from. This is not a necessary column, but instead is
+#'   included to more realistically mirror a typical workflow}
+#'   }
+#' @source vignette_data.R
+"ww_data_eval"
+
 
 
 
@@ -57,9 +98,9 @@
 #'  to match this format.
 #'
 #' This data is generated via the default values in the
-#'  `generate_simulated_data()` function. They represent the bare minumum
+#'  `generate_simulated_data()` function. They represent the bare minimum
 #'  required fields needed to pass to the model, and we recommend that users
-#'  try to format their own data to match this formate.
+#'  try to format their own data to match this format.
 #'
 #' The variables are as follows:
 #' \describe{
@@ -132,6 +173,77 @@
 #' @source vignette_data.R
 "hosp_data_eval"
 
+
+
+
+#' Example subpopulation level hospital admissions dataset
+#'
+#'  A dataset containing the simulated daily hospital admissions
+#' (labeled here as `daily_hosp_admits`) by date of admission (`date`) in
+#'  each subpopulation.
+#'  Additional columns that are the population size of the
+#'  population contributing to the hospital admissions. In this instance,
+#'  the subpopulations here are each of the wastewater catchment areas plus
+#'  an additional subpopulation for the portion of the population not captured
+#'  by wastewater surveillance. The data generated are daily hospital
+#'  admissions but they could be any other epidemiological count dataset e.g.
+#'  cases. This data should only contain hospital admissions that would have
+#'  been available as of the date that the forecast was made.
+#'
+#' This data is generated via the default values in the
+#'  `generate_simulated_data()` function.
+#'
+#' The variables are as follows:
+#' \describe{
+#'   \item{date}{Date the hospital admissions occurred, formatted in ISO8601
+#'   standards as YYYY-MM-DD}
+#'   \item{subpop_name}{A string indicating the subpopulation the hospital
+#'   admissiosn corresponds to. This is either a wastewater site, or the
+#'   remainder of the population}
+#'   \item{daily_hosp_admits}{The number of individuals admitted to the
+#'   hospital on that date, available as of the forecast date}
+#'   \item{subpop_pop}{The number of people contributing to the daily hospital
+#'   admissions in each subpopulation}
+#'   }
+#' @source vignette_data.R
+"subpop_hosp_data"
+
+
+#' Example subpopulation level retrospective hospital admissions dataset
+#'
+#'  A dataset containing the simulated daily hospital admissions
+#' (labeled here as `daily_hosp_admits`) by date of admission (`date`) in
+#'  each subpopulation observed retrospectively.
+#'  Additional columns that are required are the population size of the
+#'  population contributing to the hospital admissions. In this instance,
+#'  the subpopulations here are each of the wastewater catchment areas plus
+#'  an additional subpopulation for the portion of the population not captured
+#'  by wastewater surveillance. The data generated are daily hospital
+#'  admissions but they could be any other epidemiological count dataset e.g.
+#'  cases.This data should contain hospital admissions retrospectively beyond
+#'  the forecast date in order to evaluate the forecasts.
+#'
+#'  This data is generated via the default values in the
+#'  `generate_simulated_data()` function. They represent the bare minimumum
+#'  required fields needed to pass to the model, and we recommend that users
+#'  try to format their own data to match this format.
+#'
+#' The variables are as follows:
+#' \describe{
+#'   \item{date}{Date the hospital admissions occurred, formatted in ISO8601
+#'   standards as YYYY-MM-DD}
+#'   \item{subpop_name}{A string indicating the subpopulation the hospital
+#'   admissions corresponds to. This is either a wastewater site, or the
+#'   remainder of the population}
+#'   \item{daily_hosp_admits_for_eval}{The number of individuals admitted to the
+#'   hospital on that date, available as of the forecast date}
+#'   \item{subpop_pop}{The number of people contributing to the daily hospital
+#'   admissions in each subpopulation}
+#'   }
+#' @source vignette_data.R
+"subpop_hosp_data_eval"
+
+
 #' COVID-19 post-Omicron generation interval probability mass function
 #'
 #' \describe{

diff --git a/R/generate_simulated_data.R b/R/generate_simulated_data.R
@@ -59,6 +59,9 @@
 #' infection feedback into the infection process, default is `FALSE`, which
 #' sets the strength of the infection feedback to 0.
 #' If `TRUE`, this will apply an infection feedback drawn from the prior.
+#' @param subpop_phi Vector of numeric values indicating the overdispersion
+#' parameter phi in the hospital admissions observation process in each
+#' subpopulation
 #' @param input_params_path path to the toml file with the parameters to use
 #' to generate the simulated data
 #'
@@ -121,6 +124,7 @@ generate_simulated_data <- function(r_in_weeks = # nolint
                                     sigma_eps = 0.05,
                                     sd_i0_over_n = 0.5,
                                     if_feedback = FALSE,
+                                    subpop_phi = c(25, 50, 70, 40, 100),
                                     input_params_path =
                                       fs::path_package("extdata",
                                         "example_params.toml",
@@ -322,25 +326,72 @@ generate_simulated_data <- function(r_in_weeks = # nolint
   )
 
   ## Latent per capita admissions--------------------------------------------
+  # This won't be used other than for the unit test
   model_hosp_over_n <- model$functions$convolve_dot_product(
     p_hosp_days * new_i_over_n, # individuals who will be hospitalized
     rev(inf_to_hosp),
     uot + ot + ht
   )[(uot + 1):(uot + ot + ht)]
 
+  # Also compute per capita hosps for each subpopulation
+  model_hosp_subpop_over_n <- matrix(
+    nrow = n_subpops,
+    ncol = (ot + ht)
+  )
+  for (i in 1:n_subpops) {
+    model_hosp_subpop_over_n[i, ] <- model$functions$convolve_dot_product(
+      p_hosp_days * new_i_over_n_site[i, ],
+      rev(inf_to_hosp),
+      uot + ot + ht
+    )[(uot + 1):(uot + ot + ht)]
+  }
+
+  # unit test to make sure these are equivalent
+  if (!all.equal(
+    colSums(model_hosp_subpop_over_n * pop_fraction),
+    model_hosp_over_n,
+    tolerance = 1e-8
+  )) {
+    cli::cli_abort("Sum of convolutions not equal to convolution of sums")
+  }
+
 
   ## Add weekday effect on hospital admissions-------------------------------
   pred_hosp <- pop_size * model$functions$day_of_week_effect(
     model_hosp_over_n,
     day_of_week_vector,
     hosp_wday_effect
   )
+
+  pred_hosp_subpop <- matrix(
+    nrow = n_subpops,
+    ncol = (ot + ht)
+  )
+  for (i in 1:n_subpops) {
+    pred_hosp_subpop[i, ] <- pop_fraction[i] * pop_size *
+      model$functions$day_of_week_effect(
+        model_hosp_subpop_over_n[i, ],
+        day_of_week_vector,
+        hosp_wday_effect
+      )
+  }
+
+
   ## Add observation error---------------------------------------------------
-  # This is negative binomial but could swap out for a different obs error
-  pred_obs_hosp <- rnbinom(
-    n = length(pred_hosp), mu = pred_hosp,
-    size = 1 / ((params$inv_sqrt_phi_prior_mean)^2)
+  # Use negative binomial but could swap out for a different obs error.
+  # Each subpopulation has its own dispersion parameter, then we sum
+  # the observations to get the population total
+  pred_obs_hosp_subpop <- matrix(
+    nrow = n_subpops,
+    ncol = (ot + ht)
   )
+  for (i in 1:n_subpops) {
+    pred_obs_hosp_subpop[i, ] <- rnbinom(
+      n = length(pred_hosp_subpop[i, ]), mu = pred_hosp_subpop[i, ],
+      size = subpop_phi[i]
+    )
+  }
+  pred_obs_hosp <- colSums(pred_obs_hosp_subpop)
 
 
 
@@ -381,6 +432,18 @@ generate_simulated_data <- function(r_in_weeks = # nolint
     lab_site_reporting_latency = lab_site_reporting_latency
   )
 
+  # Create evaluation data with same reporting freq but go through the entire
+  # time period
+  log_obs_conc_lab_site_eval <- downsample_ww_obs(
+    log_conc_lab_site = log_conc_lab_site,
+    n_lab_sites = n_lab_sites,
+    ot = ot + ht,
+    ht = 0,
+    nt = 0,
+    lab_site_reporting_freq = lab_site_reporting_freq,
+    lab_site_reporting_latency = rep(0, n_lab_sites)
+  )
+
 
 
   # Global adjusted R(t) --------------------------------------------------
@@ -406,6 +469,18 @@ generate_simulated_data <- function(r_in_weeks = # nolint
     lod_lab_site = lod_lab_site
   )
 
+  ww_data_eval <- format_ww_data(
+    log_obs_conc_lab_site = log_obs_conc_lab_site_eval,
+    ot = ot + ht,
+    ht = 0,
+    date_df = date_df,
+    site_lab_map = site_lab_map,
+    lod_lab_site = lod_lab_site
+  ) |>
+    dplyr::rename(
+      "log_genome_copies_per_ml_eval" = "log_genome_copies_per_ml"
+    )
+
   # Artificially add values below the LOD----------------------------------
   # Replace it with an NA, will be used as an example of how to format data
   # properly.
@@ -419,16 +494,27 @@ generate_simulated_data <- function(r_in_weeks = # nolint
           TRUE ~ .data$log_genome_copies_per_ml
         )
     )
+  ww_data_eval <- ww_data_eval |>
+    dplyr::mutate(
+      "log_genome_copies_per_ml_eval" =
+        dplyr::case_when(
+          .data$log_genome_copies_per_ml_eval ==
+            !!min_ww_val ~ 0.5 * .data$log_lod,
+          TRUE ~ .data$log_genome_copies_per_ml_eval
+        )
+    )
 
 
   # Make a hospital admissions dataframe for model calibration
-  hosp_data <- format_hosp_data(pred_obs_hosp,
+  hosp_data <- format_hosp_data(
+    pred_obs_hosp = pred_obs_hosp,
     dur_obs = ot,
     pop_size = pop_size,
     date_df = date_df
   )
 
-  hosp_data_eval <- format_hosp_data(pred_obs_hosp,
+  hosp_data_eval <- format_hosp_data(
+    pred_obs_hosp = pred_obs_hosp,
     dur_obs = (ot + ht),
     pop_size = pop_size,
     date_df = date_df
@@ -437,6 +523,36 @@ generate_simulated_data <- function(r_in_weeks = # nolint
       "daily_hosp_admits_for_eval" = "daily_hosp_admits"
     )
 
+  # Make a subpopulation level hospital admissions data
+  # For now this will only be used for evaluation, eventually, can add
+  # feature to use this in calibration
+  subpop_map <- tibble::tibble(
+    subpop_index = as.character(1:n_subpops),
+    subpop_pop = pop_size * pop_fraction,
+    subpop_name = c(1:n_sites, NA)
+  ) |>
+    dplyr::mutate(subpop_name = ifelse(!is.na(subpop_name),
+      glue::glue("Site: {subpop_name}"),
+      "remainder of population"
+    ))
+
+  subpop_hosp_data <- format_subpop_hosp_data(
+    pred_obs_hosp_subpop = pred_obs_hosp_subpop,
+    dur_obs = ot,
+    subpop_map = subpop_map,
+    date_df = date_df
+  )
+
+  subpop_hosp_data_eval <- format_subpop_hosp_data(
+    pred_obs_hosp_subpop = pred_obs_hosp_subpop,
+    dur_obs = (ot + ht),
+    subpop_map = subpop_map,
+    date_df = date_df
+  ) |>
+    dplyr::rename(
+      "daily_hosp_admits_for_eval" = "daily_hosp_admits"
+    )
+
   # Global R(t)
   true_rt <- tibble::tibble(
     unadj_rt_daily = as.numeric(unadj_r_daily),
@@ -453,8 +569,11 @@ generate_simulated_data <- function(r_in_weeks = # nolint
 
   example_data <- list(
     ww_data = ww_data,
+    ww_data_eval = ww_data_eval,
     hosp_data = hosp_data,
     hosp_data_eval = hosp_data_eval,
+    subpop_hosp_data = subpop_hosp_data,
+    subpop_hosp_data_eval = subpop_hosp_data_eval,
     true_global_rt = true_rt
   )