updates to codebase to reflect recent dev (#93)

CDCgov · Jun 26, 2024 · b3a3f1a · b3a3f1a
1 parent 5e6eb64
commit b3a3f1a
Show file tree

Hide file tree

Showing 26 changed files with 4,000 additions and 65 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,12 +15,6 @@ secrets.yaml
 # Azure batch configuration
 batch_config.toml
 
-# batch
-batch/
-Makefile
-Containerfile
-.containerignore
-
 # rendered documents ignored by default
 *.html
 *.pdf

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,26 @@ repos:
     hooks:
     -   id: style-files
     -   id: lintr
-#####
+
+########
+# Python
+-   repo: https://github.com/psf/black
+    rev: 23.10.0
+    hooks:
+    -   id: black
+        args: ['--line-length', '79']
+-   repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+    -   id: isort
+        args: ['--profile', 'black',
+               '--line-length', '79']
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.0
+    hooks:
+    -   id: ruff
+
+#########
 # Secrets
 -   repo: https://github.com/Yelp/detect-secrets
     rev: v1.4.0

diff --git a/_targets_eval.R b/_targets_eval.R
@@ -869,7 +869,8 @@ downstream_targets <- list(
     name = plot_summarized_scores_w_data,
     command = get_plot_scores_w_data(
       grouped_submission_scores,
-      eval_hosp_data
+      eval_hosp_data,
+      eval_config$figure_dir
     ),
     pattern = map(grouped_submission_scores),
     iteration = "list",
@@ -900,7 +901,8 @@ downstream_targets <- list(
     name = plot_quantile_comparison,
     command = get_plot_quantile_comparison(
       all_hosp_quantiles,
-      eval_hosp_data
+      eval_hosp_data,
+      eval_config$figure_dir
     ),
     pattern = map(all_hosp_quantiles),
     iteration = "list"

diff --git a/_targets_eval_postprocessing.R b/_targets_eval_postprocessing.R
@@ -105,6 +105,15 @@ upstream_targets <- list(
       last_hosp_data_date = eval_config$eval_date,
       ww_data_mapping = eval_config$ww_data_mapping
     )
+  ),
+  # Returns a dataframe with each location and date and a corresponding
+  # epidemic phase
+  tar_target(
+    name = epidemic_phases,
+    command = get_epidemic_phases_from_rt(
+      locations = unique(eval_config$location_ww),
+      retro_rt_path = eval_config$retro_rt_path
+    )
   )
 )
 
@@ -141,14 +150,15 @@ combined_targets <- list(
   ## Flags------------------------------------------------------------------
   tar_target(
     name = all_flags_ww,
-    command = combine_outputs(
-      output_type = "flags",
-      scenarios = eval_config$scenario,
-      forecast_dates = eval_config$forecast_dates,
-      locations = eval_config$location_ww,
-      eval_output_subdir = eval_config$output_dir,
-      model_type = "ww"
-    )
+    command =
+      combine_outputs(
+        output_type = "flags",
+        scenarios = eval_config$scenario,
+        forecast_dates = eval_config$forecast_date_ww,
+        locations = eval_config$location_ww,
+        eval_output_subdir = eval_config$output_dir,
+        model_type = "ww"
+      )
   ),
   tar_target(
     name = all_flags_hosp,
@@ -161,6 +171,25 @@ combined_targets <- list(
       model_type = "hosp"
     )
   ),
+  tar_target(
+    name = all_flags,
+    command = dplyr::bind_rows(all_flags_ww, all_flags_hosp)
+  ),
+  tar_target(
+    name = convergence_df_ww,
+    command = get_convergence_df(
+      all_flags_ww,
+      default_scenario = "status_quo"
+    ) |>
+      dplyr::rename(any_flags_ww = any_flags)
+  ),
+  tar_target(
+    name = convergence_df_hosp,
+    command = get_convergence_df(all_flags_hosp,
+      default_scenario = "no_wastewater"
+    ) |>
+      dplyr::rename(any_flags_hosp = any_flags)
+  ),
   ### Scores from quantiles-------------------------------------------------
   tar_target(
     name = all_ww_scores_quantiles,
@@ -169,7 +198,7 @@ combined_targets <- list(
       scenarios = eval_config$scenario,
       forecast_dates = eval_config$forecast_date_ww,
       locations = eval_config$location_ww,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "ww"
     )
   ),
@@ -180,7 +209,7 @@ combined_targets <- list(
       scenarios = "no_wastewater",
       forecast_dates = eval_config$forecast_date_hosp,
       locations = eval_config$location_hosp,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "hosp"
     )
   ),
@@ -192,7 +221,7 @@ combined_targets <- list(
       scenarios = eval_config$scenario,
       forecast_dates = eval_config$forecast_date_ww,
       locations = eval_config$location_ww,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "ww"
     )
   ),
@@ -203,7 +232,7 @@ combined_targets <- list(
       scenarios = "no_wastewater",
       forecast_dates = eval_config$forecast_date_hosp,
       locations = eval_config$location_hosp,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "hosp"
     )
   ),
@@ -214,7 +243,7 @@ combined_targets <- list(
       scenarios = eval_config$scenario,
       forecast_dates = eval_config$forecast_date_ww,
       locations = eval_config$location_ww,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "ww"
     )
   ),
@@ -226,7 +255,7 @@ combined_targets <- list(
       scenarios = eval_config$scenario,
       forecast_dates = eval_config$forecast_date_ww,
       locations = eval_config$location_ww,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "ww"
     )
   ),
@@ -237,15 +266,131 @@ combined_targets <- list(
       scenarios = "no_wastewater",
       forecast_dates = eval_config$forecast_date_hosp,
       locations = eval_config$location_hosp,
-      eval_output_subdir = file.path("output", "eval"),
+      eval_output_subdir = eval_config$output_dir,
       model_type = "hosp"
     )
   )
 )
 
+# Head-to-head comparison targets-------------------------------------------
+# This set of targets will be conditioned on the presence of sufficient
+# wastewater, whereas the below targets assume that for every location and
+# forecast date we had to submit a forecast, and so we used the hospital
+# admissions only model if wastewater was missing.
+# These are only relevant for the status quo scenario
+head_to_head_targets <- list(
+  tar_target(
+    name = all_ww_quantiles_sq,
+    command = all_ww_quantiles |>
+      dplyr::filter(scenario == "status_quo")
+  ),
+  # Get a table of locations and forecast dates with sufficient wastewater
+  tar_target(
+    name = table_of_loc_dates_w_ww,
+    command = get_table_sufficient_ww(all_ww_quantiles)
+  ),
+  # Get a table indicating whether there are locations and forecast dates with
+  # convergence issues
+  tar_target(
+    name = convergence_df,
+    command = convergence_df_hosp |>
+      dplyr::left_join(convergence_df_ww,
+        by = c("location", "forecast_date")
+      )
+  ),
+  # Get the full set of quantiles, filtered down to only states and
+  # forecast dates with sufficient wastewater for both ww model and hosp only
+  # model. Then join the convergence df
+  tar_target(
+    name = hosp_quantiles_filtered,
+    command = dplyr::bind_rows(
+      all_ww_hosp_quantiles,
+      all_hosp_model_quantiles
+    ) |>
+      dplyr::left_join(table_of_loc_dates_w_ww,
+        by = c("location", "forecast_date")
+      ) |>
+      dplyr::filter(
+        ww_sufficient # filters to location forecast dates with sufficient ww
+      ) |>
+      dplyr::left_join(
+        convergence_df,
+        by = c(
+          "location",
+          "forecast_date"
+        )
+      ) |>
+      dplyr::left_join(
+        epidemic_phases,
+        by = c(
+          "location" = "state_abbr",
+          "date" = "reference_date"
+        )
+      )
+  ),
+  # Do the same thing for the sampled scores, combining ww and hosp under
+  # the status quo scenario, filtering to the locations and forecast dates
+  # with sufficient wastewater, and then joining the convergence flags
+  tar_target(
+    name = scores_filtered,
+    command = dplyr::bind_rows(
+      all_hosp_scores,
+      all_ww_scores |>
+        dplyr::filter(scenario == "status_quo")
+    ) |>
+      dplyr::left_join(table_of_loc_dates_w_ww,
+        by = c("location", "forecast_date")
+      ) |>
+      dplyr::filter(ww_sufficient) |>
+      dplyr::left_join(
+        convergence_df,
+        by = c(
+          "location",
+          "forecast_date"
+        )
+      ) |>
+      dplyr::left_join(
+        epidemic_phases,
+        by = c(
+          "location" = "state_abbr",
+          "date" = "reference_date"
+        )
+      )
+  ),
+  # Repeat for the quantile-based scores
+  tar_target(
+    name = scores_quantiles_filtered,
+    command = dplyr::bind_rows(
+      all_hosp_scores_quantiles,
+      all_ww_scores_quantiles |>
+        dplyr::filter(scenario == "status_quo")
+    ) |>
+      dplyr::left_join(table_of_loc_dates_w_ww,
+        by = c("location", "forecast_date")
+      ) |>
+      dplyr::filter(
+        isTRUE(ww_sufficient)
+      ) |>
+      dplyr::left_join(
+        convergence_df,
+        by = c(
+          "location",
+          "forecast_date"
+        )
+      ) |>
+      dplyr::left_join(
+        epidemic_phases,
+        by = c(
+          "location" = "state_abbr",
+          "date" = "reference_date"
+        )
+      )
+  )
+)
+
 
-# Head-to-head-scenario targets------------------------------------------------
-head_to_head_scenario_targets <- list(
+# Scenario targets------------------------------------------------
+scenario_targets <- list(
   tar_target(
     name = all_raw_scores,
     command = data.table::as.data.table(
@@ -262,18 +407,7 @@ head_to_head_scenario_targets <- list(
     name = all_errors,
     command = dplyr::bind_rows(all_hosp_errors, all_ww_errors)
   ),
-  tar_target(
-    name = all_flags,
-    command = dplyr::bind_rows(all_flags_ww, all_flags_hosp)
-  ),
-  tar_target(
-    name = nonconverge_df,
-    command = all_flags |> distinct(
-      location, forecast_date, model, scenario
-    ) |>
-      dplyr::mutate(convergence = FALSE) # Add a column that indicates did not pass
-    # convergence. We will use this in the head-to-head comparison to statify by convergence
-  ),
+
 
   ## Raw scores-----------------------------------------
   # These are the scores from each scenario and location without buffering
@@ -653,7 +787,8 @@ hub_comparison_plots <- list(
 list(
   upstream_targets,
   combined_targets,
-  head_to_head_scenario_targets,
+  head_to_head_targets,
+  scenario_targets,
   hub_targets,
   hub_comparison_plots
 )
diff --git a/model_definition.md b/model_definition.md
@@ -119,7 +119,9 @@ where $\gamma$ is the _infection feedback term_ controlling the strength of the
 
 Following other semi-mechanistic renewal frameworks, we model the _expected_ hospital admissions per capita $H(t)$ as a convolution of the _expected_ latent incident infections per capita $I(t)$, and a discrete infection to hospitalization distribution $d(\tau)$, scaled by the probability of being hospitalized $p_\mathrm{hosp}(t)$.
 
-To account for day-of-week effects in hospital reporting, we use an estimated _weekday effect_ $\omega(t)$. If $t$ and $t'$ are the same day of the week, $\omega(t) = \omega(t')$. The seven values that $\omega(t)$ takes on are constrained to have mean 1.
+To account for day-of-week effects in hospital reporting, we use an estimated _weekday effect_ $\omega(t)$. If $t$ and $t'$ are the same day of the week, $\omega(t) = \omega(t')$.
+The seven values that $\omega(t)$ takes on are constrained to be non-negative and have a mean of 1.
+This allows us to model the possibility that certain days of the week could have systematically high or low admissions reporting while holding the predicted weekly total reported admissions constant (i.e. the predicted weekly total is the same with and without these day-of-week reporting effects).
 
 $$H(t) = \omega(t) p_\mathrm{hosp}(t) \sum_{\tau = 0}^{T_d} d(\tau) I(t-\tau)$$
 

diff --git a/scratch/ar1_example.R b/scratch/ar1_example.R
@@ -0,0 +1,24 @@
+ar1 <- function(mu, ac, sd, z) {
+  n <- length(z)
+  x <- rep(0, n)
+  tvd <- rep(0, n)
+
+  tvd[1] <- sd * z[1]
+  x[1] <- mu[1] + tvd[1]
+
+  for (i in 2:n) {
+    tvd[i] <- ac * tvd[i - 1] + sd * z[i]
+    x[i] <- mu[i] + tvd[i]
+  }
+  return(x)
+}
+
+p_hosp_mean <- rep(0.01, 26)
+p_hosp_logit <- qlogis(p_hosp_mean)
+ac <- 0.01
+sd <- 0.3
+z <- rnorm(26)
+
+p_hosp_t_logit <- ar1(p_hosp_logit, ac, sd, z)
+
+plot(plogis(p_hosp_t_logit))