diff --git a/_targets.R b/_targets.R index aa304e3d..e1edef34 100644 --- a/_targets.R +++ b/_targets.R @@ -349,6 +349,27 @@ list( iteration = "list", deployment = "main" ), + tar_target( + name = plot_p_hosp_t, + command = get_plot_p_hosp_t( + grouped_df_id, + figure_output_subdirectory, + params_id + ), + pattern = map(grouped_df_id), + iteration = "list", + deployment = "main" + ), + tar_target( + name = plot_change_in_p_hosp_t, + command = get_plot_change_in_p_hosp_t( + grouped_df_id, + figure_output_subdirectory + ), + pattern = map(grouped_df_id), + iteration = "list", + deployment = "main" + ), tar_target( name = plot_rt_subpop_level, command = get_rt_subpop_level( @@ -369,6 +390,28 @@ list( iteration = "list", deployment = "main" ), + tar_target( + name = plot_p_hosp_sd_id, + command = get_plot_single_param( + grouped_df_id, + param_name = "p_hosp_sd", + figure_output_subdirectory + ), + pattern = map(grouped_df_id), + iteration = "list", + deployment = "main" + ), + tar_target( + name = plot_p_hosp_mean, + command = get_plot_single_param( + grouped_df_id, + param_name = "p_hosp_mean", + figure_output_subdirectory + ), + pattern = map(grouped_df_id), + iteration = "list", + deployment = "main" + ), # Hospital admissions only model for all states for a comparison----------- tar_target( @@ -520,6 +563,7 @@ list( deployment = "main" ), + # National aggregated model for the US only-------------------------------- tar_target( name = config_vars_sa, @@ -600,7 +644,6 @@ list( command = get_params(param_file_path_sa), deployment = "main" ), - ## Fit the model ------------------------------------------------------------ # get a stacked long dataframe containing the quantiles(estimated # from all draws) and 100 samples of the draws from the posterior for the @@ -646,6 +689,17 @@ list( iteration = "list", deployment = "main" ), + tar_target( + name = plot_p_hosp_t_us, + command = get_plot_p_hosp_t( + grouped_df_sa, + figure_output_subdirectory, + params_sa + ), + pattern = map(grouped_df_id), + iteration = "list", + deployment = "main" + ), tar_target( name = plot_single_location_hosp_draws_log_sa, command = get_plot_draws(grouped_df_sa, diff --git a/cfaforecastrenewalww/NAMESPACE b/cfaforecastrenewalww/NAMESPACE index 35d9eb4a..9a5421bd 100644 --- a/cfaforecastrenewalww/NAMESPACE +++ b/cfaforecastrenewalww/NAMESPACE @@ -50,11 +50,14 @@ export(get_params) export(get_pars) export(get_pdf_output_subdirectory) export(get_pipeline_metadata) +export(get_plot_change_in_p_hosp_t) export(get_plot_covidhub_submission) export(get_plot_draws) export(get_plot_labsite_ww_draws) export(get_plot_metadata) +export(get_plot_p_hosp_t) export(get_plot_param_distribs) +export(get_plot_single_param) export(get_ppa_long) export(get_prior_pred_hosps_df) export(get_prior_pred_ww_df) diff --git a/cfaforecastrenewalww/R/data.R b/cfaforecastrenewalww/R/data.R index 4d74bb5d..1947ca68 100644 --- a/cfaforecastrenewalww/R/data.R +++ b/cfaforecastrenewalww/R/data.R @@ -1,30 +1,65 @@ -#' Example input data set +#' Input dataframe for fitting the site-level infection dynamics wastewater +#' model #' -#' SOME HIGH-LEVEL DESCRIPTION OF THE DATA. MAYBE JUST POINT READERS TO -#' THE FUNCTION THAT GENERATES IT? +#' A dataset containing the daily hospital admissons, observed wastewater +#' concentrations and other attributes needed for the model to be fit. +#' Generated via the defaults in `generate_simulated_data.R` +#' The variables are as follows: #' #' @format ## example_df #' A data frame with 635 rows and 13 columns #' \describe{ -#' \item{t}{Time} -#' \item{lab_wwtp_unique_id}{Unique identifier for each unique combination -#' of sampling site and testing lab} -#' \item{log_conc}{Genome copied per liter, log10} -#' ETC. +#' \item{t}{The time index in days} +#' \item{lab_wwtp_unique_id}{The unique identifier of the wastewater site-lab +#' combination. For this example, there are 5 unique combinations, so the +#' indices range from 1 to 5 but would vary for the number of unique +#' combinations in your input dataset if trying to construct something mirroring +#' this structure} +#' \item{log_conc}{The log scale viral genomes per mL collected from the +#' lab-site specified on day date. +#' NAs indicate days where a sample wasn't reported in that lab-site.} +#' \item{date}{Sample collection date, formatted as YYYY-MM-DD} +#' \item{lod_sewage}{The log scaled limit of detection reported by the +#' lab-site on that date} +#' \item{below_LOD}{An indicator (0,1) indicating whether the wastewater +#' observation on that date is +#' above (0) or below (1) the LOD. NAs for all days with wastewater +#' observations in that lab-site} +#' \item{daily_hosp_admits}{The number of individuals admitted to the +#' hospital on that date, +#' available as of the forecast date. Note this is repeated for each site-lab.} +#' \item{daily_hosp_admits_for_eval}{The number of individuals admitted to +#' the hospital on that date, +#' available retrospectively (so including after the forecast date). +#' Note this is repeated for each site-lab} +#' \item{pop}{The number of people in the "state" or whatever population +#' contributes to the daily hospital admissions} +#' \item{forecast_date}{The date the forecast is made, formatted as YYYY-MM-DD} +#' \item{hosp_calibration_time}{The duration of time in days to calibrate the +#' model to hospital admissions data} +#' \item{site}{The unique identifier for the site (also referred to as a +#' wastewater catchment area). There can be more than +#' one lab per site if samples are sent to different/overlapping labs.} +#' \item{ww_pop}{The population size of the site (wastewater catchment area)} #' } +#' @source generate_simulated_data.R "example_df" -#' Example input parameters +#' Input parameters used to generate `example_df` #' -#' SOME HIGH-LEVEL DESCRIPTION OF THE DATA, INCLUDING HOW DERIVED +#' A few key parameters that were created by the default arguments in +#' `generate_simulated_data.R` #' #' @format ## param_df #' A data frame with 305 rows and 4 columns #' \describe{ #' \item{name}{Parameter name} -#' \item{true_value}{} -#' \item{index_rows}{} -#' \item{index_cols}{} +#' \item{true_value}{ The value that was assigned to this parameter in the +#' data generation process} +#' \item{index_rows}{if this is a parameter array, the array row index for +#' this particular value; for scalar parameters it is always NA } +#' \item{index_cols}{if this is a parameter array, the array column index for +#' this particular value; for scalar parameters it is always NA) } #' } -#' @source wwww.wherever.com +#' @source generate_simulated_data.R "param_df" diff --git a/cfaforecastrenewalww/R/get_data.R b/cfaforecastrenewalww/R/get_data.R index 606e0ee5..20b99755 100644 --- a/cfaforecastrenewalww/R/get_data.R +++ b/cfaforecastrenewalww/R/get_data.R @@ -102,9 +102,7 @@ get_stan_data <- function(train_data, # matrix to convert R(t) from weekly to daily ind_m <- get_ind_m(ot + ht, n_weeks) - - # matrix to transform p_hosp RW from weekly to daily - p_hosp_m <- get_ind_m_cum_sum(uot + ot + ht, tot_weeks) + p_hosp_m <- get_ind_m(uot + ot + ht, tot_weeks) # Get other variables needed from data pop <- train_data %>% @@ -164,6 +162,8 @@ get_stan_data <- function(train_data, # duration shedding autoreg_rt_a = autoreg_rt_a, autoreg_rt_b = autoreg_rt_b, + autoreg_p_hosp_a = autoreg_p_hosp_a, + autoreg_p_hosp_b = autoreg_p_hosp_b, inv_sqrt_phi_prior_mean = inv_sqrt_phi_prior_mean, inv_sqrt_phi_prior_sd = inv_sqrt_phi_prior_sd, r_prior_mean = r_prior_mean, @@ -178,7 +178,7 @@ get_stan_data <- function(train_data, initial_growth_prior_sd = initial_growth_prior_sd, sigma_ww_prior_mean = sigma_ww_site_prior_mean_mean, eta_sd_sd = eta_sd_sd, - p_hosp_mean = p_hosp_mean, + p_hosp_prior_mean = p_hosp_mean, p_hosp_sd_logit = p_hosp_sd_logit, p_hosp_w_sd_sd = p_hosp_w_sd_sd, inf_feedback_prior_logmean = infection_feedback_prior_logmean, @@ -378,9 +378,7 @@ get_stan_data_site_level_model <- function(train_data, # matrix to transform from weekly to daily ind_m <- get_ind_m(ot + ht, n_weeks) - - # matrix to transform p_hosp RW from weekly to daily - p_hosp_m <- get_ind_m_cum_sum(uot + ot + ht, tot_weeks) + p_hosp_m <- get_ind_m(uot + ot + ht, tot_weeks) # Get other variables needed from data pop <- train_data %>% @@ -475,6 +473,8 @@ get_stan_data_site_level_model <- function(train_data, # duration shedding autoreg_rt_a = autoreg_rt_a, autoreg_rt_b = autoreg_rt_b, + autoreg_p_hosp_a = autoreg_p_hosp_a, + autoreg_p_hosp_b = autoreg_p_hosp_b, inv_sqrt_phi_prior_mean = inv_sqrt_phi_prior_mean, inv_sqrt_phi_prior_sd = inv_sqrt_phi_prior_sd, r_prior_mean = r_prior_mean, @@ -494,7 +494,7 @@ get_stan_data_site_level_model <- function(train_data, eta_sd_sd = eta_sd_sd, sigma_i0_prior_mode = sigma_i0_prior_mode, sigma_i0_prior_sd = sigma_i0_prior_sd, - p_hosp_mean = p_hosp_mean, + p_hosp_prior_mean = p_hosp_mean, p_hosp_sd_logit = p_hosp_sd_logit, p_hosp_w_sd_sd = p_hosp_w_sd_sd, ww_site_mod_sd_sd = ww_site_mod_sd_sd, @@ -566,9 +566,10 @@ state_agg_inits <- function(train_data, params, stan_data) { initial_growth = rnorm(1, 0, 0.001), inv_sqrt_phi_h = 1 / (sqrt(200)) + rnorm(1, 1 / 10000, 1 / 10000), sigma_ww = abs(rnorm(1, 0, 0.5)), - p_hosp_int = rnorm(1, qlogis(p_hosp_mean), 0.01), - p_hosp_w = if (include_ww == 1) rnorm(tot_weeks - 1, 0, 0.01) else rep(0, 0), + p_hosp_mean = rnorm(1, qlogis(p_hosp_mean), 0.01), + p_hosp_w = if (include_ww == 1) rnorm(tot_weeks, 0, 0.01) else rep(0, 0), p_hosp_w_sd = if (include_ww == 1) abs(rnorm(1, 0.01, 0.001)) else rep(0, 0), + autoreg_p_hosp = if (include_ww == 1) abs(rnorm(1, 1 / 100, 0.001)) else rep(0, 0), t_peak = rnorm(1, t_peak_mean, 0.1 * t_peak_sd), viral_peak = rnorm(1, viral_peak_mean, 0.1 * viral_peak_sd), dur_shed = rnorm(1, duration_shedding_mean, 0.1 * duration_shedding_sd), @@ -749,6 +750,7 @@ site_level_inf_inits <- function(train_data, params, stan_data) { log_r_mu_intercept = rnorm(1, convert_to_logmean(1, 0.1), convert_to_logsd(1, 0.1)), error_site = matrix(rnorm(n_subpops * n_weeks, mean = 0, sd = 0.1), n_subpops, n_weeks), autoreg_rt_site = abs(rnorm(1, 0.5, 0.05)), + autoreg_p_hosp = abs(rnorm(1, 1 / 100, 0.001)), sigma_rt = abs(rnorm(1, 0, 0.01)), i0_over_n = plogis(rnorm(1, qlogis(i0 / pop), 0.05)), initial_growth = rnorm(1, 0, 0.001), @@ -762,8 +764,8 @@ site_level_inf_inits <- function(train_data, params, stan_data) { 0.1 * sigma_ww_site_prior_sd_sd )), sigma_ww_site_raw = abs(rnorm(n_ww_lab_sites, 0, 0.05)), - p_hosp_int = rnorm(1, qlogis(p_hosp_mean), 0.01), - p_hosp_w = rnorm(tot_weeks - 1, 0, 0.01), + p_hosp_mean = rnorm(1, qlogis(p_hosp_mean), 0.01), + p_hosp_w = rnorm(tot_weeks, 0, 0.01), p_hosp_w_sd = abs(rnorm(1, 0.01, 0.001)), t_peak = rnorm(1, t_peak_mean, 0.1 * t_peak_sd), viral_peak = rnorm(1, viral_peak_mean, 0.1 * viral_peak_sd), diff --git a/cfaforecastrenewalww/R/model_params.R b/cfaforecastrenewalww/R/model_params.R index 2659ce0c..1aef6595 100644 --- a/cfaforecastrenewalww/R/model_params.R +++ b/cfaforecastrenewalww/R/model_params.R @@ -72,7 +72,7 @@ get_model_param_df <- function(x) { static_params <- c( "eta_sd", "autoreg_rt", "autoreg_conc", "log_R", "sigma_log_conc", "i0_over_n", "initial_growth", "inv_sqrt_phi_h", - "sigma_ww_site_mean", "sigma_ww_site_sd", "p_hosp_int", "p_hosp_w_sd", + "sigma_ww_site_mean", "sigma_ww_site_sd", "p_hosp_mean", "p_hosp_w_sd", "t_peak", "viral_peak", "dur_shed", "log10_g", "ww_site_mod_sd", "infection_feedback" ) @@ -97,7 +97,7 @@ get_model_param_df <- function(x) { "autoreg_rt_site", "i0_over_n", "sigma_i0", "sigma_growth", "initial_growth", "inv_sqrt_phi_h", "sigma_ww_site_mean", "sigma_ww_site_sd", "p_hosp_w_sd", "t_peak", "dur_shed", "ww_site_mod_sd", - "infection_feedback" + "infection_feedback", "p_hosp_mean" ) daily_params <- c("rt", "state_inf_per_capita", "p_hosp") weekly_params <- c("w", "p_hosp_w") @@ -119,7 +119,7 @@ get_model_param_df <- function(x) { "i0_over_n", "initial_growth", "inv_sqrt_phi_h", "sigma_ww", "p_hosp_w_sd[1]", "t_peak", "dur_shed", - "infection_feedback" + "infection_feedback", "p_hosp_mean" ) daily_params <- c("rt", "new_i", "p_hosp") weekly_params <- c("w", "p_hosp_w") diff --git a/cfaforecastrenewalww/R/plots.R b/cfaforecastrenewalww/R/plots.R index f8b560aa..69980ab8 100644 --- a/cfaforecastrenewalww/R/plots.R +++ b/cfaforecastrenewalww/R/plots.R @@ -632,6 +632,92 @@ get_plot_param_distribs <- function(df, return(p) } +#' Get plot single parameter +#' +#' @param df df of filepaths or of draws themselves +#' @param param_name the name of the parameter to plot +#' @param figure_file_path path to save figure +#' @param from_full_df whether or not df is a df of filepaths or of draws, +#' default = FALSE which means it is of filepaths +#' @param write_files whether or not to write files (default = TRUE) +#' @param ... +#' +#' @return +#' @export +#' +#' @examples +get_plot_single_param <- function(df, + param_name, + figure_file_path, + from_full_df = FALSE, + write_files = TRUE, ...) { + if (isFALSE(from_full_df)) { + draws <- arrow::read_parquet( + file = + df$parameters_file_path + ) + } else { + draws <- df + } + + + posterior_params <- draws %>% + filter(name == param_name) + + # get metadata for filename and title + plot_metadata <- get_plot_metadata(draws) + location <- plot_metadata$location + forecast_date <- plot_metadata$forecast_date + hosp_reporting_delay <- plot_metadata$hosp_reporting_delay + model_type <- plot_metadata$model_type + model_file_name <- plot_metadata$model_file_name + + + p <- ggplot(posterior_params) + + geom_histogram(aes(x = value), alpha = 0.3) + + facet_wrap(~name, scales = "free_x", nrow = 2) + + theme_bw() + + xlab("Parameter value") + + ylab("Frequency") + + ggtitle( + glue::glue( + "Parameter draws for {model_type} in {location}", + " from forecast on {forecast_date}" + ) + ) + + theme( + axis.text.x = element_text( + size = 8, vjust = 1, + hjust = 1, angle = 45 + ), + plot.title = element_text( + size = 10, + vjust = 0.5, hjust = 0.5 + ) + ) + + if (isTRUE(write_files)) { + full_file_path <- file.path( + figure_file_path, location + ) + create_dir(full_file_path) + + ggsave( + file.path( + full_file_path, + glue::glue("posterior_{param_name}_{model_file_name}.png") + ), + plot = p, + width = 10, + height = 5, + bg = "white" + ) + } + + + return(p) +} + #' Plot of site-level Rt #' @@ -754,7 +840,7 @@ get_rt_subpop_level <- function(df, #' Get R(t) for a single location #' -#' @param quantiles dataframe of forecasts with quantiles including for R(t) +#' @param df dataframe of either model draws or filepath to model draws #' @param figure_file_path higher level directory where data from this pipeline will be saved #' @param from_full_df if TRUE then df is draws object, if FALSE then df is a #' dataframe of filepaths to load in the parameter object @@ -864,6 +950,242 @@ get_rt_from_draws <- function(df, } +#' Get plot of IHR(t) +#' +#' @param df dataframe either of model draws or file path to model draws +#' @param figure_file_path where to save the figure +#' @param params input params, used to plot the prior +#' @param from_full_df whether df is a dataframe of draws or filepath to draws, +#' default is FALSE +#' @param write_files whether to save the figure, default is TRUE +#' +#' @return +#' @export +#' +#' @examples +get_plot_p_hosp_t <- function(df, + figure_file_path, + params, + from_full_df = FALSE, + write_files = TRUE) { + if (isFALSE(from_full_df)) { + model_draws <- arrow::read_parquet( + file = + df$model_draws_file_path + ) + } else { + model_draws <- df + } + + # Get the median for plotting + p_hosp_draws <- model_draws %>% + filter(name == "p_hosp") + + p_hosp_draws <- p_hosp_draws %>% + left_join( + p_hosp_draws %>% + group_by(date) %>% + summarise( + median_p_hosp = quantile(value, 0.5, na.rm = TRUE) + ) + ) + + # Subset to 25 draws + + p_hosp_draws <- p_hosp_draws %>% + filter(draw %in% sample(unique(p_hosp_draws$draw), 25)) + + + + plot_metadata <- get_plot_metadata(model_draws) + location <- plot_metadata$location + include_ww <- plot_metadata$include_ww + forecast_date <- plot_metadata$forecast_date + hosp_reporting_delay <- plot_metadata$hosp_reporting_delay + model_type <- plot_metadata$model_type + model_file_name <- plot_metadata$model_file_name + + title <- glue::glue("IHR(t) as of {forecast_date} in {location} with {model_type}") + + p <- ggplot(p_hosp_draws) + + geom_line(aes(x = date, y = value, group = draw, color = period), + alpha = 0.2, + show.legend = FALSE + ) + + geom_line(aes(x = date, y = median_p_hosp), + show.legend = FALSE + ) + + geom_vline(aes(xintercept = forecast_date), linetype = "dashed") + + geom_hline(aes(yintercept = params$p_hosp_mean), linetype = "dashed", color = "darkgray") + + xlab("") + + ylab("IHR(t)") + + theme_bw() + + scale_x_date( + date_breaks = "2 weeks", + labels = scales::date_format("%Y-%m-%d") + ) + + ggtitle(title) + + theme( + axis.text.x = element_text( + size = 10, vjust = 1, + hjust = 1, angle = 45 + ), + axis.title.x = element_text(size = 10), + axis.title.y = element_text(size = 10), + plot.title = element_text( + size = 12, + vjust = 0.5, hjust = 0.5 + ) + ) + + p + + if (isTRUE(write_files)) { + full_file_path <- file.path( + figure_file_path, location + ) + create_dir(full_file_path) + ggsave( + file.path( + full_file_path, + glue::glue("IHR_t.png") + ), + plot = p, + width = 7, + height = 7, + units = "in", + bg = "white" + ) + } + + return(p) +} + +#' Get plot of the change in IHR over time +#' +#' @param df dataframe of filepaths +#' @param figure_file_path where to write the figure to +#' @param write_files whether or not to save the figure, default is TRUE +#' +#' @return a plot of the change in the IHR over time (IHR intercept - IHR(t)) +#' @export +#' +#' @examples +get_plot_change_in_p_hosp_t <- function(df, + figure_file_path, + write_files = TRUE) { + model_draws <- arrow::read_parquet( + file = + df$model_draws_file_path + ) + param_draws <- arrow::read_parquet( + file = + df$parameters_file_path + ) + + p_hosp_mean_draws <- param_draws %>% + filter(name == "p_hosp_mean") %>% + filter(draw %in% unique(model_draws$draw)) %>% + select(value, draw) %>% + rename(p_hosp_mean = value) + sampled_draws <- model_draws %>% + filter(draw %in% unique(p_hosp_mean_draws$draw)) + p_hosp_final_draws <- sampled_draws %>% + filter(t == max(t)) %>% + select(value, draw) %>% + rename(p_hosp_final = value) + + + # Get the median for plotting + p_hosp_draws <- sampled_draws %>% + filter(name == "p_hosp") %>% + select(date, t, forecast_date, draw, name, value) %>% + tidyr::pivot_wider( + id_cols = c(date, forecast_date, t, draw), + names_from = name, + values_from = value + ) %>% + left_join(p_hosp_mean_draws, by = "draw") %>% + left_join(p_hosp_final_draws, by = "draw") %>% + mutate(delta_p_hosp = p_hosp_final - p_hosp) + + + + p_hosp_draws <- p_hosp_draws %>% + left_join( + p_hosp_draws %>% + group_by(date) %>% + summarise( + median_delta = quantile(delta_p_hosp, 0.5, na.rm = TRUE) + ) + ) + + + + plot_metadata <- get_plot_metadata(model_draws) + location <- plot_metadata$location + include_ww <- plot_metadata$include_ww + forecast_date <- plot_metadata$forecast_date + hosp_reporting_delay <- plot_metadata$hosp_reporting_delay + model_type <- plot_metadata$model_type + model_file_name <- plot_metadata$model_file_name + + title <- glue::glue("Change in IHR(t) as of {forecast_date} in {location} with {model_type}") + + p <- ggplot(p_hosp_draws) + + geom_line(aes(x = date, y = delta_p_hosp, group = draw), + alpha = 0.2, + show.legend = FALSE + ) + + geom_line(aes(x = date, y = median_delta), + show.legend = FALSE + ) + + geom_vline(aes(xintercept = forecast_date), linetype = "dashed") + + xlab("") + + ylab("IHR(t)") + + theme_bw() + + scale_x_date( + date_breaks = "2 weeks", + labels = scales::date_format("%Y-%m-%d") + ) + + ggtitle(title) + + theme( + axis.text.x = element_text( + size = 10, vjust = 1, + hjust = 1, angle = 45 + ), + axis.title.x = element_text(size = 10), + axis.title.y = element_text(size = 10), + plot.title = element_text( + size = 12, + vjust = 0.5, hjust = 0.5 + ) + ) + + p + + if (isTRUE(write_files)) { + full_file_path <- file.path( + figure_file_path, location + ) + create_dir(full_file_path) + ggsave( + file.path( + full_file_path, + glue::glue("delta_IHR_t.png") + ), + plot = p, + width = 7, + height = 7, + units = "in", + bg = "white" + ) + } + + return(p) +} + + #' Title #' diff --git a/cfaforecastrenewalww/R/process_model_outputs.R b/cfaforecastrenewalww/R/process_model_outputs.R index 3ddb17e5..5e936137 100644 --- a/cfaforecastrenewalww/R/process_model_outputs.R +++ b/cfaforecastrenewalww/R/process_model_outputs.R @@ -1028,10 +1028,23 @@ get_raw_param_draws <- function(stan_output_draws, model_type, ) %>% rename(value = p_hosp_w_sd) %>% select(name, t, value, draw) + + p_hosp_mean <- stan_output_draws %>% + spread_draws(p_hosp_mean) %>% + sample_draws(ndraws = n_draws) %>% + mutate(draw = `.draw`) %>% + mutate( + name = "p_hosp_mean", + t = NA + ) %>% + rename(value = p_hosp_mean) %>% + select(name, t, value, draw) + posterior_params <- rbind( phi_h, eta_sd, log10_g, i0, infection_feedback, - autoreg_rt, initial_growth, p_hosp_sd + autoreg_rt, initial_growth, p_hosp_sd, + p_hosp_mean ) } else if (model_type == "hospital admissions only") { posterior_params <- rbind( @@ -1052,10 +1065,22 @@ get_raw_param_draws <- function(stan_output_draws, model_type, rename(value = p_hosp_w_sd) %>% select(name, t, value, draw) + p_hosp_mean <- stan_output_draws %>% + spread_draws(p_hosp_mean) %>% + sample_draws(ndraws = n_draws) %>% + mutate(draw = `.draw`) %>% + mutate( + name = "p_hosp_mean", + t = NA + ) %>% + mutate(value = plogis(p_hosp_mean)) %>% + select(name, t, value, draw) + posterior_params <- rbind( phi_h, eta_sd, log10_g, i0, infection_feedback, - autoreg_rt, initial_growth, p_hosp_sd + autoreg_rt, initial_growth, p_hosp_sd, + p_hosp_mean ) } @@ -1145,13 +1170,7 @@ get_loc_model_map <- function(df_of_filepaths, TRUE ~ model_type ) ) |> - dplyr::mutate( - keep_state = dplyr::case_when( - (location %in% exclude_states) ~ FALSE, - TRUE ~ TRUE - ) - ) |> - dplyr::filter(keep_state == TRUE) |> + dplyr::filter(!(location %in% exclude_states)) |> dplyr::select(location, model_type) us_df <- data.frame(location = "US", model_type = us_model_type) diff --git a/cfaforecastrenewalww/R/sysdata.rda b/cfaforecastrenewalww/R/sysdata.rda index 971754ff..e0a50729 100755 Binary files a/cfaforecastrenewalww/R/sysdata.rda and b/cfaforecastrenewalww/R/sysdata.rda differ diff --git a/cfaforecastrenewalww/data/example_df.rda b/cfaforecastrenewalww/data/example_df.rda index f8cc566f..553640ea 100755 Binary files a/cfaforecastrenewalww/data/example_df.rda and b/cfaforecastrenewalww/data/example_df.rda differ diff --git a/cfaforecastrenewalww/data/param_df.rda b/cfaforecastrenewalww/data/param_df.rda index c462b650..727d9fa5 100755 Binary files a/cfaforecastrenewalww/data/param_df.rda and b/cfaforecastrenewalww/data/param_df.rda differ diff --git a/cfaforecastrenewalww/inst/extdata/example_params.toml b/cfaforecastrenewalww/inst/extdata/example_params.toml index 520ef7ca..f2fb45f2 100644 --- a/cfaforecastrenewalww/inst/extdata/example_params.toml +++ b/cfaforecastrenewalww/inst/extdata/example_params.toml @@ -18,6 +18,8 @@ initial_growth_prior_sd = 0.01 autoreg_rt_a = 2 # shape1 parameter of autoreg term on Rt trend autoreg_rt_b = 40 # shape2 parameter of autoreg on Rt trend # mean = a/(a+b) = 0.05, stdv = sqrt(a)/b = sqrt(2)/40 = 0.035 +autoreg_p_hosp_a = 1 # shape1 parameter of autoreg term on IHR(t) trend +autoreg_p_hosp_b = 100 # shape2 parameter of autoreg term on IHR(t) trend eta_sd_sd = 0.01 infection_feedback_prior_logmean = 6.37408 # log(mode) + q^2 mode = 500, q = 0.4 infection_feedback_prior_logsd = 0.4 @@ -33,8 +35,6 @@ p_hosp_w_sd_sd = 0.01 inv_sqrt_phi_prior_mean = 0.1 # 1 / sqrt(100) inv_sqrt_phi_prior_sd = 0.1414214 # 1 / sqrt(50) -phi_w_prior_mean = 100 -phi_w_prior_sd = 30 wday_effect_prior_mean = 0.1428571 # 1 / 7 wday_effect_prior_sd = 0.05 @@ -51,16 +51,11 @@ log10_g_prior_sd = 2 log_g_prior_mean = 27.63102 # 12 * log(10) log_g_prior_sd = 4.60517 # 2 * log(10) -sigma_ww_prior_sd = 0.5 - sigma_ww_site_prior_mean_mean = 0.5 sigma_ww_site_prior_mean_sd = 1 sigma_ww_site_prior_sd_mean = 0 sigma_ww_site_prior_sd_sd = 1 -autoreg_conc_a = 1 -# shape1 parameter of Beta dist for autoreg term on Ct trend -autoreg_conc_b = 2 -# shape 2 parameter of Beta dist for autoreg term on Ct trend + ww_site_mod_sd_sd = 0.25 log_phi_g_prior_mean = -2.302585 # log(0.1) # prior mean in individual level dispersion diff --git a/cfaforecastrenewalww/inst/stan/functions/hospitalization.stan b/cfaforecastrenewalww/inst/stan/functions/hospitalization.stan index f0687a04..4cf4db5d 100644 --- a/cfaforecastrenewalww/inst/stan/functions/hospitalization.stan +++ b/cfaforecastrenewalww/inst/stan/functions/hospitalization.stan @@ -6,12 +6,24 @@ * @param p_hosp_m matrix to handle conversion from weekly to daily * @param p_hosp_int intercept/first value, on unconstrained scale * @param p_hosp_w_sd, scaling factor for/SD of random walk + * @param autoreg_p_hosp, autoreg parameter for IHR * @param p_hosp_w weekly deviations of the random walk, on unconstrained scale + * @param tot_weeks, number of total weeks in calibration and forecast period + * @param is_stat, whether or not AR(1) process starts at stationarity * @return A vector, daily probabilities of hospitalization */ -vector assemble_p_hosp(matrix p_hosp_m, real p_hosp_int, real p_hosp_w_sd, vector p_hosp_w) { +vector assemble_p_hosp(matrix p_hosp_m, real p_hosp_mean, real p_hosp_w_sd, + real autoreg_p_hosp, + vector p_hosp_w, int tot_weeks, int is_stat) { vector[dims(p_hosp_m)[1]] p_hosp; - p_hosp = p_hosp_m * append_row(p_hosp_int, p_hosp_w_sd * p_hosp_w); + vector[tot_weeks] p_hosp_in_weeks; + + p_hosp_in_weeks = ar1(rep_vector(p_hosp_mean, tot_weeks), + autoreg_p_hosp, + p_hosp_w_sd, + p_hosp_w, + is_stat); + p_hosp = p_hosp_m * p_hosp_in_weeks; p_hosp = inv_logit(p_hosp); return p_hosp; } diff --git a/cfaforecastrenewalww/inst/stan/renewal_ww_hosp.stan b/cfaforecastrenewalww/inst/stan/renewal_ww_hosp.stan index 39cca671..c9e82061 100644 --- a/cfaforecastrenewalww/inst/stan/renewal_ww_hosp.stan +++ b/cfaforecastrenewalww/inst/stan/renewal_ww_hosp.stan @@ -43,13 +43,14 @@ data { vector[6] viral_shedding_pars; // tpeak, viral peak, shedding duration mean and sd real autoreg_rt_a; real autoreg_rt_b; + real autoreg_p_hosp_a; + real autoreg_p_hosp_b; real inv_sqrt_phi_prior_mean; real inv_sqrt_phi_prior_sd; real r_prior_mean; real r_prior_sd; real log10_g_prior_mean; real log10_g_prior_sd; - real i0_over_n_prior_a; real i0_over_n_prior_b; real wday_effect_prior_mean; @@ -58,7 +59,7 @@ data { real initial_growth_prior_sd; real sigma_ww_prior_mean; real eta_sd_sd; - real p_hosp_mean; + real p_hosp_prior_mean; real p_hosp_sd_logit; real p_hosp_w_sd_sd; real inf_feedback_prior_logmean; @@ -87,17 +88,16 @@ parameters { vector[n_weeks-1] w; // weekly random walk real eta_sd; // step size of random walk real autoreg_rt;// coefficient on AR process in R(t) + array [(include_ww==1) ? 1 : 0] real autoreg_p_hosp; real log_r; // baseline reproduction number estimate (log) real i0_over_n; // Per capita incident infections // on day -uot before first observation day real initial_growth; // initial growth from I0 to first observed time real inv_sqrt_phi_h; real sigma_ww; - real p_hosp_int; // Estimated IHR - vector[(include_ww==1) ? tot_weeks -1 : 0] p_hosp_w; + real p_hosp_mean; // Estimated IHR + vector[(include_ww==1) ? tot_weeks : 0] p_hosp_w; array [(include_ww==1) ? 1 : 0] real p_hosp_w_sd; - //vector[tot_weeks -1] p_hosp_w; // weekly random walk for IHR - // real p_hosp_w_sd; // Estimated IHR sd real t_peak; // time to viral load peak in shedding real viral_peak; // log10 peak viral load shed /mL real dur_shed; // duration of detectable viral shedding @@ -110,6 +110,7 @@ parameters { transformed parameters { vector[ot + uot + ht] new_i; // daily incident infections /n vector [(include_ww ==1) ? ot + uot + ht: 1] p_hosp; // probability of hospitalization + vector[(include_ww == 1) ? tot_weeks-1: 0] p_hosp_in_weeks; // the weekly vector of probability of hospital admissions vector[ot + uot + ht] model_hosp; // model estimated hospital admissions vector[oht] exp_obs_hosp; // expected observed hospital admissions vector[ot] exp_obs_hosp_no_wday_effect; // expected observed hospital admissions before weekday effect @@ -146,11 +147,12 @@ transformed parameters { // Expected hospitalizations: // generates all hospitalizations, across unobserved time, observed time, and forecast time if(include_ww==1){ - p_hosp = assemble_p_hosp(p_hosp_m, p_hosp_int, p_hosp_w_sd[1], p_hosp_w); + p_hosp = assemble_p_hosp(p_hosp_m, p_hosp_mean, p_hosp_w_sd[1], + autoreg_p_hosp[1], p_hosp_w, tot_weeks, 1); model_hosp = convolve_dot_product(p_hosp .* new_i, reverse(inf_to_hosp), ot + uot + ht); }else{ - p_hosp[1] = inv_logit(p_hosp_int); + p_hosp[1] = inv_logit(p_hosp_mean); // generates all hospitalizations, across unobserved time, observed time, and forecast time model_hosp = convolve_dot_product(p_hosp[1] * new_i, reverse(inf_to_hosp), ot + uot + ht); @@ -185,6 +187,7 @@ model { w ~ std_normal(); eta_sd ~ normal(0, eta_sd_sd); autoreg_rt ~ beta(autoreg_rt_a, autoreg_rt_b); + autoreg_p_hosp ~ beta(autoreg_p_hosp_a, autoreg_p_hosp_b); log_r ~ normal(r_logmean, r_logsd); i0_over_n ~ beta(i0_over_n_prior_a, i0_over_n_prior_b); initial_growth ~ normal(initial_growth_prior_mean, initial_growth_prior_sd); @@ -192,7 +195,7 @@ model { sigma_ww ~ normal(0, sigma_ww_prior_mean); log10_g ~ normal(log10_g_prior_mean, log10_g_prior_sd); hosp_wday_effect ~ normal(effect_mean, wday_effect_prior_sd); - p_hosp_int ~ normal(logit(p_hosp_mean), p_hosp_sd_logit); + p_hosp_mean ~ normal(logit(p_hosp_prior_mean), p_hosp_sd_logit); p_hosp_w ~ std_normal(); p_hosp_w_sd ~ normal(0, p_hosp_w_sd_sd); t_peak ~ normal(t_peak_mean, t_peak_sd); diff --git a/cfaforecastrenewalww/inst/stan/renewal_ww_hosp_site_level_inf_dynamics.stan b/cfaforecastrenewalww/inst/stan/renewal_ww_hosp_site_level_inf_dynamics.stan index fa56c445..adfc27a6 100644 --- a/cfaforecastrenewalww/inst/stan/renewal_ww_hosp_site_level_inf_dynamics.stan +++ b/cfaforecastrenewalww/inst/stan/renewal_ww_hosp_site_level_inf_dynamics.stan @@ -54,6 +54,8 @@ data { vector[6] viral_shedding_pars;// tpeak, viral peak, shedding duration mean and sd real autoreg_rt_a; real autoreg_rt_b; + real autoreg_p_hosp_a; + real autoreg_p_hosp_b; real inv_sqrt_phi_prior_mean; real inv_sqrt_phi_prior_sd; real r_prior_mean; @@ -73,7 +75,7 @@ data { real sigma_ww_site_prior_sd_mean; real sigma_ww_site_prior_sd_sd; real eta_sd_sd; - real p_hosp_mean; + real p_hosp_prior_mean; real p_hosp_sd_logit; real p_hosp_w_sd_sd; real ww_site_mod_sd_sd; @@ -111,7 +113,8 @@ parameters { real log_r_mu_intercept; // state-level mean baseline reproduction number estimate (log) at t=0 real sigma_rt; // magnitude of site level variation from state level real autoreg_rt_site; - matrix[n_subpops, n_weeks] error_site; // matrix of w_sites + real autoreg_p_hosp; + matrix[n_subpops, n_weeks] error_site; // matrix of subpopulations real i0_over_n; // initial per capita // infection incidence vector[n_subpops] eta_i0; // z-score on logit scale of state @@ -125,8 +128,8 @@ parameters { real sigma_ww_site_mean; //mean of site level stdev real sigma_ww_site_sd; // stdev of site level stdev vector[n_ww_lab_sites]sigma_ww_site_raw; // let each lab-site combo have its own observation error - real p_hosp_int; // Estimated IHR - vector[tot_weeks -1] p_hosp_w; // weekly random walk for IHR + real p_hosp_mean; // Estimated mean IHR + vector[tot_weeks] p_hosp_w; // weekly random walk for IHR real p_hosp_w_sd; // Estimated IHR sd real t_peak; // time to viral load peak in shedding real viral_peak; // log10 peak viral load shed /mL @@ -196,13 +199,13 @@ transformed parameters { tuple(vector[num_elements(state_inf_per_capita)], vector[num_elements(unadj_r)]) output; output = generate_infections( to_vector(unadj_r_site_t), - uot, - gt_rev_pmf, - log(i0_site_over_n[i]), - growth_site[i], - ht, + uot, + gt_rev_pmf, + log(i0_site_over_n[i]), + growth_site[i], + ht, infection_feedback, - infection_feedback_rev_pmf + infection_feedback_rev_pmf ); new_i_site = to_row_vector(output.1); r_site_t[i] = to_row_vector(output.2); @@ -223,8 +226,9 @@ transformed parameters { } - // Assemble a daily vector of p_hosp from the initial value, random walk increments, and scaling SD - p_hosp = assemble_p_hosp(p_hosp_m, p_hosp_int, p_hosp_w_sd, p_hosp_w); + // Set up p_hosp as an AR(1) process that regresses back towards the initial value of p_hosp + p_hosp = assemble_p_hosp(p_hosp_m, p_hosp_mean, p_hosp_w_sd, + autoreg_p_hosp, p_hosp_w, tot_weeks, 1); // Expected hospital admissions per capita: // This is a convolution of incident infections and the hospital-admission delay distribution @@ -269,6 +273,7 @@ model { eta_sd ~ normal(0, eta_sd_sd); autoreg_rt_site ~ beta(autoreg_rt_a, autoreg_rt_b); autoreg_rt ~ beta(autoreg_rt_a, autoreg_rt_b); + autoreg_p_hosp ~ beta(autoreg_p_hosp_a, autoreg_p_hosp_b); log_r_mu_intercept ~ normal(r_logmean, r_logsd); to_vector(error_site) ~ std_normal(); sigma_rt ~ normal(0, sigma_rt_prior); @@ -286,7 +291,7 @@ model { sigma_ww_site_raw ~ std_normal(); log10_g ~ normal(log10_g_prior_mean, log10_g_prior_sd); hosp_wday_effect ~ normal(effect_mean, wday_effect_prior_sd); - p_hosp_int ~ normal(logit(p_hosp_mean), p_hosp_sd_logit); // logit scale + p_hosp_mean ~ normal(logit(p_hosp_prior_mean), p_hosp_sd_logit); // logit scale p_hosp_w ~ std_normal(); p_hosp_w_sd ~ normal(0, p_hosp_w_sd_sd); t_peak ~ normal(t_peak_mean, t_peak_sd); diff --git a/cfaforecastrenewalww/man/example_df.Rd b/cfaforecastrenewalww/man/example_df.Rd index 058a7fe4..6cef1afc 100644 --- a/cfaforecastrenewalww/man/example_df.Rd +++ b/cfaforecastrenewalww/man/example_df.Rd @@ -3,23 +3,56 @@ \docType{data} \name{example_df} \alias{example_df} -\title{Example input data set} +\title{Input dataframe for fitting the site-level infection dynamics wastewater +model} \format{ ## example_df A data frame with 635 rows and 13 columns \describe{ - \item{t}{Time} - \item{lab_wwtp_unique_id}{Unique identifier for each unique combination - of sampling site and testing lab} - \item{log_conc}{Genome copied per liter, log10} - ETC. + \item{t}{The time index in days} + \item{lab_wwtp_unique_id}{The unique identifier of the wasteawter site-lab + combination (1-5) in this instance, + but would vary for the number of unique combinations in your inptu dataset + if trying to construct something mirroring + this structure} + \item{log_conc}{The log scaled viral genomes per mL collected from the + lab-site specified on day date. + NAs indicate days where a sample wasn't reported in that lab-site.} + \item{date}{Sample collection date, formatted as YYYY-MM-DD} + \item{lod_sewage}{The log scaled limit of detection reported by the + lab-site on that date} + \item{below_LOD}{An indicator (0,1) indicating whether the wastewater + observation on that date is + above (0) or below (1) the LOD. NAs for all days with wastewater + observations in that lab-site} + \item{daily_hosp_admits}{The number of individuals admitted to the + hospital on that date, + available as of the forecast date. Note this is repeated for each site-lab.} + \item{daily_hosp_admits_for_eval}{The number of individuals admitted to + the hospital on that date, + available retrospectively (so including after the forecast date). + Note this is repeated for each site-lab} + \item{pop}{The number of people in the "state" or whatever population + contributes to the daily hospital admissions} + \item{forecast_date}{The date the forecast is made, formatted as YYYY-MM-DD} + \item{hosp_calibration_time}{The duration of time in days to calibrate the + model to hospital admissions data} + \item{site}{The unique identifier for the site (also referred to as a + wastewater catchment area). There can be more than + one lab per site if samples are sent to different/overlapping labs.} + \item{ww_pop}{The population size of the site (wastewater catchment area)} } } +\source{ +generate_simulated_data.R +} \usage{ example_df } \description{ -SOME HIGH-LEVEL DESCRIPTION OF THE DATA. MAYBE JUST POINT READERS TO -THE FUNCTION THAT GENERATES IT? +A dataset containing the daily hospital admissons, observed wastewater +concentrations and other attributes needed for the model to be fit. +Generated via the defaults in `generate_simulated_data.R` + The variables are as follows: } \keyword{datasets} diff --git a/cfaforecastrenewalww/man/example_params.Rd b/cfaforecastrenewalww/man/example_params.Rd new file mode 100644 index 00000000..ae34ae0c --- /dev/null +++ b/cfaforecastrenewalww/man/example_params.Rd @@ -0,0 +1,146 @@ +\name{example_params} +\alias{example_params} +\docType{data} +\title{Example model parameters and hyperparameters for priors} +\description{ +The `example_params.toml` file contains modifiable model inputs that get passed +to stan to fit the model. The parameters used here have been chosen by the +authors to best reflect SARS-CoV-2 infection, hospital admissions, and +wastewater components. See the [`Model Parameters`](https://github.com/CDCgov/wastewater-informed-covid-forecasting/blob/prod/model_definition.md#model-parameters) +section of the model_definition.md in our Github for more information/justification of these +prior/parameter choices. + +If running the model on SARS-CoV-2 hospital admissions and wastewater concentrations, +it may be reasonable to use these example parameters as a starting place for your +data (aka none are dataset specific). The values that will need tweaking for a +specific dataset are defined in the documentation for `example_df`. +} +\usage{example_params <- cfaforecastrenewalww::get_params(fs::path_package( + "extdata", "example_params.toml", + package = "cfaforecastrenewalww" + )) + } +\format{ + A .toml file that gets formatted as a 1 row dataframe where each column is a + parameter name and its value is indicated in the first row +\describe{ + \item{\code{uot}}{"unoubserved time" an integer that dictates the duration + of time (in days) that the model is initialized with exponential growth + before any observations are available.} + \item{\code{r_prior_mean}}{Prior on mean R(t)} + \item{\code{r_prior_sd}}{Prior on standard deviation in the mean R(t)} + \item{\code{sigma_rt_prior}}{Prior on the standard deviation between + subpopulation level R(t)s} + \item{\code{i0_certainty}}{Prior on the certainty in initial infections + I0/N, representing the effective number of binomial trials in the beta + prior centered on I0/N} + \item{\code{initial_growth_prior_mean}}{Prior on mean exponential growth + rate(daily) in the unobserved time during infection initialization} + \item{\code{initial_growth_prior_sd}}{Prior on the standard deviation in + the daily exponential growth rate during the unobserved time} + \item{\code{autoreg_rt_a}}{Prior on the autoregulation term on the R(t) + trend and the AR(1) process in subpopulation level R(t), shape1 parameter of + a beta prior} + \item{\code{autoreg_rt_b}}{Prior on the autoregulation term on the R(t) + trend and the AR(1) process in subpopulation level R(t), shape2 parameter of + a beta prior} + \item{\code{eta_sd_sd}}{Prior on standard deviation in the deviation in the + R(t) trend} + \item{\code{infection_feedback_prior_logmean}}{Prior on the log mean + infection feedback term} + \item{\code{infection_feedback_prior_logsd}}{Prior on log standard + deviation of the infection feedback term)} + \item{\code{p_hosp_mean}}{Prior on the mean infection hospital admissions + rate} + \item{\code{p_hosp_sd_logit}}{Prior on the logit scale of the standard deviation + of the mean infection hospital admissions rate} + \item{\code{p_hosp_sd_sd}}{Prior on the standard deviation in the time-varying + deviation in the infection hospital admissions rate} + \item{\code{inv_sqrt_phi_prior_mean}}{Prior on the mean of the inverse square + root of the negative binomial `phi` of the hospital admissions + observation process} + \item{\code{inv_sqrt_phi_prior_sd}}{Prior on the standard deviation of the + inverse square root of the negative binomial `phi` of the hospital + admissions observation process} + \item{\code{wday_effect_prior_mean}}{Prior on the mean amount of weight + assigned to each day of the week proportional to the whole week} + \item{\code{wday_effect_prior_sd}}{Prior on the standard deviation in the + amount of weight assigned to each day of the week proportional to the whole + week} + \item{\code{ml_of_ww_per_person_day}}{Set value of the estimated number + of mL of wastewter produced per person per day} + \item{\code{t_peak_mean}}{Prior on the mean time of peak fecal shedding in + wastewater} + \item{\code{t_peak_sd}}{Prior on the standard deviation in the time of peak + fecal shedding in wastewater} + \item{\code{viral_peak_mean}}{Prior on mean peak viral load (in log10 scale) + of fecal shedding of viral genomes in wastewaster} + \item{\code{viral_peak_sd}}{Prior on the standard deviation in the peak viral + load (in log10 scale) of fecal shedding of viral genomes in wasteawter} + \item{\code{duration_shedding_mean}}{Prior on mean duration of fecal shedding + in wastewater} + \item{\code{duration_shedding_sd}}{Prior on the standard deviation of + fecal shedding in wastewater} + \item{\code{log10_g_prior_mean}}{Prior on the mean number of genomes shed + per infected individual in log10 scale} + \item{\code{log10_g_prior_sd}}{Prior on standard deviation in the number of + genomes shed per infected individual in log10 scale} + \item{\code{log_g_prior_mean}}{Prior on the mean number of genomes shed + per infected individual in log scale} + \item{\code{log_g_prior_sd}}{Prior on standard deviation in the number of + genomes shed per infected individual in log scale} + \item{\code{sigma_ww_site_prior_mean_mean}}{Prior on mean of the mean + site-lab-level observation error} + \item{\code{sigma_ww_site_prior_mean_sd}}{Prior on standard deviation of the + mean site-lab-level observation error} + \item{\code{sigma_ww_site_prior_sd_mean}}{Prior on the mean of the standard + deviation across sites in site-lab-level observation errors} + \item{\code{sigma_ww_site_prior_sd_sd}}{Prior on standard deviaiton of the + standard deviation across sites in site-lab level observation errors} + \item{\code{ww_site_mod_sd_sd}}{Prior on the standard deviation in the + standard deviation between site-lab level multipliers} + \item{\code{log_phi_g_prior_mean}}{Prior on mean individual level dispersion + in number of genomes shed per infection, not currently used in the model} + \item{\code{log_phi_g_prior_sd}}{Prior on the standard deviation in + individual level dispersion in number of genomes shed per infection, not + currently used in the model} + \item{\code{mu_gi}}{Set parameter for the log mean of the generation + interval} + \item{\code{sigma_gi}}{Set parameter for the log standard deviaiton of the + generation interval} + \item{\code{gt_max}}{Integer indicating the maximum number of days + for which secondary transmission can occur} + \item{\code{r}}{Exponential growth rate on the correction factor applied to + the incubation period estimate} + \item{\code{backward_shape}}{Set parameter for the backward shape parameter + of the Weibull distribution for the incubation period} + \item{\code{backward_scale}}{Set parameter for the backward scale parameter + of the Weibull distribution for the incubation period} + \item{\code{neg_binom_mu}}{Set parameter for the mean of the negative + binomially distributed time from symptom onset to hospital admission} + \item{\code{neg_binom_size}}{Set parameter for the size parameter of the + negative binomially distributed time from symptom onset to hospital + admissions} +} +} +\details{ +These parameters specify the priors on key parameters, except where indicated +as set parameters. In these instances (generation interval, incubation period, +symptom onset to hospital admissions) we pre-define the delay distrubions using +estimates from the literature. +} +\source{ +The set parameters for the generation interval and incubation period were +obtained from: +Park, Sang Woo, et al. "Inferring the differences in incubation-period +and generation-interval distributions of the Delta and Omicron variants of +SARS-CoV-2." Proceedings of the National Academy of Sciences 120.22 (2023): +e2221887120. + +See [Model Parameters]([`Model Parameters`](https://github.com/CDCgov/wastewater-informed-covid-forecasting/blob/prod/model_definition.md#model-parameters) +for the full set of references for each prior. +} +\references{ +%% ~~ possibly secondary sources and usages ~~ +} +\keyword{datasets} diff --git a/cfaforecastrenewalww/man/get_plot_change_in_p_hosp_t.Rd b/cfaforecastrenewalww/man/get_plot_change_in_p_hosp_t.Rd new file mode 100644 index 00000000..bb0e2c49 --- /dev/null +++ b/cfaforecastrenewalww/man/get_plot_change_in_p_hosp_t.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{get_plot_change_in_p_hosp_t} +\alias{get_plot_change_in_p_hosp_t} +\title{Get plot of the change in IHR over time} +\usage{ +get_plot_change_in_p_hosp_t(df, figure_file_path, write_files = TRUE) +} +\arguments{ +\item{df}{dataframe of filepaths} + +\item{figure_file_path}{where to write the figure to} + +\item{write_files}{whether or not to save the figure, default is TRUE} +} +\value{ +a plot of the change in the IHR over time (IHR intercept - IHR(t)) +} +\description{ +Get plot of the change in IHR over time +} diff --git a/cfaforecastrenewalww/man/get_plot_p_hosp_t.Rd b/cfaforecastrenewalww/man/get_plot_p_hosp_t.Rd new file mode 100644 index 00000000..765480eb --- /dev/null +++ b/cfaforecastrenewalww/man/get_plot_p_hosp_t.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{get_plot_p_hosp_t} +\alias{get_plot_p_hosp_t} +\title{Get plot of IHR(t)} +\usage{ +get_plot_p_hosp_t( + df, + figure_file_path, + params, + from_full_df = FALSE, + write_files = TRUE +) +} +\arguments{ +\item{df}{dataframe either of model draws or file path to model draws} + +\item{figure_file_path}{where to save the figure} + +\item{params}{input params, used to plot the prior} + +\item{from_full_df}{whether df is a dataframe of draws or filepath to draws, +default is FALSE} + +\item{write_files}{whether to save the figure, default is TRUE} +} +\description{ +Get plot of IHR(t) +} diff --git a/cfaforecastrenewalww/man/get_plot_single_param.Rd b/cfaforecastrenewalww/man/get_plot_single_param.Rd new file mode 100644 index 00000000..9a8c8077 --- /dev/null +++ b/cfaforecastrenewalww/man/get_plot_single_param.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plots.R +\name{get_plot_single_param} +\alias{get_plot_single_param} +\title{Get plot single parameter} +\usage{ +get_plot_single_param( + df, + param_name, + figure_file_path, + from_full_df = FALSE, + write_files = TRUE, + ... +) +} +\arguments{ +\item{df}{df of filepaths or of draws themselves} + +\item{param_name}{the name of the parameter to plot} + +\item{figure_file_path}{path to save figure} + +\item{from_full_df}{whether or not df is a df of filepaths or of draws, +default = FALSE which means it is of filepaths} + +\item{write_files}{whether or not to write files (default = TRUE)} + +\item{...}{} +} +\description{ +Get plot single parameter +} diff --git a/cfaforecastrenewalww/man/get_rt_from_draws.Rd b/cfaforecastrenewalww/man/get_rt_from_draws.Rd index e86058d6..86c88646 100644 --- a/cfaforecastrenewalww/man/get_rt_from_draws.Rd +++ b/cfaforecastrenewalww/man/get_rt_from_draws.Rd @@ -12,6 +12,8 @@ get_rt_from_draws( ) } \arguments{ +\item{df}{dataframe of either model draws or filepath to model draws} + \item{figure_file_path}{higher level directory where data from this pipeline will be saved} \item{from_full_df}{if TRUE then df is draws object, if FALSE then df is a @@ -19,8 +21,6 @@ dataframe of filepaths to load in the parameter object} \item{write_files}{if TRUE then write files to specified location, if FALSE then don't} - -\item{quantiles}{dataframe of forecasts with quantiles including for R(t)} } \value{ plot of R(t) for a single location, model, and forecast date diff --git a/cfaforecastrenewalww/man/param_df.Rd b/cfaforecastrenewalww/man/param_df.Rd index 34654aea..7957828c 100644 --- a/cfaforecastrenewalww/man/param_df.Rd +++ b/cfaforecastrenewalww/man/param_df.Rd @@ -3,24 +3,27 @@ \docType{data} \name{param_df} \alias{param_df} -\title{Example input parameters} +\title{Input parameters used to generate `example_df`} \format{ ## param_df A data frame with 305 rows and 4 columns \describe{ \item{name}{Parameter name} - \item{true_value}{} - \item{index_rows}{} - \item{index_cols}{} + \item{true_value}{ The value that was assigned to this parameter in the + data generation process} + \item{index_rows}{If the data is dynamic, the other indexing } + \item{index_cols}{If the data is dynamic the indexing (e.g. time in + days or weeks) } } } \source{ -wwww.wherever.com +generate_simulated_data.R } \usage{ param_df } \description{ -SOME HIGH-LEVEL DESCRIPTION OF THE DATA, INCLUDING HOW DERIVED +A few key parameters that were created by the default arguments in +`generate_simulated_data.R` } \keyword{datasets} diff --git a/cfaforecastrenewalww/tests/testthat/test_hosp_transformation.R b/cfaforecastrenewalww/tests/testthat/test_hosp_transformation.R index 8d5592cc..191aa853 100644 --- a/cfaforecastrenewalww/tests/testthat/test_hosp_transformation.R +++ b/cfaforecastrenewalww/tests/testthat/test_hosp_transformation.R @@ -7,24 +7,26 @@ test_that("Test logit-scale random walk on probability of being hospitalized in # Make sure we cover a wide range sigma <- 0.5 - lower <- -20 - upper <- 20 - delta <- ((upper - lower) / (nweeks - 1)) + ac <- 0.1 + std_normal <- rnorm((nweeks)) + mu <- rep(logit_fn(0.01), (nweeks)) - # Build the vector ourselves - p_hosp_r <- numeric(nweeks) - p_hosp_r <- lower + c(0, cumsum(rep(delta, nweeks - 1))) - p_hosp_r <- inv_logit_fn(p_hosp_r) - p_hosp_r <- sort(rep(p_hosp_r, 7)) # Make daily, will overshoot + # Build the vector ourselves using the AR1 function from stan + p_hosp_r <- model$functions$ar1(mu, ac, sigma, std_normal, 1) + p_hosp_r <- rep(p_hosp_r, each = 7) # In R, expand weekly to daily + p_hosp_r <- inv_logit_fn(p_hosp_r) # convert to natural scale p_hosp_r <- p_hosp_r[1:ndays] # Trim to size + # Get vector from stan and compare - delta <- delta / sigma # make sure stan SD != 1, but canceling them out makes comparing easy p_hosp_stan <- model$functions$assemble_p_hosp( - toy_stan_data_id$p_hosp_m, # cum sum and expand matrix - lower, # intercept + toy_stan_data_id$p_hosp_m, # matrix to expand from weekly to daily + mu[1], # intercept to regress back to sigma, # SD - rep(delta, nweeks - 1) # noise terms + ac, # autocorrelation factor + std_normal, + nweeks, + 1 ) testthat::expect_equal( diff --git a/cfaforecastrenewalww/vignettes/toy_data_vignette.Rmd b/cfaforecastrenewalww/vignettes/toy_data_vignette.Rmd index 8b9f2bb0..a954ea24 100644 --- a/cfaforecastrenewalww/vignettes/toy_data_vignette.Rmd +++ b/cfaforecastrenewalww/vignettes/toy_data_vignette.Rmd @@ -9,7 +9,10 @@ This vignette steps through the pre-processing and fitting procedure to forecast state-level hospital admissions using simulated data. The code can be adapted for your own datasets by replacing the joined hospital admissions + site-lab level wastewater dataset, here called, `example_df`, with your own dataset and -varying the parameters that are pre-specified. +varying the parameters that are pre-specified. See the documentation for +`example_df` to format your data in a similar manner. For an example of how to +do this from the raw generated outputs, see the `generate_simulated_data.R` +function. ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) @@ -28,26 +31,26 @@ library(tidybayes) # by calling `generate_simulated_data()` # See function documentation to modify R(t), number of sites, etc. example_df <- cfaforecastrenewalww::example_df -param_df <- cfaforecastrenewalww::param_df ``` The `generate_simulated_data.R` file can be used to generate this simulated -data, using the most complex model, where individual sites -exhibit their own site-level infection dynamics, site-level time-varying -concentrations of wastewater, and lab-site level measurement effects. There -are options to -turn off all but the lab-site measurement effects. Setting `n_sites` to one +data where individual sites exhibit their own site-level infection dynamics. +Setting `n_sites` to one allows the user to generate data amenable to the state-level aggregated wastewater model, or alternatively, site-level data can be generated and then aggregated using population weighted averaging of the concentration to generate -a single state-level wastewater data stream. +a single state-level wastewater data stream. However, this vignette assumes you +have wastewater data from one or more sites that represent some subset of the +population for which you have hospital admissions data for. # Data exploration -The example dataframe is a tidy data frame with hospital admissions alongside +The example dataframe is a long tidy data frame with hospital admissions alongside site-level wastewater data, corresponding to a single observation for each lab-site-day. We assign an arbitrary date to the time series so that we can assign a week day to each day. We additionally include a column containing -the daily hospital admissions through the forecast period. This will +the daily hospital admissions through the forecast period +`daily_hosp_admits_for_eval`. This will be used for evaluating our simulated data inference procedure. +See documentation of `example_df` for a full description of each column. ```{r} head(example_df) ``` @@ -154,7 +157,7 @@ train_data <- flag_ww_outliers(train_data_raw) # delay distribution to pass to stan. # Use the same values as we do in the # generation of these vectors in the generate simulated data function.... -# See Song Woo Park et al +# See Park et al # https://www.medrxiv.org/content/10.1101/2024.01.12.24301247v1 for why # we use a double-censored pmf here generation_interval <- simulate_double_censored_pmf( @@ -328,6 +331,8 @@ full_param_df <- get_full_param_distrib( ) # Compare to the knows parameters +param_df <- cfaforecastrenewalww::param_df # A set of the set static and dynamic +# from the `generate_simulated_data()` function default args. This is package data comp_df <- full_param_df %>% left_join(param_df, by = c("name", "index_rows", "index_cols") diff --git a/input/params.toml b/input/params.toml index 520ef7ca..ab37e68b 100644 --- a/input/params.toml +++ b/input/params.toml @@ -18,6 +18,8 @@ initial_growth_prior_sd = 0.01 autoreg_rt_a = 2 # shape1 parameter of autoreg term on Rt trend autoreg_rt_b = 40 # shape2 parameter of autoreg on Rt trend # mean = a/(a+b) = 0.05, stdv = sqrt(a)/b = sqrt(2)/40 = 0.035 +autoreg_p_hosp_a = 1 # shape1 parameter of autoreg term on IHR(t) trend +autoreg_p_hosp_b = 100 # shape2 parameter of autoreg term on IHR(t) trend eta_sd_sd = 0.01 infection_feedback_prior_logmean = 6.37408 # log(mode) + q^2 mode = 500, q = 0.4 infection_feedback_prior_logsd = 0.4 diff --git a/model_definition.md b/model_definition.md index 84e5caf4..499e9789 100644 --- a/model_definition.md +++ b/model_definition.md @@ -126,17 +126,24 @@ We define the discrete hospital admissions delay distribution $d(\tau)$ as a con #### Infection-hospitalization rate In the models that include fits to wastewater data, we allow the population-level infection-hospitalization rate (IHR) to change over time. An inferred change in the IHR could reflect either a true change in the rate at which infections result in hospital admissions (e.g. the age distribution of cases could shift, a more or less severe variant could emerge, or vaccine coverage could shift) or a change in the relationship between infections and genomes shed in wastewater $G$ (which we currently treat as fixed, but which could change in time if, for example, immunity reduces wastewater shedding without reducing transmission, or a variant emerges with a different per infection wastewater shedding profile). -Therefore, we model the hospitalization proportion $p_\mathrm{hosp}(t)$ as a piecewise-constant function with weekly change points. +Therefore, we model the proportion of infections that give rise to hospital admissions $p_\mathrm{hosp}(t)$ as a piecewise-constant function with weekly change points. If $t$ and $t'$ are two days in the same week, then $p_\mathrm{hosp}(t) = p_\mathrm{hosp}(t')$. -The values $p_\mathrm{hosp}(t)$ follow a random walk. -For $t_1$ and $t_2$ in two successive weeks: +The values $p_\mathrm{hosp}(t)$ follow a logit-scale AR(1) process with linear scale median $\mu_{p_\mathrm{hosp}}$ -$$ \mathrm{logit} (p_{\mathrm{hosp}}(t_2)) \sim \mathrm{Normal}(\mathrm{logit}(p_{\mathrm{hosp}}(t_1)), \sigma_p)$$ -The process is initialized at the calibration start time $t_0$ via a prior distribution (see [Prior Distributions](#prior-distributions) below). +$$ \mathrm{logit} (p_{\mathrm{hosp}}(t)) = \mathrm{logit} (\mu_{p_{\mathrm{hosp}}}(t)) + \delta_{H}(t)$$ -In hospital admissions only models, we model the IHR as constant. We assign this constant IHR the same prior distribution we assign to the intercept of the time-varying IHR in the wastewater-informed models. +where $\delta_{H}(t)$ is the time-varying site effect on $\mathrm{logit} (p_{\mathrm{hosp}}(t))$, modeled as, + +$$\delta_{H}(t) = \varphi_H \delta_{H}(t-1) + \epsilon_{Ht}$$ + +where $0 < \varphi_H < 1$ and $\epsilon_{Ht} \sim \mathrm{Normal}(0, \sigma_{H\delta})$. + +We chose a relatively strong prior of $\varphi_H \sim \mathrm{beta}(1,100)$ to impose minimal autocorrelation, and similarly chose a small prior on $\sigma_{H\delta} \sim \mathrm{Normal}(0, 0.01)$ to impose our believe that the IHR should only change in time if the data strongly suggests it. +See [Prior Distributions](#prior-distributions) for the specified prior on $\mu_{p_\mathrm{hosp}}$. + +In hospital admissions only models, we model the IHR as constant. We assign this constant IHR the same prior distribution that we assign $\mu_{p_\mathrm{hosp}}$ in the wastewater model. ### Hospital admissions observation model We model the observed hospital admission counts $h_t$ as: @@ -222,15 +229,19 @@ $$ where $\delta_k(t)$ is the time-varying subpopulation effect on $\mathcal{R}(t)$, modeled as, -$$\delta_k(t) = \varphi \delta_k(t-1) + \epsilon_{kt}$$ +$$\delta_k(t) = \varphi_{R(t)} \delta_k(t-1) + \epsilon_{kt}$$ -where $0 < \varphi < 1$ and $\epsilon_{kt} \sim \mathrm{Normal}(0, \sigma_\delta)$. +where $0 < \varphi_{R(t)} < 1$ and $\epsilon_{kt} \sim \mathrm{Normal}(0, \sigma_{R(t)\delta})$. + +We chose a prior of $\varphi_{R(t)} \sim \mathrm{beta}(2,40)$ to impose limited autocorrelation in the week-by-week deviations. +We set a weakly informative prior $\sigma_{R(t)\delta} \sim \mathrm{Normal}(0, 0.3)$ to allow for either limited or substantial site-site heterogeneity in $\mathcal{R}(t)$, with the degree of heterogeneity inferred from the data. The subpopulation $\mathcal{R}_{k}(t)$ is subject to the infection feedback described above such that: ```math \mathcal{R}_k(t) = \mathcal{R}^\mathrm{u}_k(t) \exp \left(-\gamma \sum_{\tau = 1}^{T_f} I_k(t-\tau) g(\tau) \right) ``` + From $\mathcal{R}_{k}(t)$, we generate values of the supopulation-level _expected_ latent incident infections per capita $I_k(t)$ using the renewal process described in [the infection component](#infection-component). To obtain the number of statewide infections per capita $I(t)$, we sum the $K_\mathrm{total}$ subpopulation per capita infection counts $I_k(t)$ weighted by their population sizes: @@ -238,6 +249,7 @@ To obtain the number of statewide infections per capita $I(t)$, we sum the $K_\m ```math I(t) = \frac{1}{\sum\nolimits_{k=1}^{K_\mathrm{total}} n_k} \sum_{k=1}^{K_\mathrm{total}} n_k I_k(t) ``` + We infer the site level initial per capita incidence $I_k(0)$ hierarchically. Specifically, we treat $\mathrm{logit}[I_k(0)]$ as Normally distributed about the corresponding state-level value $\mathrm{logit}[I(0)]$, with an estimated standard deviation $\sigma_{i0}$: $$ @@ -367,7 +379,7 @@ This resulting infection to hospital admission delay distribution has a mean of Our framework is an extension of the widely used [^CDCRtestimates] [^CDCtechnicalblog], semi-mechanistic renewal framework `{EpiNow2}` [^epinow2paper][^EpiNow2], using a Bayesian latent variable approach implemented in the probabilistic programming language Stan [^stan] using [^cmdstanr] to interface with R. For submission to the [COVID-19 Forecast Hub](https://github.com/reichlab/covid19-forecast-hub/tree/master), the model is run on Saturday to generate forecasts each Monday. -For each location, we run 4 chains for 250 warm-up iterations and 500 sampling iterations, with a target average acceptance probability of 99% and a maximum tree depth of 12. +For each location, we run 4 chains for 750 warm-up iterations and 500 sampling iterations, with a target average acceptance probability of 0.95 and a maximum tree depth of 12. To generate forecasts per the hub submission guidelines, we calculate the necessary quantiles from the 2,000 draws from the posterior of the expected observed hospital admissions 28 days ahead of the Monday forecast date. ## Appendix: Notation diff --git a/src/write_config.R b/src/write_config.R index 4eb0d31a..0f0f5e08 100644 --- a/src/write_config.R +++ b/src/write_config.R @@ -103,22 +103,38 @@ write_config <- function(run_id, dates_for_hosp_removal <- c( seq( - from = lubridate::ymd("2024-03-13"), - to = lubridate::ymd("2024-03-14"), + from = lubridate::ymd("2024-02-27"), + to = lubridate::ymd("2024-02-29"), by = "days" - ), # OR exclude two repeats + ), # NM exclude repeats seq( from = lubridate::ymd("2024-02-27"), - to = lubridate::ymd("2024-02-29"), + to = lubridate::ymd("2024-03-02"), + by = "days" + ), # TX anomalies + seq( + from = lubridate::ymd("2024-02-27"), + to = lubridate::ymd("2024-03-02"), + by = "days" + ), # US exclusions bc of TX anomalies + seq( + from = lubridate::ymd("2024-03-18"), + to = lubridate::ymd("2024-03-19"), + by = "days" + ), # NJ repeats + seq( + from = lubridate::ymd("2024-03-16"), + to = lubridate::ymd("2024-03-22"), by = "days" - ), # NM exclude repeats, - lubridate::ymd("2023-03-14") # second to last MS data point + ) # OR anomalies ) states_for_hosp_removal <- c( - rep("OR", 2), rep("NM", 3), - "MS" + rep("TX", 5), + rep("US", 5), + rep("NJ", 2), + rep("OR", 7) )