diff --git a/NAMESPACE b/NAMESPACE index b3e0bc8ae..459db88c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,8 @@ S3method(extrapolate_quantiles,dist_default) S3method(extrapolate_quantiles,dist_quantiles) S3method(extrapolate_quantiles,distribution) S3method(fit,epi_workflow) +S3method(flusight_hub_formatter,canned_epipred) +S3method(flusight_hub_formatter,data.frame) S3method(format,dist_quantiles) S3method(is.na,dist_quantiles) S3method(is.na,distribution) @@ -126,6 +128,7 @@ export(fit) export(flatline) export(flatline_args_list) export(flatline_forecaster) +export(flusight_hub_formatter) export(frosting) export(get_test_data) export(grab_names) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index 62e5172cb..ea8e35a06 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -12,13 +12,13 @@ #' This forecaster is meant to produce exactly the CDC Baseline used for #' [COVID19ForecastHub](https://covid19forecasthub.org) #' -#' @param epi_data An [epiprocess::epi_df] +#' @param epi_data An [`epiprocess::epi_df`] #' @param outcome A scalar character for the column name we wish to predict. #' @param args_list A list of additional arguments as created by the #' [cdc_baseline_args_list()] constructor function. #' -#' @return A data frame of point and interval forecasts at for all -#' aheads (unique horizons) for each unique combination of `key_vars`. +#' @return A data frame of point and interval forecasts for all aheads (unique +#' horizons) for each unique combination of `key_vars`. #' @export #' #' @examples @@ -26,7 +26,7 @@ #' weekly_deaths <- case_death_rate_subset %>% #' select(geo_value, time_value, death_rate) %>% #' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% -#' mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% #' select(-pop, -death_rate) %>% #' group_by(geo_value) %>% #' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R new file mode 100644 index 000000000..d433ab2a7 --- /dev/null +++ b/R/flusight_hub_formatter.R @@ -0,0 +1,131 @@ +abbr_to_fips <- function(abbr) { + fi <- dplyr::left_join( + tibble::tibble(abbr = tolower(abbr)), + state_census, by = "abbr") %>% + dplyr::mutate(fips = as.character(fips), fips = case_when( + fips == "0" ~ "US", + nchar(fips) < 2L ~ paste0("0", fips), + TRUE ~ fips + )) %>% + pull(.data$fips) + names(fi) <- NULL + fi +} + +#' Format predictions for submission to FluSight forecast Hub +#' +#' This function converts predictions from any of the included forecasters into +#' a format (nearly) ready for submission to the 2023-24 +#' [FluSight-forecast-hub](https://github.com/cdcepi/FluSight-forecast-hub). +#' See there for documentation of the required columns. Currently, only +#' "quantile" forcasts are supported, but the intention is to support both +#' "quantile" and "pmf". For this reason, adding the `output_type` column should +#' be done via the `...` argument. See the examples below. The specific required +#' format for this forecast task is [here](https://github.com/cdcepi/FluSight-forecast-hub/blob/main/model-output/README.md). +#' +#' @param object a data.frame of predictions or an object of class +#' `canned_epipred` as created by, e.g., [arx_forecaster()] +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name = value pairs of constant +#' columns (or mutations) to perform to the results. See examples. +#' @param .fcast_period Control whether the `horizon` should represent days or +#' weeks. Depending on whether the forecaster output has target dates +#' from [layer_add_target_date()] or not, we may need to compute the horizon +#' and/or the `target_end_date` from the other available columns in the predictions. +#' When both `ahead` and `target_date` are available, this is ignored. If only +#' `ahead` or `aheads` exists, then the target date may need to be multiplied +#' if the `ahead` represents weekly forecasts. Alternatively, if only, the +#' `target_date` is available, then the `horizon` will be in days, unless +#' this argument is `"weekly"`. Note that these can be adjusted later by the +#' `...` argument. +#' +#' @return A [tibble::tibble]. If `...` is empty, the result will contain the +#' columns `reference_date`, `horizon`, `target_end_date`, `location`, +#' `output_type_id`, and `value`. The `...` can perform mutations on any of +#' these. +#' @export +#' +#' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' flusight_hub_formatter(cdc) +#' flusight_hub_formatter(cdc, target = "wk inc covid deaths") +#' flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) +#' flusight_hub_formatter(cdc, target = "wk inc covid deaths", output_type = "quantile") +flusight_hub_formatter <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + UseMethod("flusight_hub_formatter") +} + +#' @export +flusight_hub_formatter.canned_epipred <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + flusight_hub_formatter(object$predictions, ..., .fcast_period = .fcast_period) +} + +#' @export +flusight_hub_formatter.data.frame <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + required_names <- c(".pred", ".pred_distn", "forecast_date", "geo_value") + optional_names <- c("ahead", "target_date") + hardhat::validate_column_names(object, required_names) + if (!any(optional_names %in% names(object))) { + cli::cli_abort("At least one of {.val {optional_names}} must be present.") + } + + dots <- enquos(..., .named = TRUE) + names <- names(dots) + + object <- object %>% + # combine the predictions and the distribution + dplyr::mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn = list(add_row(.pred_distn, q = .pred, tau = NA)), + .pred = NULL + ) %>% + tidyr::unnest(.pred_distn) %>% + # now we create the correct column names + dplyr::rename( + value = q, + output_type_id = tau, + reference_date = forecast_date + ) %>% + # convert to fips codes, and add any constant cols passed in ... + dplyr::mutate(location = abbr_to_fips(tolower(geo_value)), geo_value = NULL) + + # create target_end_date / horizon, depending on what is available + pp <- ifelse(match.arg(.fcast_period) == "daily", 1L, 7L) + has_ahead <- charmatch("ahead", names(object)) + if ("target_date" %in% names(object) && !is.na(has_ahead)) { + object <- object %>% + dplyr::rename( + target_end_date = target_date, + horizon = !!names(object)[has_ahead] + ) + } else if (!is.na(has_ahead)) { # ahead present, not target date + object <- object %>% + dplyr::rename(horizon = !!names(object)[has_ahead]) %>% + dplyr::mutate(target_end_date = horizon * pp + reference_date) + } else { # target_date present, not ahead + object <- object %>% + dplyr::rename(target_end_date = target_date) %>% + dplyr::mutate(horizon = as.integer((target_end_date - reference_date)) / pp) + } + object %>% dplyr::relocate( + reference_date, horizon, target_end_date, location, output_type_id, value + ) %>% + dplyr::mutate(!!!dots) +} diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index bd2af0bf6..a024905cd 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -22,20 +22,30 @@ #' @inheritParams layer_residual_quantiles #' @param aheads Numeric vector of desired forecast horizons. These should be #' given in the "units of the training data". So, for example, for data -#' typically observed daily (possibly with missing values), but -#' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with -#' weekly data, you would use `1:4`. +#' typically observed daily (possibly with missing values), but with weekly +#' forecast targets, you would use `c(7, 14, 21, 28)`. But with weekly data, +#' you would use `1:4`. #' @param quantile_levels Numeric vector of probabilities with values in (0,1) #' referring to the desired predictive intervals. The default is the standard #' set for the COVID Forecast Hub. #' @param nsims Positive integer. The number of draws from the empirical CDF. -#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting -#' in linear interpolation on the X scale. This is achieved with +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in +#' linear interpolation on the X scale. This is achieved with #' [stats::quantile()] Type 7 (the default for that function). -#' @param nonneg Logical. Force all predictive intervals be non-negative. -#' Because non-negativity is forced _before_ propagating forward, this -#' has slightly different behaviour than would occur if using -#' [layer_threshold()]. +#' @param symmetrize Scalar logical. If `TRUE`, does two things: (i) forces the +#' "empirical" CDF of residuals to be symmetric by pretending that for every +#' actually-observed residual X we also observed another residual -X, and (ii) +#' at each ahead, forces the median simulated value to be equal to the point +#' prediction by adding or subtracting the same amount to every simulated +#' value. Adjustments in (ii) take place before propagating forward and +#' simulating the next ahead. This forces any 1-ahead predictive intervals to +#' be symmetric about the point prediction, and encourages larger aheads to be +#' more symmetric. +#' @param nonneg Scalar logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this has +#' slightly different behaviour than would occur if using [layer_threshold()]. +#' Thresholding at each ahead takes place after any shifting from +#' `symmetrize`. #' #' @return an updated `frosting` postprocessor. Calling [predict()] will result #' in an additional `` named `.pred_distn_all` containing 2-column @@ -213,7 +223,7 @@ slather.layer_cdc_flatline_quantiles <- res <- dplyr::left_join(p, r, by = avail_grps) %>% dplyr::rowwise() %>% dplyr::mutate( - .pred_distn_all = propogate_samples( + .pred_distn_all = propagate_samples( .resid, .pred, object$quantile_levels, object$aheads, object$nsim, object$symmetrize, object$nonneg ) @@ -229,10 +239,14 @@ slather.layer_cdc_flatline_quantiles <- components } -propogate_samples <- function( +propagate_samples <- function( r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { max_ahead <- max(aheads) - samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) + if (symmetrize) { + r <- c(r, -r) + } + samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), + na.rm = TRUE, names = FALSE) res <- list() raw <- samp + p diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 7e62a0521..122903649 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -11,7 +11,7 @@ cdc_baseline_forecaster( ) } \arguments{ -\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} +\item{epi_data}{An \code{\link[epiprocess:epi_df]{epiprocess::epi_df}}} \item{outcome}{A scalar character for the column name we wish to predict.} @@ -19,8 +19,8 @@ cdc_baseline_forecaster( \code{\link[=cdc_baseline_args_list]{cdc_baseline_args_list()}} constructor function.} } \value{ -A data frame of point and interval forecasts at for all -aheads (unique horizons) for each unique combination of \code{key_vars}. +A data frame of point and interval forecasts for all aheads (unique +horizons) for each unique combination of \code{key_vars}. } \description{ This is a simple forecasting model for @@ -41,7 +41,7 @@ library(dplyr) weekly_deaths <- case_death_rate_subset \%>\% select(geo_value, time_value, death_rate) \%>\% left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% - mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% select(-pop, -death_rate) \%>\% group_by(geo_value) \%>\% epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd new file mode 100644 index 000000000..d8a4571f4 --- /dev/null +++ b/man/flusight_hub_formatter.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flusight_hub_formatter.R +\name{flusight_hub_formatter} +\alias{flusight_hub_formatter} +\title{Format predictions for submission to FluSight forecast Hub} +\usage{ +flusight_hub_formatter(object, ..., .fcast_period = c("daily", "weekly")) +} +\arguments{ +\item{object}{a data.frame of predictions or an object of class +\code{canned_epipred} as created by, e.g., \code{\link[=arx_forecaster]{arx_forecaster()}}} + +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name = value pairs of constant +columns (or mutations) to perform to the results. See examples.} + +\item{.fcast_period}{Control whether the \code{horizon} should represent days or +weeks. Depending on whether the forecaster output has target dates +from \code{\link[=layer_add_target_date]{layer_add_target_date()}} or not, we may need to compute the horizon +and/or the \code{target_end_date} from the other available columns in the predictions. +When both \code{ahead} and \code{target_date} are available, this is ignored. If only +\code{ahead} or \code{aheads} exists, then the target date may need to be multiplied +if the \code{ahead} represents weekly forecasts. Alternatively, if only, the +\code{target_date} is available, then the \code{horizon} will be in days, unless +this argument is \code{"weekly"}. Note that these can be adjusted later by the +\code{...} argument.} +} +\value{ +A \link[tibble:tibble]{tibble::tibble}. If \code{...} is empty, the result will contain the +columns \code{reference_date}, \code{horizon}, \code{target_end_date}, \code{location}, +\code{output_type_id}, and \code{value}. The \code{...} can perform mutations on any of +these. +} +\description{ +This function converts predictions from any of the included forecasters into +a format (nearly) ready for submission to the 2023-24 +\href{https://github.com/cdcepi/FluSight-forecast-hub}{FluSight-forecast-hub}. +See there for documentation of the required columns. Currently, only +"quantile" forcasts are supported, but the intention is to support both +"quantile" and "pmf". For this reason, adding the \code{output_type} column should +be done via the \code{...} argument. See the examples below. The specific required +format for this forecast task is \href{https://github.com/cdcepi/FluSight-forecast-hub/blob/main/model-output/README.md}{here}. +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +flusight_hub_formatter(cdc) +flusight_hub_formatter(cdc, target = "wk inc covid deaths") +flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) +flusight_hub_formatter(cdc, target = "wk inc covid deaths", output_type = "quantile") +} diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 594c63afb..1340698d4 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -23,28 +23,37 @@ layer_cdc_flatline_quantiles( \item{aheads}{Numeric vector of desired forecast horizons. These should be given in the "units of the training data". So, for example, for data -typically observed daily (possibly with missing values), but -with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with -weekly data, you would use \code{1:4}.} +typically observed daily (possibly with missing values), but with weekly +forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, +you would use \code{1:4}.} \item{quantile_levels}{Numeric vector of probabilities with values in (0,1) referring to the desired predictive intervals. The default is the standard set for the COVID Forecast Hub.} \item{nsims}{Positive integer. The number of draws from the empirical CDF. -These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting -in linear interpolation on the X scale. This is achieved with +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in +linear interpolation on the X scale. This is achieved with \code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} \item{by_key}{A character vector of keys to group the residuals by before calculating quantiles. The default, \code{c()} performs no grouping.} -\item{symmetrize}{logical. If \code{TRUE} then interval will be symmetric.} +\item{symmetrize}{Scalar logical. If \code{TRUE}, does two things: (i) forces the +"empirical" CDF of residuals to be symmetric by pretending that for every +actually-observed residual X we also observed another residual -X, and (ii) +at each ahead, forces the median simulated value to be equal to the point +prediction by adding or subtracting the same amount to every simulated +value. Adjustments in (ii) take place before propagating forward and +simulating the next ahead. This forces any 1-ahead predictive intervals to +be symmetric about the point prediction, and encourages larger aheads to be +more symmetric.} -\item{nonneg}{Logical. Force all predictive intervals be non-negative. -Because non-negativity is forced \emph{before} propagating forward, this -has slightly different behaviour than would occur if using -\code{\link[=layer_threshold]{layer_threshold()}}.} +\item{nonneg}{Scalar logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this has +slightly different behaviour than would occur if using \code{\link[=layer_threshold]{layer_threshold()}}. +Thresholding at each ahead takes place after any shifting from +\code{symmetrize}.} \item{id}{a random id string} } diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propagate_samples.R similarity index 72% rename from tests/testthat/test-propogate_samples.R rename to tests/testthat/test-propagate_samples.R index b8a1ff82d..5278ab385 100644 --- a/tests/testthat/test-propogate_samples.R +++ b/tests/testthat/test-propagate_samples.R @@ -1,4 +1,4 @@ -test_that("propogate_samples", { +test_that("propagate_samples", { r <- -30:50 p <- 40 quantiles <- 1:9 / 10