diff --git a/.Rbuildignore b/.Rbuildignore index 5139bcabe..cb36bb9d2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ ^musings$ ^data-raw$ ^vignettes/articles$ +^.git-blame-ignore-revs$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c4bcd6b68..eff7367ec 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to use API key. on: push: - branches: [main, master] + branches: [main, master, v0.0.6] pull_request: - branches: [main, master] + branches: [main, master, v0.0.6] name: R-CMD-check diff --git a/DESCRIPTION b/DESCRIPTION index 75602f072..eb6405df4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Imports: generics, glue, hardhat (>= 1.3.0), + lifecycle, magrittr, methods, quantreg, diff --git a/NAMESPACE b/NAMESPACE index b22ec53a5..a24e5844b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,6 +52,7 @@ S3method(print,alist) S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) +S3method(print,cdc_baseline_fcast) S3method(print,epi_workflow) S3method(print,flat_fcast) S3method(print,flatline) @@ -79,6 +80,7 @@ S3method(residuals,flatline) S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_add_forecast_date) S3method(slather,layer_add_target_date) +S3method(slather,layer_cdc_flatline_quantiles) S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) @@ -106,6 +108,8 @@ export(arx_classifier) export(arx_fcast_epi_workflow) export(arx_forecaster) export(bake) +export(cdc_baseline_args_list) +export(cdc_baseline_forecaster) export(create_layer) export(default_epi_recipe_blueprint) export(detect_layer) @@ -131,6 +135,7 @@ export(is_layer) export(layer) export(layer_add_forecast_date) export(layer_add_target_date) +export(layer_cdc_flatline_quantiles) export(layer_naomit) export(layer_point_from_distn) export(layer_population_scaling) @@ -143,7 +148,8 @@ export(layer_unnest) export(nested_quantiles) export(new_default_epi_recipe_blueprint) export(new_epi_recipe_blueprint) -export(pivot_quantiles) +export(pivot_quantiles_longer) +export(pivot_quantiles_wider) export(prep) export(quantile_reg) export(remove_frosting) @@ -167,6 +173,7 @@ importFrom(generics,augment) importFrom(generics,fit) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) +importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") importFrom(methods,is) importFrom(quantreg,rq) @@ -181,6 +188,7 @@ importFrom(rlang,caller_env) importFrom(rlang,is_empty) importFrom(rlang,is_null) importFrom(rlang,quos) +importFrom(smoothqr,smooth_qr) importFrom(stats,as.formula) importFrom(stats,family) importFrom(stats,lm) diff --git a/NEWS.md b/NEWS.md index fa99c8bcd..cba55a67d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,8 @@ * canned forecasters get a class * fixed quantile bug in `flatline_forecaster()` * add functionality to output the unfit workflow from the canned forecasters -* add `pivot_quantiles()` for easier plotting +* add `pivot_quantiles_wider()` for easier plotting +* add complement `pivot_quantiles_longer()` # epipredict 0.0.4 diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R new file mode 100644 index 000000000..62e5172cb --- /dev/null +++ b/R/cdc_baseline_forecaster.R @@ -0,0 +1,228 @@ +#' Predict the future with the most recent value +#' +#' This is a simple forecasting model for +#' [epiprocess::epi_df] data. It uses the most recent observation as the +#' forecast for any future date, and produces intervals by shuffling the quantiles +#' of the residuals of such a "flatline" forecast and incrementing these +#' forward over all available training data. +#' +#' By default, the predictive intervals are computed separately for each +#' combination of `geo_value` in the `epi_data` argument. +#' +#' This forecaster is meant to produce exactly the CDC Baseline used for +#' [COVID19ForecastHub](https://covid19forecasthub.org) +#' +#' @param epi_data An [epiprocess::epi_df] +#' @param outcome A scalar character for the column name we wish to predict. +#' @param args_list A list of additional arguments as created by the +#' [cdc_baseline_args_list()] constructor function. +#' +#' @return A data frame of point and interval forecasts at for all +#' aheads (unique horizons) for each unique combination of `key_vars`. +#' @export +#' +#' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) +#' +#' if (require(ggplot2)) { +#' forecast_date <- unique(preds$forecast_date) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = weekly_deaths %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = deaths) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Weekly deaths") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' } +cdc_baseline_forecaster <- function( + epi_data, + outcome, + args_list = cdc_baseline_args_list()) { + validate_forecaster_inputs(epi_data, outcome, "time_value") + if (!inherits(args_list, c("cdc_flat_fcast", "alist"))) { + cli_stop("args_list was not created using `cdc_baseline_args_list().") + } + keys <- epi_keys(epi_data) + ek <- kill_time_value(keys) + outcome <- rlang::sym(outcome) + + + r <- epi_recipe(epi_data) %>% + step_epi_ahead(!!outcome, ahead = args_list$data_frequency, skip = TRUE) %>% + recipes::update_role(!!outcome, new_role = "predictor") %>% + recipes::add_role(tidyselect::all_of(keys), new_role = "predictor") %>% + step_training_window(n_recent = args_list$n_training) + + forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) + # target_date <- args_list$target_date %||% forecast_date + args_list$ahead + + + latest <- get_test_data( + epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, + forecast_date + ) + + f <- frosting() %>% + layer_predict() %>% + layer_cdc_flatline_quantiles( + aheads = args_list$aheads, + quantile_levels = args_list$quantile_levels, + nsims = args_list$nsims, + by_key = args_list$quantile_by_key, + symmetrize = args_list$symmetrize, + nonneg = args_list$nonneg + ) %>% + layer_add_forecast_date(forecast_date = forecast_date) %>% + layer_unnest(.pred_distn_all) + # layer_add_target_date(target_date = target_date) + if (args_list$nonneg) f <- layer_threshold(f, ".pred") + + eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") + + wf <- epi_workflow(r, eng, f) + wf <- generics::fit(wf, epi_data) + preds <- suppressWarnings(predict(wf, new_data = latest)) %>% + tibble::as_tibble() %>% + dplyr::select(-time_value) %>% + dplyr::mutate(target_date = forecast_date + ahead * args_list$data_frequency) + + structure( + list( + predictions = preds, + epi_workflow = wf, + metadata = list( + training = attr(epi_data, "metadata"), + forecast_created = Sys.time() + ) + ), + class = c("cdc_baseline_fcast", "canned_epipred") + ) +} + + + +#' CDC baseline forecaster argument constructor +#' +#' Constructs a list of arguments for [cdc_baseline_forecaster()]. +#' +#' @inheritParams arx_args_list +#' @param data_frequency Integer or string. This describes the frequency of the +#' input `epi_df`. For typical FluSight forecasts, this would be `"1 week"`. +#' Allowable arguments are integers (taken to mean numbers of days) or a +#' string like `"7 days"` or `"2 weeks"`. Currently, all other periods +#' (other than days or weeks) result in an error. +#' @param aheads Integer vector. Unlike [arx_forecaster()], this doesn't have +#' any effect on the predicted values. +#' Predictions are always the most recent observation. This determines the +#' set of prediction horizons for [layer_cdc_flatline_quantiles()]`. It interacts +#' with the `data_frequency` argument. So, for example, if the data is daily +#' and you want forecasts for 1:4 days ahead, then you would use `1:4`. However, +#' if you want one-week predictions, you would set this as `c(7, 14, 21, 28)`. +#' But if `data_frequency` is `"1 week"`, then you would set it as `1:4`. +#' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce +#' prediction intervals. These are created by computing the quantiles of +#' training residuals. A `NULL` value will result in point forecasts only. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold()]. +#' +#' @return A list containing updated parameter choices with class `cdc_flat_fcast`. +#' @export +#' +#' @examples +#' cdc_baseline_args_list() +#' cdc_baseline_args_list(symmetrize = FALSE) +#' cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) +cdc_baseline_args_list <- function( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e3L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf) { + arg_is_scalar(n_training, nsims, data_frequency) + data_frequency <- parse_period(data_frequency) + arg_is_pos_int(data_frequency) + arg_is_chr(quantile_by_key, allow_empty = TRUE) + arg_is_scalar(forecast_date, allow_null = TRUE) + arg_is_date(forecast_date, allow_null = TRUE) + arg_is_nonneg_int(aheads, nsims) + arg_is_lgl(symmetrize, nonneg) + arg_is_probabilities(quantile_levels, allow_null = TRUE) + arg_is_pos(n_training) + if (is.finite(n_training)) arg_is_pos_int(n_training) + if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) + + structure( + enlist( + data_frequency, + aheads, + n_training, + forecast_date, + quantile_levels, + nsims, + symmetrize, + nonneg, + quantile_by_key, + nafill_buffer + ), + class = c("cdc_baseline_fcast", "alist") + ) +} + +#' @export +print.cdc_baseline_fcast <- function(x, ...) { + name <- "CDC Baseline" + NextMethod(name = name, ...) +} + +parse_period <- function(x) { + arg_is_scalar(x) + if (is.character(x)) { + x <- unlist(strsplit(x, " ")) + if (length(x) == 1L) x <- as.numeric(x) + if (length(x) == 2L) { + mult <- substr(x[2], 1, 3) + mult <- switch( + mult, + day = 1L, + wee = 7L, + cli::cli_abort("incompatible timespan in `aheads`.") + ) + x <- as.numeric(x[1]) * mult + } + if (length(x) > 2L) cli::cli_abort("incompatible timespan in `aheads`.") + } + stopifnot(rlang::is_integerish(x)) + as.integer(x) +} diff --git a/R/compat-purrr.R b/R/compat-purrr.R index 7e28bd630..712926f73 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -32,6 +32,11 @@ map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } +map_vec <- function(.x, .f, ...) { + out <- map(.x, .f, ...) + vctrs::list_unchop(out) +} + map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index 032a4d96c..ff14d6733 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -116,93 +116,6 @@ is_dist_quantiles <- function(x) { } -#' Turn a vector of quantile distributions into a list-col -#' -#' @param x a `distribution` containing `dist_quantiles` -#' -#' @return a list-col -#' @export -#' -#' @examples -#' edf <- case_death_rate_subset[1:3, ] -#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) -#' -#' edf_nested <- edf %>% dplyr::mutate(q = nested_quantiles(q)) -#' edf_nested %>% tidyr::unnest(q) -nested_quantiles <- function(x) { - stopifnot(is_dist_quantiles(x)) - distributional:::dist_apply(x, .f = function(z) { - tibble::as_tibble(vec_data(z)) %>% - dplyr::mutate(dplyr::across(tidyselect::everything(), as.double)) %>% - list_of() - }) -} - - -#' Pivot columns containing `dist_quantile` wider -#' -#' Any selected columns that contain `dist_quantiles` will be "widened" with -#' the "taus" (quantile) serving as names and the values in the data frame. -#' When pivoting multiple columns, the original column name will be used as -#' a prefix. -#' -#' @param .data A data frame, or a data frame extension such as a tibble or -#' epi_df. -#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted -#' expressions separated by commas. Variable names can be used as if they -#' were positions in the data frame, so expressions like `x:y` can -#' be used to select a range of variables. Any selected columns should -#' -#' @return An object of the same class as `.data` -#' @export -#' -#' @examples -#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) -#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -#' -#' pivot_quantiles(tib, c("d1", "d2")) -#' pivot_quantiles(tib, tidyselect::starts_with("d")) -#' pivot_quantiles(tib, d2) -pivot_quantiles <- function(.data, ...) { - expr <- rlang::expr(c(...)) - cols <- names(tidyselect::eval_select(expr, .data)) - dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) - if (!all(dqs)) { - nms <- cols[!dqs] - cli::cli_abort( - "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." - ) - } - .data <- .data %>% - dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) - checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) - if (!all(checks)) { - nms <- cols[!checks] - cli::cli_abort( - c("Quantiles must be the same length and have the same set of taus.", - i = "Check failed for variables(s) {.var {nms}}." - ) - ) - } - if (length(cols) > 1L) { - for (col in cols) { - .data <- .data %>% - tidyr::unnest(tidyselect::all_of(col)) %>% - tidyr::pivot_wider( - names_from = "tau", values_from = "q", - names_prefix = paste0(col, "_") - ) - } - } else { - .data <- .data %>% - tidyr::unnest(tidyselect::all_of(cols)) %>% - tidyr::pivot_wider(names_from = "tau", values_from = "q") - } - .data -} - - #' @export diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 51478065b..da4991feb 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -1,5 +1,8 @@ +## usethis namespace: start #' @importFrom tibble tibble #' @importFrom rlang := !! #' @importFrom stats poly predict lm residuals quantile +#' @importFrom lifecycle deprecated #' @import epiprocess parsnip +## usethis namespace: end NULL diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index e437f50ea..197c8cca5 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -94,6 +94,12 @@ flatline_forecaster <- function( #' Constructs a list of arguments for [flatline_forecaster()]. #' #' @inheritParams arx_args_list +#' @param ahead Integer. Unlike [arx_forecaster()], this doesn't have any effect +#' on the predicted values. Predictions are always the most recent observation. +#' However, this _does_ impact the residuals stored in the object. Residuals +#' are calculated based on this number to mimic how badly you would have done. +#' So for example, `ahead = 7` will create residuals by comparing values +#' 7 days apart. #' #' @return A list containing updated parameter choices with class `flatline_alist`. #' @export diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R new file mode 100644 index 000000000..bd2af0bf6 --- /dev/null +++ b/R/layer_cdc_flatline_quantiles.R @@ -0,0 +1,266 @@ +#' CDC Flatline Forecast Quantiles +#' +#' This layer creates quantile forecasts by taking a sample from the +#' interpolated CDF of the flatline residuals, and shuffling them. +#' These are then added on to the point prediction. +#' +#' @details +#' This layer is intended to be used in concert with [flatline()]. But it can +#' also be used with anything else. As long as residuals are available in the +#' the fitted model, this layer could be useful. Like +#' [layer_residual_quantiles()] it only uses the residuals for the fitted model +#' object. However, it propagates these forward for *all* aheads, by +#' iteratively shuffling them (randomly), and then adding them to the previous +#' set. This is in contrast to what happens with the [flatline_forecaster()]. +#' When using [flatline()] as the underlying engine (here), both will result in the +#' same predictions (the most recent observed value), but that model calculates +#' separate residuals for each `ahead` by comparing to observations further into +#' the future. This version continues to use the same set of residuals, and +#' adds them on to produce wider intervals as `ahead` increases. +#' +#' +#' @inheritParams layer_residual_quantiles +#' @param aheads Numeric vector of desired forecast horizons. These should be +#' given in the "units of the training data". So, for example, for data +#' typically observed daily (possibly with missing values), but +#' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with +#' weekly data, you would use `1:4`. +#' @param quantile_levels Numeric vector of probabilities with values in (0,1) +#' referring to the desired predictive intervals. The default is the standard +#' set for the COVID Forecast Hub. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold()]. +#' +#' @return an updated `frosting` postprocessor. Calling [predict()] will result +#' in an additional `` named `.pred_distn_all` containing 2-column +#' [tibble::tibble()]'s. For each +#' desired combination of `key`'s, the tibble will contain one row per ahead +#' with the associated [dist_quantiles()]. +#' @export +#' +#' @examples +#' r <- epi_recipe(case_death_rate_subset) %>% +#' # data is "daily", so we fit this to 1 ahead, the result will contain +#' # 1 day ahead residuals +#' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% +#' recipes::update_role(death_rate, new_role = "predictor") %>% +#' recipes::add_role(time_value, geo_value, new_role = "predictor") +#' +#' forecast_date <- max(case_death_rate_subset$time_value) +#' +#' latest <- get_test_data( +#' epi_recipe(case_death_rate_subset), case_death_rate_subset +#' ) +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) +#' +#' eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") +#' +#' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset) +#' preds <- suppressWarnings(predict(wf, new_data = latest)) %>% +#' dplyr::select(-time_value) %>% +#' dplyr::mutate(forecast_date = forecast_date) +#' preds +#' +#' preds <- preds %>% +#' unnest(.pred_distn_all) %>% +#' pivot_quantiles_wider(.pred_distn) %>% +#' mutate(target_date = forecast_date + ahead) +#' +#' if (require("ggplot2")) { +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Death rate") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' } +layer_cdc_flatline_quantiles <- function( + frosting, + ..., + aheads = 1:4, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e3, + by_key = "geo_value", + symmetrize = FALSE, + nonneg = TRUE, + id = rand_id("cdc_baseline_bands")) { + rlang::check_dots_empty() + + arg_is_int(aheads) + arg_is_probabilities(quantile_levels, allow_null = TRUE) + arg_is_pos_int(nsims) + arg_is_scalar(nsims) + arg_is_chr_scalar(id) + arg_is_lgl_scalar(symmetrize, nonneg) + arg_is_chr(by_key, allow_null = TRUE, allow_na = TRUE, allow_empty = TRUE) + + add_layer( + frosting, + layer_cdc_flatline_quantiles_new( + aheads = aheads, + quantile_levels = quantile_levels, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + nonneg = nonneg, + id = id + ) + ) +} + +layer_cdc_flatline_quantiles_new <- function( + aheads, + quantile_levels, + nsims, + by_key, + symmetrize, + nonneg, + id) { + layer( + "cdc_flatline_quantiles", + aheads = aheads, + quantile_levels = quantile_levels, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + nonneg = nonneg, + id = id + ) +} + +#' @export +slather.layer_cdc_flatline_quantiles <- + function(object, components, workflow, new_data, ...) { + if (is.null(object$quantile_levels)) return(components) + the_fit <- workflows::extract_fit_parsnip(workflow) + if (!inherits(the_fit, "_flatline")) { + cli::cli_warn( + c( + "Predictions for this workflow were not produced by the {.cls flatline}", + "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." + ) + ) + } + p <- components$predictions + ek <- kill_time_value(epi_keys_mold(components$mold)) + r <- grab_residuals(the_fit, components) + + avail_grps <- character(0L) + if (length(object$by_key) > 0L) { + cols_in_preds <- hardhat::check_column_names(p, object$by_key) + if (!cols_in_preds$ok) { + cli::cli_warn(c( + "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", + "Ignoring these." + )) + } + if (inherits(the_fit, "_flatline")) { + cols_in_resids <- hardhat::check_column_names(r, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", + "Ignoring these." + )) + } + # use only the keys that are in the predictions and requested. + avail_grps <- intersect(ek, setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + } else { # not flatline, but we'll try + key_cols <- dplyr::bind_cols( + geo_value = components$mold$extras$roles$geo_value, + components$mold$extras$roles$key + ) + cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", + "Ignoring these." + )) + } + avail_grps <- intersect(names(key_cols), setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + r <- dplyr::bind_cols(key_cols, r) + } + } + r <- r %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".resid"))) %>% + dplyr::group_by(!!!rlang::syms(avail_grps)) %>% + dplyr::summarise(.resid = list(.resid), .groups = "drop") + + res <- dplyr::left_join(p, r, by = avail_grps) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn_all = propogate_samples( + .resid, .pred, object$quantile_levels, + object$aheads, object$nsim, object$symmetrize, object$nonneg + ) + ) %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".pred_distn_all"))) + + # res <- check_pname(res, components$predictions, object) + components$predictions <- dplyr::left_join( + components$predictions, + res, + by = avail_grps + ) + components + } + +propogate_samples <- function( + r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { + max_ahead <- max(aheads) + samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) + res <- list() + + raw <- samp + p + if (nonneg) raw <- pmax(0, raw) + res[[1]] <- raw + if (max_ahead > 1L) { + for (iter in 2:max_ahead) { + samp <- shuffle(samp) + raw <- raw + samp + if (symmetrize) { + symmetric <- raw - (median(raw) - p) + } else { + symmetric <- raw + } + if (nonneg) symmetric <- pmax(0, symmetric) + res[[iter]] <- symmetric + } + } + res <- res[aheads] + list(tibble::tibble( + ahead = aheads, + .pred_distn = map_vec( + res, ~ dist_quantiles(quantile(.x, quantile_levels), quantile_levels) + ) + )) +} + +shuffle <- function(x) { + stopifnot(is.vector(x)) + sample(x, length(x), replace = FALSE) +} diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index a9a8cab24..b9a71e265 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -141,8 +141,8 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", i = "difference between predictions and observations.", @@ -152,8 +152,8 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in an", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", i = "Residual quantiles will be calculated directly from the", diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 6eab2a132..7d25a8c6b 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -21,7 +21,7 @@ #' #' @seealso [fit.model_spec()], [set_engine()] #' -#' @importFrom quantreg rq +#' @importFrom smoothqr smooth_qr #' @examples #' tib <- data.frame( #' y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), @@ -61,7 +61,8 @@ #' lines(pl$x, pl$`0.2`, col = "blue") #' lines(pl$x, pl$`0.8`, col = "blue") #' lines(pl$x, pl$`0.5`, col = "red") -#' \dontrun{ +#' +#' if (require("ggplot2")) { #' ggplot(data.frame(x = x, y = y), aes(x)) + #' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + #' geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R new file mode 100644 index 000000000..a156bcf90 --- /dev/null +++ b/R/pivot_quantiles.R @@ -0,0 +1,160 @@ +#' Turn a vector of quantile distributions into a list-col +#' +#' @param x a `distribution` containing `dist_quantiles` +#' +#' @return a list-col +#' @export +#' +#' @examples +#' edf <- case_death_rate_subset[1:3, ] +#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) +#' +#' edf_nested <- edf %>% dplyr::mutate(q = nested_quantiles(q)) +#' edf_nested %>% tidyr::unnest(q) +nested_quantiles <- function(x) { + stopifnot(is_dist_quantiles(x)) + distributional:::dist_apply(x, .f = function(z) { + tibble::as_tibble(vec_data(z)) %>% + dplyr::mutate(dplyr::across(tidyselect::everything(), as.double)) %>% + list_of() + }) +} + + +#' Pivot columns containing `dist_quantile` longer +#' +#' Selected columns that contain `dist_quantiles` will be "lengthened" with +#' the quantile levels serving as 1 column and the values as another. If +#' multiple columns are selected, these will be prefixed with the column name. +#' +#' @param .data A data frame, or a data frame extension such as a tibble or +#' epi_df. +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' @param .ignore_length_check If multiple columns are selected, as long as +#' each row has contains the same number of quantiles, the result will be +#' reasonable. But if, for example, `var1[1]` has 5 quantiles while `var2[1]` +#' has 7, then the only option would be to recycle everything, creating a +#' _very_ long result. By default, this would throw an error. But if this is +#' really the goal, then the error can be bypassed by setting this argument +#' to `TRUE`. The quantiles in the first selected column will vary the fastest. +#' +#' @return An object of the same class as `.data`. +#' @export +#' +#' @examples +#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) +#' +#' pivot_quantiles_longer(tib, "d1") +#' pivot_quantiles_longer(tib, tidyselect::ends_with("1")) +#' pivot_quantiles_longer(tib, d1, d2) +pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { + cols <- validate_pivot_quantiles(.data, ...) + .data <- .data %>% + dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) + if (length(cols) > 1L) { + lengths_check <- .data %>% + dplyr::transmute(dplyr::across( + tidyselect::all_of(cols), + ~ map_int(.x, vctrs::vec_size) + )) %>% + as.matrix() %>% + apply(1, function(x) dplyr::n_distinct(x) == 1L) %>% + all() + if (lengths_check) { + .data <- tidyr::unnest(.data, tidyselect::all_of(cols), names_sep = "_") + } else { + if (.ignore_length_check) { + for (col in cols) { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(col), names_sep = "_") + } + } else { + cli::cli_abort(c( + "Some selected columns contain different numbers of quantiles.", + "The result would be a {.emph very} long {.cls tibble}.", + "To do this anyway, rerun with `.ignore_length_check = TRUE`." + )) + } + } + } else { + .data <- .data %>% tidyr::unnest(tidyselect::all_of(cols)) + } + .data +} + +#' Pivot columns containing `dist_quantile` wider +#' +#' Any selected columns that contain `dist_quantiles` will be "widened" with +#' the "taus" (quantile) serving as names and the values in the data frame. +#' When pivoting multiple columns, the original column name will be used as +#' a prefix. +#' +#' @param .data A data frame, or a data frame extension such as a tibble or +#' epi_df. +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' +#' @return An object of the same class as `.data` +#' @export +#' +#' @examples +#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) +#' +#' pivot_quantiles_wider(tib, c("d1", "d2")) +#' pivot_quantiles_wider(tib, tidyselect::starts_with("d")) +#' pivot_quantiles_wider(tib, d2) +pivot_quantiles_wider <- function(.data, ...) { + cols <- validate_pivot_quantiles(.data, ...) + .data <- .data %>% + dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) + checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) + if (!all(checks)) { + nms <- cols[!checks] + cli::cli_abort( + c("Quantiles must be the same length and have the same set of taus.", + i = "Check failed for variables(s) {.var {nms}}." + ) + ) + } + if (length(cols) > 1L) { + for (col in cols) { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(col)) %>% + tidyr::pivot_wider( + names_from = "tau", values_from = "q", + names_prefix = paste0(col, "_") + ) + } + } else { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(cols)) %>% + tidyr::pivot_wider(names_from = "tau", values_from = "q") + } + .data +} + +pivot_quantiles <- function(.data, ...) { + lifecycle::deprecate_stop("0.0.6", "pivot_quantiles()", "pivot_quantiles_wider()") +} + +validate_pivot_quantiles <- function(.data, ...) { + expr <- rlang::expr(c(...)) + cols <- names(tidyselect::eval_select(expr, .data)) + dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) + if (!all(dqs)) { + nms <- cols[!dqs] + cli::cli_abort( + "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." + ) + } + cols +} diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index f6ad29a5b..74cfff284 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -42,20 +42,19 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_growth_rate <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), - log_scale = FALSE, - replace_Inf = NA, - prefix = "gr_", - columns = NULL, - skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list()) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + log_scale = FALSE, + replace_Inf = NA, + prefix = "gr_", + columns = NULL, + skip = FALSE, + id = rand_id("growth_rate"), + additional_gr_args_list = list()) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 2482be46a..21878eaa7 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -23,16 +23,15 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_lag_difference <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - prefix = "lag_diff_", - columns = NULL, - skip = FALSE, - id = rand_id("lag_diff")) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + prefix = "lag_diff_", + columns = NULL, + skip = FALSE, + id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/utils-enframer.R b/R/utils-enframer.R index 387d04356..0a8152906 100644 --- a/R/utils-enframer.R +++ b/R/utils-enframer.R @@ -13,10 +13,11 @@ enframer <- function(df, x, fill = NA) { } enlist <- function(...) { - # in epiprocess - x <- list(...) - n <- as.character(sys.call())[-1] - if (!is.null(n0 <- names(x))) n[n0 != ""] <- n0[n0 != ""] - names(x) <- n - x + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 2ad03c277..ac66b8208 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -34,6 +34,7 @@ reference: contents: - contains("flatline") - contains("arx") + - contains("cdc") - title: Parsnip engines desc: Prediction methods not available elsewhere contents: @@ -67,7 +68,7 @@ reference: - dist_quantiles - extrapolate_quantiles - nested_quantiles - - pivot_quantiles + - starts_with("pivot_quantiles") - title: Included datasets contents: - case_death_rate_subset diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index d7d217777..4d77572a1 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -35,7 +35,9 @@ latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) # Add frosting to a workflow and predict -f <- frosting() \%>\% layer_predict() \%>\% layer_naomit(.pred) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) p1 <- predict(wf1, latest) p1 diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index fdd309959..7a6b66305 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -41,12 +41,16 @@ use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate")) +arx_fcast_epi_workflow( + jhu, "death_rate", + c("case_rate", "death_rate") +) arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_forecaster]{arx_forecaster()}} diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d4866aa0e..e121f272c 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -41,12 +41,16 @@ that it estimates a model for a particular target horizon. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate")) +out <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") +) out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_fcast_epi_workflow]{arx_fcast_epi_workflow()}}, \code{\link[=arx_args_list]{arx_args_list()}} diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd new file mode 100644 index 000000000..2f6546f74 --- /dev/null +++ b/man/cdc_baseline_args_list.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_args_list} +\alias{cdc_baseline_args_list} +\title{CDC baseline forecaster argument constructor} +\usage{ +cdc_baseline_args_list( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf +) +} +\arguments{ +\item{data_frequency}{Integer or string. This describes the frequency of the +input \code{epi_df}. For typical FluSight forecasts, this would be \code{"1 week"}. +Allowable arguments are integers (taken to mean numbers of days) or a +string like \code{"7 days"} or \code{"2 weeks"}. Currently, all other periods +(other than days or weeks) result in an error.} + +\item{aheads}{Integer vector. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have +any effect on the predicted values. +Predictions are always the most recent observation. This determines the +set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_cdc_flatline_quantiles()}}\verb{. It interacts with the }data_frequency\verb{argument. So, for example, if the data is daily and you want forecasts for 1:4 days ahead, then you would use}1:4\verb{. However, if you want one-week predictions, you would set this as }c(7, 14, 21, 28)\verb{. But if }data_frequency\code{is}"1 week"\verb{, then you would set it as }1:4`.} + +\item{n_training}{Integer. An upper limit for the number of rows per +key that are used for training +(in the time unit of the \code{epi_df}).} + +\item{forecast_date}{Date. The date on which the forecast is created. +The default \code{NULL} will attempt to determine this automatically.} + +\item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce +prediction intervals. These are created by computing the quantiles of +training residuals. A \code{NULL} value will result in point forecasts only.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{symmetrize}{Logical. The default \code{TRUE} calculates +symmetric prediction intervals. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold]{layer_threshold()}}.} + +\item{quantile_by_key}{Character vector. Groups residuals by listed keys +before calculating residual quantiles. See the \code{by_key} argument to +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} for more information. The default, +\code{character(0)} performs no grouping. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nafill_buffer}{At predict time, recent values of the training data +are used to create a forecast. However, these can be \code{NA} due to, e.g., +data latency issues. By default, any missing values will get filled with +less recent data. Setting this value to \code{NULL} will result in 1 extra +recent row (beyond those required for lag creation) to be used. Note that +we require at least \code{min(lags)} rows of recent data per \code{geo_value} to +create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} +will be treated as \emph{additional} allowed recent data rather than the +total amount of recent data to examine.} +} +\value{ +A list containing updated parameter choices with class \code{cdc_flat_fcast}. +} +\description{ +Constructs a list of arguments for \code{\link[=cdc_baseline_forecaster]{cdc_baseline_forecaster()}}. +} +\examples{ +cdc_baseline_args_list() +cdc_baseline_args_list(symmetrize = FALSE) +cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) +} diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd new file mode 100644 index 000000000..7e62a0521 --- /dev/null +++ b/man/cdc_baseline_forecaster.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_forecaster} +\alias{cdc_baseline_forecaster} +\title{Predict the future with the most recent value} +\usage{ +cdc_baseline_forecaster( + epi_data, + outcome, + args_list = cdc_baseline_args_list() +) +} +\arguments{ +\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} + +\item{outcome}{A scalar character for the column name we wish to predict.} + +\item{args_list}{A list of additional arguments as created by the +\code{\link[=cdc_baseline_args_list]{cdc_baseline_args_list()}} constructor function.} +} +\value{ +A data frame of point and interval forecasts at for all +aheads (unique horizons) for each unique combination of \code{key_vars}. +} +\description{ +This is a simple forecasting model for +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the +forecast for any future date, and produces intervals by shuffling the quantiles +of the residuals of such a "flatline" forecast and incrementing these +forward over all available training data. +} +\details{ +By default, the predictive intervals are computed separately for each +combination of \code{geo_value} in the \code{epi_data} argument. + +This forecaster is meant to produce exactly the CDC Baseline used for +\href{https://covid19forecasthub.org}{COVID19ForecastHub} +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) + +if (require(ggplot2)) { +forecast_date <- unique(preds$forecast_date) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = deaths) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Weekly deaths") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) +} +} diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 399d62efa..d36385fb2 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -20,9 +20,9 @@ fill in the name of the layer, and open the file. \examples{ \dontrun{ - # Note: running this will write `layer_strawberry.R` to - # the `R/` directory of your current project - create_layer("strawberry") +# Note: running this will write `layer_strawberry.R` to +# the `R/` directory of your current project +create_layer("strawberry") } } diff --git a/man/dist_quantiles.Rd b/man/dist_quantiles.Rd index 50f00dc32..739bae5a8 100644 --- a/man/dist_quantiles.Rd +++ b/man/dist_quantiles.Rd @@ -15,7 +15,7 @@ dist_quantiles(x, tau) A distribution parameterized by a set of quantiles } \examples{ -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) quantile(dstn, p = c(.1, .25, .5, .9)) median(dstn) diff --git a/man/extrapolate_quantiles.Rd b/man/extrapolate_quantiles.Rd index 985d7cae8..cc6cb2c3c 100644 --- a/man/extrapolate_quantiles.Rd +++ b/man/extrapolate_quantiles.Rd @@ -24,12 +24,14 @@ library(distributional) dstn <- dist_normal(c(10, 2), c(5, 10)) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) # because this distribution is already quantiles, any extra quantiles are # appended extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- c(dist_normal(c(10, 2), c(5, 10)), - dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8)))) +dstn <- c( + dist_normal(c(10, 2), c(5, 10)), + dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) +) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) } diff --git a/man/figures/lifecycle-archived.svg b/man/figures/lifecycle-archived.svg new file mode 100644 index 000000000..745ab0c78 --- /dev/null +++ b/man/figures/lifecycle-archived.svg @@ -0,0 +1,21 @@ + + lifecycle: archived + + + + + + + + + + + + + + + lifecycle + + archived + + diff --git a/man/figures/lifecycle-defunct.svg b/man/figures/lifecycle-defunct.svg new file mode 100644 index 000000000..d5c9559ed --- /dev/null +++ b/man/figures/lifecycle-defunct.svg @@ -0,0 +1,21 @@ + + lifecycle: defunct + + + + + + + + + + + + + + + lifecycle + + defunct + + diff --git a/man/figures/lifecycle-deprecated.svg b/man/figures/lifecycle-deprecated.svg new file mode 100644 index 000000000..b61c57c3f --- /dev/null +++ b/man/figures/lifecycle-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: deprecated + + + + + + + + + + + + + + + lifecycle + + deprecated + + diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg new file mode 100644 index 000000000..5d88fc2c6 --- /dev/null +++ b/man/figures/lifecycle-experimental.svg @@ -0,0 +1,21 @@ + + lifecycle: experimental + + + + + + + + + + + + + + + lifecycle + + experimental + + diff --git a/man/figures/lifecycle-maturing.svg b/man/figures/lifecycle-maturing.svg new file mode 100644 index 000000000..897370ecf --- /dev/null +++ b/man/figures/lifecycle-maturing.svg @@ -0,0 +1,21 @@ + + lifecycle: maturing + + + + + + + + + + + + + + + lifecycle + + maturing + + diff --git a/man/figures/lifecycle-questioning.svg b/man/figures/lifecycle-questioning.svg new file mode 100644 index 000000000..7c1721d05 --- /dev/null +++ b/man/figures/lifecycle-questioning.svg @@ -0,0 +1,21 @@ + + lifecycle: questioning + + + + + + + + + + + + + + + lifecycle + + questioning + + diff --git a/man/figures/lifecycle-soft-deprecated.svg b/man/figures/lifecycle-soft-deprecated.svg new file mode 100644 index 000000000..9c166ff30 --- /dev/null +++ b/man/figures/lifecycle-soft-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: soft-deprecated + + + + + + + + + + + + + + + lifecycle + + soft-deprecated + + diff --git a/man/figures/lifecycle-stable.svg b/man/figures/lifecycle-stable.svg new file mode 100644 index 000000000..9bf21e76b --- /dev/null +++ b/man/figures/lifecycle-stable.svg @@ -0,0 +1,29 @@ + + lifecycle: stable + + + + + + + + + + + + + + + + lifecycle + + + + stable + + + diff --git a/man/figures/lifecycle-superseded.svg b/man/figures/lifecycle-superseded.svg new file mode 100644 index 000000000..db8d757f7 --- /dev/null +++ b/man/figures/lifecycle-superseded.svg @@ -0,0 +1,21 @@ + + lifecycle: superseded + + + + + + + + + + + + + + + lifecycle + + superseded + + diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index fb1c3af28..3dfa0029a 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -29,7 +29,7 @@ preprocessing the data and fitting the underlying parsnip model. } \examples{ jhu <- case_death_rate_subset \%>\% -filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/flatline.Rd b/man/flatline.Rd index a396cfeb9..c353ff163 100644 --- a/man/flatline.Rd +++ b/man/flatline.Rd @@ -38,8 +38,10 @@ This is an internal function that is used to create a \code{\link[parsnip:linear model. It has somewhat odd behaviour (see below). } \examples{ -tib <- data.frame(y = runif(100), - expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5)) \%>\% +tib <- data.frame( + y = runif(100), + expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5) +) \%>\% dplyr::group_by(k, j) \%>\% dplyr::mutate(y2 = dplyr::lead(y, 2)) # predict 2 steps ahead flat <- flatline(y2 ~ j + k + y, tib) # predictions for 20 locations diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 55d93c1db..dcae448f1 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -17,8 +17,12 @@ flatline_args_list( ) } \arguments{ -\item{ahead}{Integer. Number of time steps ahead (in days) of the forecast -date for which forecasts should be produced.} +\item{ahead}{Integer. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have any effect +on the predicted values. Predictions are always the most recent observation. +However, this \emph{does} impact the residuals stored in the object. Residuals +are calculated based on this number to mimic how badly you would have done. +So for example, \code{ahead = 7} will create residuals by comparing values +7 days apart.} \item{n_training}{Integer. An upper limit for the number of rows per key that are used for training diff --git a/man/frosting.Rd b/man/frosting.Rd index 83a8d6a9d..362c40a4f 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -24,8 +24,8 @@ The arguments are currently placeholders and must be NULL \examples{ # Toy example to show that frosting can be created and added for postprocessing - f <- frosting() - wf <- epi_workflow() \%>\% add_frosting(f) +f <- frosting() +wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example jhu <- case_death_rate_subset \%>\% diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 421978eb5..4e173d662 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -46,15 +46,17 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) - # Don't specify `forecast_date` (by default, this should be last date in latest) -f <- frosting() \%>\% layer_predict() \%>\% - layer_naomit(.pred) +# Don't specify `forecast_date` (by default, this should be last date in latest) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf0 <- wf \%>\% add_frosting(f) p0 <- predict(wf0, latest) p0 # Specify a `forecast_date` that is greater than or equal to `as_of` date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) @@ -73,7 +75,7 @@ p2 <- predict(wf2, latest) p2 # Do not specify a forecast_date - f3 <- frosting() \%>\% +f3 <- frosting() \%>\% layer_predict() \%>\% layer_add_forecast_date() \%>\% layer_naomit(.pred) diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index 58ff7770f..3c2884e10 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -48,7 +48,8 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- get_test_data(r, jhu) # Use ahead + forecast date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) @@ -59,7 +60,8 @@ p # Use ahead + max time value from pre, fit, post # which is the same if include `layer_add_forecast_date()` -f2 <- frosting() \%>\% layer_predict() \%>\% +f2 <- frosting() \%>\% + layer_predict() \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) wf2 <- wf \%>\% add_frosting(f2) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd new file mode 100644 index 000000000..594c63afb --- /dev/null +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -0,0 +1,126 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_cdc_flatline_quantiles.R +\name{layer_cdc_flatline_quantiles} +\alias{layer_cdc_flatline_quantiles} +\title{CDC Flatline Forecast Quantiles} +\usage{ +layer_cdc_flatline_quantiles( + frosting, + ..., + aheads = 1:4, + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000, + by_key = "geo_value", + symmetrize = FALSE, + nonneg = TRUE, + id = rand_id("cdc_baseline_bands") +) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor} + +\item{...}{Unused, include for consistency with other layers.} + +\item{aheads}{Numeric vector of desired forecast horizons. These should be +given in the "units of the training data". So, for example, for data +typically observed daily (possibly with missing values), but +with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with +weekly data, you would use \code{1:4}.} + +\item{quantile_levels}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{by_key}{A character vector of keys to group the residuals by before +calculating quantiles. The default, \code{c()} performs no grouping.} + +\item{symmetrize}{logical. If \code{TRUE} then interval will be symmetric.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold]{layer_threshold()}}.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result +in an additional \verb{} named \code{.pred_distn_all} containing 2-column +\code{\link[tibble:tibble]{tibble::tibble()}}'s. For each +desired combination of \code{key}'s, the tibble will contain one row per ahead +with the associated \code{\link[=dist_quantiles]{dist_quantiles()}}. +} +\description{ +This layer creates quantile forecasts by taking a sample from the +interpolated CDF of the flatline residuals, and shuffling them. +These are then added on to the point prediction. +} +\details{ +This layer is intended to be used in concert with \code{\link[=flatline]{flatline()}}. But it can +also be used with anything else. As long as residuals are available in the +the fitted model, this layer could be useful. Like +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} it only uses the residuals for the fitted model +object. However, it propagates these forward for \emph{all} aheads, by +iteratively shuffling them (randomly), and then adding them to the previous +set. This is in contrast to what happens with the \code{\link[=flatline_forecaster]{flatline_forecaster()}}. +When using \code{\link[=flatline]{flatline()}} as the underlying engine (here), both will result in the +same predictions (the most recent observed value), but that model calculates +separate residuals for each \code{ahead} by comparing to observations further into +the future. This version continues to use the same set of residuals, and +adds them on to produce wider intervals as \code{ahead} increases. +} +\examples{ +r <- epi_recipe(case_death_rate_subset) \%>\% + # data is "daily", so we fit this to 1 ahead, the result will contain + # 1 day ahead residuals + step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% + recipes::update_role(death_rate, new_role = "predictor") \%>\% + recipes::add_role(time_value, geo_value, new_role = "predictor") + +forecast_date <- max(case_death_rate_subset$time_value) + +latest <- get_test_data( + epi_recipe(case_death_rate_subset), case_death_rate_subset +) + +f <- frosting() \%>\% + layer_predict() \%>\% + layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) + +eng <- parsnip::linear_reg() \%>\% parsnip::set_engine("flatline") + +wf <- epi_workflow(r, eng, f) \%>\% fit(case_death_rate_subset) +preds <- suppressWarnings(predict(wf, new_data = latest)) \%>\% + dplyr::select(-time_value) \%>\% + dplyr::mutate(forecast_date = forecast_date) +preds + +preds <- preds \%>\% + unnest(.pred_distn_all) \%>\% + pivot_quantiles_wider(.pred_distn) \%>\% + mutate(target_date = forecast_date + ahead) + +if (require("ggplot2")) { +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Death rate") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) +} +} diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index e841e9a50..179d6862c 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -78,13 +78,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -93,9 +95,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -104,9 +108,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% + dplyr::select(geo_value, time_value, cases) +) predict(wf, latest) } diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 1326dfe75..03473053f 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -62,9 +62,9 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% - step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_naomit() + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index 1a2824041..143532650 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R +% Please edit documentation in R/pivot_quantiles.R \name{nested_quantiles} \alias{nested_quantiles} \title{Turn a vector of quantile distributions into a list-col} @@ -16,8 +16,8 @@ a list-col Turn a vector of quantile distributions into a list-col } \examples{ -edf <- case_death_rate_subset[1:3,] -edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5/6, 2:4/5, 3:10/11)) +edf <- case_death_rate_subset[1:3, ] +edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) edf_nested <- edf \%>\% dplyr::mutate(q = nested_quantiles(q)) edf_nested \%>\% tidyr::unnest(q) diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd new file mode 100644 index 000000000..f73e6deaf --- /dev/null +++ b/man/pivot_quantiles_longer.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles_longer} +\alias{pivot_quantiles_longer} +\title{Pivot columns containing \code{dist_quantile} longer} +\usage{ +pivot_quantiles_longer(.data, ..., .ignore_length_check = FALSE) +} +\arguments{ +\item{.data}{A data frame, or a data frame extension such as a tibble or +epi_df.} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables.} + +\item{.ignore_length_check}{If multiple columns are selected, as long as +each row has contains the same number of quantiles, the result will be +reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2[1]} +has 7, then the only option would be to recycle everything, creating a +\emph{very} long result. By default, this would throw an error. But if this is +really the goal, then the error can be bypassed by setting this argument +to \code{TRUE}. The quantiles in the first selected column will vary the fastest.} +} +\value{ +An object of the same class as \code{.data}. +} +\description{ +Selected columns that contain \code{dist_quantiles} will be "lengthened" with +the quantile levels serving as 1 column and the values as another. If +multiple columns are selected, these will be prefixed with the column name. +} +\examples{ +d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + +pivot_quantiles_longer(tib, "d1") +pivot_quantiles_longer(tib, tidyselect::ends_with("1")) +pivot_quantiles_longer(tib, d1, d2) +} diff --git a/man/pivot_quantiles.Rd b/man/pivot_quantiles_wider.Rd similarity index 75% rename from man/pivot_quantiles.Rd rename to man/pivot_quantiles_wider.Rd index 0ed6588ed..02a33bb2f 100644 --- a/man/pivot_quantiles.Rd +++ b/man/pivot_quantiles_wider.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R -\name{pivot_quantiles} -\alias{pivot_quantiles} +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles_wider} +\alias{pivot_quantiles_wider} \title{Pivot columns containing \code{dist_quantile} wider} \usage{ -pivot_quantiles(.data, ...) +pivot_quantiles_wider(.data, ...) } \arguments{ \item{.data}{A data frame, or a data frame extension such as a tibble or @@ -13,7 +13,7 @@ epi_df.} \item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted expressions separated by commas. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables. Any selected columns should} +be used to select a range of variables.} } \value{ An object of the same class as \code{.data} @@ -29,7 +29,7 @@ d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -pivot_quantiles(tib, c("d1", "d2")) -pivot_quantiles(tib, tidyselect::starts_with("d")) -pivot_quantiles(tib, d2) +pivot_quantiles_wider(tib, c("d1", "d2")) +pivot_quantiles_wider(tib, tidyselect::starts_with("d")) +pivot_quantiles_wider(tib, d2) } diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 293999876..b938541f1 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -39,9 +39,10 @@ only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}} tib <- data.frame( y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), - x1 = rnorm(100), x2 = rnorm(100)) + x1 = rnorm(100), x2 = rnorm(100) +) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) -ff <- qr_spec \%>\% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +ff <- qr_spec \%>\% fit(cbind(y1, y2, y3, y4, y5, y6) ~ ., data = tib) p <- predict(ff, new_data = tib) x <- -99:99 / 100 * 2 * pi @@ -50,28 +51,31 @@ fd <- x[length(x) - 20] XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) XY <- tibble::as_tibble(XY) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) -tt <- qr_spec \%>\% fit_xy(x = XY[,21:41], y = XY[,1:20]) +tt <- qr_spec \%>\% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) library(tidyr) library(dplyr) pl <- predict( - object = tt, - new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] - ) + object = tt, + new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] +) pl <- pl \%>\% - unnest(.pred) \%>\% - mutate(distn = nested_quantiles(distn)) \%>\% - unnest(distn) \%>\% - mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, - ahead = NULL) \%>\% - pivot_wider(names_from = tau, values_from = q) + unnest(.pred) \%>\% + mutate(distn = nested_quantiles(distn)) \%>\% + unnest(distn) \%>\% + mutate( + x = x[length(x) - 20] + ahead / 100 * 2 * pi, + ahead = NULL + ) \%>\% + pivot_wider(names_from = tau, values_from = q) plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") curve(sin(x), add = TRUE) abline(v = fd, lty = 2) lines(pl$x, pl$`0.2`, col = "blue") lines(pl$x, pl$`0.8`, col = "blue") lines(pl$x, pl$`0.5`, col = "red") -\dontrun{ + +if (require("ggplot2")) { ggplot(data.frame(x = x, y = y), aes(x)) + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index ca8609b1e..bf135346e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -90,7 +90,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \examples{ r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_lag(death_rate, lag = c(0,7,14)) + step_epi_lag(death_rate, lag = c(0, 7, 14)) r } \seealso{ diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index 0449b887c..b409135b1 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -87,7 +87,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_growth_rate(case_rate, death_rate) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index d69c25faa..b06abe43c 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -59,7 +59,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 2964c6912..1a9564563 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -104,13 +104,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -119,9 +121,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -130,8 +134,10 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% dplyr::select(geo_value, time_value, cases) ) diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 7861f27ea..ce7c0fc74 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -50,9 +50,12 @@ after any filtering step. tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), by = 1, - length.out = 5), times = 2), - geo_value = rep(c("ca", "hi"), each = 5)) \%>\% + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, + length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) +) \%>\% as_epi_df() epi_recipe(y ~ x, data = tib) \%>\% diff --git a/tests/testthat/test-parse_period.R b/tests/testthat/test-parse_period.R new file mode 100644 index 000000000..0adbcec3d --- /dev/null +++ b/tests/testthat/test-parse_period.R @@ -0,0 +1,12 @@ +test_that("parse_period works", { + expect_error(parse_period(c(1, 2))) + expect_error(parse_period(c(1.3))) + expect_error(parse_period("1 year")) + expect_error(parse_period("2 weeks later")) + expect_identical(parse_period(1), 1L) + expect_identical(parse_period("1 day"), 1L) + expect_identical(parse_period("1 days"), 1L) + expect_identical(parse_period("1 week"), 7L) + expect_identical(parse_period("1 weeks"), 7L) + expect_identical(parse_period("2 weeks"), 14L) +}) diff --git a/tests/testthat/test-pivot_quantiles.R b/tests/testthat/test-pivot_quantiles.R index 85694aace..9928c5e09 100644 --- a/tests/testthat/test-pivot_quantiles.R +++ b/tests/testthat/test-pivot_quantiles.R @@ -1,26 +1,68 @@ -test_that("quantile pivotting behaves", { +test_that("quantile pivotting wider behaves", { tib <- tibble::tibble(a = 1:5, b = 6:10) - expect_error(pivot_quantiles(tib, a)) + expect_error(pivot_quantiles_wider(tib, a)) tib$c <- rep(dist_normal(), 5) - expect_error(pivot_quantiles(tib, c)) + expect_error(pivot_quantiles_wider(tib, c)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) # different quantiles tib <- tib[1:2, ] tib$d1 <- d1 - expect_error(pivot_quantiles(tib, d1)) + expect_error(pivot_quantiles_wider(tib, d1)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) tib$d1 <- d1 # would want to error (mismatched quantiles), but hard to check efficiently - expect_silent(pivot_quantiles(tib, d1)) + expect_silent(pivot_quantiles_wider(tib, d1)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) - expect_length(pivot_quantiles(tib, c("d1", "d2")), 7L) - expect_length(pivot_quantiles(tib, tidyselect::starts_with("d")), 7L) - expect_length(pivot_quantiles(tib, d2), 5L) + expect_length(pivot_quantiles_wider(tib, c("d1", "d2")), 7L) + expect_length(pivot_quantiles_wider(tib, tidyselect::starts_with("d")), 7L) + expect_length(pivot_quantiles_wider(tib, d2), 5L) +}) + + +test_that("quantile pivotting longer behaves", { + tib <- tibble::tibble(a = 1:5, b = 6:10) + expect_error(pivot_quantiles_longer(tib, a)) + tib$c <- rep(dist_normal(), 5) + expect_error(pivot_quantiles_longer(tib, c)) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) + # different quantiles + tib <- tib[1:2, ] + tib$d1 <- d1 + expect_length(pivot_quantiles_longer(tib, d1), 5L) + expect_identical(nrow(pivot_quantiles_longer(tib, d1)), 7L) + expect_identical(pivot_quantiles_longer(tib, d1)$q, as.double(c(1:3, 2:5))) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) + tib$d1 <- d1 + expect_silent(pivot_quantiles_longer(tib, d1)) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) + d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) + tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + + + expect_length(pivot_quantiles_longer(tib, c("d1", "d2")), 5L) + expect_identical(nrow(pivot_quantiles_longer(tib, c("d1", "d2"))), 6L) + expect_silent(pivot_quantiles_longer(tib, tidyselect::starts_with("d"))) + expect_length(pivot_quantiles_longer(tib, d2), 4L) + + tib$d3 <- c(dist_quantiles(2:5, 2:5 / 6), dist_quantiles(3:6, 2:5 / 6)) + # now the cols have different numbers of quantiles + expect_error(pivot_quantiles_longer(tib, d1, d3)) + expect_length( + pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE), + 6L + ) + expect_identical( + pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE)$d1_q, + as.double(rep(c(1:3, 2:4), each = 4)) + ) }) diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propogate_samples.R new file mode 100644 index 000000000..b8a1ff82d --- /dev/null +++ b/tests/testthat/test-propogate_samples.R @@ -0,0 +1,7 @@ +test_that("propogate_samples", { + r <- -30:50 + p <- 40 + quantiles <- 1:9 / 10 + aheads <- c(2, 4, 7) + nsim <- 100 +}) diff --git a/tests/testthat/test-shuffle.R b/tests/testthat/test-shuffle.R new file mode 100644 index 000000000..94bc1aa3b --- /dev/null +++ b/tests/testthat/test-shuffle.R @@ -0,0 +1,5 @@ +test_that("shuffle works", { + expect_error(shuffle(matrix(NA, 2, 2))) + expect_length(shuffle(1:10), 10L) + expect_identical(sort(shuffle(1:10)), 1:10) +}) diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd index 67af7289d..e889f0b74 100644 --- a/vignettes/articles/sliding.Rmd +++ b/vignettes/articles/sliding.Rmd @@ -129,7 +129,7 @@ fc <- bind_rows( ) ) %>% list_rbind() ) %>% - pivot_quantiles(fc_.pred_distn) + pivot_quantiles_wider(fc_.pred_distn) ``` Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the @@ -225,7 +225,7 @@ can_fc <- bind_rows( ) ) %>% list_rbind() ) %>% - pivot_quantiles(fc_.pred_distn) + pivot_quantiles_wider(fc_.pred_distn) ``` The figures below shows the results for all of the provinces. @@ -327,7 +327,7 @@ k_week_version_aware <- function(ahead = 7, version_aware = TRUE) { fc <- bind_rows( map(aheads, ~ k_week_version_aware(.x, TRUE)) %>% list_rbind(), map(aheads, ~ k_week_version_aware(.x, FALSE)) %>% list_rbind() -) %>% pivot_quantiles(fc_.pred_distn) +) %>% pivot_quantiles_wider(fc_.pred_distn) ``` Now we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.