diff --git a/NAMESPACE b/NAMESPACE index 5ee13f73..e7e720f1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -242,6 +242,7 @@ importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,group_by_at) +importFrom(dplyr,inner_join) importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) @@ -273,6 +274,7 @@ importFrom(hardhat,extract_recipe) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") +importFrom(magrittr,extract2) importFrom(recipes,bake) importFrom(recipes,detect_step) importFrom(recipes,prep) diff --git a/NEWS.md b/NEWS.md index 3fcf4dc0..8be7d7c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat ## Improvements - Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data. +- Fix `layer_population_scaling` default `by` with `other_keys`. +- Make key column inference more consistent within the package and with current `epiprocess`. +- Fix `quantile_reg()` producing error when asked to output just median-level predictions. - (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift` ## Bug fixes diff --git a/R/autoplot.R b/R/autoplot.R index 4f422297..870dcb8d 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -127,11 +127,10 @@ autoplot.epi_workflow <- function( if (!is.null(shift)) { edf <- mutate(edf, time_value = time_value + shift) } - extra_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) - if (length(extra_keys) == 0L) extra_keys <- NULL + other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) edf <- as_epi_df(edf, as_of = object$fit$meta$as_of, - other_keys = extra_keys %||% character() + other_keys = other_keys ) if (is.null(predictions)) { return(autoplot( diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 3dee263e..b4b9973b 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -7,7 +7,9 @@ #' @importFrom cli cli_abort cli_warn #' @importFrom dplyr arrange across all_of any_of bind_cols bind_rows group_by #' @importFrom dplyr full_join relocate summarise everything +#' @importFrom dplyr inner_join #' @importFrom dplyr summarize filter mutate select left_join rename ungroup +#' @importFrom magrittr extract2 #' @importFrom rlang := !! %||% as_function global_env set_names !!! caller_arg #' @importFrom rlang is_logical is_true inject enquo enquos expr sym arg_match #' @importFrom stats poly predict lm residuals quantile diff --git a/R/key_colnames.R b/R/key_colnames.R index b9ebde5d..9e0d44dc 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -1,20 +1,25 @@ #' @export -key_colnames.recipe <- function(x, ...) { +key_colnames.recipe <- function(x, ..., exclude = character()) { geo_key <- x$var_info$variable[x$var_info$role %in% "geo_value"] time_key <- x$var_info$variable[x$var_info$role %in% "time_value"] keys <- x$var_info$variable[x$var_info$role %in% "key"] - c(geo_key, keys, time_key) %||% character(0L) + full_key <- c(geo_key, keys, time_key) %||% character(0L) + full_key[!full_key %in% exclude] } #' @export -key_colnames.epi_workflow <- function(x, ...) { +key_colnames.epi_workflow <- function(x, ..., exclude = character()) { # safer to look at the mold than the preprocessor mold <- hardhat::extract_mold(x) - molded_names <- names(mold$extras$roles) - geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value) - time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value) - keys <- names(mold$extras$roles[molded_names %in% "key"]$key) - c(geo_key, keys, time_key) %||% character(0L) + molded_roles <- mold$extras$roles + extras <- bind_cols(molded_roles$geo_value, molded_roles$key, molded_roles$time_value) + full_key <- names(extras) + if (length(full_key) == 0L) { + # No epikeytime role assignment; infer from all columns: + potential_keys <- c("geo_value", "time_value") + full_key <- potential_keys[potential_keys %in% names(bind_cols(molded_roles))] + } + full_key[!full_key %in% exclude] } kill_time_value <- function(v) { diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 7ec16882..badd073f 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -19,12 +19,17 @@ #' inverting the existing scaling. #' @param by A (possibly named) character vector of variables to join by. #' -#' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` produced by the `predict()` call -#' and the user-provided dataset. -#' If columns in that `epi_df` and `df` have the same name (and aren't -#' included in `by`), `.df` is added to the one from the user-provided data -#' to disambiguate. +#' If `NULL`, the default, the function will try to infer a reasonable set of +#' columns. First, it will try to join by all variables in the test data with +#' roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in `df`; +#' these roles are automatically set if you are using an `epi_df`, or you can +#' use, e.g., `update_role`. If no such roles are set, it will try to perform a +#' natural join, using variables in common between the training/test data and +#' population data. +#' +#' If columns in the training/testing data and `df` have the same name (and +#' aren't included in `by`), a `.df` suffix is added to the one from the +#' user-provided data to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` @@ -135,6 +140,26 @@ slather.layer_population_scaling <- ) rlang::check_dots_empty() + if (is.null(object$by)) { + # Assume `layer_predict` has calculated the prediction keys and other + # layers don't change the prediction key colnames: + prediction_key_colnames <- names(components$keys) + lhs_potential_keys <- prediction_key_colnames + rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col)) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- kill_time_value(lhs_potential_keys) + if (!all(suggested_min_keys %in% object$by)) { + cli_warn(c( + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys") + } + } + object$by <- object$by %||% intersect( epi_keys_only(components$predictions), colnames(select(object$df, !object$df_pop_col)) @@ -152,10 +177,12 @@ slather.layer_population_scaling <- suffix <- ifelse(object$create_new, object$suffix, "") col_to_remove <- setdiff(colnames(object$df), colnames(components$predictions)) - components$predictions <- left_join( + components$predictions <- inner_join( components$predictions, object$df, by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop"), suffix = c("", ".df") ) %>% mutate(across( diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 9e653184..1388dd85 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -112,7 +112,7 @@ make_quantile_reg <- function() { # can't make a method because object is second out <- switch(type, - rq = dist_quantiles(unname(as.list(x)), object$quantile_levels), # one quantile + rq = dist_quantiles(unname(as.list(x)), object$tau), # one quantile rqs = { x <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) dist_quantiles(x, list(object$tau)) diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 7bdb0092..6047f5c4 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -16,12 +16,17 @@ #' inverting the existing scaling. #' @param by A (possibly named) character vector of variables to join by. #' -#' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` produced by the `predict()` call -#' and the user-provided dataset. -#' If columns in that `epi_df` and `df` have the same name (and aren't -#' included in `by`), `.df` is added to the one from the user-provided data -#' to disambiguate. +#' If `NULL`, the default, the function will try to infer a reasonable set of +#' columns. First, it will try to join by all variables in the training/test +#' data with roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in +#' `df`; these roles are automatically set if you are using an `epi_df`, or you +#' can use, e.g., `update_role`. If no such roles are set, it will try to +#' perform a natural join, using variables in common between the training/test +#' data and population data. +#' +#' If columns in the training/testing data and `df` have the same name (and +#' aren't included in `by`), a `.df` suffix is added to the one from the +#' user-provided data to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` @@ -29,7 +34,7 @@ #' For example, `by = c("geo_value" = "states", "county" = "county")` will match #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. #' -#' See [dplyr::left_join()] for more details. +#' See [dplyr::inner_join()] for more details. #' @param df_pop_col the name of the column in the data frame `df` that #' contains the population data and will be used for scaling. #' This should be one column. @@ -89,13 +94,25 @@ step_population_scaling <- suffix = "_scaled", skip = FALSE, id = rand_id("population_scaling")) { - arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, id) - arg_is_lgl(create_new, skip) - arg_is_chr(df_pop_col, suffix, id) + if (rlang::dots_n(...) == 0L) { + cli_abort(c( + "`...` must not be empty.", + ">" = "Please provide one or more tidyselect expressions in `...` + specifying the columns to which scaling should be applied.", + ">" = "If you really want to list `step_population_scaling` in your + recipe but not have it do anything, you can use a tidyselection + that selects zero variables, such as `c()`." + )) + } + arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, skip, id) + arg_is_chr(role, df_pop_col, suffix, id) + hardhat::validate_column_names(df, df_pop_col) arg_is_chr(by, allow_null = TRUE) + arg_is_numeric(rate_rescaling) if (rate_rescaling <= 0) { cli_abort("`rate_rescaling` must be a positive number.") } + arg_is_lgl(create_new, skip) recipes::add_step( recipe, @@ -138,6 +155,42 @@ step_population_scaling_new <- #' @export prep.step_population_scaling <- function(x, training, info = NULL, ...) { + if (is.null(x$by)) { + rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col) + lhs_potential_keys <- info %>% + filter(role %in% c("geo_value", "key", "time_value")) %>% + extract2("variable") %>% + unique() # in case of weird var with multiple of above roles + if (length(lhs_potential_keys) == 0L) { + # We're working with a recipe and tibble, and *_role hasn't set up any of + # the above roles. Let's say any column could actually act as a key, and + # lean on `intersect` below to make this something reasonable. + lhs_potential_keys <- names(training) + } + suggested_min_keys <- info %>% + filter(role %in% c("geo_value", "key")) %>% + extract2("variable") %>% + unique() + # (0 suggested keys if we weren't given any epikeytime var info.) + x$by <- intersect(lhs_potential_keys, rhs_potential_keys) + if (length(x$by) == 0L) { + cli_stop(c( + "Couldn't guess a default for `by`", + ">" = "Please rename columns in your population data to match those in your training data, + or manually specify `by =` in `step_population_scaling()`." + ), class = "epipredict__step_population_scaling__default_by_no_intersection") + } + if (!all(suggested_min_keys %in% x$by)) { + cli_warn(c( + "{setdiff(suggested_min_keys, x$by)} {?was an/were} epikey column{?s} in the training data, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {x$by}.", + ">" = "Double-check whether column names on the population `df` match those for your training data.", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}.", + ">" = "Manually specify `by =` to silence." + ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys") + } + } step_population_scaling_new( terms = x$terms, role = x$role, @@ -156,10 +209,14 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) { #' @export bake.step_population_scaling <- function(object, new_data, ...) { - object$by <- object$by %||% intersect( - epi_keys_only(new_data), - colnames(select(object$df, !object$df_pop_col)) - ) + if (is.null(object$by)) { + cli::cli_abort(c( + "`by` was not set and no default was filled in", + ">" = "If this was a fit recipe generated from an older version + of epipredict that you loaded in from a file, + please regenerate with the current version of epipredict." + )) + } joinby <- list(x = names(object$by) %||% object$by, y = object$by) hardhat::validate_column_names(new_data, joinby$x) hardhat::validate_column_names(object$df, joinby$y) @@ -177,7 +234,10 @@ bake.step_population_scaling <- function(object, new_data, ...) { suffix <- ifelse(object$create_new, object$suffix, "") col_to_remove <- setdiff(colnames(object$df), colnames(new_data)) - left_join(new_data, object$df, by = object$by, suffix = c("", ".df")) %>% + inner_join(new_data, object$df, + by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"), + suffix = c("", ".df") + ) %>% mutate( across( all_of(object$columns), diff --git a/R/utils-misc.R b/R/utils-misc.R index a1e0f025..fec70791 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -31,30 +31,76 @@ check_pname <- function(res, preds, object, newname = NULL) { res } +# Copied from `epiprocess`: + +#' "Format" a character vector of column/variable names for cli interpolation +#' +#' Designed to give good output if interpolated with cli. Main purpose is to add +#' backticks around variable names when necessary, and something other than an +#' empty string if length 0. +#' +#' @param x `chr`; e.g., `colnames` of some data frame +#' @param empty string; what should be output if `x` is of length 0? +#' @return `chr` +#' @keywords internal +format_varnames <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + as.character(syms(x)) + } +} grab_forged_keys <- function(forged, workflow, new_data) { - forged_roles <- names(forged$extras$roles) - extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) - # 1. these are the keys in the test data after prep/bake - new_keys <- names(extras) - # 2. these are the keys in the training data + # 1. keys in the training data post-prep, based on roles: old_keys <- key_colnames(workflow) - # 3. these are the keys in the test data as input - new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) - if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { - cli_warn(paste( - "Not all epi keys that were present in the training data are available", - "in `new_data`. Predictions will have only the available keys." + # 2. keys in the test data post-bake, based on roles & structure: + forged_roles <- forged$extras$roles + new_key_tbl <- bind_cols(forged_roles$geo_value, forged_roles$key, forged_roles$time_value) + new_keys <- names(new_key_tbl) + if (length(new_keys) == 0L) { + # No epikeytime role assignment; infer from all columns: + potential_new_keys <- c("geo_value", "time_value") + forged_tbl <- bind_cols(forged$extras$roles) + new_keys <- potential_new_keys[potential_new_keys %in% names(forged_tbl)] + new_key_tbl <- forged_tbl[new_keys] + } + # Softly validate: + if (!(setequal(old_keys, new_keys))) { + cli_warn(c( + "Inconsistent epikeytime identifier columns specified/inferred in training vs. in testing data.", + "i" = "training epikeytime columns, based on roles post-mold/prep: {format_varnames(old_keys)}", + "i" = "testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}", + "*" = "", + ">" = 'Some mismatches can be addressed by using `epi_df`s instead of tibbles, or by using `update_role` + to assign pre-`prep` columns the "geo_value", "key", and "time_value" roles.' )) } - if (is_epi_df(new_data)) { - meta <- attr(new_data, "metadata") - extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys %||% character()) - } else if (all(c("geo_value", "time_value") %in% new_keys)) { - if (length(new_keys) > 2) other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] - extras <- as_epi_df(extras, other_keys = other_keys %||% character()) + # Convert `new_key_tbl` to `epi_df` if not renaming columns nor violating + # `epi_df` invariants. Require that our key is a unique key in any case. + if (all(c("geo_value", "time_value") %in% new_keys)) { + maybe_as_of <- attr(new_data, "metadata")$as_of # NULL if wasn't epi_df + try(return(as_epi_df(new_key_tbl, other_keys = new_keys, as_of = maybe_as_of)), + silent = TRUE + ) + } + if (anyDuplicated(new_key_tbl)) { + duplicate_key_tbl <- new_key_tbl %>% filter(.by = everything(), dplyr::n() > 1L) + error_part1 <- cli::format_error( + c( + "Specified/inferred key columns had repeated combinations in the forged/baked test data.", + "i" = "Key columns: {format_varnames(new_keys)}", + "Duplicated keys:" + ) + ) + error_part2 <- capture.output(print(duplicate_key_tbl)) + rlang::abort( + paste(collapse = "\n", c(error_part1, error_part2)), + class = "epipredict__grab_forged_keys__nonunique_key" + ) + } else { + return(new_key_tbl) } - extras } get_parsnip_mode <- function(trainer) { diff --git a/inst/other-vignettes/symptom-surveys.Rmd b/inst/other-vignettes/symptom-surveys.Rmd index af692726..cf801357 100644 --- a/inst/other-vignettes/symptom-surveys.Rmd +++ b/inst/other-vignettes/symptom-surveys.Rmd @@ -155,6 +155,7 @@ library(purrr) library(epipredict) library(recipes) +case_num <- 200 z <- epidatasets::county_smoothed_cli_comparison ``` diff --git a/man/format_varnames.Rd b/man/format_varnames.Rd new file mode 100644 index 00000000..59fe218a --- /dev/null +++ b/man/format_varnames.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-misc.R +\name{format_varnames} +\alias{format_varnames} +\title{"Format" a character vector of column/variable names for cli interpolation} +\usage{ +format_varnames(x, empty = "*none*") +} +\arguments{ +\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} + +\item{empty}{string; what should be output if \code{x} is of length 0?} +} +\value{ +\code{chr} +} +\description{ +Designed to give good output if interpolated with cli. Main purpose is to add +backticks around variable names when necessary, and something other than an +empty string if length 0. +} +\keyword{internal} diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index 3a22dbee..083dd6b4 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -28,12 +28,17 @@ inverting the existing scaling.} \item{by}{A (possibly named) character vector of variables to join by. -If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} produced by the \code{predict()} call -and the user-provided dataset. -If columns in that \code{epi_df} and \code{df} have the same name (and aren't -included in \code{by}), \code{.df} is added to the one from the user-provided data -to disambiguate. +If \code{NULL}, the default, the function will try to infer a reasonable set of +columns. First, it will try to join by all variables in the test data with +roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in \code{df}; +these roles are automatically set if you are using an \code{epi_df}, or you can +use, e.g., \code{update_role}. If no such roles are set, it will try to perform a +natural join, using variables in common between the training/test data and +population data. + +If columns in the training/testing data and \code{df} have the same name (and +aren't included in \code{by}), a \code{.df} suffix is added to the one from the +user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 6a0c5dea..c41ec7fc 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -33,12 +33,17 @@ inverting the existing scaling.} \item{by}{A (possibly named) character vector of variables to join by. -If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} produced by the \code{predict()} call -and the user-provided dataset. -If columns in that \code{epi_df} and \code{df} have the same name (and aren't -included in \code{by}), \code{.df} is added to the one from the user-provided data -to disambiguate. +If \code{NULL}, the default, the function will try to infer a reasonable set of +columns. First, it will try to join by all variables in the training/test +data with roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in +\code{df}; these roles are automatically set if you are using an \code{epi_df}, or you +can use, e.g., \code{update_role}. If no such roles are set, it will try to +perform a natural join, using variables in common between the training/test +data and population data. + +If columns in the training/testing data and \code{df} have the same name (and +aren't included in \code{by}), a \code{.df} suffix is added to the one from the +user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} @@ -46,7 +51,7 @@ to \code{df$states}. To join by multiple variables, use a vector with length > 1 For example, \code{by = c("geo_value" = "states", "county" = "county")} will match \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}. -See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} +See \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}} for more details.} \item{df_pop_col}{the name of the column in the data frame \code{df} that contains the population data and will be used for scaling. diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index d94daaec..021bbb50 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -17,6 +17,9 @@ test_that("key_colnames extracts time_value and geo_value, but not raw", { fit(data = covid_case_death_rates) expect_identical(key_colnames(my_workflow), c("geo_value", "time_value")) + + # `exclude =` works: + expect_identical(key_colnames(my_workflow, exclude = "geo_value"), c("time_value")) }) test_that("key_colnames extracts additional keys when they are present", { @@ -49,4 +52,7 @@ test_that("key_colnames extracts additional keys when they are present", { # order of the additional keys may be different expect_equal(key_colnames(my_workflow), c("geo_value", "state", "pol", "time_value")) + + # `exclude =` works: + expect_equal(key_colnames(my_workflow, exclude = c("time_value", "pol")), c("geo_value", "state")) }) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 1f356c0c..6e4cd5df 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -304,6 +304,296 @@ test_that("test joining by default columns", { }) +test_that("test joining by default columns with less common keys/classes", { + # Make a model spec that expects no predictor columns and outputs a fixed + # (rate) prediction. Based on combining two linear inequalities. + fixed_rate_prediction <- 2e-6 + model_spec <- quantile_reg(quantile_levels = 0.5, method = "fnc") %>% + set_engine( + "rq", + R = matrix(c(1, -1), 2, 1), r = c(1, -1) * fixed_rate_prediction, + eps = fixed_rate_prediction * 1e-6 # prevent early stop + ) + + # Here's the typical setup + dat1 <- tibble::tibble(geo_value = 1:2, time_value = 1, y = c(3 * 5, 7 * 11)) %>% + as_epi_df() + pop1 <- tibble::tibble(geo_value = 1:2, population = c(5e6, 11e6)) + ewf1 <- epi_workflow( + epi_recipe(dat1) %>% + step_population_scaling(y, df = pop1, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf1, estimated = FALSE) %>% + prep(dat1) %>% + bake(new_data = NULL), + dat1 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf1, dat1)) %>% + pivot_quantiles_wider(.pred), + dat1 %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ) + + # With geo x age in time series but only geo in population data: + dat1b <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) %>% + as_epi_df(other_keys = "age_group") + pop1b <- pop1 + ewf1b <- epi_workflow( + epi_recipe(dat1b) %>% + step_population_scaling(y, df = pop1b, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf1b, estimated = FALSE) %>% + prep(dat1b) %>% + bake(new_data = NULL), + dat1b %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6), ahead_0_y_scaled = y_scaled) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + forecast(fit(ewf1b, dat1b)) %>% + pivot_quantiles_wider(.pred), + dat1b %>% + select(!"y") %>% + as_tibble() %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # Same thing but with time series in tibble, but with role hints: + dat1b2 <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) + pop1b2 <- pop1b + ewf1b2 <- epi_workflow( + # Can't use epi_recipe or step_epi_ahead; adjust. + recipe(dat1b2) %>% + update_role("geo_value", new_role = "geo_value") %>% + update_role("age_group", new_role = "key") %>% + update_role("time_value", new_role = "time_value") %>% + step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b2, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf1b2, estimated = FALSE) %>% + prep(dat1b2) %>% + bake(new_data = NULL), + dat1b2 %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + # get_test_data doesn't work with non-`epi_df`s, so provide test data manually: + predict(fit(ewf1b2, dat1b2), dat1b2) %>% + pivot_quantiles_wider(.pred), + dat1b2 %>% + select(!"y") %>% + as_tibble() %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) %>% + select(geo_value, age_group, time_value, `0.5`) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # Same thing but with time series in tibble, but no role hints -> incorrect + # key guess -> we prep without warning, but error when we realize we don't + # actually have a unique key for our predictions: + dat1b3 <- dat1b2 + pop1b3 <- pop1b2 + ewf1b3 <- epi_workflow( + # Can't use epi_recipe or step_epi_ahead; adjust. + recipe(dat1b3) %>% + step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b3, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf1b3, estimated = FALSE) %>% + prep(dat1b3) %>% + bake(new_data = NULL), + dat1b3 %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) + ) + expect_error( + predict(fit(ewf1b3, dat1b3), dat1b3) %>% + pivot_quantiles_wider(.pred), + class = "epipredict__grab_forged_keys__nonunique_key" + ) + + # With geo x age_group breakdown on both: + dat2 <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) %>% + as_epi_df(other_keys = "age_group") + pop2 <- pop1 %>% + mutate(age_group = geo_value, geo_value = 1) + ewf2 <- epi_workflow( + epi_recipe(dat2) %>% + step_population_scaling(y, df = pop2, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop2, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf2, estimated = FALSE) %>% + prep(dat2) %>% + bake(new_data = NULL), + dat2 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf2, dat2)) %>% + pivot_quantiles_wider(.pred), + dat2 %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ) + + # With only an age column in population data: + dat2b <- dat2 + pop2b <- pop1 %>% + mutate(age_group = geo_value, geo_value = NULL) + ewf2b <- epi_workflow( + epi_recipe(dat2b) %>% + step_population_scaling(y, df = pop2b, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop2b, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf2b, estimated = FALSE) %>% + prep(dat2b) %>% + bake(new_data = NULL), + dat2b %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + forecast(fit(ewf2b, dat2b)) %>% + pivot_quantiles_wider(.pred), + dat2b %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # with geo x time_value breakdown instead: + dat3 <- dat1 %>% + as_tibble() %>% + mutate(time_value = geo_value, geo_value = 1) %>% + as_epi_df() + pop3 <- pop1 %>% + mutate(time_value = geo_value, geo_value = 1) + ewf3 <- epi_workflow( + epi_recipe(dat3) %>% + step_population_scaling(y, df = pop3, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop3, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf3, estimated = FALSE) %>% + prep(dat3) %>% + bake(new_data = NULL), + dat3 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf3, dat3)) %>% + pivot_quantiles_wider(.pred), + # slightly edited copy-pasta due to test time selection: + dat3 %>% + select(!"y") %>% + as_tibble() %>% + slice_max(by = geo_value, time_value) %>% + mutate(`0.5` = 2 * 11) + ) + + # With alternative geo naming... we're able to successfully prep (and we're + # not missing a warning as in 1b3 since we're actually in a "correct" setup), + # but still like 1b3, we fail at prediction time as key roles have not been + # set up and inference fails: + dat4 <- dat1 %>% rename(geo = geo_value) + pop4 <- pop1 %>% rename(geo = geo_value) + ewf4 <- epi_workflow( + recipe(dat4) %>% + step_population_scaling(y, df = pop4, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf4, estimated = FALSE) %>% + prep(dat4) %>% + bake(new_data = NULL), + dat4 %>% + mutate(y_scaled = c(3e-6, 7e-6)) + ) + expect_error( + # get_test_data doesn't work with non-`epi_df`s, so provide test data manually: + predict(fit(ewf4, dat4), dat4) %>% + pivot_quantiles_wider(.pred), + class = "epipredict__grab_forged_keys__nonunique_key" + ) +}) + + test_that("expect error if `by` selector does not match", { jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd index 89a23c33..ec07272a 100644 --- a/vignettes/articles/smooth-qr.Rmd +++ b/vignettes/articles/smooth-qr.Rmd @@ -97,7 +97,7 @@ state cases and deaths. This sample data ranges from Dec. 31, 2020 to Dec. 31, 2021. ```{r} -edf <- covid_case_death_rates +edf <- epidatasets::covid_case_death_rates ``` We will set the forecast date to be November 30, 2021 so that we can produce