From 3ab495600da832c4958a25548e26c1b48b18dcd2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 22 Oct 2024 10:32:14 -0700 Subject: [PATCH 01/12] Change `key_colnames(extra_keys =)` to supported `other_keys =` Recent/current epiprocess versions silently ignore `extra_keys =`. Pending epiprocess changes will soft-deprecate and route it to `other_keys =` instead, plus add some stricter checks on `other_keys =`. --- R/utils-misc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils-misc.R b/R/utils-misc.R index a1e0f025f..bb6c09e76 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -40,7 +40,7 @@ grab_forged_keys <- function(forged, workflow, new_data) { # 2. these are the keys in the training data old_keys <- key_colnames(workflow) # 3. these are the keys in the test data as input - new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) + new_df_keys <- key_colnames(new_data, other_keys = setdiff(new_keys, c("geo_value", "time_value"))) if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { cli_warn(paste( "Not all epi keys that were present in the training data are available", From 06f54588457fd3d717e45d15646ea14e0b3b6d6c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 22 Oct 2024 10:41:48 -0700 Subject: [PATCH 02/12] Add `key_colnames(exclude =)` support for epipredict objects --- R/key_colnames.R | 10 ++++++---- tests/testthat/test-key_colnames.R | 6 ++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/R/key_colnames.R b/R/key_colnames.R index b9ebde5dc..b8d07ce82 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -1,20 +1,22 @@ #' @export -key_colnames.recipe <- function(x, ...) { +key_colnames.recipe <- function(x, ..., exclude = character()) { geo_key <- x$var_info$variable[x$var_info$role %in% "geo_value"] time_key <- x$var_info$variable[x$var_info$role %in% "time_value"] keys <- x$var_info$variable[x$var_info$role %in% "key"] - c(geo_key, keys, time_key) %||% character(0L) + full_key <- c(geo_key, keys, time_key) %||% character(0L) + full_key[!full_key %in% exclude] } #' @export -key_colnames.epi_workflow <- function(x, ...) { +key_colnames.epi_workflow <- function(x, ..., exclude = character()) { # safer to look at the mold than the preprocessor mold <- hardhat::extract_mold(x) molded_names <- names(mold$extras$roles) geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value) time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value) keys <- names(mold$extras$roles[molded_names %in% "key"]$key) - c(geo_key, keys, time_key) %||% character(0L) + full_key <- c(geo_key, keys, time_key) %||% character(0L) + full_key[!full_key %in% exclude] } kill_time_value <- function(v) { diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index d94daaec4..021bbb50c 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -17,6 +17,9 @@ test_that("key_colnames extracts time_value and geo_value, but not raw", { fit(data = covid_case_death_rates) expect_identical(key_colnames(my_workflow), c("geo_value", "time_value")) + + # `exclude =` works: + expect_identical(key_colnames(my_workflow, exclude = "geo_value"), c("time_value")) }) test_that("key_colnames extracts additional keys when they are present", { @@ -49,4 +52,7 @@ test_that("key_colnames extracts additional keys when they are present", { # order of the additional keys may be different expect_equal(key_colnames(my_workflow), c("geo_value", "state", "pol", "time_value")) + + # `exclude =` works: + expect_equal(key_colnames(my_workflow, exclude = c("time_value", "pol")), c("geo_value", "state")) }) From f406a0f9c5aac23af80fe0da86419c7f44012755 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 22 Oct 2024 11:12:53 -0700 Subject: [PATCH 03/12] Refactor: rename some `extra_keys`, remove some NULL hedges - Rename `extra_keys` -> `other_keys` when we are going to feed it into `other_keys =`. - Remove some `$other_keys %||% character()` hedges now that current epiprocess standardizes to character() not NULL and example `epi_df` objects have been updated to that standard. --- R/autoplot.R | 5 ++--- R/utils-misc.R | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index 4f4222979..870dcb8d8 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -127,11 +127,10 @@ autoplot.epi_workflow <- function( if (!is.null(shift)) { edf <- mutate(edf, time_value = time_value + shift) } - extra_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) - if (length(extra_keys) == 0L) extra_keys <- NULL + other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) edf <- as_epi_df(edf, as_of = object$fit$meta$as_of, - other_keys = extra_keys %||% character() + other_keys = other_keys ) if (is.null(predictions)) { return(autoplot( diff --git a/R/utils-misc.R b/R/utils-misc.R index bb6c09e76..7f1eaf843 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -49,10 +49,10 @@ grab_forged_keys <- function(forged, workflow, new_data) { } if (is_epi_df(new_data)) { meta <- attr(new_data, "metadata") - extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys %||% character()) + extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys) } else if (all(c("geo_value", "time_value") %in% new_keys)) { - if (length(new_keys) > 2) other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] - extras <- as_epi_df(extras, other_keys = other_keys %||% character()) + other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] + extras <- as_epi_df(extras, other_keys = other_keys) } extras } From e2cb1a1147de1bcc3e4c965ef3a821631b604efd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 24 Oct 2024 08:40:05 -0700 Subject: [PATCH 04/12] Fix `process_rq_preds` on single-level predictions --- R/make_quantile_reg.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 9e653184c..1388dd859 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -112,7 +112,7 @@ make_quantile_reg <- function() { # can't make a method because object is second out <- switch(type, - rq = dist_quantiles(unname(as.list(x)), object$quantile_levels), # one quantile + rq = dist_quantiles(unname(as.list(x)), object$tau), # one quantile rqs = { x <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) dist_quantiles(x, list(object$tau)) From f0ace50ff6d74c4175f177af20b46fb041081d5d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 22 Oct 2024 17:29:06 -0700 Subject: [PATCH 05/12] population_scaling: fix w/ other_keys, upstream updates --- R/layer_population_scaling.R | 23 ++- R/step_population_scaling.R | 27 +++- tests/testthat/test-population_scaling.R | 193 +++++++++++++++++++++++ 3 files changed, 237 insertions(+), 6 deletions(-) diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 7ec168822..741a5adf8 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -135,6 +135,25 @@ slather.layer_population_scaling <- ) rlang::check_dots_empty() + if (is.null(object$by)) { + # Assume `layer_predict` has calculated the prediction keys and other layers + # don't change the prediction key colnames: + prediction_key_colnames <- names(components$keys) + lhs_potential_keys <- prediction_key_colnames + rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col)) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- kill_time_value(lhs_potential_keys) + if (!all(suggested_min_keys %in% object$by)) { + cli_warn(c( + "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys") + } + } + object$by <- object$by %||% intersect( epi_keys_only(components$predictions), colnames(select(object$df, !object$df_pop_col)) @@ -152,10 +171,12 @@ slather.layer_population_scaling <- suffix <- ifelse(object$create_new, object$suffix, "") col_to_remove <- setdiff(colnames(object$df), colnames(components$predictions)) - components$predictions <- left_join( + components$predictions <- inner_join( components$predictions, object$df, by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop"), suffix = c("", ".df") ) %>% mutate(across( diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 7bdb00925..901d80867 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -156,10 +156,25 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) { #' @export bake.step_population_scaling <- function(object, new_data, ...) { - object$by <- object$by %||% intersect( - epi_keys_only(new_data), - colnames(select(object$df, !object$df_pop_col)) - ) + if (is.null(object$by)) { + rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col)) + if (is_epi_df(new_data)) { + lhs_potential_keys <- key_colnames(new_data) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- kill_time_value(lhs_potential_keys) + if (!all(suggested_min_keys %in% object$by)) { + cli_warn(c( + "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those for your time series", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys") + } + } else { + object$by <- intersect(names(new_data), rhs_potential_keys) + } + } joinby <- list(x = names(object$by) %||% object$by, y = object$by) hardhat::validate_column_names(new_data, joinby$x) hardhat::validate_column_names(object$df, joinby$y) @@ -177,7 +192,9 @@ bake.step_population_scaling <- function(object, new_data, ...) { suffix <- ifelse(object$create_new, object$suffix, "") col_to_remove <- setdiff(colnames(object$df), colnames(new_data)) - left_join(new_data, object$df, by = object$by, suffix = c("", ".df")) %>% + inner_join(new_data, object$df, + by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"), + suffix = c("", ".df")) %>% mutate( across( all_of(object$columns), diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 1f356c0cc..9de0c7d6d 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -304,6 +304,199 @@ test_that("test joining by default columns", { }) +test_that("test joining by default columns with less common keys/classes", { + # Make a model spec that expects no predictor columns and outputs a fixed + # (rate) prediction. Based on combining two linear inequalities. + fixed_rate_prediction <- 2e-6 + model_spec <- quantile_reg(quantile_levels = 0.5, method = "fnc") %>% + set_engine( + "rq", + R = matrix(c(1, -1), 2, 1), r = c(1, -1) * fixed_rate_prediction, + eps = fixed_rate_prediction * 1e-6 # prevent early stop + ) + + # Here's the typical setup + dat1 <- tibble::tibble(geo_value = 1:2, time_value = 1, y = c(3 * 5, 7 * 11)) %>% + as_epi_df() + pop1 <- tibble::tibble(geo_value = 1:2, population = c(5e6, 11e6)) + ewf1 <- epi_workflow( + epi_recipe(dat1) %>% + step_population_scaling(y, df = pop1, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf1, estimated = FALSE) %>% + prep(dat1) %>% + bake(new_data = NULL), + dat1 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf1, dat1)) %>% + pivot_quantiles_wider(.pred), + dat1 %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ) + + # With geo x age in time series but only geo in population data: + dat1b <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) %>% + as_epi_df(other_keys = "age_group") + pop1b <- pop1 + ewf1b <- epi_workflow( + epi_recipe(dat1b) %>% + step_population_scaling(y, df = pop1b, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf1b, estimated = FALSE) %>% + prep(dat1b) %>% + bake(new_data = NULL), + dat1b %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6), ahead_0_y_scaled = y_scaled) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + forecast(fit(ewf1b, dat1b)) %>% + pivot_quantiles_wider(.pred), + dat1b %>% + select(!"y") %>% + as_tibble() %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # With geo x age_group breakdown on both: + dat2 <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) %>% + as_epi_df(other_keys = "age_group") + pop2 <- pop1 %>% + mutate(age_group = geo_value, geo_value = 1) + ewf2 <- epi_workflow( + epi_recipe(dat2) %>% + step_population_scaling(y, df = pop2, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop2, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf2, estimated = FALSE) %>% + prep(dat2) %>% + bake(new_data = NULL), + dat2 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf2, dat2)) %>% + pivot_quantiles_wider(.pred), + dat2 %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ) + + # With only an age column in population data: + dat2b <- dat2 + pop2b <- pop1 %>% + mutate(age_group = geo_value, geo_value = NULL) + ewf2b <- epi_workflow( + epi_recipe(dat2b) %>% + step_population_scaling(y, df = pop2b, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop2b, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf2b, estimated = FALSE) %>% + prep(dat2b) %>% + bake(new_data = NULL), + dat2b %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + forecast(fit(ewf2b, dat2b)) %>% + pivot_quantiles_wider(.pred), + dat2b %>% + select(!"y") %>% + as_tibble() %>% + mutate(`0.5` = c(2 * 5, 2 * 11)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # with geo x time_value breakdown instead: + dat3 <- dat1 %>% + as_tibble() %>% + mutate(time_value = geo_value, geo_value = 1) %>% + as_epi_df() + pop3 <- pop1 %>% + mutate(time_value = geo_value, geo_value = 1) + ewf3 <- epi_workflow( + epi_recipe(dat3) %>% + step_population_scaling(y, df = pop3, df_pop_col = "population") %>% + step_epi_ahead(y_scaled, ahead = 0), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop3, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf3, estimated = FALSE) %>% + prep(dat3) %>% + bake(new_data = NULL), + dat3 %>% + mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled) + ) + expect_equal( + forecast(fit(ewf3, dat3)) %>% + pivot_quantiles_wider(.pred), + # slightly edited copy-pasta due to test time selection: + dat3 %>% + select(!"y") %>% + as_tibble() %>% + slice_max(by = geo_value, time_value) %>% + mutate(`0.5` = 2 * 11) + ) + + # TODO non-`epi_df` scaling? + + # TODO multikey scaling? + +}) + + test_that("expect error if `by` selector does not match", { jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% From 29a320458d7025fcbac7499f6b9703c281c6c854 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 29 Oct 2024 11:04:16 -0700 Subject: [PATCH 06/12] Move validation to prep and make it rely on roles - Roles should already factor in key_colnames. --- NAMESPACE | 2 + R/epipredict-package.R | 2 + R/layer_population_scaling.R | 4 +- R/step_population_scaling.R | 59 +++++++++++++++++------- tests/testthat/test-population_scaling.R | 39 ++++++++++++++++ 5 files changed, 87 insertions(+), 19 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5ee13f730..e7e720f11 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -242,6 +242,7 @@ importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,group_by_at) +importFrom(dplyr,inner_join) importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) @@ -273,6 +274,7 @@ importFrom(hardhat,extract_recipe) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") +importFrom(magrittr,extract2) importFrom(recipes,bake) importFrom(recipes,detect_step) importFrom(recipes,prep) diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 3dee263e2..b4b9973bf 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -7,7 +7,9 @@ #' @importFrom cli cli_abort cli_warn #' @importFrom dplyr arrange across all_of any_of bind_cols bind_rows group_by #' @importFrom dplyr full_join relocate summarise everything +#' @importFrom dplyr inner_join #' @importFrom dplyr summarize filter mutate select left_join rename ungroup +#' @importFrom magrittr extract2 #' @importFrom rlang := !! %||% as_function global_env set_names !!! caller_arg #' @importFrom rlang is_logical is_true inject enquo enquos expr sym arg_match #' @importFrom stats poly predict lm residuals quantile diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 741a5adf8..5a982d344 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -136,8 +136,8 @@ slather.layer_population_scaling <- rlang::check_dots_empty() if (is.null(object$by)) { - # Assume `layer_predict` has calculated the prediction keys and other layers - # don't change the prediction key colnames: + # Assume `layer_predict` has calculated the prediction keys and other + # layers don't change the prediction key colnames: prediction_key_colnames <- names(components$keys) lhs_potential_keys <- prediction_key_colnames rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col)) diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 901d80867..fe07c2808 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -138,6 +138,42 @@ step_population_scaling_new <- #' @export prep.step_population_scaling <- function(x, training, info = NULL, ...) { + hardhat::validate_column_names(x$df, x$df_pop_col) + if (is.null(x$by)) { + rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col) + lhs_potential_keys <- info %>% + filter(role %in% c("geo_value", "key", "time_value")) %>% + extract2("variable") %>% + unique() # in case of weird var with multiple of above roles + if (length(lhs_potential_keys) == 0L) { + # We're working with a recipe and tibble, and *_role hasn't set up any of + # the above roles. Let's say any column could actually act as a key, and + # lean on `intersect` below to make this something reasonable. + lhs_potential_keys <- names(training) + } + suggested_min_keys <- info %>% + filter(role %in% c("geo_value", "key")) %>% + extract2("variable") %>% + unique() + # (0 suggested keys if we weren't given any epikeytime var info.) + x$by <- intersect(lhs_potential_keys, rhs_potential_keys) + if (length(x$by) == 0L) { + cli_stop(c( + "Couldn't guess a default for `by`", + ">" = "Please rename columns in your population data to match those in your training data, + or manually specify `by =` in `step_population_scaling()`." + ), class = "epipredict__step_population_scaling__default_by_no_intersection") + } + if (!all(suggested_min_keys %in% x$by)) { + cli_warn(c( + "Couldn't find {setdiff(suggested_min_keys, x$by)} in population `df`.", + "i" = "Defaulting to join by {x$by}.", + ">" = "Double-check whether column names on the population `df` match those for your time series.", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}.", + ">" = "Manually specify `by =` to silence." + ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys") + } + } step_population_scaling_new( terms = x$terms, role = x$role, @@ -157,23 +193,12 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) { #' @export bake.step_population_scaling <- function(object, new_data, ...) { if (is.null(object$by)) { - rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col)) - if (is_epi_df(new_data)) { - lhs_potential_keys <- key_colnames(new_data) - object$by <- intersect(lhs_potential_keys, rhs_potential_keys) - suggested_min_keys <- kill_time_value(lhs_potential_keys) - if (!all(suggested_min_keys %in% object$by)) { - cli_warn(c( - "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`", - "i" = "Defaulting to join by {object$by}", - ">" = "Double-check whether column names on the population `df` match those for your time series", - ">" = "Consider using population data with breakdowns by {suggested_min_keys}", - ">" = "Manually specify `by =` to silence" - ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys") - } - } else { - object$by <- intersect(names(new_data), rhs_potential_keys) - } + cli::cli_abort(c( + "`by` was not set and no default was filled in", + ">" = "If this was a fit recipe generated from an older version + of epipredict that you loaded in from a file, + please regenerate with the current version of epipredict." + )) } joinby <- list(x = names(object$by) %||% object$by, y = object$by) hardhat::validate_column_names(new_data, joinby$x) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 9de0c7d6d..0a16dff6f 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -386,6 +386,45 @@ test_that("test joining by default columns with less common keys/classes", { class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" ) + # Same thing but with time series in tibble: + dat1bb <- dat1 %>% + as_tibble() %>% + mutate(age_group = geo_value, geo_value = 1) + pop1bb <- pop1b + ewf1bb <- epi_workflow( + # Can't use epi_recipe or step_epi_ahead; adjust. + recipe(dat1bb) %>% + update_role("geo_value", new_role = "geo_value") %>% + update_role("age_group", new_role = "key") %>% + update_role("time_value", new_role = "time_value") %>% + step_population_scaling(y, df = pop1bb, df_pop_col = "population", role = "outcome") %>% + # XXX key_colnames inference differs at fit vs. predict time, so we also + # need to manually provide some key role settings to not have trouble at + # predict time. + {.}, + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1bb, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf1bb, estimated = FALSE) %>% + prep(dat1bb) %>% + bake(new_data = NULL), + dat1bb %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) + ) + expect_equal( + predict(fit(ewf1bb, dat1bb), dat1bb) %>% + pivot_quantiles_wider(.pred), + dat1bb %>% + select(!"y") %>% + as_tibble() %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) + ) + # With geo x age_group breakdown on both: dat2 <- dat1 %>% as_tibble() %>% From 194735fd992b7ad0b55d27b5ed384537c16dab9b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 28 Oct 2024 05:06:37 -0700 Subject: [PATCH 07/12] population_scaling: do more validation on creation --- R/step_population_scaling.R | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index fe07c2808..d7b7893e3 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -89,13 +89,25 @@ step_population_scaling <- suffix = "_scaled", skip = FALSE, id = rand_id("population_scaling")) { - arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, id) - arg_is_lgl(create_new, skip) - arg_is_chr(df_pop_col, suffix, id) + if (rlang::dots_n(...) == 0L) { + cli_abort(c( + "`...` must not be empty.", + ">" = "Please provide one or more tidyselect expressions in `...` + specifying the columns to which scaling should be applied.", + ">" = "If you really want to list `step_population_scaling` in your + recipe but not have it do anything, you can use a tidyselection + that selects zero variables, such as `c()`." + )) + } + arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, skip, id) + arg_is_chr(role, df_pop_col, suffix, id) + hardhat::validate_column_names(df, df_pop_col) arg_is_chr(by, allow_null = TRUE) + arg_is_numeric(rate_rescaling) if (rate_rescaling <= 0) { cli_abort("`rate_rescaling` must be a positive number.") } + arg_is_lgl(create_new, skip) recipes::add_step( recipe, @@ -138,7 +150,6 @@ step_population_scaling_new <- #' @export prep.step_population_scaling <- function(x, training, info = NULL, ...) { - hardhat::validate_column_names(x$df, x$df_pop_col) if (is.null(x$by)) { rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col) lhs_potential_keys <- info %>% From 7090cf0580a37a57b676943409dd3c9df5ea5bae Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 28 Oct 2024 15:21:01 -0700 Subject: [PATCH 08/12] Adjust message on `grab_forged_keys` mismatches to apply to testing situation --- R/utils-misc.R | 49 ++++++++++++--- tests/testthat/test-population_scaling.R | 79 ++++++++++++++++++------ 2 files changed, 100 insertions(+), 28 deletions(-) diff --git a/R/utils-misc.R b/R/utils-misc.R index 7f1eaf843..7ab4ad953 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -31,26 +31,59 @@ check_pname <- function(res, preds, object, newname = NULL) { res } +# Copied from `epiprocess`: + +#' "Format" a character vector of column/variable names for cli interpolation +#' +#' Designed to give good output if interpolated with cli. Main purpose is to add +#' backticks around variable names when necessary, and something other than an +#' empty string if length 0. +#' +#' @param x `chr`; e.g., `colnames` of some data frame +#' @param empty string; what should be output if `x` is of length 0? +#' @return `chr` +#' @keywords internal +format_varnames <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + as.character(syms(x)) + } +} grab_forged_keys <- function(forged, workflow, new_data) { + # 1. keys in the training data post-prep, based on roles: + old_keys <- key_colnames(workflow) + # 3. keys in the test data post-bake, based on roles: forged_roles <- names(forged$extras$roles) extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) - # 1. these are the keys in the test data after prep/bake new_keys <- names(extras) - # 2. these are the keys in the training data - old_keys <- key_colnames(workflow) - # 3. these are the keys in the test data as input + if (length(new_keys) == 0L) { + # No epikeytime role assignment; infer from all columns: + potential_keys <- c("geo_value", "time_value") + new_keys <- potential_keys[potential_keys %in% names(bind_cols(forged$extras$roles))] + } + # 2. keys in the test data pre-bake based on data structure + post-bake roles: new_df_keys <- key_colnames(new_data, other_keys = setdiff(new_keys, c("geo_value", "time_value"))) - if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { - cli_warn(paste( - "Not all epi keys that were present in the training data are available", - "in `new_data`. Predictions will have only the available keys." + # Softly validate, assuming that no steps change epikeytime role assignments: + if (!(setequal(old_keys, new_df_keys) && setequal(new_df_keys, new_keys))) { + cli_warn(c( + "Inconsistent epikeytime identifier columns specified/inferred.", + "i" = "training epikeytime columns, based on roles post-mold/prep: {format_varnames(old_keys)}", + "i" = " testing epikeytime columns, based on data structure pre-bake and roles post-forge/bake: {format_varnames(new_df_keys)}", + "i" = " testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}", + "*" = "Keys will be set giving preference to test-time `epi_df` metadata followed by test-time + post-bake role settings.", + ">" = 'Some mismatches can be addressed by using `epi_df`s instead of tibbles, or by using `update_role` + to assign pre-`prep` columns the "geo_value", "key", and "time_value" roles.' )) } if (is_epi_df(new_data)) { + # Inference based on test data pre-bake data structure "wins": meta <- attr(new_data, "metadata") extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys) } else if (all(c("geo_value", "time_value") %in% new_keys)) { + # Inference based on test data post-bake roles "wins": other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] extras <- as_epi_df(extras, other_keys = other_keys) } diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 0a16dff6f..eabca9f7e 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -386,43 +386,82 @@ test_that("test joining by default columns with less common keys/classes", { class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" ) - # Same thing but with time series in tibble: - dat1bb <- dat1 %>% + # Same thing but with time series in tibble, but with role hints: + dat1b2 <- dat1 %>% as_tibble() %>% mutate(age_group = geo_value, geo_value = 1) - pop1bb <- pop1b - ewf1bb <- epi_workflow( + pop1b2 <- pop1b + ewf1b2 <- epi_workflow( # Can't use epi_recipe or step_epi_ahead; adjust. - recipe(dat1bb) %>% - update_role("geo_value", new_role = "geo_value") %>% - update_role("age_group", new_role = "key") %>% - update_role("time_value", new_role = "time_value") %>% - step_population_scaling(y, df = pop1bb, df_pop_col = "population", role = "outcome") %>% - # XXX key_colnames inference differs at fit vs. predict time, so we also - # need to manually provide some key role settings to not have trouble at - # predict time. - {.}, + recipe(dat1b2) %>% + update_role("geo_value", new_role = "geo_value") %>% + update_role("age_group", new_role = "key") %>% + update_role("time_value", new_role = "time_value") %>% + step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome") %>% + {.}, model_spec, frosting() %>% layer_predict() %>% - layer_population_scaling(.pred, df = pop1bb, df_pop_col = "population", create_new = FALSE) + layer_population_scaling(.pred, df = pop1b2, df_pop_col = "population", create_new = FALSE) + ) + expect_warning( + expect_equal( + extract_recipe(ewf1b2, estimated = FALSE) %>% + prep(dat1b2) %>% + bake(new_data = NULL), + dat1b2 %>% + # geo 1 scaling used for both: + mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ) + expect_warning( + expect_warning( + expect_equal( + predict(fit(ewf1b2, dat1b2), dat1b2) %>% + pivot_quantiles_wider(.pred), + dat1b2 %>% + select(!"y") %>% + as_tibble() %>% + # geo 1 scaling used for both: + mutate(`0.5` = c(2 * 5, 2 * 5)) %>% + select(geo_value, age_group, time_value, `0.5`) + ), + class = "epipredict__step_population_scaling__default_by_missing_suggested_keys" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + + # Same thing but with time series in tibble, but no role hints -> different inference&messaging: + dat1b3 <- dat1b2 + pop1b3 <- pop1b2 + ewf1b3 <- epi_workflow( + # Can't use epi_recipe or step_epi_ahead; adjust. + recipe(dat1b3) %>% + step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome") %>% + {.}, + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1b3, df_pop_col = "population", create_new = FALSE) ) expect_equal( - extract_recipe(ewf1bb, estimated = FALSE) %>% - prep(dat1bb) %>% + extract_recipe(ewf1b3, estimated = FALSE) %>% + prep(dat1b3) %>% bake(new_data = NULL), - dat1bb %>% + dat1b3 %>% # geo 1 scaling used for both: mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) ) expect_equal( - predict(fit(ewf1bb, dat1bb), dat1bb) %>% + predict(fit(ewf1b3, dat1b3), dat1b3) %>% pivot_quantiles_wider(.pred), - dat1bb %>% + dat1b3 %>% select(!"y") %>% as_tibble() %>% # geo 1 scaling used for both: - mutate(`0.5` = c(2 * 5, 2 * 5)) + mutate(`0.5` = c(2 * 5, 2 * 5)) %>% + select(geo_value, age_group, time_value, `0.5`) ) # With geo x age_group breakdown on both: From 2e307577fb0c611a22d11806898483fd2795b194 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 28 Oct 2024 17:08:51 -0700 Subject: [PATCH 09/12] Make key inference more consistent; allow non-`epi_df` forged data --- R/key_colnames.R | 13 +++-- R/layer_population_scaling.R | 3 +- R/step_population_scaling.R | 10 ++-- R/utils-misc.R | 61 ++++++++++++++---------- tests/testthat/test-population_scaling.R | 20 ++++---- 5 files changed, 62 insertions(+), 45 deletions(-) diff --git a/R/key_colnames.R b/R/key_colnames.R index b8d07ce82..9e0d44dcf 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -11,11 +11,14 @@ key_colnames.recipe <- function(x, ..., exclude = character()) { key_colnames.epi_workflow <- function(x, ..., exclude = character()) { # safer to look at the mold than the preprocessor mold <- hardhat::extract_mold(x) - molded_names <- names(mold$extras$roles) - geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value) - time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value) - keys <- names(mold$extras$roles[molded_names %in% "key"]$key) - full_key <- c(geo_key, keys, time_key) %||% character(0L) + molded_roles <- mold$extras$roles + extras <- bind_cols(molded_roles$geo_value, molded_roles$key, molded_roles$time_value) + full_key <- names(extras) + if (length(full_key) == 0L) { + # No epikeytime role assignment; infer from all columns: + potential_keys <- c("geo_value", "time_value") + full_key <- potential_keys[potential_keys %in% names(bind_cols(molded_roles))] + } full_key[!full_key %in% exclude] } diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 5a982d344..f47e3f292 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -145,7 +145,8 @@ slather.layer_population_scaling <- suggested_min_keys <- kill_time_value(lhs_potential_keys) if (!all(suggested_min_keys %in% object$by)) { cli_warn(c( - "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`", + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, + but {?wasn't/weren't} found in the population `df`.", "i" = "Defaulting to join by {object$by}", ">" = "Double-check whether column names on the population `df` match those expected in your predictions", ">" = "Consider using population data with breakdowns by {suggested_min_keys}", diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index d7b7893e3..297c30727 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -177,9 +177,10 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) { } if (!all(suggested_min_keys %in% x$by)) { cli_warn(c( - "Couldn't find {setdiff(suggested_min_keys, x$by)} in population `df`.", + "{setdiff(suggested_min_keys, x$by)} {?was an/were} epikey column{?s} in the training data, + but {?wasn't/weren't} found in the population `df`.", "i" = "Defaulting to join by {x$by}.", - ">" = "Double-check whether column names on the population `df` match those for your time series.", + ">" = "Double-check whether column names on the population `df` match those for your training data.", ">" = "Consider using population data with breakdowns by {suggested_min_keys}.", ">" = "Manually specify `by =` to silence." ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys") @@ -229,8 +230,9 @@ bake.step_population_scaling <- function(object, new_data, ...) { col_to_remove <- setdiff(colnames(object$df), colnames(new_data)) inner_join(new_data, object$df, - by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"), - suffix = c("", ".df")) %>% + by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"), + suffix = c("", ".df") + ) %>% mutate( across( all_of(object$columns), diff --git a/R/utils-misc.R b/R/utils-misc.R index 7ab4ad953..fec707913 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -54,40 +54,53 @@ format_varnames <- function(x, empty = "*none*") { grab_forged_keys <- function(forged, workflow, new_data) { # 1. keys in the training data post-prep, based on roles: old_keys <- key_colnames(workflow) - # 3. keys in the test data post-bake, based on roles: - forged_roles <- names(forged$extras$roles) - extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) - new_keys <- names(extras) + # 2. keys in the test data post-bake, based on roles & structure: + forged_roles <- forged$extras$roles + new_key_tbl <- bind_cols(forged_roles$geo_value, forged_roles$key, forged_roles$time_value) + new_keys <- names(new_key_tbl) if (length(new_keys) == 0L) { # No epikeytime role assignment; infer from all columns: - potential_keys <- c("geo_value", "time_value") - new_keys <- potential_keys[potential_keys %in% names(bind_cols(forged$extras$roles))] + potential_new_keys <- c("geo_value", "time_value") + forged_tbl <- bind_cols(forged$extras$roles) + new_keys <- potential_new_keys[potential_new_keys %in% names(forged_tbl)] + new_key_tbl <- forged_tbl[new_keys] } - # 2. keys in the test data pre-bake based on data structure + post-bake roles: - new_df_keys <- key_colnames(new_data, other_keys = setdiff(new_keys, c("geo_value", "time_value"))) - # Softly validate, assuming that no steps change epikeytime role assignments: - if (!(setequal(old_keys, new_df_keys) && setequal(new_df_keys, new_keys))) { + # Softly validate: + if (!(setequal(old_keys, new_keys))) { cli_warn(c( - "Inconsistent epikeytime identifier columns specified/inferred.", + "Inconsistent epikeytime identifier columns specified/inferred in training vs. in testing data.", "i" = "training epikeytime columns, based on roles post-mold/prep: {format_varnames(old_keys)}", - "i" = " testing epikeytime columns, based on data structure pre-bake and roles post-forge/bake: {format_varnames(new_df_keys)}", - "i" = " testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}", - "*" = "Keys will be set giving preference to test-time `epi_df` metadata followed by test-time - post-bake role settings.", + "i" = "testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}", + "*" = "", ">" = 'Some mismatches can be addressed by using `epi_df`s instead of tibbles, or by using `update_role` to assign pre-`prep` columns the "geo_value", "key", and "time_value" roles.' )) } - if (is_epi_df(new_data)) { - # Inference based on test data pre-bake data structure "wins": - meta <- attr(new_data, "metadata") - extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys) - } else if (all(c("geo_value", "time_value") %in% new_keys)) { - # Inference based on test data post-bake roles "wins": - other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] - extras <- as_epi_df(extras, other_keys = other_keys) + # Convert `new_key_tbl` to `epi_df` if not renaming columns nor violating + # `epi_df` invariants. Require that our key is a unique key in any case. + if (all(c("geo_value", "time_value") %in% new_keys)) { + maybe_as_of <- attr(new_data, "metadata")$as_of # NULL if wasn't epi_df + try(return(as_epi_df(new_key_tbl, other_keys = new_keys, as_of = maybe_as_of)), + silent = TRUE + ) + } + if (anyDuplicated(new_key_tbl)) { + duplicate_key_tbl <- new_key_tbl %>% filter(.by = everything(), dplyr::n() > 1L) + error_part1 <- cli::format_error( + c( + "Specified/inferred key columns had repeated combinations in the forged/baked test data.", + "i" = "Key columns: {format_varnames(new_keys)}", + "Duplicated keys:" + ) + ) + error_part2 <- capture.output(print(duplicate_key_tbl)) + rlang::abort( + paste(collapse = "\n", c(error_part1, error_part2)), + class = "epipredict__grab_forged_keys__nonunique_key" + ) + } else { + return(new_key_tbl) } - extras } get_parsnip_mode <- function(trainer) { diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index eabca9f7e..63c23a388 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -398,7 +398,9 @@ test_that("test joining by default columns with less common keys/classes", { update_role("age_group", new_role = "key") %>% update_role("time_value", new_role = "time_value") %>% step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome") %>% - {.}, + { + . + }, model_spec, frosting() %>% layer_predict() %>% @@ -432,14 +434,16 @@ test_that("test joining by default columns with less common keys/classes", { class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" ) - # Same thing but with time series in tibble, but no role hints -> different inference&messaging: + # Same thing but with time series in tibble, but no role hints -> different behavior: dat1b3 <- dat1b2 pop1b3 <- pop1b2 ewf1b3 <- epi_workflow( # Can't use epi_recipe or step_epi_ahead; adjust. recipe(dat1b3) %>% step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome") %>% - {.}, + { + . + }, model_spec, frosting() %>% layer_predict() %>% @@ -453,15 +457,10 @@ test_that("test joining by default columns with less common keys/classes", { # geo 1 scaling used for both: mutate(y_scaled = c(3e-6, 7 * 11 / 5e6)) ) - expect_equal( + expect_error( predict(fit(ewf1b3, dat1b3), dat1b3) %>% pivot_quantiles_wider(.pred), - dat1b3 %>% - select(!"y") %>% - as_tibble() %>% - # geo 1 scaling used for both: - mutate(`0.5` = c(2 * 5, 2 * 5)) %>% - select(geo_value, age_group, time_value, `0.5`) + class = "epipredict__grab_forged_keys__nonunique_key" ) # With geo x age_group breakdown on both: @@ -571,7 +570,6 @@ test_that("test joining by default columns with less common keys/classes", { # TODO non-`epi_df` scaling? # TODO multikey scaling? - }) From 8dae68b464425fbd46edb345463ed9c9c06d326b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 29 Oct 2024 10:34:58 -0700 Subject: [PATCH 10/12] Add a renamed-geo_value pop scaling test, minor cleanup --- tests/testthat/test-population_scaling.R | 45 +++++++++++++++++------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 63c23a388..6e4cd5df2 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -397,10 +397,7 @@ test_that("test joining by default columns with less common keys/classes", { update_role("geo_value", new_role = "geo_value") %>% update_role("age_group", new_role = "key") %>% update_role("time_value", new_role = "time_value") %>% - step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome") %>% - { - . - }, + step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome"), model_spec, frosting() %>% layer_predict() %>% @@ -420,6 +417,7 @@ test_that("test joining by default columns with less common keys/classes", { expect_warning( expect_warning( expect_equal( + # get_test_data doesn't work with non-`epi_df`s, so provide test data manually: predict(fit(ewf1b2, dat1b2), dat1b2) %>% pivot_quantiles_wider(.pred), dat1b2 %>% @@ -434,16 +432,15 @@ test_that("test joining by default columns with less common keys/classes", { class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" ) - # Same thing but with time series in tibble, but no role hints -> different behavior: + # Same thing but with time series in tibble, but no role hints -> incorrect + # key guess -> we prep without warning, but error when we realize we don't + # actually have a unique key for our predictions: dat1b3 <- dat1b2 pop1b3 <- pop1b2 ewf1b3 <- epi_workflow( # Can't use epi_recipe or step_epi_ahead; adjust. recipe(dat1b3) %>% - step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome") %>% - { - . - }, + step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome"), model_spec, frosting() %>% layer_predict() %>% @@ -567,9 +564,33 @@ test_that("test joining by default columns with less common keys/classes", { mutate(`0.5` = 2 * 11) ) - # TODO non-`epi_df` scaling? - - # TODO multikey scaling? + # With alternative geo naming... we're able to successfully prep (and we're + # not missing a warning as in 1b3 since we're actually in a "correct" setup), + # but still like 1b3, we fail at prediction time as key roles have not been + # set up and inference fails: + dat4 <- dat1 %>% rename(geo = geo_value) + pop4 <- pop1 %>% rename(geo = geo_value) + ewf4 <- epi_workflow( + recipe(dat4) %>% + step_population_scaling(y, df = pop4, df_pop_col = "population", role = "outcome"), + model_spec, + frosting() %>% + layer_predict() %>% + layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE) + ) + expect_equal( + extract_recipe(ewf4, estimated = FALSE) %>% + prep(dat4) %>% + bake(new_data = NULL), + dat4 %>% + mutate(y_scaled = c(3e-6, 7e-6)) + ) + expect_error( + # get_test_data doesn't work with non-`epi_df`s, so provide test data manually: + predict(fit(ewf4, dat4), dat4) %>% + pivot_quantiles_wider(.pred), + class = "epipredict__grab_forged_keys__nonunique_key" + ) }) From 3685e67e0dd686e3445590d3285ea515858c4f5c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 29 Oct 2024 15:22:47 -0700 Subject: [PATCH 11/12] Update docs, news for `*_population_scaling(by =)` and keys updates --- DESCRIPTION | 2 +- NEWS.md | 3 +++ R/layer_population_scaling.R | 17 +++++++++++------ R/step_population_scaling.R | 19 ++++++++++++------- man/format_varnames.Rd | 22 ++++++++++++++++++++++ man/layer_population_scaling.Rd | 17 +++++++++++------ man/step_population_scaling.Rd | 19 ++++++++++++------- 7 files changed, 72 insertions(+), 27 deletions(-) create mode 100644 man/format_varnames.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2ae9d3337..456cdb54f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.3 +Version: 0.1.4 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 3fcf4dc0c..8be7d7c0d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat ## Improvements - Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data. +- Fix `layer_population_scaling` default `by` with `other_keys`. +- Make key column inference more consistent within the package and with current `epiprocess`. +- Fix `quantile_reg()` producing error when asked to output just median-level predictions. - (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift` ## Bug fixes diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index f47e3f292..badd073f8 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -19,12 +19,17 @@ #' inverting the existing scaling. #' @param by A (possibly named) character vector of variables to join by. #' -#' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` produced by the `predict()` call -#' and the user-provided dataset. -#' If columns in that `epi_df` and `df` have the same name (and aren't -#' included in `by`), `.df` is added to the one from the user-provided data -#' to disambiguate. +#' If `NULL`, the default, the function will try to infer a reasonable set of +#' columns. First, it will try to join by all variables in the test data with +#' roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in `df`; +#' these roles are automatically set if you are using an `epi_df`, or you can +#' use, e.g., `update_role`. If no such roles are set, it will try to perform a +#' natural join, using variables in common between the training/test data and +#' population data. +#' +#' If columns in the training/testing data and `df` have the same name (and +#' aren't included in `by`), a `.df` suffix is added to the one from the +#' user-provided data to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 297c30727..6047f5c4c 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -16,12 +16,17 @@ #' inverting the existing scaling. #' @param by A (possibly named) character vector of variables to join by. #' -#' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` produced by the `predict()` call -#' and the user-provided dataset. -#' If columns in that `epi_df` and `df` have the same name (and aren't -#' included in `by`), `.df` is added to the one from the user-provided data -#' to disambiguate. +#' If `NULL`, the default, the function will try to infer a reasonable set of +#' columns. First, it will try to join by all variables in the training/test +#' data with roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in +#' `df`; these roles are automatically set if you are using an `epi_df`, or you +#' can use, e.g., `update_role`. If no such roles are set, it will try to +#' perform a natural join, using variables in common between the training/test +#' data and population data. +#' +#' If columns in the training/testing data and `df` have the same name (and +#' aren't included in `by`), a `.df` suffix is added to the one from the +#' user-provided data to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` @@ -29,7 +34,7 @@ #' For example, `by = c("geo_value" = "states", "county" = "county")` will match #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. #' -#' See [dplyr::left_join()] for more details. +#' See [dplyr::inner_join()] for more details. #' @param df_pop_col the name of the column in the data frame `df` that #' contains the population data and will be used for scaling. #' This should be one column. diff --git a/man/format_varnames.Rd b/man/format_varnames.Rd new file mode 100644 index 000000000..59fe218af --- /dev/null +++ b/man/format_varnames.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-misc.R +\name{format_varnames} +\alias{format_varnames} +\title{"Format" a character vector of column/variable names for cli interpolation} +\usage{ +format_varnames(x, empty = "*none*") +} +\arguments{ +\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} + +\item{empty}{string; what should be output if \code{x} is of length 0?} +} +\value{ +\code{chr} +} +\description{ +Designed to give good output if interpolated with cli. Main purpose is to add +backticks around variable names when necessary, and something other than an +empty string if length 0. +} +\keyword{internal} diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index 3a22dbee5..083dd6b48 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -28,12 +28,17 @@ inverting the existing scaling.} \item{by}{A (possibly named) character vector of variables to join by. -If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} produced by the \code{predict()} call -and the user-provided dataset. -If columns in that \code{epi_df} and \code{df} have the same name (and aren't -included in \code{by}), \code{.df} is added to the one from the user-provided data -to disambiguate. +If \code{NULL}, the default, the function will try to infer a reasonable set of +columns. First, it will try to join by all variables in the test data with +roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in \code{df}; +these roles are automatically set if you are using an \code{epi_df}, or you can +use, e.g., \code{update_role}. If no such roles are set, it will try to perform a +natural join, using variables in common between the training/test data and +population data. + +If columns in the training/testing data and \code{df} have the same name (and +aren't included in \code{by}), a \code{.df} suffix is added to the one from the +user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 6a0c5dea9..c41ec7fcc 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -33,12 +33,17 @@ inverting the existing scaling.} \item{by}{A (possibly named) character vector of variables to join by. -If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} produced by the \code{predict()} call -and the user-provided dataset. -If columns in that \code{epi_df} and \code{df} have the same name (and aren't -included in \code{by}), \code{.df} is added to the one from the user-provided data -to disambiguate. +If \code{NULL}, the default, the function will try to infer a reasonable set of +columns. First, it will try to join by all variables in the training/test +data with roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in +\code{df}; these roles are automatically set if you are using an \code{epi_df}, or you +can use, e.g., \code{update_role}. If no such roles are set, it will try to +perform a natural join, using variables in common between the training/test +data and population data. + +If columns in the training/testing data and \code{df} have the same name (and +aren't included in \code{by}), a \code{.df} suffix is added to the one from the +user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} @@ -46,7 +51,7 @@ to \code{df$states}. To join by multiple variables, use a vector with length > 1 For example, \code{by = c("geo_value" = "states", "county" = "county")} will match \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}. -See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} +See \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}} for more details.} \item{df_pop_col}{the name of the column in the data frame \code{df} that contains the population data and will be used for scaling. From 202a226cf388b6dd52d4ac87023260e3764eec6c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:51:10 -0700 Subject: [PATCH 12/12] define case_num var even when pulling data from epidatasets --- vignettes/articles/smooth-qr.Rmd | 2 +- vignettes/articles/symptom-surveys.Rmd | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd index 801934e8f..a73ffd221 100644 --- a/vignettes/articles/smooth-qr.Rmd +++ b/vignettes/articles/smooth-qr.Rmd @@ -97,7 +97,7 @@ state cases and deaths. This sample data ranges from Dec. 31, 2020 to Dec. 31, 2021. ```{r} -edf <- covid_case_death_rates +edf <- epidatasets::covid_case_death_rates ``` We will set the forecast date to be November 30, 2021 so that we can produce diff --git a/vignettes/articles/symptom-surveys.Rmd b/vignettes/articles/symptom-surveys.Rmd index af692726e..cf8013572 100644 --- a/vignettes/articles/symptom-surveys.Rmd +++ b/vignettes/articles/symptom-surveys.Rmd @@ -155,6 +155,7 @@ library(purrr) library(epipredict) library(recipes) +case_num <- 200 z <- epidatasets::county_smoothed_cli_comparison ```