From 3ab495600da832c4958a25548e26c1b48b18dcd2 Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 22 Oct 2024 10:32:14 -0700
Subject: [PATCH 01/12] Change `key_colnames(extra_keys =)` to supported
 `other_keys =`

Recent/current epiprocess versions silently ignore `extra_keys =`. Pending
epiprocess changes will soft-deprecate and route it to `other_keys =` instead,
plus add some stricter checks on `other_keys =`.
---
 R/utils-misc.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/utils-misc.R b/R/utils-misc.R
index a1e0f025f..bb6c09e76 100644
--- a/R/utils-misc.R
+++ b/R/utils-misc.R
@@ -40,7 +40,7 @@ grab_forged_keys <- function(forged, workflow, new_data) {
   # 2. these are the keys in the training data
   old_keys <- key_colnames(workflow)
   # 3. these are the keys in the test data as input
-  new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value")))
+  new_df_keys <- key_colnames(new_data, other_keys = setdiff(new_keys, c("geo_value", "time_value")))
   if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) {
     cli_warn(paste(
       "Not all epi keys that were present in the training data are available",

From 06f54588457fd3d717e45d15646ea14e0b3b6d6c Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 22 Oct 2024 10:41:48 -0700
Subject: [PATCH 02/12] Add `key_colnames(exclude =)` support for epipredict
 objects

---
 R/key_colnames.R                   | 10 ++++++----
 tests/testthat/test-key_colnames.R |  6 ++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/R/key_colnames.R b/R/key_colnames.R
index b9ebde5dc..b8d07ce82 100644
--- a/R/key_colnames.R
+++ b/R/key_colnames.R
@@ -1,20 +1,22 @@
 #' @export
-key_colnames.recipe <- function(x, ...) {
+key_colnames.recipe <- function(x, ..., exclude = character()) {
   geo_key <- x$var_info$variable[x$var_info$role %in% "geo_value"]
   time_key <- x$var_info$variable[x$var_info$role %in% "time_value"]
   keys <- x$var_info$variable[x$var_info$role %in% "key"]
-  c(geo_key, keys, time_key) %||% character(0L)
+  full_key <- c(geo_key, keys, time_key) %||% character(0L)
+  full_key[!full_key %in% exclude]
 }
 
 #' @export
-key_colnames.epi_workflow <- function(x, ...) {
+key_colnames.epi_workflow <- function(x, ..., exclude = character()) {
   # safer to look at the mold than the preprocessor
   mold <- hardhat::extract_mold(x)
   molded_names <- names(mold$extras$roles)
   geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value)
   time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value)
   keys <- names(mold$extras$roles[molded_names %in% "key"]$key)
-  c(geo_key, keys, time_key) %||% character(0L)
+  full_key <- c(geo_key, keys, time_key) %||% character(0L)
+  full_key[!full_key %in% exclude]
 }
 
 kill_time_value <- function(v) {
diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R
index d94daaec4..021bbb50c 100644
--- a/tests/testthat/test-key_colnames.R
+++ b/tests/testthat/test-key_colnames.R
@@ -17,6 +17,9 @@ test_that("key_colnames extracts time_value and geo_value, but not raw", {
     fit(data = covid_case_death_rates)
 
   expect_identical(key_colnames(my_workflow), c("geo_value", "time_value"))
+
+  # `exclude =` works:
+  expect_identical(key_colnames(my_workflow, exclude = "geo_value"), c("time_value"))
 })
 
 test_that("key_colnames extracts additional keys when they are present", {
@@ -49,4 +52,7 @@ test_that("key_colnames extracts additional keys when they are present", {
 
   # order of the additional keys may be different
   expect_equal(key_colnames(my_workflow), c("geo_value", "state", "pol", "time_value"))
+
+  # `exclude =` works:
+  expect_equal(key_colnames(my_workflow, exclude = c("time_value", "pol")), c("geo_value", "state"))
 })

From f406a0f9c5aac23af80fe0da86419c7f44012755 Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 22 Oct 2024 11:12:53 -0700
Subject: [PATCH 03/12] Refactor: rename some `extra_keys`, remove some NULL
 hedges

- Rename `extra_keys` -> `other_keys` when we are going to feed it into
  `other_keys =`.
- Remove some `<edf metadata>$other_keys %||% character()` hedges now that
  current epiprocess standardizes to character() not NULL and example `epi_df`
  objects have been updated to that standard.
---
 R/autoplot.R   | 5 ++---
 R/utils-misc.R | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/R/autoplot.R b/R/autoplot.R
index 4f4222979..870dcb8d8 100644
--- a/R/autoplot.R
+++ b/R/autoplot.R
@@ -127,11 +127,10 @@ autoplot.epi_workflow <- function(
   if (!is.null(shift)) {
     edf <- mutate(edf, time_value = time_value + shift)
   }
-  extra_keys <- setdiff(key_colnames(object), c("geo_value", "time_value"))
-  if (length(extra_keys) == 0L) extra_keys <- NULL
+  other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value"))
   edf <- as_epi_df(edf,
     as_of = object$fit$meta$as_of,
-    other_keys = extra_keys %||% character()
+    other_keys = other_keys
   )
   if (is.null(predictions)) {
     return(autoplot(
diff --git a/R/utils-misc.R b/R/utils-misc.R
index bb6c09e76..7f1eaf843 100644
--- a/R/utils-misc.R
+++ b/R/utils-misc.R
@@ -49,10 +49,10 @@ grab_forged_keys <- function(forged, workflow, new_data) {
   }
   if (is_epi_df(new_data)) {
     meta <- attr(new_data, "metadata")
-    extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys %||% character())
+    extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys)
   } else if (all(c("geo_value", "time_value") %in% new_keys)) {
-    if (length(new_keys) > 2) other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")]
-    extras <- as_epi_df(extras, other_keys = other_keys %||% character())
+    other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")]
+    extras <- as_epi_df(extras, other_keys = other_keys)
   }
   extras
 }

From e2cb1a1147de1bcc3e4c965ef3a821631b604efd Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Thu, 24 Oct 2024 08:40:05 -0700
Subject: [PATCH 04/12] Fix `process_rq_preds` on single-level predictions

---
 R/make_quantile_reg.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R
index 9e653184c..1388dd859 100644
--- a/R/make_quantile_reg.R
+++ b/R/make_quantile_reg.R
@@ -112,7 +112,7 @@ make_quantile_reg <- function() {
 
     # can't make a method because object is second
     out <- switch(type,
-      rq = dist_quantiles(unname(as.list(x)), object$quantile_levels), # one quantile
+      rq = dist_quantiles(unname(as.list(x)), object$tau), # one quantile
       rqs = {
         x <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x)))
         dist_quantiles(x, list(object$tau))

From f0ace50ff6d74c4175f177af20b46fb041081d5d Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 22 Oct 2024 17:29:06 -0700
Subject: [PATCH 05/12] population_scaling: fix w/ other_keys, upstream updates

---
 R/layer_population_scaling.R             |  23 ++-
 R/step_population_scaling.R              |  27 +++-
 tests/testthat/test-population_scaling.R | 193 +++++++++++++++++++++++
 3 files changed, 237 insertions(+), 6 deletions(-)

diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 7ec168822..741a5adf8 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -135,6 +135,25 @@ slather.layer_population_scaling <-
     )
     rlang::check_dots_empty()
 
+    if (is.null(object$by)) {
+      # Assume `layer_predict` has calculated the prediction keys and other layers
+      # don't change the prediction key colnames:
+      prediction_key_colnames <- names(components$keys)
+      lhs_potential_keys <- prediction_key_colnames
+      rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col))
+      object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
+      suggested_min_keys <- kill_time_value(lhs_potential_keys)
+      if (!all(suggested_min_keys %in% object$by)) {
+        cli_warn(c(
+          "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`",
+          "i" = "Defaulting to join by {object$by}",
+          ">" = "Double-check whether column names on the population `df` match those expected in your predictions",
+          ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
+          ">" = "Manually specify `by =` to silence"
+        ), class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys")
+      }
+    }
+
     object$by <- object$by %||% intersect(
       epi_keys_only(components$predictions),
       colnames(select(object$df, !object$df_pop_col))
@@ -152,10 +171,12 @@ slather.layer_population_scaling <-
     suffix <- ifelse(object$create_new, object$suffix, "")
     col_to_remove <- setdiff(colnames(object$df), colnames(components$predictions))
 
-    components$predictions <- left_join(
+    components$predictions <- inner_join(
       components$predictions,
       object$df,
       by = object$by,
+      relationship = "many-to-one",
+      unmatched = c("error", "drop"),
       suffix = c("", ".df")
     ) %>%
       mutate(across(
diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index 7bdb00925..901d80867 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -156,10 +156,25 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
 
 #' @export
 bake.step_population_scaling <- function(object, new_data, ...) {
-  object$by <- object$by %||% intersect(
-    epi_keys_only(new_data),
-    colnames(select(object$df, !object$df_pop_col))
-  )
+  if (is.null(object$by)) {
+    rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col))
+    if (is_epi_df(new_data)) {
+      lhs_potential_keys <- key_colnames(new_data)
+      object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
+      suggested_min_keys <- kill_time_value(lhs_potential_keys)
+      if (!all(suggested_min_keys %in% object$by)) {
+        cli_warn(c(
+          "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`",
+          "i" = "Defaulting to join by {object$by}",
+          ">" = "Double-check whether column names on the population `df` match those for your time series",
+          ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
+          ">" = "Manually specify `by =` to silence"
+        ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys")
+      }
+    } else {
+      object$by <- intersect(names(new_data), rhs_potential_keys)
+    }
+  }
   joinby <- list(x = names(object$by) %||% object$by, y = object$by)
   hardhat::validate_column_names(new_data, joinby$x)
   hardhat::validate_column_names(object$df, joinby$y)
@@ -177,7 +192,9 @@ bake.step_population_scaling <- function(object, new_data, ...) {
   suffix <- ifelse(object$create_new, object$suffix, "")
   col_to_remove <- setdiff(colnames(object$df), colnames(new_data))
 
-  left_join(new_data, object$df, by = object$by, suffix = c("", ".df")) %>%
+  inner_join(new_data, object$df,
+             by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"),
+             suffix = c("", ".df")) %>%
     mutate(
       across(
         all_of(object$columns),
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 1f356c0cc..9de0c7d6d 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -304,6 +304,199 @@ test_that("test joining by default columns", {
 })
 
 
+test_that("test joining by default columns with less common keys/classes", {
+  # Make a model spec that expects no predictor columns and outputs a fixed
+  # (rate) prediction. Based on combining two linear inequalities.
+  fixed_rate_prediction <- 2e-6
+  model_spec <- quantile_reg(quantile_levels = 0.5, method = "fnc") %>%
+    set_engine(
+      "rq",
+      R = matrix(c(1, -1), 2, 1), r = c(1, -1) * fixed_rate_prediction,
+      eps = fixed_rate_prediction * 1e-6 # prevent early stop
+    )
+
+  # Here's the typical setup
+  dat1 <- tibble::tibble(geo_value = 1:2, time_value = 1, y = c(3 * 5, 7 * 11)) %>%
+    as_epi_df()
+  pop1 <- tibble::tibble(geo_value = 1:2, population = c(5e6, 11e6))
+  ewf1 <- epi_workflow(
+    epi_recipe(dat1) %>%
+      step_population_scaling(y, df = pop1, df_pop_col = "population") %>%
+      step_epi_ahead(y_scaled, ahead = 0),
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_equal(
+    extract_recipe(ewf1, estimated = FALSE) %>%
+      prep(dat1) %>%
+      bake(new_data = NULL),
+    dat1 %>%
+      mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled)
+  )
+  expect_equal(
+    forecast(fit(ewf1, dat1)) %>%
+      pivot_quantiles_wider(.pred),
+    dat1 %>%
+      select(!"y") %>%
+      as_tibble() %>%
+      mutate(`0.5` = c(2 * 5, 2 * 11))
+  )
+
+  # With geo x age in time series but only geo in population data:
+  dat1b <- dat1 %>%
+    as_tibble() %>%
+    mutate(age_group = geo_value, geo_value = 1) %>%
+    as_epi_df(other_keys = "age_group")
+  pop1b <- pop1
+  ewf1b <- epi_workflow(
+    epi_recipe(dat1b) %>%
+      step_population_scaling(y, df = pop1b, df_pop_col = "population") %>%
+      step_epi_ahead(y_scaled, ahead = 0),
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop1b, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_warning(
+    expect_equal(
+      extract_recipe(ewf1b, estimated = FALSE) %>%
+        prep(dat1b) %>%
+        bake(new_data = NULL),
+      dat1b %>%
+        # geo 1 scaling used for both:
+        mutate(y_scaled = c(3e-6, 7 * 11 / 5e6), ahead_0_y_scaled = y_scaled)
+    ),
+    class = "epipredict__step_population_scaling__default_by_missing_suggested_keys"
+  )
+  expect_warning(
+    expect_warning(
+      expect_equal(
+        forecast(fit(ewf1b, dat1b)) %>%
+          pivot_quantiles_wider(.pred),
+        dat1b %>%
+          select(!"y") %>%
+          as_tibble() %>%
+          # geo 1 scaling used for both:
+          mutate(`0.5` = c(2 * 5, 2 * 5))
+      ),
+      class = "epipredict__step_population_scaling__default_by_missing_suggested_keys"
+    ),
+    class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
+  )
+
+  # With geo x age_group breakdown on both:
+  dat2 <- dat1 %>%
+    as_tibble() %>%
+    mutate(age_group = geo_value, geo_value = 1) %>%
+    as_epi_df(other_keys = "age_group")
+  pop2 <- pop1 %>%
+    mutate(age_group = geo_value, geo_value = 1)
+  ewf2 <- epi_workflow(
+    epi_recipe(dat2) %>%
+      step_population_scaling(y, df = pop2, df_pop_col = "population") %>%
+      step_epi_ahead(y_scaled, ahead = 0),
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop2, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_equal(
+    extract_recipe(ewf2, estimated = FALSE) %>%
+      prep(dat2) %>%
+      bake(new_data = NULL),
+    dat2 %>%
+      mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled)
+  )
+  expect_equal(
+    forecast(fit(ewf2, dat2)) %>%
+      pivot_quantiles_wider(.pred),
+    dat2 %>%
+      select(!"y") %>%
+      as_tibble() %>%
+      mutate(`0.5` = c(2 * 5, 2 * 11))
+  )
+
+  # With only an age column in population data:
+  dat2b <- dat2
+  pop2b <- pop1 %>%
+    mutate(age_group = geo_value, geo_value = NULL)
+  ewf2b <- epi_workflow(
+    epi_recipe(dat2b) %>%
+      step_population_scaling(y, df = pop2b, df_pop_col = "population") %>%
+      step_epi_ahead(y_scaled, ahead = 0),
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop2b, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_warning(
+    expect_equal(
+      extract_recipe(ewf2b, estimated = FALSE) %>%
+        prep(dat2b) %>%
+        bake(new_data = NULL),
+      dat2b %>%
+        mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled)
+    ),
+    class = "epipredict__step_population_scaling__default_by_missing_suggested_keys"
+  )
+  expect_warning(
+    expect_warning(
+      expect_equal(
+        forecast(fit(ewf2b, dat2b)) %>%
+          pivot_quantiles_wider(.pred),
+        dat2b %>%
+          select(!"y") %>%
+          as_tibble() %>%
+          mutate(`0.5` = c(2 * 5, 2 * 11))
+      ),
+      class = "epipredict__step_population_scaling__default_by_missing_suggested_keys"
+    ),
+    class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
+  )
+
+  # with geo x time_value breakdown instead:
+  dat3 <- dat1 %>%
+    as_tibble() %>%
+    mutate(time_value = geo_value, geo_value = 1) %>%
+    as_epi_df()
+  pop3 <- pop1 %>%
+    mutate(time_value = geo_value, geo_value = 1)
+  ewf3 <- epi_workflow(
+    epi_recipe(dat3) %>%
+      step_population_scaling(y, df = pop3, df_pop_col = "population") %>%
+      step_epi_ahead(y_scaled, ahead = 0),
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop3, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_equal(
+    extract_recipe(ewf3, estimated = FALSE) %>%
+      prep(dat3) %>%
+      bake(new_data = NULL),
+    dat3 %>%
+      mutate(y_scaled = c(3e-6, 7e-6), ahead_0_y_scaled = y_scaled)
+  )
+  expect_equal(
+    forecast(fit(ewf3, dat3)) %>%
+      pivot_quantiles_wider(.pred),
+    # slightly edited copy-pasta due to test time selection:
+    dat3 %>%
+      select(!"y") %>%
+      as_tibble() %>%
+      slice_max(by = geo_value, time_value) %>%
+      mutate(`0.5` = 2 * 11)
+  )
+
+  # TODO non-`epi_df` scaling?
+
+  # TODO multikey scaling?
+
+})
+
+
 test_that("expect error if `by` selector does not match", {
   jhu <- covid_case_death_rates %>%
     dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%

From 29a320458d7025fcbac7499f6b9703c281c6c854 Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 29 Oct 2024 11:04:16 -0700
Subject: [PATCH 06/12] Move validation to prep and make it rely on roles

- Roles should already factor in key_colnames.
---
 NAMESPACE                                |  2 +
 R/epipredict-package.R                   |  2 +
 R/layer_population_scaling.R             |  4 +-
 R/step_population_scaling.R              | 59 +++++++++++++++++-------
 tests/testthat/test-population_scaling.R | 39 ++++++++++++++++
 5 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 5ee13f730..e7e720f11 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -242,6 +242,7 @@ importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_at)
+importFrom(dplyr,inner_join)
 importFrom(dplyr,join_by)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
@@ -273,6 +274,7 @@ importFrom(hardhat,extract_recipe)
 importFrom(hardhat,refresh_blueprint)
 importFrom(hardhat,run_mold)
 importFrom(magrittr,"%>%")
+importFrom(magrittr,extract2)
 importFrom(recipes,bake)
 importFrom(recipes,detect_step)
 importFrom(recipes,prep)
diff --git a/R/epipredict-package.R b/R/epipredict-package.R
index 3dee263e2..b4b9973bf 100644
--- a/R/epipredict-package.R
+++ b/R/epipredict-package.R
@@ -7,7 +7,9 @@
 #' @importFrom cli cli_abort cli_warn
 #' @importFrom dplyr arrange across all_of any_of bind_cols bind_rows group_by
 #' @importFrom dplyr full_join relocate summarise everything
+#' @importFrom dplyr inner_join
 #' @importFrom dplyr summarize filter mutate select left_join rename ungroup
+#' @importFrom magrittr extract2
 #' @importFrom rlang := !! %||% as_function global_env set_names !!! caller_arg
 #' @importFrom rlang is_logical is_true inject enquo enquos expr sym arg_match
 #' @importFrom stats poly predict lm residuals quantile
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 741a5adf8..5a982d344 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -136,8 +136,8 @@ slather.layer_population_scaling <-
     rlang::check_dots_empty()
 
     if (is.null(object$by)) {
-      # Assume `layer_predict` has calculated the prediction keys and other layers
-      # don't change the prediction key colnames:
+      # Assume `layer_predict` has calculated the prediction keys and other
+      # layers don't change the prediction key colnames:
       prediction_key_colnames <- names(components$keys)
       lhs_potential_keys <- prediction_key_colnames
       rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col))
diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index 901d80867..fe07c2808 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -138,6 +138,42 @@ step_population_scaling_new <-
 
 #' @export
 prep.step_population_scaling <- function(x, training, info = NULL, ...) {
+  hardhat::validate_column_names(x$df, x$df_pop_col)
+  if (is.null(x$by)) {
+    rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col)
+    lhs_potential_keys <- info %>%
+      filter(role %in% c("geo_value", "key", "time_value")) %>%
+      extract2("variable") %>%
+      unique() # in case of weird var with multiple of above roles
+    if (length(lhs_potential_keys) == 0L) {
+      # We're working with a recipe and tibble, and *_role hasn't set up any of
+      # the above roles. Let's say any column could actually act as a key, and
+      # lean on `intersect` below to make this something reasonable.
+      lhs_potential_keys <- names(training)
+    }
+    suggested_min_keys <- info %>%
+      filter(role %in% c("geo_value", "key")) %>%
+      extract2("variable") %>%
+      unique()
+    # (0 suggested keys if we weren't given any epikeytime var info.)
+    x$by <- intersect(lhs_potential_keys, rhs_potential_keys)
+    if (length(x$by) == 0L) {
+      cli_stop(c(
+        "Couldn't guess a default for `by`",
+        ">" = "Please rename columns in your population data to match those in your training data,
+               or manually specify `by =` in `step_population_scaling()`."
+      ), class = "epipredict__step_population_scaling__default_by_no_intersection")
+    }
+    if (!all(suggested_min_keys %in% x$by)) {
+      cli_warn(c(
+        "Couldn't find {setdiff(suggested_min_keys, x$by)} in population `df`.",
+        "i" = "Defaulting to join by {x$by}.",
+        ">" = "Double-check whether column names on the population `df` match those for your time series.",
+        ">" = "Consider using population data with breakdowns by {suggested_min_keys}.",
+        ">" = "Manually specify `by =` to silence."
+      ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys")
+    }
+  }
   step_population_scaling_new(
     terms = x$terms,
     role = x$role,
@@ -157,23 +193,12 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
 #' @export
 bake.step_population_scaling <- function(object, new_data, ...) {
   if (is.null(object$by)) {
-    rhs_potential_keys <- colnames(select(object$df, !object$df_pop_col))
-    if (is_epi_df(new_data)) {
-      lhs_potential_keys <- key_colnames(new_data)
-      object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
-      suggested_min_keys <- kill_time_value(lhs_potential_keys)
-      if (!all(suggested_min_keys %in% object$by)) {
-        cli_warn(c(
-          "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`",
-          "i" = "Defaulting to join by {object$by}",
-          ">" = "Double-check whether column names on the population `df` match those for your time series",
-          ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
-          ">" = "Manually specify `by =` to silence"
-        ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys")
-      }
-    } else {
-      object$by <- intersect(names(new_data), rhs_potential_keys)
-    }
+    cli::cli_abort(c(
+      "`by` was not set and no default was filled in",
+      ">" = "If this was a fit recipe generated from an older version
+             of epipredict that you loaded in from a file,
+             please regenerate with the current version of epipredict."
+    ))
   }
   joinby <- list(x = names(object$by) %||% object$by, y = object$by)
   hardhat::validate_column_names(new_data, joinby$x)
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 9de0c7d6d..0a16dff6f 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -386,6 +386,45 @@ test_that("test joining by default columns with less common keys/classes", {
     class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
   )
 
+  # Same thing but with time series in tibble:
+  dat1bb <- dat1 %>%
+    as_tibble() %>%
+    mutate(age_group = geo_value, geo_value = 1)
+  pop1bb <- pop1b
+  ewf1bb <- epi_workflow(
+    # Can't use epi_recipe or step_epi_ahead; adjust.
+    recipe(dat1bb) %>%
+    update_role("geo_value", new_role = "geo_value") %>%
+    update_role("age_group", new_role = "key") %>%
+    update_role("time_value", new_role = "time_value") %>%
+    step_population_scaling(y, df = pop1bb, df_pop_col = "population", role = "outcome") %>%
+      # XXX key_colnames inference differs at fit vs. predict time, so we also
+      # need to manually provide some key role settings to not have trouble at
+      # predict time.
+   {.},
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop1bb, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_equal(
+    extract_recipe(ewf1bb, estimated = FALSE) %>%
+      prep(dat1bb) %>%
+      bake(new_data = NULL),
+    dat1bb %>%
+      # geo 1 scaling used for both:
+      mutate(y_scaled = c(3e-6, 7 * 11 / 5e6))
+  )
+  expect_equal(
+    predict(fit(ewf1bb, dat1bb), dat1bb) %>%
+      pivot_quantiles_wider(.pred),
+    dat1bb %>%
+      select(!"y") %>%
+      as_tibble() %>%
+      # geo 1 scaling used for both:
+      mutate(`0.5` = c(2 * 5, 2 * 5))
+  )
+
   # With geo x age_group breakdown on both:
   dat2 <- dat1 %>%
     as_tibble() %>%

From 194735fd992b7ad0b55d27b5ed384537c16dab9b Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Mon, 28 Oct 2024 05:06:37 -0700
Subject: [PATCH 07/12] population_scaling: do more validation on creation

---
 R/step_population_scaling.R | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index fe07c2808..d7b7893e3 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -89,13 +89,25 @@ step_population_scaling <-
            suffix = "_scaled",
            skip = FALSE,
            id = rand_id("population_scaling")) {
-    arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, id)
-    arg_is_lgl(create_new, skip)
-    arg_is_chr(df_pop_col, suffix, id)
+    if (rlang::dots_n(...) == 0L) {
+      cli_abort(c(
+        "`...` must not be empty.",
+        ">" = "Please provide one or more tidyselect expressions in `...`
+               specifying the columns to which scaling should be applied.",
+        ">" = "If you really want to list `step_population_scaling` in your
+               recipe but not have it do anything, you can use a tidyselection
+               that selects zero variables, such as `c()`."
+      ))
+    }
+    arg_is_scalar(role, df_pop_col, rate_rescaling, create_new, suffix, skip, id)
+    arg_is_chr(role, df_pop_col, suffix, id)
+    hardhat::validate_column_names(df, df_pop_col)
     arg_is_chr(by, allow_null = TRUE)
+    arg_is_numeric(rate_rescaling)
     if (rate_rescaling <= 0) {
       cli_abort("`rate_rescaling` must be a positive number.")
     }
+    arg_is_lgl(create_new, skip)
 
     recipes::add_step(
       recipe,
@@ -138,7 +150,6 @@ step_population_scaling_new <-
 
 #' @export
 prep.step_population_scaling <- function(x, training, info = NULL, ...) {
-  hardhat::validate_column_names(x$df, x$df_pop_col)
   if (is.null(x$by)) {
     rhs_potential_keys <- setdiff(colnames(x$df), x$df_pop_col)
     lhs_potential_keys <- info %>%

From 7090cf0580a37a57b676943409dd3c9df5ea5bae Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Mon, 28 Oct 2024 15:21:01 -0700
Subject: [PATCH 08/12] Adjust message on `grab_forged_keys` mismatches to
 apply to testing situation

---
 R/utils-misc.R                           | 49 ++++++++++++---
 tests/testthat/test-population_scaling.R | 79 ++++++++++++++++++------
 2 files changed, 100 insertions(+), 28 deletions(-)

diff --git a/R/utils-misc.R b/R/utils-misc.R
index 7f1eaf843..7ab4ad953 100644
--- a/R/utils-misc.R
+++ b/R/utils-misc.R
@@ -31,26 +31,59 @@ check_pname <- function(res, preds, object, newname = NULL) {
   res
 }
 
+# Copied from `epiprocess`:
+
+#' "Format" a character vector of column/variable names for cli interpolation
+#'
+#' Designed to give good output if interpolated with cli. Main purpose is to add
+#' backticks around variable names when necessary, and something other than an
+#' empty string if length 0.
+#'
+#' @param x `chr`; e.g., `colnames` of some data frame
+#' @param empty string; what should be output if `x` is of length 0?
+#' @return `chr`
+#' @keywords internal
+format_varnames <- function(x, empty = "*none*") {
+  if (length(x) == 0L) {
+    empty
+  } else {
+    as.character(syms(x))
+  }
+}
 
 grab_forged_keys <- function(forged, workflow, new_data) {
+  # 1. keys in the training data post-prep, based on roles:
+  old_keys <- key_colnames(workflow)
+  # 3. keys in the test data post-bake, based on roles:
   forged_roles <- names(forged$extras$roles)
   extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")])
-  # 1. these are the keys in the test data after prep/bake
   new_keys <- names(extras)
-  # 2. these are the keys in the training data
-  old_keys <- key_colnames(workflow)
-  # 3. these are the keys in the test data as input
+  if (length(new_keys) == 0L) {
+    # No epikeytime role assignment; infer from all columns:
+    potential_keys <- c("geo_value", "time_value")
+    new_keys <- potential_keys[potential_keys %in% names(bind_cols(forged$extras$roles))]
+  }
+  # 2. keys in the test data pre-bake based on data structure + post-bake roles:
   new_df_keys <- key_colnames(new_data, other_keys = setdiff(new_keys, c("geo_value", "time_value")))
-  if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) {
-    cli_warn(paste(
-      "Not all epi keys that were present in the training data are available",
-      "in `new_data`. Predictions will have only the available keys."
+  # Softly validate, assuming that no steps change epikeytime role assignments:
+  if (!(setequal(old_keys, new_df_keys) && setequal(new_df_keys, new_keys))) {
+    cli_warn(c(
+      "Inconsistent epikeytime identifier columns specified/inferred.",
+      "i" = "training epikeytime columns, based on roles post-mold/prep: {format_varnames(old_keys)}",
+      "i" = " testing epikeytime columns, based on data structure pre-bake and roles post-forge/bake: {format_varnames(new_df_keys)}",
+      "i" = " testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}",
+      "*" = "Keys will be set giving preference to test-time `epi_df` metadata followed by test-time
+             post-bake role settings.",
+      ">" = 'Some mismatches can be addressed by using `epi_df`s instead of tibbles, or by using `update_role`
+             to assign pre-`prep` columns the "geo_value", "key", and "time_value" roles.'
     ))
   }
   if (is_epi_df(new_data)) {
+    # Inference based on test data pre-bake data structure "wins":
     meta <- attr(new_data, "metadata")
     extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys)
   } else if (all(c("geo_value", "time_value") %in% new_keys)) {
+    # Inference based on test data post-bake roles "wins":
     other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")]
     extras <- as_epi_df(extras, other_keys = other_keys)
   }
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 0a16dff6f..eabca9f7e 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -386,43 +386,82 @@ test_that("test joining by default columns with less common keys/classes", {
     class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
   )
 
-  # Same thing but with time series in tibble:
-  dat1bb <- dat1 %>%
+  # Same thing but with time series in tibble, but with role hints:
+  dat1b2 <- dat1 %>%
     as_tibble() %>%
     mutate(age_group = geo_value, geo_value = 1)
-  pop1bb <- pop1b
-  ewf1bb <- epi_workflow(
+  pop1b2 <- pop1b
+  ewf1b2 <- epi_workflow(
     # Can't use epi_recipe or step_epi_ahead; adjust.
-    recipe(dat1bb) %>%
-    update_role("geo_value", new_role = "geo_value") %>%
-    update_role("age_group", new_role = "key") %>%
-    update_role("time_value", new_role = "time_value") %>%
-    step_population_scaling(y, df = pop1bb, df_pop_col = "population", role = "outcome") %>%
-      # XXX key_colnames inference differs at fit vs. predict time, so we also
-      # need to manually provide some key role settings to not have trouble at
-      # predict time.
-   {.},
+    recipe(dat1b2) %>%
+      update_role("geo_value", new_role = "geo_value") %>%
+      update_role("age_group", new_role = "key") %>%
+      update_role("time_value", new_role = "time_value") %>%
+      step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome") %>%
+      {.},
     model_spec,
     frosting() %>%
       layer_predict() %>%
-      layer_population_scaling(.pred, df = pop1bb, df_pop_col = "population", create_new = FALSE)
+      layer_population_scaling(.pred, df = pop1b2, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_warning(
+    expect_equal(
+      extract_recipe(ewf1b2, estimated = FALSE) %>%
+        prep(dat1b2) %>%
+        bake(new_data = NULL),
+      dat1b2 %>%
+        # geo 1 scaling used for both:
+        mutate(y_scaled = c(3e-6, 7 * 11 / 5e6))
+    ),
+    class = "epipredict__step_population_scaling__default_by_missing_suggested_keys"
+  )
+  expect_warning(
+    expect_warning(
+      expect_equal(
+        predict(fit(ewf1b2, dat1b2), dat1b2) %>%
+          pivot_quantiles_wider(.pred),
+        dat1b2 %>%
+          select(!"y") %>%
+          as_tibble() %>%
+          # geo 1 scaling used for both:
+          mutate(`0.5` = c(2 * 5, 2 * 5)) %>%
+          select(geo_value, age_group, time_value, `0.5`)
+      ),
+      class = "epipredict__step_population_scaling__default_by_missing_suggested_keys"
+    ),
+    class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
+  )
+
+  # Same thing but with time series in tibble, but no role hints -> different inference&messaging:
+  dat1b3 <- dat1b2
+  pop1b3 <- pop1b2
+  ewf1b3 <- epi_workflow(
+    # Can't use epi_recipe or step_epi_ahead; adjust.
+    recipe(dat1b3) %>%
+      step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome") %>%
+      {.},
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop1b3, df_pop_col = "population", create_new = FALSE)
   )
   expect_equal(
-    extract_recipe(ewf1bb, estimated = FALSE) %>%
-      prep(dat1bb) %>%
+    extract_recipe(ewf1b3, estimated = FALSE) %>%
+      prep(dat1b3) %>%
       bake(new_data = NULL),
-    dat1bb %>%
+    dat1b3 %>%
       # geo 1 scaling used for both:
       mutate(y_scaled = c(3e-6, 7 * 11 / 5e6))
   )
   expect_equal(
-    predict(fit(ewf1bb, dat1bb), dat1bb) %>%
+    predict(fit(ewf1b3, dat1b3), dat1b3) %>%
       pivot_quantiles_wider(.pred),
-    dat1bb %>%
+    dat1b3 %>%
       select(!"y") %>%
       as_tibble() %>%
       # geo 1 scaling used for both:
-      mutate(`0.5` = c(2 * 5, 2 * 5))
+      mutate(`0.5` = c(2 * 5, 2 * 5)) %>%
+      select(geo_value, age_group, time_value, `0.5`)
   )
 
   # With geo x age_group breakdown on both:

From 2e307577fb0c611a22d11806898483fd2795b194 Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Mon, 28 Oct 2024 17:08:51 -0700
Subject: [PATCH 09/12] Make key inference more consistent; allow non-`epi_df`
 forged data

---
 R/key_colnames.R                         | 13 +++--
 R/layer_population_scaling.R             |  3 +-
 R/step_population_scaling.R              | 10 ++--
 R/utils-misc.R                           | 61 ++++++++++++++----------
 tests/testthat/test-population_scaling.R | 20 ++++----
 5 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/R/key_colnames.R b/R/key_colnames.R
index b8d07ce82..9e0d44dcf 100644
--- a/R/key_colnames.R
+++ b/R/key_colnames.R
@@ -11,11 +11,14 @@ key_colnames.recipe <- function(x, ..., exclude = character()) {
 key_colnames.epi_workflow <- function(x, ..., exclude = character()) {
   # safer to look at the mold than the preprocessor
   mold <- hardhat::extract_mold(x)
-  molded_names <- names(mold$extras$roles)
-  geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value)
-  time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value)
-  keys <- names(mold$extras$roles[molded_names %in% "key"]$key)
-  full_key <- c(geo_key, keys, time_key) %||% character(0L)
+  molded_roles <- mold$extras$roles
+  extras <- bind_cols(molded_roles$geo_value, molded_roles$key, molded_roles$time_value)
+  full_key <- names(extras)
+  if (length(full_key) == 0L) {
+    # No epikeytime role assignment; infer from all columns:
+    potential_keys <- c("geo_value", "time_value")
+    full_key <- potential_keys[potential_keys %in% names(bind_cols(molded_roles))]
+  }
   full_key[!full_key %in% exclude]
 }
 
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 5a982d344..f47e3f292 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -145,7 +145,8 @@ slather.layer_population_scaling <-
       suggested_min_keys <- kill_time_value(lhs_potential_keys)
       if (!all(suggested_min_keys %in% object$by)) {
         cli_warn(c(
-          "Couldn't find {setdiff(suggested_min_keys, object$by)} in population `df`",
+          "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
+           but {?wasn't/weren't} found in the population `df`.",
           "i" = "Defaulting to join by {object$by}",
           ">" = "Double-check whether column names on the population `df` match those expected in your predictions",
           ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index d7b7893e3..297c30727 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -177,9 +177,10 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
     }
     if (!all(suggested_min_keys %in% x$by)) {
       cli_warn(c(
-        "Couldn't find {setdiff(suggested_min_keys, x$by)} in population `df`.",
+        "{setdiff(suggested_min_keys, x$by)} {?was an/were} epikey column{?s} in the training data,
+         but {?wasn't/weren't} found in the population `df`.",
         "i" = "Defaulting to join by {x$by}.",
-        ">" = "Double-check whether column names on the population `df` match those for your time series.",
+        ">" = "Double-check whether column names on the population `df` match those for your training data.",
         ">" = "Consider using population data with breakdowns by {suggested_min_keys}.",
         ">" = "Manually specify `by =` to silence."
       ), class = "epipredict__step_population_scaling__default_by_missing_suggested_keys")
@@ -229,8 +230,9 @@ bake.step_population_scaling <- function(object, new_data, ...) {
   col_to_remove <- setdiff(colnames(object$df), colnames(new_data))
 
   inner_join(new_data, object$df,
-             by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"),
-             suffix = c("", ".df")) %>%
+    by = object$by, relationship = "many-to-one", unmatched = c("error", "drop"),
+    suffix = c("", ".df")
+  ) %>%
     mutate(
       across(
         all_of(object$columns),
diff --git a/R/utils-misc.R b/R/utils-misc.R
index 7ab4ad953..fec707913 100644
--- a/R/utils-misc.R
+++ b/R/utils-misc.R
@@ -54,40 +54,53 @@ format_varnames <- function(x, empty = "*none*") {
 grab_forged_keys <- function(forged, workflow, new_data) {
   # 1. keys in the training data post-prep, based on roles:
   old_keys <- key_colnames(workflow)
-  # 3. keys in the test data post-bake, based on roles:
-  forged_roles <- names(forged$extras$roles)
-  extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")])
-  new_keys <- names(extras)
+  # 2. keys in the test data post-bake, based on roles & structure:
+  forged_roles <- forged$extras$roles
+  new_key_tbl <- bind_cols(forged_roles$geo_value, forged_roles$key, forged_roles$time_value)
+  new_keys <- names(new_key_tbl)
   if (length(new_keys) == 0L) {
     # No epikeytime role assignment; infer from all columns:
-    potential_keys <- c("geo_value", "time_value")
-    new_keys <- potential_keys[potential_keys %in% names(bind_cols(forged$extras$roles))]
+    potential_new_keys <- c("geo_value", "time_value")
+    forged_tbl <- bind_cols(forged$extras$roles)
+    new_keys <- potential_new_keys[potential_new_keys %in% names(forged_tbl)]
+    new_key_tbl <- forged_tbl[new_keys]
   }
-  # 2. keys in the test data pre-bake based on data structure + post-bake roles:
-  new_df_keys <- key_colnames(new_data, other_keys = setdiff(new_keys, c("geo_value", "time_value")))
-  # Softly validate, assuming that no steps change epikeytime role assignments:
-  if (!(setequal(old_keys, new_df_keys) && setequal(new_df_keys, new_keys))) {
+  # Softly validate:
+  if (!(setequal(old_keys, new_keys))) {
     cli_warn(c(
-      "Inconsistent epikeytime identifier columns specified/inferred.",
+      "Inconsistent epikeytime identifier columns specified/inferred in training vs. in testing data.",
       "i" = "training epikeytime columns, based on roles post-mold/prep: {format_varnames(old_keys)}",
-      "i" = " testing epikeytime columns, based on data structure pre-bake and roles post-forge/bake: {format_varnames(new_df_keys)}",
-      "i" = " testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}",
-      "*" = "Keys will be set giving preference to test-time `epi_df` metadata followed by test-time
-             post-bake role settings.",
+      "i" = "testing epikeytime columns, based on roles post-forge/bake: {format_varnames(new_keys)}",
+      "*" = "",
       ">" = 'Some mismatches can be addressed by using `epi_df`s instead of tibbles, or by using `update_role`
              to assign pre-`prep` columns the "geo_value", "key", and "time_value" roles.'
     ))
   }
-  if (is_epi_df(new_data)) {
-    # Inference based on test data pre-bake data structure "wins":
-    meta <- attr(new_data, "metadata")
-    extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys)
-  } else if (all(c("geo_value", "time_value") %in% new_keys)) {
-    # Inference based on test data post-bake roles "wins":
-    other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")]
-    extras <- as_epi_df(extras, other_keys = other_keys)
+  # Convert `new_key_tbl` to `epi_df` if not renaming columns nor violating
+  # `epi_df` invariants.  Require that our key is a unique key in any case.
+  if (all(c("geo_value", "time_value") %in% new_keys)) {
+    maybe_as_of <- attr(new_data, "metadata")$as_of # NULL if wasn't epi_df
+    try(return(as_epi_df(new_key_tbl, other_keys = new_keys, as_of = maybe_as_of)),
+      silent = TRUE
+    )
+  }
+  if (anyDuplicated(new_key_tbl)) {
+    duplicate_key_tbl <- new_key_tbl %>% filter(.by = everything(), dplyr::n() > 1L)
+    error_part1 <- cli::format_error(
+      c(
+        "Specified/inferred key columns had repeated combinations in the forged/baked test data.",
+        "i" = "Key columns: {format_varnames(new_keys)}",
+        "Duplicated keys:"
+      )
+    )
+    error_part2 <- capture.output(print(duplicate_key_tbl))
+    rlang::abort(
+      paste(collapse = "\n", c(error_part1, error_part2)),
+      class = "epipredict__grab_forged_keys__nonunique_key"
+    )
+  } else {
+    return(new_key_tbl)
   }
-  extras
 }
 
 get_parsnip_mode <- function(trainer) {
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index eabca9f7e..63c23a388 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -398,7 +398,9 @@ test_that("test joining by default columns with less common keys/classes", {
       update_role("age_group", new_role = "key") %>%
       update_role("time_value", new_role = "time_value") %>%
       step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome") %>%
-      {.},
+      {
+        .
+      },
     model_spec,
     frosting() %>%
       layer_predict() %>%
@@ -432,14 +434,16 @@ test_that("test joining by default columns with less common keys/classes", {
     class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
   )
 
-  # Same thing but with time series in tibble, but no role hints -> different inference&messaging:
+  # Same thing but with time series in tibble, but no role hints -> different behavior:
   dat1b3 <- dat1b2
   pop1b3 <- pop1b2
   ewf1b3 <- epi_workflow(
     # Can't use epi_recipe or step_epi_ahead; adjust.
     recipe(dat1b3) %>%
       step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome") %>%
-      {.},
+      {
+        .
+      },
     model_spec,
     frosting() %>%
       layer_predict() %>%
@@ -453,15 +457,10 @@ test_that("test joining by default columns with less common keys/classes", {
       # geo 1 scaling used for both:
       mutate(y_scaled = c(3e-6, 7 * 11 / 5e6))
   )
-  expect_equal(
+  expect_error(
     predict(fit(ewf1b3, dat1b3), dat1b3) %>%
       pivot_quantiles_wider(.pred),
-    dat1b3 %>%
-      select(!"y") %>%
-      as_tibble() %>%
-      # geo 1 scaling used for both:
-      mutate(`0.5` = c(2 * 5, 2 * 5)) %>%
-      select(geo_value, age_group, time_value, `0.5`)
+    class = "epipredict__grab_forged_keys__nonunique_key"
   )
 
   # With geo x age_group breakdown on both:
@@ -571,7 +570,6 @@ test_that("test joining by default columns with less common keys/classes", {
   # TODO non-`epi_df` scaling?
 
   # TODO multikey scaling?
-
 })
 
 

From 8dae68b464425fbd46edb345463ed9c9c06d326b Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 29 Oct 2024 10:34:58 -0700
Subject: [PATCH 10/12] Add a renamed-geo_value pop scaling test, minor cleanup

---
 tests/testthat/test-population_scaling.R | 45 +++++++++++++++++-------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 63c23a388..6e4cd5df2 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -397,10 +397,7 @@ test_that("test joining by default columns with less common keys/classes", {
       update_role("geo_value", new_role = "geo_value") %>%
       update_role("age_group", new_role = "key") %>%
       update_role("time_value", new_role = "time_value") %>%
-      step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome") %>%
-      {
-        .
-      },
+      step_population_scaling(y, df = pop1b2, df_pop_col = "population", role = "outcome"),
     model_spec,
     frosting() %>%
       layer_predict() %>%
@@ -420,6 +417,7 @@ test_that("test joining by default columns with less common keys/classes", {
   expect_warning(
     expect_warning(
       expect_equal(
+        # get_test_data doesn't work with non-`epi_df`s, so provide test data manually:
         predict(fit(ewf1b2, dat1b2), dat1b2) %>%
           pivot_quantiles_wider(.pred),
         dat1b2 %>%
@@ -434,16 +432,15 @@ test_that("test joining by default columns with less common keys/classes", {
     class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
   )
 
-  # Same thing but with time series in tibble, but no role hints -> different behavior:
+  # Same thing but with time series in tibble, but no role hints -> incorrect
+  # key guess -> we prep without warning, but error when we realize we don't
+  # actually have a unique key for our predictions:
   dat1b3 <- dat1b2
   pop1b3 <- pop1b2
   ewf1b3 <- epi_workflow(
     # Can't use epi_recipe or step_epi_ahead; adjust.
     recipe(dat1b3) %>%
-      step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome") %>%
-      {
-        .
-      },
+      step_population_scaling(y, df = pop1b3, df_pop_col = "population", role = "outcome"),
     model_spec,
     frosting() %>%
       layer_predict() %>%
@@ -567,9 +564,33 @@ test_that("test joining by default columns with less common keys/classes", {
       mutate(`0.5` = 2 * 11)
   )
 
-  # TODO non-`epi_df` scaling?
-
-  # TODO multikey scaling?
+  # With alternative geo naming... we're able to successfully prep (and we're
+  # not missing a warning as in 1b3 since we're actually in a "correct" setup),
+  # but still like 1b3, we fail at prediction time as key roles have not been
+  # set up and inference fails:
+  dat4 <- dat1 %>% rename(geo = geo_value)
+  pop4 <- pop1 %>% rename(geo = geo_value)
+  ewf4 <- epi_workflow(
+    recipe(dat4) %>%
+      step_population_scaling(y, df = pop4, df_pop_col = "population", role = "outcome"),
+    model_spec,
+    frosting() %>%
+      layer_predict() %>%
+      layer_population_scaling(.pred, df = pop1, df_pop_col = "population", create_new = FALSE)
+  )
+  expect_equal(
+    extract_recipe(ewf4, estimated = FALSE) %>%
+      prep(dat4) %>%
+      bake(new_data = NULL),
+    dat4 %>%
+      mutate(y_scaled = c(3e-6, 7e-6))
+  )
+  expect_error(
+    # get_test_data doesn't work with non-`epi_df`s, so provide test data manually:
+    predict(fit(ewf4, dat4), dat4) %>%
+      pivot_quantiles_wider(.pred),
+    class = "epipredict__grab_forged_keys__nonunique_key"
+  )
 })
 
 

From 3685e67e0dd686e3445590d3285ea515858c4f5c Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 29 Oct 2024 15:22:47 -0700
Subject: [PATCH 11/12] Update docs, news for `*_population_scaling(by =)` and
 keys updates

---
 DESCRIPTION                     |  2 +-
 NEWS.md                         |  3 +++
 R/layer_population_scaling.R    | 17 +++++++++++------
 R/step_population_scaling.R     | 19 ++++++++++++-------
 man/format_varnames.Rd          | 22 ++++++++++++++++++++++
 man/layer_population_scaling.Rd | 17 +++++++++++------
 man/step_population_scaling.Rd  | 19 ++++++++++++-------
 7 files changed, 72 insertions(+), 27 deletions(-)
 create mode 100644 man/format_varnames.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 2ae9d3337..456cdb54f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: epipredict
 Title: Basic epidemiology forecasting methods
-Version: 0.1.3
+Version: 0.1.4
 Authors@R: c(
     person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")),
     person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"),
diff --git a/NEWS.md b/NEWS.md
index 3fcf4dc0c..8be7d7c0d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,6 +15,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
 ## Improvements
 
 - Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data.
+- Fix `layer_population_scaling` default `by` with `other_keys`.
+- Make key column inference more consistent within the package and with current `epiprocess`.
+- Fix `quantile_reg()` producing error when asked to output just median-level predictions.
 - (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift`
 
 ## Bug fixes
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index f47e3f292..badd073f8 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -19,12 +19,17 @@
 #'   inverting the existing scaling.
 #' @param by A (possibly named) character vector of variables to join by.
 #'
-#' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` produced by the `predict()` call
-#' and the user-provided dataset.
-#' If columns in that `epi_df` and `df` have the same name (and aren't
-#' included in `by`), `.df` is added to the one from the user-provided data
-#' to disambiguate.
+#' If `NULL`, the default, the function will try to infer a reasonable set of
+#' columns. First, it will try to join by all variables in the test data with
+#' roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in `df`;
+#' these roles are automatically set if you are using an `epi_df`, or you can
+#' use, e.g., `update_role`. If no such roles are set, it will try to perform a
+#' natural join, using variables in common between the training/test data and
+#' population data.
+#'
+#' If columns in the training/testing data and `df` have the same name (and
+#' aren't included in `by`), a `.df` suffix is added to the one from the
+#' user-provided data to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
 #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index 297c30727..6047f5c4c 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -16,12 +16,17 @@
 #'   inverting the existing scaling.
 #' @param by A (possibly named) character vector of variables to join by.
 #'
-#' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` produced by the `predict()` call
-#' and the user-provided dataset.
-#' If columns in that `epi_df` and `df` have the same name (and aren't
-#' included in `by`), `.df` is added to the one from the user-provided data
-#' to disambiguate.
+#' If `NULL`, the default, the function will try to infer a reasonable set of
+#' columns. First, it will try to join by all variables in the training/test
+#' data with roles `"geo_value"`, `"key"`, or `"time_value"` that also appear in
+#' `df`; these roles are automatically set if you are using an `epi_df`, or you
+#' can use, e.g., `update_role`. If no such roles are set, it will try to
+#' perform a natural join, using variables in common between the training/test
+#' data and population data.
+#'
+#' If columns in the training/testing data and `df` have the same name (and
+#' aren't included in `by`), a `.df` suffix is added to the one from the
+#' user-provided data to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
 #' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
@@ -29,7 +34,7 @@
 #' For example, `by = c("geo_value" = "states", "county" = "county")` will match
 #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
 #'
-#' See [dplyr::left_join()] for more details.
+#' See [dplyr::inner_join()] for more details.
 #' @param df_pop_col the name of the column in the data frame `df` that
 #' contains the population data and will be used for scaling.
 #' This should be one column.
diff --git a/man/format_varnames.Rd b/man/format_varnames.Rd
new file mode 100644
index 000000000..59fe218af
--- /dev/null
+++ b/man/format_varnames.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-misc.R
+\name{format_varnames}
+\alias{format_varnames}
+\title{"Format" a character vector of column/variable names for cli interpolation}
+\usage{
+format_varnames(x, empty = "*none*")
+}
+\arguments{
+\item{x}{\code{chr}; e.g., \code{colnames} of some data frame}
+
+\item{empty}{string; what should be output if \code{x} is of length 0?}
+}
+\value{
+\code{chr}
+}
+\description{
+Designed to give good output if interpolated with cli. Main purpose is to add
+backticks around variable names when necessary, and something other than an
+empty string if length 0.
+}
+\keyword{internal}
diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd
index 3a22dbee5..083dd6b48 100644
--- a/man/layer_population_scaling.Rd
+++ b/man/layer_population_scaling.Rd
@@ -28,12 +28,17 @@ inverting the existing scaling.}
 
 \item{by}{A (possibly named) character vector of variables to join by.
 
-If \code{NULL}, the default, the function will perform a natural join, using all
-variables in common across the \code{epi_df} produced by the \code{predict()} call
-and the user-provided dataset.
-If columns in that \code{epi_df} and \code{df} have the same name (and aren't
-included in \code{by}), \code{.df} is added to the one from the user-provided data
-to disambiguate.
+If \code{NULL}, the default, the function will try to infer a reasonable set of
+columns. First, it will try to join by all variables in the test data with
+roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in \code{df};
+these roles are automatically set if you are using an \code{epi_df}, or you can
+use, e.g., \code{update_role}. If no such roles are set, it will try to perform a
+natural join, using variables in common between the training/test data and
+population data.
+
+If columns in the training/testing data and \code{df} have the same name (and
+aren't included in \code{by}), a \code{.df} suffix is added to the one from the
+user-provided data to disambiguate.
 
 To join by different variables on the \code{epi_df} and \code{df}, use a named vector.
 For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value}
diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd
index 6a0c5dea9..c41ec7fcc 100644
--- a/man/step_population_scaling.Rd
+++ b/man/step_population_scaling.Rd
@@ -33,12 +33,17 @@ inverting the existing scaling.}
 
 \item{by}{A (possibly named) character vector of variables to join by.
 
-If \code{NULL}, the default, the function will perform a natural join, using all
-variables in common across the \code{epi_df} produced by the \code{predict()} call
-and the user-provided dataset.
-If columns in that \code{epi_df} and \code{df} have the same name (and aren't
-included in \code{by}), \code{.df} is added to the one from the user-provided data
-to disambiguate.
+If \code{NULL}, the default, the function will try to infer a reasonable set of
+columns. First, it will try to join by all variables in the training/test
+data with roles \code{"geo_value"}, \code{"key"}, or \code{"time_value"} that also appear in
+\code{df}; these roles are automatically set if you are using an \code{epi_df}, or you
+can use, e.g., \code{update_role}. If no such roles are set, it will try to
+perform a natural join, using variables in common between the training/test
+data and population data.
+
+If columns in the training/testing data and \code{df} have the same name (and
+aren't included in \code{by}), a \code{.df} suffix is added to the one from the
+user-provided data to disambiguate.
 
 To join by different variables on the \code{epi_df} and \code{df}, use a named vector.
 For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value}
@@ -46,7 +51,7 @@ to \code{df$states}. To join by multiple variables, use a vector with length > 1
 For example, \code{by = c("geo_value" = "states", "county" = "county")} will match
 \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}.
 
-See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
+See \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}} for more details.}
 
 \item{df_pop_col}{the name of the column in the data frame \code{df} that
 contains the population data and will be used for scaling.

From 202a226cf388b6dd52d4ac87023260e3764eec6c Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Wed, 30 Oct 2024 15:51:10 -0700
Subject: [PATCH 12/12] define case_num var even when pulling data from
 epidatasets

---
 vignettes/articles/smooth-qr.Rmd       | 2 +-
 vignettes/articles/symptom-surveys.Rmd | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd
index 801934e8f..a73ffd221 100644
--- a/vignettes/articles/smooth-qr.Rmd
+++ b/vignettes/articles/smooth-qr.Rmd
@@ -97,7 +97,7 @@ state cases and deaths. This sample data ranges from Dec. 31, 2020 to
 Dec. 31, 2021.
 
 ```{r}
-edf <- covid_case_death_rates
+edf <- epidatasets::covid_case_death_rates
 ```
 
 We will set the forecast date to be November 30, 2021 so that we can produce
diff --git a/vignettes/articles/symptom-surveys.Rmd b/vignettes/articles/symptom-surveys.Rmd
index af692726e..cf8013572 100644
--- a/vignettes/articles/symptom-surveys.Rmd
+++ b/vignettes/articles/symptom-surveys.Rmd
@@ -155,6 +155,7 @@ library(purrr)
 library(epipredict)
 library(recipes)
 
+case_num <- 200
 z <- epidatasets::county_smoothed_cli_comparison
 ```