diff --git a/NAMESPACE b/NAMESPACE
index 86b77716b..2dbdf3178 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -25,6 +25,7 @@ S3method(bake,step_epi_lag)
S3method(bake,step_epi_slide)
S3method(bake,step_growth_rate)
S3method(bake,step_lag_difference)
+S3method(bake,step_pivot_wider)
S3method(bake,step_population_scaling)
S3method(bake,step_training_window)
S3method(detect_layer,frosting)
@@ -65,6 +66,7 @@ S3method(prep,step_epi_lag)
S3method(prep,step_epi_slide)
S3method(prep,step_growth_rate)
S3method(prep,step_lag_difference)
+S3method(prep,step_pivot_wider)
S3method(prep,step_population_scaling)
S3method(prep,step_training_window)
S3method(print,alist)
@@ -96,6 +98,7 @@ S3method(print,step_epi_slide)
S3method(print,step_growth_rate)
S3method(print,step_lag_difference)
S3method(print,step_naomit)
+S3method(print,step_pivot_wider)
S3method(print,step_population_scaling)
S3method(print,step_training_window)
S3method(quantile,dist_quantiles)
@@ -205,6 +208,7 @@ export(step_epi_naomit)
export(step_epi_slide)
export(step_growth_rate)
export(step_lag_difference)
+export(step_pivot_wider)
export(step_population_scaling)
export(step_training_window)
export(tibble)
diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R
new file mode 100644
index 000000000..edadd29db
--- /dev/null
+++ b/R/step_pivot_wider.R
@@ -0,0 +1,175 @@
+
+
+#' Create new variables by pivotting data
+#'
+#' This function typically creates new predictors by sharing values across keys.
+#' So in the most basic case (see examples below), the values of a signal in
+#' one `geo_value` would be used as predictors in all the other locations.
+#'
+#' @inheritParams step_growth_rate
+#' @param ... <[`tidy-select`][tidyr_tidy_select]> One or more selector
+#' functions to choose variables
+#' values to pivot. These are the `values_from` argument for [tidyr::pivot_wider()].
+#' See [recipes::selections()] for more details.
+#' @param names_from A selector function to choose which column (or columns) to
+#' get the name of the output columns from. This is typically `geo_value`
+#' (the default), and possibly any additional keys in the training data.
+#' @param id_cols <[`tidy-select`][tidyr_tidy_select]> A selector function
+#' providing a set of columns that uniquely identifies each observation.
+#' The typical use is for this to be `time_value` and any additional keys
+#' not selected by `names_from` (this is the default behaviour).
+#' @inheritParams tidyr::pivot_wider
+#'
+#' @template step-return
+#' @export
+#'
+#' @examples
+#' jhu <- case_death_rate_subset %>%
+#' filter(geo_value %in% c("ca", "ny", "pa"), time_value > "2021-12-01")
+#' r <- epi_recipe(jhu)
+#'
+#' r1 <- r %>% step_pivot_wider("death_rate")
+#' bake(prep(r1, jhu), new_data = NULL)
+#'
+#' r2 <- r %>% step_pivot_wider(dplyr::ends_with("rate"))
+#' bake(prep(r2, jhu), new_data = NULL)
+step_pivot_wider <- function(
+ recipe,
+ ...,
+ names_from = "geo_value",
+ role = "predictor",
+ id_cols = "time_value",
+ id_expand = FALSE,
+ values_fill = NA,
+ values_fn = NULL,
+ skip = FALSE,
+ id = rand_id("pivot_wider")
+) {
+
+ arg_is_chr_scalar(role, id)
+
+ id_cols <- enquos(id_cols)
+ names_from <- enquos(names_from)
+
+ add_step(
+ recipe,
+ step_pivot_wider_new(
+ terms = enquos(...),
+ role = role,
+ trained = FALSE,
+ user_id_cols = id_cols,
+ edf_id_cols = key_colnames(recipe),
+ id_expand = id_expand,
+ names_from = names_from,
+ values_fill = values_fill,
+ values_fn = values_fn,
+ values_from = NULL,
+ skip = skip,
+ id = id
+ )
+ )
+}
+
+step_pivot_wider_new <- function(
+ terms, role, trained, user_id_cols, edf_id_cols,
+ id_expand, names_from, values_fill,
+ values_fn, values_from, skip, id) {
+ step(
+ subclass = "pivot_wider",
+ terms = terms,
+ role = role,
+ trained = trained,
+ user_id_cols = user_id_cols,
+ edf_id_cols = edf_id_cols,
+ id_expand = id_expand,
+ names_from = names_from,
+ values_fill = values_fill,
+ values_fn = values_fn,
+ values_from = values_from,
+ skip = skip,
+ id = id
+ )
+}
+
+#' @export
+prep.step_pivot_wider <- function(x, training, info = NULL, ...) {
+ user_id_cols <- recipes_eval_select(x$user_id_cols, training, info)
+ hardhat::validate_column_names(training, user_id_cols)
+ names_from <- recipes_eval_select(x$names_from, training, info)
+ remaining_ids <- setdiff(
+ union(user_id_cols, names_from), # keys from user
+ key_colnames(training) # all edf keys
+ )
+ all_id_cols <- union(user_id_cols, remaining_ids)
+ step_pivot_wider_new(
+ terms = x$terms,
+ role = x$role,
+ trained = TRUE,
+ user_id_cols = user_id_cols,
+ edf_id_cols = all_id_cols,
+ id_expand = x$id_expand,
+ names_from = names_from,
+ values_fill = x$values_fill,
+ values_fn = x$values_fn,
+ values_from = recipes_eval_select(x$terms, training, info),
+ skip = x$skip,
+ id = x$id
+ )
+}
+
+#' @export
+bake.step_pivot_wider <- function(object, new_data, ...) {
+ id_cols <- object$edf_id_cols
+ names_from <- object$names_from
+ values_from <- object$values_from
+ browser()
+ hardhat::validate_column_names(new_data, id_cols)
+ hardhat::validate_column_names(new_data, names_from)
+ hardhat::validate_column_names(new_data, values_from)
+ if (length(id_cols) == 0L) {
+ pivotted <- tidyr::pivot_wider(
+ new_data[, c(names_from, values_from)],
+ id_expand = object$id_expand,
+ names_from = unname(names_from),
+ values_from = unname(values_from),
+ values_fill = object$values_fill,
+ values_fn = object$values_fn,
+ names_repair = "unique"
+ )
+ joinby <- intersect(names(pivotted), names(new_data))
+ } else {
+ pivotted <- tidyr::pivot_wider(
+ new_data[, c(id_cols, names_from, values_from)],
+ id_cols = unname(id_cols),
+ id_expand = object$id_expand,
+ names_from = unname(names_from),
+ values_from = unname(values_from),
+ values_fill = object$values_fill,
+ values_fn = object$values_fn,
+ names_repair = "unique"
+ )
+ joinby <- id_cols
+ }
+ if (length(joinby) > 0L) {
+ new_data <- left_join(new_data, pivotted, by = joinby)
+ } else if (length(joinby) == 0L && nrow(pivotted) == nrow(new_data)) {
+ new_data <- bind_cols(new_data, pivotted, .name_repair = "unique")
+ } else {
+ cli_abort(c(
+ "Unable to join variables created by `step_pivot_wider()`.",
+ i = "You may want to pass in `id_cols` on step creation."
+ ))
+ }
+ new_data
+}
+
+#' @export
+print.step_pivot_wider <- function(x, width = max(20, options()$width - 30), ...) {
+ print_epi_step(x$values_from, x$terms, x$trained,
+ title = "Pivotting variables",
+ conjunction = "by",
+ extra_text = x$names_from
+ )
+ invisible(x)
+}
+
diff --git a/R/utils-arg.R b/R/utils-arg.R
index 081d153fb..d953906d6 100644
--- a/R/utils-arg.R
+++ b/R/utils-arg.R
@@ -199,3 +199,26 @@ arg_to_date <- function(x, allow_null = FALSE) {
arg_is_date(x, allow_null = allow_null)
x
}
+
+check_tidyselect_cols_exist <- function(selection, data, call = caller_env()) {
+ name_pos <- tidyselect::eval_select(selection, data, error_call = call)
+ if (length(name_pos) == 0L) {
+ return(list(ok = TRUE, missing_names = selection))
+ }
+ hardhat::check_column_names(data, names(name_pos))
+}
+
+validate_tidyselect_cols_exist <- function(selection, data, call = caller_env()) {
+ check <- check_tidyselect_cols_exist(selection, data, call)
+ if (!check$ok) {
+ missing_names <- glue::glue_collapse(
+ glue::single_quote(check$missing_names),
+ sep = ", "
+ )
+ message <- glue::glue(
+ "The {selection} results in missing columns: {missing_names}."
+ )
+ cli_abort(message, call = call)
+ }
+ invisible(data)
+}
diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd
index f0ee41390..678c2d38f 100644
--- a/man/step_adjust_latency.Rd
+++ b/man/step_adjust_latency.Rd
@@ -260,8 +260,8 @@ while this will not:
\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\%
step_epi_lag(a, lag=0) \%>\%
step_adjust_latency(a, method = "extend_lags")
-#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with
-#> modified data.
+#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't
+#> work with modified data.
}\if{html}{\out{
}}
If you create columns that you then apply lags to (such as
diff --git a/man/step_pivot_wider.Rd b/man/step_pivot_wider.Rd
new file mode 100644
index 000000000..668c4f4dd
--- /dev/null
+++ b/man/step_pivot_wider.Rd
@@ -0,0 +1,88 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/step_pivot_wider.R
+\name{step_pivot_wider}
+\alias{step_pivot_wider}
+\title{Create new variables by pivotting data}
+\usage{
+step_pivot_wider(
+ recipe,
+ ...,
+ names_from = "geo_value",
+ role = "predictor",
+ id_cols = "time_value",
+ id_expand = FALSE,
+ values_fill = NA,
+ values_fn = NULL,
+ skip = FALSE,
+ id = rand_id("pivot_wider")
+)
+}
+\arguments{
+\item{recipe}{A recipe object. The step will be added to the
+sequence of operations for this recipe.}
+
+\item{...}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> One or more selector
+functions to choose variables
+values to pivot. These are the \code{values_from} argument for \code{\link[tidyr:pivot_wider]{tidyr::pivot_wider()}}.
+See \code{\link[recipes:selections]{recipes::selections()}} for more details.}
+
+\item{names_from}{A selector function to choose which column (or columns) to
+get the name of the output columns from. This is typically \code{geo_value}
+(the default), and possibly any additional keys in the training data.}
+
+\item{role}{For model terms created by this step, what analysis role should
+they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.}
+
+\item{id_cols}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> A selector function
+providing a set of columns that uniquely identifies each observation.
+The typical use is for this to be \code{time_value} and any additional keys
+not selected by \code{names_from} (this is the default behaviour).}
+
+\item{id_expand}{Should the values in the \code{id_cols} columns be expanded by
+\code{\link[tidyr:expand]{expand()}} before pivoting? This results in more rows, the output will
+contain a complete expansion of all possible values in \code{id_cols}. Implicit
+factor levels that aren't represented in the data will become explicit.
+Additionally, the row values corresponding to the expanded \code{id_cols} will
+be sorted.}
+
+\item{values_fill}{Optionally, a (scalar) value that specifies what each
+\code{value} should be filled in with when missing.
+
+This can be a named list if you want to apply different fill values to
+different value columns.}
+
+\item{values_fn}{Optionally, a function applied to the value in each cell
+in the output. You will typically use this when the combination of
+\code{id_cols} and \code{names_from} columns does not uniquely identify an
+observation.
+
+This can be a named list if you want to apply different aggregations
+to different \code{values_from} columns.}
+
+\item{skip}{A logical. Should the step be skipped when the
+recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked
+when \code{\link[=prep]{prep()}} is run, some operations may not be able to be
+conducted on new data (e.g. processing the outcome variable(s)).
+Care should be taken when using \code{skip = TRUE} as it may affect
+the computations for subsequent operations.}
+
+\item{id}{A unique identifier for the step}
+}
+\value{
+An updated version of \code{recipe} with the new step added to the
+sequence of any existing operations.
+}
+\description{
+Create new variables by pivotting data
+}
+\examples{
+jhu <- case_death_rate_subset \%>\%
+ filter(geo_value \%in\% c("ca", "ny", "pa"), time_value > "2021-12-01")
+r <- epi_recipe(jhu)
+
+r1 <- r \%>\% step_pivot_wider("death_rate")
+bake(prep(r1, jhu), new_data = NULL)
+
+r2 <- r \%>\% step_pivot_wider(dplyr::ends_with("rate"))
+bake(prep(r2, jhu), new_data = NULL)
+}
diff --git a/tests/testthat/test-step_pivot_wider.R b/tests/testthat/test-step_pivot_wider.R
new file mode 100644
index 000000000..ce16cc77b
--- /dev/null
+++ b/tests/testthat/test-step_pivot_wider.R
@@ -0,0 +1,38 @@
+library(tidyr)
+tib <- expand_grid(
+ tv = 1:4,
+ gv = letters[1:2],
+ cl = letters[2:4]
+)
+tib$val1 <- 1:nrow(tib) + .1
+tib$val2 <- nrow(tib):1 - .1
+
+
+test_that("works with recipe, various possible pivots", {
+ out <- recipe(tib) %>%
+ step_pivot_wider(val1, names_from = cl) %>%
+ prep(training = tib) %>% bake(new_data = NULL)
+ expect_snapshot(out)
+
+ out <- recipe(tib) %>%
+ step_pivot_wider(val1, names_from = cl, values_fill = 0) %>%
+ prep(training = tib) %>% bake(new_data = NULL)
+ expect_snapshot(out)
+
+ out <- recipe(tib) %>%
+ step_pivot_wider(starts_with("val"), names_from = cl) %>%
+ prep(training = tib) %>% bake(new_data = NULL)
+ expect_snapshot(out)
+
+ out <- recipe(tib) %>%
+ step_pivot_wider(val1, names_from = gv:cl) %>%
+ prep(training = tib) %>% bake(new_data = NULL)
+ expect_snapshot(out)
+
+ #fails
+ out <- recipe(tib) %>%
+ step_pivot_wider(val1, id_cols = tv:cl, names_from = cl) %>%
+ prep(training = tib) %>% bake(new_data = NULL)
+
+ edf <- tib %>% as_epi_df()
+})
diff --git a/vignettes/articles/scorecaster.Rmd b/vignettes/articles/scorecaster.Rmd
new file mode 100644
index 000000000..960b83f87
--- /dev/null
+++ b/vignettes/articles/scorecaster.Rmd
@@ -0,0 +1,97 @@
+---
+title: "Implementing a scorecaster for quantile calibration"
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ warning = FALSE,
+ message = FALSE,
+ cache = TRUE
+)
+```
+
+
+```{r packages}
+library(tidyverse)
+library(epipredict)
+```
+
+First we get some forecasts.
+
+```{r forecast-output}
+jhu <- case_death_rate_subset
+fc_time_values <- seq(as.Date("2021-03-09"), as.Date("2021-12-01"), by = "4 weeks")
+q_levels <- c(1, 2, 5, 8, 9) / 10
+forecaster <- function(x, aheads = 7) {
+ map(aheads, ~ arx_forecaster(
+ x, "death_rate", c("case_rate", "death_rate"),
+ quantile_reg(quantile_levels = q_levels),
+ arx_args_list(ahead = .x, quantile_levels = q_levels)
+ )$predictions |>
+ mutate(ahead = .x)
+ ) |> list_rbind()
+}
+
+out <- map(
+ .x = fc_time_values,
+ .f = ~forecaster(jhu %>% filter(time_value <= .x), c(7, 14, 21, 28)),
+ .progress = TRUE
+)
+
+out <- out %>% list_rbind()
+out <- left_join(
+ out,
+ jhu,
+ by = c("target_date" = "time_value", "geo_value")
+)
+```
+
+Now we set up the "quantile conformal score" and the tangent integrator.
+
+```{r necessary-funs}
+quantile_conformal_score <- function(x, actual) {
+ UseMethod("quantile_conformal_score")
+}
+quantile_conformal_score.distribution <- function(x, actual) {
+ l <- vctrs::vec_recycle_common(x = x, actual = actual)
+ map2(
+ .x = vctrs::vec_data(l$x),
+ .y = l$actual,
+ .f = quantile_conformal_score
+ )
+}
+quantile_conformal_score.dist_quantiles <- function(x, actual) {
+ values <- vctrs::field(x, "values")
+ quantile_levels <- vctrs::field(x, "quantile_levels")
+ errs <- (actual - values) * (quantile_levels > 0.5) +
+ (values - actual) * (quantile_levels < 0.5) +
+ abs(actual - values) * (quantile_levels == 0.5)
+ errs
+}
+
+tangent_integrator <- function(x, t, KI = 1000, Csat = 2) {
+ # defaults from https://github.com/aangelopoulos/conformal-time-series/blob/b729c3f5ff633bfc43f0f7ca08199b549c2573ac/tests/configs/ca-COVID-deaths-4wk.yaml#L41
+ x <- x * log(t + 1) / (Csat * (t + 1))
+ up <- x >= pi / 2
+ down <- x <= -pi / 2
+ x[up] <- Inf
+ x[down] <- -Inf
+ mid <- !up & !down
+ x[mid] <- KI * tan(x[mid])
+}
+```
+
+Score the forecasts.
+
+```{r score-fcasts}
+out <- out |>
+ mutate(qc_scores = quantile_conformal_score(.pred_distn, death_rate))
+```
+
+Now we would need a "scorecaster". The paper has code here:
+https://github.com/aangelopoulos/conformal-time-series/blob/b729c3f5ff633bfc43f0f7ca08199b549c2573ac/tests/datasets/covid-ts-proc/statewide/death-forecasting-perstate-lasso-qr.ipynb
+
+Not quite sure what the model is. Note that `epipredict::quantile_reg()` may work
+(without the $\ell_1$ penalty).