diff --git a/NAMESPACE b/NAMESPACE index 86b77716b..2dbdf3178 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) S3method(bake,step_growth_rate) S3method(bake,step_lag_difference) +S3method(bake,step_pivot_wider) S3method(bake,step_population_scaling) S3method(bake,step_training_window) S3method(detect_layer,frosting) @@ -65,6 +66,7 @@ S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) S3method(prep,step_growth_rate) S3method(prep,step_lag_difference) +S3method(prep,step_pivot_wider) S3method(prep,step_population_scaling) S3method(prep,step_training_window) S3method(print,alist) @@ -96,6 +98,7 @@ S3method(print,step_epi_slide) S3method(print,step_growth_rate) S3method(print,step_lag_difference) S3method(print,step_naomit) +S3method(print,step_pivot_wider) S3method(print,step_population_scaling) S3method(print,step_training_window) S3method(quantile,dist_quantiles) @@ -205,6 +208,7 @@ export(step_epi_naomit) export(step_epi_slide) export(step_growth_rate) export(step_lag_difference) +export(step_pivot_wider) export(step_population_scaling) export(step_training_window) export(tibble) diff --git a/R/step_pivot_wider.R b/R/step_pivot_wider.R new file mode 100644 index 000000000..edadd29db --- /dev/null +++ b/R/step_pivot_wider.R @@ -0,0 +1,175 @@ + + +#' Create new variables by pivotting data +#' +#' This function typically creates new predictors by sharing values across keys. +#' So in the most basic case (see examples below), the values of a signal in +#' one `geo_value` would be used as predictors in all the other locations. +#' +#' @inheritParams step_growth_rate +#' @param ... <[`tidy-select`][tidyr_tidy_select]> One or more selector +#' functions to choose variables +#' values to pivot. These are the `values_from` argument for [tidyr::pivot_wider()]. +#' See [recipes::selections()] for more details. +#' @param names_from A selector function to choose which column (or columns) to +#' get the name of the output columns from. This is typically `geo_value` +#' (the default), and possibly any additional keys in the training data. +#' @param id_cols <[`tidy-select`][tidyr_tidy_select]> A selector function +#' providing a set of columns that uniquely identifies each observation. +#' The typical use is for this to be `time_value` and any additional keys +#' not selected by `names_from` (this is the default behaviour). +#' @inheritParams tidyr::pivot_wider +#' +#' @template step-return +#' @export +#' +#' @examples +#' jhu <- case_death_rate_subset %>% +#' filter(geo_value %in% c("ca", "ny", "pa"), time_value > "2021-12-01") +#' r <- epi_recipe(jhu) +#' +#' r1 <- r %>% step_pivot_wider("death_rate") +#' bake(prep(r1, jhu), new_data = NULL) +#' +#' r2 <- r %>% step_pivot_wider(dplyr::ends_with("rate")) +#' bake(prep(r2, jhu), new_data = NULL) +step_pivot_wider <- function( + recipe, + ..., + names_from = "geo_value", + role = "predictor", + id_cols = "time_value", + id_expand = FALSE, + values_fill = NA, + values_fn = NULL, + skip = FALSE, + id = rand_id("pivot_wider") +) { + + arg_is_chr_scalar(role, id) + + id_cols <- enquos(id_cols) + names_from <- enquos(names_from) + + add_step( + recipe, + step_pivot_wider_new( + terms = enquos(...), + role = role, + trained = FALSE, + user_id_cols = id_cols, + edf_id_cols = key_colnames(recipe), + id_expand = id_expand, + names_from = names_from, + values_fill = values_fill, + values_fn = values_fn, + values_from = NULL, + skip = skip, + id = id + ) + ) +} + +step_pivot_wider_new <- function( + terms, role, trained, user_id_cols, edf_id_cols, + id_expand, names_from, values_fill, + values_fn, values_from, skip, id) { + step( + subclass = "pivot_wider", + terms = terms, + role = role, + trained = trained, + user_id_cols = user_id_cols, + edf_id_cols = edf_id_cols, + id_expand = id_expand, + names_from = names_from, + values_fill = values_fill, + values_fn = values_fn, + values_from = values_from, + skip = skip, + id = id + ) +} + +#' @export +prep.step_pivot_wider <- function(x, training, info = NULL, ...) { + user_id_cols <- recipes_eval_select(x$user_id_cols, training, info) + hardhat::validate_column_names(training, user_id_cols) + names_from <- recipes_eval_select(x$names_from, training, info) + remaining_ids <- setdiff( + union(user_id_cols, names_from), # keys from user + key_colnames(training) # all edf keys + ) + all_id_cols <- union(user_id_cols, remaining_ids) + step_pivot_wider_new( + terms = x$terms, + role = x$role, + trained = TRUE, + user_id_cols = user_id_cols, + edf_id_cols = all_id_cols, + id_expand = x$id_expand, + names_from = names_from, + values_fill = x$values_fill, + values_fn = x$values_fn, + values_from = recipes_eval_select(x$terms, training, info), + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.step_pivot_wider <- function(object, new_data, ...) { + id_cols <- object$edf_id_cols + names_from <- object$names_from + values_from <- object$values_from + browser() + hardhat::validate_column_names(new_data, id_cols) + hardhat::validate_column_names(new_data, names_from) + hardhat::validate_column_names(new_data, values_from) + if (length(id_cols) == 0L) { + pivotted <- tidyr::pivot_wider( + new_data[, c(names_from, values_from)], + id_expand = object$id_expand, + names_from = unname(names_from), + values_from = unname(values_from), + values_fill = object$values_fill, + values_fn = object$values_fn, + names_repair = "unique" + ) + joinby <- intersect(names(pivotted), names(new_data)) + } else { + pivotted <- tidyr::pivot_wider( + new_data[, c(id_cols, names_from, values_from)], + id_cols = unname(id_cols), + id_expand = object$id_expand, + names_from = unname(names_from), + values_from = unname(values_from), + values_fill = object$values_fill, + values_fn = object$values_fn, + names_repair = "unique" + ) + joinby <- id_cols + } + if (length(joinby) > 0L) { + new_data <- left_join(new_data, pivotted, by = joinby) + } else if (length(joinby) == 0L && nrow(pivotted) == nrow(new_data)) { + new_data <- bind_cols(new_data, pivotted, .name_repair = "unique") + } else { + cli_abort(c( + "Unable to join variables created by `step_pivot_wider()`.", + i = "You may want to pass in `id_cols` on step creation." + )) + } + new_data +} + +#' @export +print.step_pivot_wider <- function(x, width = max(20, options()$width - 30), ...) { + print_epi_step(x$values_from, x$terms, x$trained, + title = "Pivotting variables", + conjunction = "by", + extra_text = x$names_from + ) + invisible(x) +} + diff --git a/R/utils-arg.R b/R/utils-arg.R index 081d153fb..d953906d6 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -199,3 +199,26 @@ arg_to_date <- function(x, allow_null = FALSE) { arg_is_date(x, allow_null = allow_null) x } + +check_tidyselect_cols_exist <- function(selection, data, call = caller_env()) { + name_pos <- tidyselect::eval_select(selection, data, error_call = call) + if (length(name_pos) == 0L) { + return(list(ok = TRUE, missing_names = selection)) + } + hardhat::check_column_names(data, names(name_pos)) +} + +validate_tidyselect_cols_exist <- function(selection, data, call = caller_env()) { + check <- check_tidyselect_cols_exist(selection, data, call) + if (!check$ok) { + missing_names <- glue::glue_collapse( + glue::single_quote(check$missing_names), + sep = ", " + ) + message <- glue::glue( + "The {selection} results in missing columns: {missing_names}." + ) + cli_abort(message, call = call) + } + invisible(data) +} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index f0ee41390..678c2d38f 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -260,8 +260,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't +#> work with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/man/step_pivot_wider.Rd b/man/step_pivot_wider.Rd new file mode 100644 index 000000000..668c4f4dd --- /dev/null +++ b/man/step_pivot_wider.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_pivot_wider.R +\name{step_pivot_wider} +\alias{step_pivot_wider} +\title{Create new variables by pivotting data} +\usage{ +step_pivot_wider( + recipe, + ..., + names_from = "geo_value", + role = "predictor", + id_cols = "time_value", + id_expand = FALSE, + values_fill = NA, + values_fn = NULL, + skip = FALSE, + id = rand_id("pivot_wider") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> One or more selector +functions to choose variables +values to pivot. These are the \code{values_from} argument for \code{\link[tidyr:pivot_wider]{tidyr::pivot_wider()}}. +See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{names_from}{A selector function to choose which column (or columns) to +get the name of the output columns from. This is typically \code{geo_value} +(the default), and possibly any additional keys in the training data.} + +\item{role}{For model terms created by this step, what analysis role should +they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} + +\item{id_cols}{<\code{\link[=tidyr_tidy_select]{tidy-select}}> A selector function +providing a set of columns that uniquely identifies each observation. +The typical use is for this to be \code{time_value} and any additional keys +not selected by \code{names_from} (this is the default behaviour).} + +\item{id_expand}{Should the values in the \code{id_cols} columns be expanded by +\code{\link[tidyr:expand]{expand()}} before pivoting? This results in more rows, the output will +contain a complete expansion of all possible values in \code{id_cols}. Implicit +factor levels that aren't represented in the data will become explicit. +Additionally, the row values corresponding to the expanded \code{id_cols} will +be sorted.} + +\item{values_fill}{Optionally, a (scalar) value that specifies what each +\code{value} should be filled in with when missing. + +This can be a named list if you want to apply different fill values to +different value columns.} + +\item{values_fn}{Optionally, a function applied to the value in each cell +in the output. You will typically use this when the combination of +\code{id_cols} and \code{names_from} columns does not uniquely identify an +observation. + +This can be a named list if you want to apply different aggregations +to different \code{values_from} columns.} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{id}{A unique identifier for the step} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +Create new variables by pivotting data +} +\examples{ +jhu <- case_death_rate_subset \%>\% + filter(geo_value \%in\% c("ca", "ny", "pa"), time_value > "2021-12-01") +r <- epi_recipe(jhu) + +r1 <- r \%>\% step_pivot_wider("death_rate") +bake(prep(r1, jhu), new_data = NULL) + +r2 <- r \%>\% step_pivot_wider(dplyr::ends_with("rate")) +bake(prep(r2, jhu), new_data = NULL) +} diff --git a/tests/testthat/test-step_pivot_wider.R b/tests/testthat/test-step_pivot_wider.R new file mode 100644 index 000000000..ce16cc77b --- /dev/null +++ b/tests/testthat/test-step_pivot_wider.R @@ -0,0 +1,38 @@ +library(tidyr) +tib <- expand_grid( + tv = 1:4, + gv = letters[1:2], + cl = letters[2:4] +) +tib$val1 <- 1:nrow(tib) + .1 +tib$val2 <- nrow(tib):1 - .1 + + +test_that("works with recipe, various possible pivots", { + out <- recipe(tib) %>% + step_pivot_wider(val1, names_from = cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + out <- recipe(tib) %>% + step_pivot_wider(val1, names_from = cl, values_fill = 0) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + out <- recipe(tib) %>% + step_pivot_wider(starts_with("val"), names_from = cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + out <- recipe(tib) %>% + step_pivot_wider(val1, names_from = gv:cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + expect_snapshot(out) + + #fails + out <- recipe(tib) %>% + step_pivot_wider(val1, id_cols = tv:cl, names_from = cl) %>% + prep(training = tib) %>% bake(new_data = NULL) + + edf <- tib %>% as_epi_df() +}) diff --git a/vignettes/articles/scorecaster.Rmd b/vignettes/articles/scorecaster.Rmd new file mode 100644 index 000000000..960b83f87 --- /dev/null +++ b/vignettes/articles/scorecaster.Rmd @@ -0,0 +1,97 @@ +--- +title: "Implementing a scorecaster for quantile calibration" +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + warning = FALSE, + message = FALSE, + cache = TRUE +) +``` + + +```{r packages} +library(tidyverse) +library(epipredict) +``` + +First we get some forecasts. + +```{r forecast-output} +jhu <- case_death_rate_subset +fc_time_values <- seq(as.Date("2021-03-09"), as.Date("2021-12-01"), by = "4 weeks") +q_levels <- c(1, 2, 5, 8, 9) / 10 +forecaster <- function(x, aheads = 7) { + map(aheads, ~ arx_forecaster( + x, "death_rate", c("case_rate", "death_rate"), + quantile_reg(quantile_levels = q_levels), + arx_args_list(ahead = .x, quantile_levels = q_levels) + )$predictions |> + mutate(ahead = .x) + ) |> list_rbind() +} + +out <- map( + .x = fc_time_values, + .f = ~forecaster(jhu %>% filter(time_value <= .x), c(7, 14, 21, 28)), + .progress = TRUE +) + +out <- out %>% list_rbind() +out <- left_join( + out, + jhu, + by = c("target_date" = "time_value", "geo_value") +) +``` + +Now we set up the "quantile conformal score" and the tangent integrator. + +```{r necessary-funs} +quantile_conformal_score <- function(x, actual) { + UseMethod("quantile_conformal_score") +} +quantile_conformal_score.distribution <- function(x, actual) { + l <- vctrs::vec_recycle_common(x = x, actual = actual) + map2( + .x = vctrs::vec_data(l$x), + .y = l$actual, + .f = quantile_conformal_score + ) +} +quantile_conformal_score.dist_quantiles <- function(x, actual) { + values <- vctrs::field(x, "values") + quantile_levels <- vctrs::field(x, "quantile_levels") + errs <- (actual - values) * (quantile_levels > 0.5) + + (values - actual) * (quantile_levels < 0.5) + + abs(actual - values) * (quantile_levels == 0.5) + errs +} + +tangent_integrator <- function(x, t, KI = 1000, Csat = 2) { + # defaults from https://github.com/aangelopoulos/conformal-time-series/blob/b729c3f5ff633bfc43f0f7ca08199b549c2573ac/tests/configs/ca-COVID-deaths-4wk.yaml#L41 + x <- x * log(t + 1) / (Csat * (t + 1)) + up <- x >= pi / 2 + down <- x <= -pi / 2 + x[up] <- Inf + x[down] <- -Inf + mid <- !up & !down + x[mid] <- KI * tan(x[mid]) +} +``` + +Score the forecasts. + +```{r score-fcasts} +out <- out |> + mutate(qc_scores = quantile_conformal_score(.pred_distn, death_rate)) +``` + +Now we would need a "scorecaster". The paper has code here: +https://github.com/aangelopoulos/conformal-time-series/blob/b729c3f5ff633bfc43f0f7ca08199b549c2573ac/tests/datasets/covid-ts-proc/statewide/death-forecasting-perstate-lasso-qr.ipynb + +Not quite sure what the model is. Note that `epipredict::quantile_reg()` may work +(without the $\ell_1$ penalty).