From 740d4387e79524c22414a37298e6eeaed2c441ea Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 2 Aug 2024 18:33:13 -0700 Subject: [PATCH] refactor: step_epi_slide * validate_slide_fun now rejects formula f * remove warning about optimized slide functions until that PR * fix tests * remove try_period and replace with epiprocess internal * remove slider dependency * update documentation --- DESCRIPTION | 5 +- NEWS.md | 2 +- R/step_epi_slide.R | 127 ++++++++++----------------- man/add_epi_recipe.Rd | 2 +- man/arx_classifier.Rd | 2 +- man/arx_forecaster.Rd | 2 +- man/cdc_baseline_forecaster.Rd | 4 +- man/epi_slide_wrapper.Rd | 2 +- man/flatline_forecaster.Rd | 4 +- man/get_test_data.Rd | 2 +- man/grad_employ_subset.Rd | 2 +- man/predict-epi_workflow.Rd | 2 +- man/step_epi_slide.Rd | 24 +---- tests/testthat/test-step_epi_slide.R | 112 +++++------------------ 14 files changed, 86 insertions(+), 206 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c16aff98e..3e8db4117 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.0.17 +Version: 0.0.18 Authors@R: c( person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), @@ -35,12 +35,10 @@ Imports: ggplot2, glue, hardhat (>= 1.3.0), - lubridate, magrittr, quantreg, recipes (>= 1.0.4), rlang (>= 1.0.0), - slider, smoothqr, stats, tibble, @@ -55,6 +53,7 @@ Suggests: epidatr (>= 1.0.0), fs, knitr, + lubridate, poissonreg, purrr, ranger, diff --git a/NEWS.md b/NEWS.md index 4cb3e092a..d02e246f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -52,4 +52,4 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat `...` args intended for `predict.model_fit()` - `bake.epi_recipe()` will now re-infer the geo and time type in case baking the steps has changed the appropriate values -- Add a step to produce generic sliding computations over an `epi_df` +- Add `step_epi_slide` to produce generic sliding computations over an `epi_df` diff --git a/R/step_epi_slide.R b/R/step_epi_slide.R index 25ef3973c..637d31a54 100644 --- a/R/step_epi_slide.R +++ b/R/step_epi_slide.R @@ -4,7 +4,6 @@ #' that will generate one or more new columns of derived data by "sliding" #' a computation along existing data. #' -#' #' @inheritParams step_epi_lag #' @param .f A function in one of the following formats: #' 1. An unquoted function name with no arguments, e.g., `mean` @@ -20,27 +19,15 @@ #' argument must be named `.x`. A common, though very difficult to debug #' error is using something like `function(x) mean`. This will not work #' because it returns the function mean, rather than `mean(x)` +#' @param before,after the size of the sliding window on the left and the right +#' of the center. Usually non-negative integers for data indexed by date, but +#' more restrictive in other cases (see [epiprocess::epi_slide()] for details). +#' @param prefix A character string that will be prefixed to the new column. #' @param f_name a character string of at most 20 characters that describes #' the function. This will be combined with `prefix` and the columns in `...` #' to name the result using `{prefix}{f_name}_{column}`. By default it will be determined #' automatically using `clean_f_name()`. -#' @param before,after non-negative integers. -#' How far `before` and `after` each `time_value` should -#' the sliding window extend? Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `time_value - time_step(k)` to -#' `time_value`, use `before=k, after=0`. This is the most likely use case -#' for the purposes of forecasting. -#' * For center-aligned windows from `time_value - time_step(k)` to -#' `time_value + time_step(k)`, use `before=k, after=k`. -#' * For leading/left-aligned windows from `time_value` to -#' `time_value + time_step(k)`, use `after=k, after=0`. #' -#' You may also pass a [lubridate::period], like `lubridate::weeks(1)` or a -#' character string that is coercible to a [lubridate::period], like -#' `"2 weeks"`. #' @template step-return #' #' @export @@ -69,9 +56,8 @@ step_epi_slide <- rlang::abort("This recipe step can only operate on an `epi_recipe`.") } .f <- validate_slide_fun(.f) - arg_is_scalar(before, after) - before <- try_period(before) - after <- try_period(after) + epiprocess:::validate_slide_window_arg(before, attributes(recipe$template)$metadata$time_type) + epiprocess:::validate_slide_window_arg(after, attributes(recipe$template)$metadata$time_type) arg_is_chr_scalar(role, prefix, id) arg_is_lgl_scalar(skip) @@ -126,7 +112,6 @@ step_epi_slide_new <- } - #' @export prep.step_epi_slide <- function(x, training, info = NULL, ...) { col_names <- recipes::recipes_eval_select(x$terms, data = training, info = info) @@ -150,7 +135,6 @@ prep.step_epi_slide <- function(x, training, info = NULL, ...) { } - #' @export bake.step_epi_slide <- function(object, new_data, ...) { recipes::check_new_data(names(object$columns), object, new_data) @@ -170,12 +154,16 @@ bake.step_epi_slide <- function(object, new_data, ...) { class = "epipredict__step__name_collision_error" ) } - if (any(vapply(c(mean, sum), \(x) identical(x, object$.f), logical(1L)))) { - cli_warn( - c("There is an optimized version of both mean and sum. See `step_epi_slide_mean`, `step_epi_slide_sum`, or `step_epi_slide_opt`."), - class = "epipredict__step_epi_slide__optimized_version" - ) - } + # TODO: Uncomment this whenever we make the optimized versions available. + # if (any(vapply(c(mean, sum), \(x) identical(x, object$.f), logical(1L)))) { + # cli_warn( + # c( + # "There is an optimized version of both mean and sum. See `step_epi_slide_mean`, `step_epi_slide_sum`, + # or `step_epi_slide_opt`." + # ), + # class = "epipredict__step_epi_slide__optimized_version" + # ) + # } epi_slide_wrapper( new_data, object$before, @@ -187,48 +175,51 @@ bake.step_epi_slide <- function(object, new_data, ...) { object$prefix ) } -#' wrapper to handle epi_slide particulars + + +#' Wrapper to handle epi_slide particulars +#' #' @description #' This should simplify somewhat in the future when we can run `epi_slide` on #' columns. Surprisingly, lapply is several orders of magnitude faster than #' using roughly equivalent tidy select style. +#' #' @param fns vector of functions, even if it's length 1. #' @param group_keys the keys to group by. likely `epi_keys[-1]` (to remove time_value) +#' #' @importFrom tidyr crossing #' @importFrom dplyr bind_cols group_by ungroup #' @importFrom epiprocess epi_slide #' @keywords internal epi_slide_wrapper <- function(new_data, before, after, columns, fns, fn_names, group_keys, name_prefix) { cols_fns <- tidyr::crossing(col_name = columns, fn_name = fn_names, fn = fns) + # Iterate over the rows of cols_fns. For each row number, we will output a + # transformed column. The first result returns all the original columns along + # with the new column. The rest just return the new column. seq_len(nrow(cols_fns)) %>% - lapply( # iterate over the rows of cols_fns - # takes in the row number, outputs the transformed column - function(comp_i) { - # extract values from the row - col_name <- cols_fns[[comp_i, "col_name"]] - fn_name <- cols_fns[[comp_i, "fn_name"]] - fn <- cols_fns[[comp_i, "fn"]][[1L]] - result_name <- paste(name_prefix, fn_name, col_name, sep = "_") - result <- new_data %>% - group_by(across(all_of(group_keys))) %>% - epi_slide( - before = before, - after = after, - new_col_name = result_name, - f = function(slice, geo_key, ref_time_value) { - fn(slice[[col_name]]) - } - ) %>% - ungroup() - # the first result needs to include all of the original columns - if (comp_i == 1L) { - result - } else { - # everything else just needs that column transformed - result[result_name] - } + lapply(function(comp_i) { + col_name <- cols_fns[[comp_i, "col_name"]] + fn_name <- cols_fns[[comp_i, "fn_name"]] + fn <- cols_fns[[comp_i, "fn"]][[1L]] + result_name <- paste(name_prefix, fn_name, col_name, sep = "_") + result <- new_data %>% + group_by(across(all_of(group_keys))) %>% + epi_slide( + before = before, + after = after, + new_col_name = result_name, + f = function(slice, geo_key, ref_time_value) { + fn(slice[[col_name]]) + } + ) %>% + ungroup() + + if (comp_i == 1L) { + result + } else { + result[result_name] } - ) %>% + }) %>% bind_cols() } @@ -286,9 +277,7 @@ validate_slide_fun <- function(.f) { cli_abort("In, `step_epi_slide()`, `.f` may not be missing.") } if (rlang::is_formula(.f, scoped = TRUE)) { - if (!is.null(rlang::f_lhs(.f))) { - cli_abort("In, `step_epi_slide()`, `.f` must be a one-sided formula.") - } + cli_abort("In, `step_epi_slide()`, `.f` cannot be a formula.") } else if (rlang::is_character(.f)) { .f <- rlang::as_function(.f) } else if (!rlang::is_function(.f)) { @@ -296,23 +285,3 @@ validate_slide_fun <- function(.f) { } .f } - -try_period <- function(x) { - err <- is.na(x) - if (!err) { - if (is.numeric(x)) { - err <- !rlang::is_integerish(x) || x < 0 - } else { - x <- lubridate::as.period(x) - err <- is.na(x) - } - } - if (err) { - cli_abort(paste( - "The value supplied to `before` or `after` must be a non-negative integer", - "a {.cls lubridate::period} or a character scalar that can be coerced", - 'as a {.cls lubridate::period}, e.g., `"1 week"`.' - ), ) - } - x -} diff --git a/man/add_epi_recipe.Rd b/man/add_epi_recipe.Rd index 10c282209..0da2d55b3 100644 --- a/man/add_epi_recipe.Rd +++ b/man/add_epi_recipe.Rd @@ -35,7 +35,7 @@ Add an \code{epi_recipe} to a workflow \details{ \code{add_epi_recipe} has the same behaviour as \code{\link[workflows:add_recipe]{workflows::add_recipe()}} but sets a different -default blueprint to automatically handle \link[epiprocess:as_epi_df]{epiprocess::epi_df} data. +default blueprint to automatically handle \link[epiprocess:epi_df]{epiprocess::epi_df} data. } \examples{ library(dplyr) diff --git a/man/arx_classifier.Rd b/man/arx_classifier.Rd index eb36f0872..350352ae9 100644 --- a/man/arx_classifier.Rd +++ b/man/arx_classifier.Rd @@ -44,7 +44,7 @@ workflow } \description{ This is an autoregressive classification model for -\link[epiprocess:as_epi_df]{epiprocess::epi_df} data. It does "direct" forecasting, meaning +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It does "direct" forecasting, meaning that it estimates a class at a particular target horizon. } \examples{ diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index 2395f5659..af05c0682 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -37,7 +37,7 @@ workflow } \description{ This is an autoregressive forecasting model for -\link[epiprocess:as_epi_df]{epiprocess::epi_df} data. It does "direct" forecasting, meaning +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It does "direct" forecasting, meaning that it estimates a model for a particular target horizon. } \examples{ diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index db7bbc41c..cd3c4ed67 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -11,7 +11,7 @@ cdc_baseline_forecaster( ) } \arguments{ -\item{epi_data}{An \code{\link[epiprocess:as_epi_df]{epiprocess::epi_df}}} +\item{epi_data}{An \code{\link[epiprocess:epi_df]{epiprocess::epi_df}}} \item{outcome}{A scalar character for the column name we wish to predict.} @@ -24,7 +24,7 @@ horizons) for each unique combination of \code{key_vars}. } \description{ This is a simple forecasting model for -\link[epiprocess:as_epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the forecast for any future date, and produces intervals by shuffling the quantiles of the residuals of such a "flatline" forecast and incrementing these forward over all available training data. diff --git a/man/epi_slide_wrapper.Rd b/man/epi_slide_wrapper.Rd index e1afbf599..583d2eacf 100644 --- a/man/epi_slide_wrapper.Rd +++ b/man/epi_slide_wrapper.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_epi_slide.R \name{epi_slide_wrapper} \alias{epi_slide_wrapper} -\title{wrapper to handle epi_slide particulars} +\title{Wrapper to handle epi_slide particulars} \usage{ epi_slide_wrapper( new_data, diff --git a/man/flatline_forecaster.Rd b/man/flatline_forecaster.Rd index 9c37b4943..052dd6428 100644 --- a/man/flatline_forecaster.Rd +++ b/man/flatline_forecaster.Rd @@ -7,7 +7,7 @@ flatline_forecaster(epi_data, outcome, args_list = flatline_args_list()) } \arguments{ -\item{epi_data}{An \link[epiprocess:as_epi_df]{epiprocess::epi_df}} +\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} \item{outcome}{A scalar character for the column name we wish to predict.} @@ -20,7 +20,7 @@ ahead (unique horizon) for each unique combination of \code{key_vars}. } \description{ This is a simple forecasting model for -\link[epiprocess:as_epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the forcast for any future date, and produces intervals based on the quantiles of the residuals of such a "flatline" forecast over all available training data. diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index a88dcc4c7..b18685d89 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -37,7 +37,7 @@ keys, as well other variables in the original dataset. } \description{ Based on the longest lag period in the recipe, -\code{get_test_data()} creates an \link[epiprocess:as_epi_df]{epi_df} +\code{get_test_data()} creates an \link[epiprocess:epi_df]{epi_df} with columns \code{geo_value}, \code{time_value} and other variables in the original dataset, which will be used to create features necessary to produce forecasts. diff --git a/man/grad_employ_subset.Rd b/man/grad_employ_subset.Rd index 8e7a230f7..46ba36913 100644 --- a/man/grad_employ_subset.Rd +++ b/man/grad_employ_subset.Rd @@ -5,7 +5,7 @@ \alias{grad_employ_subset} \title{Subset of Statistics Canada median employment income for postsecondary graduates} \format{ -An \link[epiprocess:as_epi_df]{epiprocess::epi_df} with 10193 rows and 8 variables: +An \link[epiprocess:epi_df]{epiprocess::epi_df} with 10193 rows and 8 variables: \describe{ \item{geo_value}{The province in Canada associated with each row of measurements.} diff --git a/man/predict-epi_workflow.Rd b/man/predict-epi_workflow.Rd index 2ecdf0102..130279249 100644 --- a/man/predict-epi_workflow.Rd +++ b/man/predict-epi_workflow.Rd @@ -60,7 +60,7 @@ workflow was created and fit. This is accomplished using \code{\link[recipes:bake]{recipes::bake()}} if a recipe was supplied. \item Call \code{\link[parsnip:predict.model_fit]{parsnip::predict.model_fit()}} for you using the underlying fit parsnip model. -\item Ensure that the returned object is an \link[epiprocess:as_epi_df]{epiprocess::epi_df} where +\item Ensure that the returned object is an \link[epiprocess:epi_df]{epiprocess::epi_df} where possible. Specifically, the output will have \code{time_value} and \code{geo_value} columns as well as the prediction. } diff --git a/man/step_epi_slide.Rd b/man/step_epi_slide.Rd index fb83942ae..46bb386ad 100644 --- a/man/step_epi_slide.Rd +++ b/man/step_epi_slide.Rd @@ -41,30 +41,14 @@ argument must be named \code{.x}. A common, though very difficult to debug error is using something like \code{function(x) mean}. This will not work because it returns the function mean, rather than \code{mean(x)}} -\item{before, after}{non-negative integers. -How far \code{before} and \code{after} each \code{time_value} should -the sliding window extend? Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{time_value - time_step(k)} to -\code{time_value}, use \verb{before=k, after=0}. This is the most likely use case -for the purposes of forecasting. -\item For center-aligned windows from \code{time_value - time_step(k)} to -\code{time_value + time_step(k)}, use \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{time_value} to -\code{time_value + time_step(k)}, use \verb{after=k, after=0}. -} - -You may also pass a \link[lubridate:period]{lubridate::period}, like \code{lubridate::weeks(1)} or a -character string that is coercible to a \link[lubridate:period]{lubridate::period}, like -\code{"2 weeks"}.} +\item{before, after}{the size of the sliding window on the left and the right +of the center. Usually non-negative integers for data indexed by date, but +more restrictive in other cases (see \code{\link[epiprocess:epi_slide]{epiprocess::epi_slide()}} for details).} \item{role}{For model terms created by this step, what analysis role should they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} -\item{prefix}{A prefix to indicate what type of variable this is} +\item{prefix}{A character string that will be prefixed to the new column.} \item{f_name}{a character string of at most 20 characters that describes the function. This will be combined with \code{prefix} and the columns in \code{...} diff --git a/tests/testthat/test-step_epi_slide.R b/tests/testthat/test-step_epi_slide.R index 1f284b721..29e046eae 100644 --- a/tests/testthat/test-step_epi_slide.R +++ b/tests/testthat/test-step_epi_slide.R @@ -1,3 +1,5 @@ +library(dplyr) + tt <- seq(as.Date("2022-01-01"), by = "1 day", length.out = 20) edf <- data.frame( time_value = c(tt, tt), @@ -7,17 +9,14 @@ edf <- data.frame( as_epi_df() r <- epi_recipe(edf) - -test_that("try_period works", { - expect_error(try_period("1 jeff")) - expect_error(try_period(lubridate::period("1 jeff"))) - expect_error(try_period(NA)) - expect_error(try_period(1.5)) - res <- lubridate::weeks(1) - expect_identical(try_period("1 week"), res) - expect_identical(try_period(lubridate::period("1 week")), res) - expect_identical(try_period(1L), 1L) -}) +rolled_before <- edf %>% + group_by(geo_value) %>% + epi_slide(value = mean(value), before = 3L) %>% + pull(value) +rolled_after <- edf %>% + group_by(geo_value) %>% + epi_slide(value = mean(value), after = 3L) %>% + pull(value) test_that("epi_slide errors when needed", { @@ -43,92 +42,21 @@ test_that("epi_slide errors when needed", { expect_error(r %>% step_epi_slide(value, .f = 1)) }) -library(dplyr) -rolled_before <- edf %>% - group_by(geo_value) %>% - epi_slide(value = mean(value), before = 3L) %>% - pull(value) -rolled_after <- edf %>% - group_by(geo_value) %>% - epi_slide(value = mean(value), after = 3L) %>% - pull(value) - - -test_that("epi_slide handles classed before/after", { - expect_warning( - baseline <- r %>% - step_epi_slide(value, .f = mean, before = 3L) %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_equal(baseline[[4]], rolled_before) - - expect_warning( - pbefore <- r %>% - step_epi_slide(value, .f = mean, before = lubridate::period("3 days")) %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_warning( - cbefore <- r %>% - step_epi_slide(value, .f = mean, before = "3 days") %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_equal(baseline, pbefore) - expect_equal(baseline, cbefore) - - expect_warning( - baseline <- r %>% - step_epi_slide(value, .f = mean, after = 3L) %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_equal(baseline[[4]], rolled_after) - expect_warning( - pafter <- r %>% - step_epi_slide(value, .f = mean, after = lubridate::period("3 days")) %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_warning( - cafter <- r %>% - step_epi_slide(value, .f = mean, after = "3 days") %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_equal(baseline, pafter) - expect_equal(baseline, cafter) -}) - test_that("epi_slide handles different function specs", { - expect_warning( - cfun <- r %>% - step_epi_slide(value, .f = "mean", before = 3L) %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) - expect_warning( - ffun <- r %>% - step_epi_slide(value, .f = mean, before = 3L) %>% - prep(edf) %>% - bake(new_data = NULL), - regexp = "There is an optimized version" - ) + cfun <- r %>% + step_epi_slide(value, .f = "mean", before = 3L) %>% + prep(edf) %>% + bake(new_data = NULL) + ffun <- r %>% + step_epi_slide(value, .f = mean, before = 3L) %>% + prep(edf) %>% + bake(new_data = NULL) # formula NOT currently supported expect_error( lfun <- r %>% - step_epi_slide(value, .f = ~ mean(.x, na.rm = TRUE), before = 3L) %>% - prep(edf) %>% - bake(new_data = NULL) + step_epi_slide(value, .f = ~ mean(.x, na.rm = TRUE), before = 3L), + regexp = "cannot be a formula." ) blfun <- r %>% step_epi_slide(value, .f = function(x) mean(x, na.rm = TRUE), before = 3L) %>%