diff --git a/.Rbuildignore b/.Rbuildignore index f1a8c3636..510725267 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -19,4 +19,5 @@ ^DEVELOPMENT\.md$ ^doc$ ^Meta$ -^.lintr$ \ No newline at end of file +^.lintr$ +^.venv$ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index a7df5ba5a..37134c160 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.0.20 +Version: 0.0.21 Authors@R: c( person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), @@ -23,8 +23,7 @@ URL: https://github.com/cmu-delphi/epipredict/, https://cmu-delphi.github.io/epipredict BugReports: https://github.com/cmu-delphi/epipredict/issues/ Depends: - epiprocess (>= 0.8.0), - epiprocess (< 0.9.0), + epiprocess (>= 0.9.0), parsnip (>= 1.0.0), R (>= 3.5.0) Imports: diff --git a/R/autoplot.R b/R/autoplot.R index d35850fd6..648c74e33 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -131,7 +131,7 @@ autoplot.epi_workflow <- function( if (length(extra_keys) == 0L) extra_keys <- NULL edf <- as_epi_df(edf, as_of = object$fit$meta$as_of, - additional_metadata = list(other_keys = extra_keys) + other_keys = extra_keys %||% character() ) if (is.null(predictions)) { return(autoplot( diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index 74af5e443..b2e7434e2 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -29,11 +29,11 @@ #' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% #' select(-pop, -death_rate) %>% #' group_by(geo_value) %>% -#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' epi_slide(~ sum(.$deaths), .window_size = 7, .new_col_name = "deaths_7dsum") %>% #' ungroup() %>% #' filter(weekdays(time_value) == "Saturday") #' -#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths_7dsum") #' preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) #' #' if (require(ggplot2)) { @@ -47,7 +47,7 @@ #' geom_line(aes(y = .pred), color = "orange") + #' geom_line( #' data = weekly_deaths %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = deaths) +#' aes(x = time_value, y = deaths_7dsum) #' ) + #' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + #' labs(x = "Date", y = "Weekly deaths") + diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 88ba605cd..0ff9efee8 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -95,7 +95,7 @@ epi_recipe.epi_df <- keys <- key_colnames(x) # we know x is an epi_df var_info <- tibble(variable = vars) - key_roles <- c("geo_value", "time_value", rep("key", length(keys) - 2)) + key_roles <- c("geo_value", rep("key", length(keys) - 2), "time_value") ## Check and add roles when available if (!is.null(roles)) { @@ -499,8 +499,11 @@ prep.epi_recipe <- function( if (!is_epi_df(training)) { # tidymodels killed our class # for now, we only allow step_epi_* to alter the metadata - training <- dplyr::dplyr_reconstruct( - as_epi_df(training), before_template + metadata <- attr(before_template, "metadata") + training <- as_epi_df( + training, + as_of = metadata$as_of, + other_keys = metadata$other_keys %||% character() ) } training <- dplyr::relocate(training, all_of(key_colnames(training))) @@ -579,8 +582,7 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { new_data <- as_epi_df( new_data, as_of = meta$as_of, - # avoid NULL if meta is from saved older epi_df: - additional_metadata = meta$additional_metadata %||% list() + other_keys = meta$other_keys %||% character() ) } new_data diff --git a/R/epi_workflow.R b/R/epi_workflow.R index b059a81d0..f448f4aff 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -98,7 +98,8 @@ is_epi_workflow <- function(x) { fit.epi_workflow <- function(object, data, ..., control = workflows::control_workflow()) { object$fit$meta <- list( max_time_value = max(data$time_value), - as_of = attributes(data)$metadata$as_of + as_of = attr(data, "metadata")$as_of, + other_keys = attr(data, "metadata")$other_keys ) object$original_data <- data diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index c91f738ae..3e0eb1aaa 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -67,11 +67,11 @@ abbr_to_location <- function(abbr) { #' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% #' select(-pop, -death_rate) %>% #' group_by(geo_value) %>% -#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' epi_slide(~ sum(.$deaths), .window_size = 7, .new_col_name = "deaths_7dsum") %>% #' ungroup() %>% #' filter(weekdays(time_value) == "Saturday") #' -#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths_7dsum") #' flusight_hub_formatter(cdc) #' flusight_hub_formatter(cdc, target = "wk inc covid deaths") #' flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) diff --git a/R/key_colnames.R b/R/key_colnames.R index c69d1a628..b9ebde5dc 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -1,19 +1,20 @@ #' @export key_colnames.recipe <- function(x, ...) { - possible_keys <- c("geo_value", "time_value", "key") - keys <- x$var_info$variable[x$var_info$role %in% possible_keys] - keys[order(match(keys, possible_keys))] %||% character(0L) + geo_key <- x$var_info$variable[x$var_info$role %in% "geo_value"] + time_key <- x$var_info$variable[x$var_info$role %in% "time_value"] + keys <- x$var_info$variable[x$var_info$role %in% "key"] + c(geo_key, keys, time_key) %||% character(0L) } #' @export key_colnames.epi_workflow <- function(x, ...) { # safer to look at the mold than the preprocessor mold <- hardhat::extract_mold(x) - possible_keys <- c("geo_value", "time_value", "key") molded_names <- names(mold$extras$roles) - keys <- map(mold$extras$roles[molded_names %in% possible_keys], names) - keys <- unname(unlist(keys)) - keys[order(match(keys, possible_keys))] %||% character(0L) + geo_key <- names(mold$extras$roles[molded_names %in% "geo_value"]$geo_value) + time_key <- names(mold$extras$roles[molded_names %in% "time_value"]$time_value) + keys <- names(mold$extras$roles[molded_names %in% "key"]$key) + c(geo_key, keys, time_key) %||% character(0L) } kill_time_value <- function(v) { diff --git a/R/step_epi_slide.R b/R/step_epi_slide.R index 9714971fa..c7d3f9fbd 100644 --- a/R/step_epi_slide.R +++ b/R/step_epi_slide.R @@ -19,13 +19,18 @@ #' argument must be named `.x`. A common, though very difficult to debug #' error is using something like `function(x) mean`. This will not work #' because it returns the function mean, rather than `mean(x)` -#' @param before,after the size of the sliding window on the left and the right -#' of the center. Usually non-negative integers for data indexed by date, but -#' more restrictive in other cases (see [epiprocess::epi_slide()] for details). -#' @param f_name a character string of at most 20 characters that describes -#' the function. This will be combined with `prefix` and the columns in `...` -#' to name the result using `{prefix}{f_name}_{column}`. By default it will be determined -#' automatically using `clean_f_name()`. +#' @param .window_size the size of the sliding window, required. Usually a +#' non-negative integer will suffice (e.g. for data indexed by date, but more +#' restrictive in other time_type cases (see [epiprocess::epi_slide()] for +#' details). For example, set to 7 for a 7-day window. +#' @param .align a character string indicating how the window should be aligned. +#' By default, this is "right", meaning the slide_window will be anchored with +#' its right end point on the reference date. (see [epiprocess::epi_slide()] +#' for details). +#' @param f_name a character string of at most 20 characters that describes the +#' function. This will be combined with `prefix` and the columns in `...` to +#' name the result using `{prefix}{f_name}_{column}`. By default it will be +#' determined automatically using `clean_f_name()`. #' #' @template step-return #' @@ -37,53 +42,55 @@ #' rec <- epi_recipe(jhu) %>% #' step_epi_slide(case_rate, death_rate, #' .f = \(x) mean(x, na.rm = TRUE), -#' before = 6L +#' .window_size = 7L #' ) #' bake(prep(rec, jhu), new_data = NULL) -step_epi_slide <- - function(recipe, - ..., - .f, - before = 0L, - after = 0L, - role = "predictor", - prefix = "epi_slide_", - f_name = clean_f_name(.f), - skip = FALSE, - id = rand_id("epi_slide")) { - if (!is_epi_recipe(recipe)) { - cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") - } - .f <- validate_slide_fun(.f) - epiprocess:::validate_slide_window_arg(before, attributes(recipe$template)$metadata$time_type) - epiprocess:::validate_slide_window_arg(after, attributes(recipe$template)$metadata$time_type) - arg_is_chr_scalar(role, prefix, id) - arg_is_lgl_scalar(skip) +step_epi_slide <- function(recipe, + ..., + .f, + .window_size = NULL, + .align = c("right", "center", "left"), + role = "predictor", + prefix = "epi_slide_", + f_name = clean_f_name(.f), + skip = FALSE, + id = rand_id("epi_slide")) { + if (!is_epi_recipe(recipe)) { + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") + } + .f <- validate_slide_fun(.f) + if (is.null(.window_size)) { + cli_abort("step_epi_slide: `.window_size` must be specified.") + } + epiprocess:::validate_slide_window_arg(.window_size, attributes(recipe$template)$metadata$time_type) + .align <- rlang::arg_match(.align) + arg_is_chr_scalar(role, prefix, id) + arg_is_lgl_scalar(skip) - recipes::add_step( - recipe, - step_epi_slide_new( - terms = enquos(...), - before = before, - after = after, - .f = .f, - f_name = f_name, - role = role, - trained = FALSE, - prefix = prefix, - keys = key_colnames(recipe), - columns = NULL, - skip = skip, - id = id - ) + recipes::add_step( + recipe, + step_epi_slide_new( + terms = enquos(...), + .window_size = .window_size, + .align = .align, + .f = .f, + f_name = f_name, + role = role, + trained = FALSE, + prefix = prefix, + keys = key_colnames(recipe), + columns = NULL, + skip = skip, + id = id ) - } + ) +} step_epi_slide_new <- function(terms, - before, - after, + .window_size, + .align, .f, f_name, role, @@ -96,8 +103,8 @@ step_epi_slide_new <- recipes::step( subclass = "epi_slide", terms = terms, - before = before, - after = after, + .window_size = .window_size, + .align = .align, .f = .f, f_name = f_name, role = role, @@ -119,8 +126,8 @@ prep.step_epi_slide <- function(x, training, info = NULL, ...) { step_epi_slide_new( terms = x$terms, - before = x$before, - after = x$after, + .window_size = x$.window_size, + .align = x$.align, .f = x$.f, f_name = x$f_name, role = x$role, @@ -165,8 +172,8 @@ bake.step_epi_slide <- function(object, new_data, ...) { # } epi_slide_wrapper( new_data, - object$before, - object$after, + object$.window_size, + object$.align, object$columns, c(object$.f), object$f_name, @@ -190,7 +197,7 @@ bake.step_epi_slide <- function(object, new_data, ...) { #' @importFrom dplyr bind_cols group_by ungroup #' @importFrom epiprocess epi_slide #' @keywords internal -epi_slide_wrapper <- function(new_data, before, after, columns, fns, fn_names, group_keys, name_prefix) { +epi_slide_wrapper <- function(new_data, .window_size, .align, columns, fns, fn_names, group_keys, name_prefix) { cols_fns <- tidyr::crossing(col_name = columns, fn_name = fn_names, fn = fns) # Iterate over the rows of cols_fns. For each row number, we will output a # transformed column. The first result returns all the original columns along @@ -204,10 +211,10 @@ epi_slide_wrapper <- function(new_data, before, after, columns, fns, fn_names, g result <- new_data %>% group_by(across(all_of(group_keys))) %>% epi_slide( - before = before, - after = after, - new_col_name = result_name, - f = function(slice, geo_key, ref_time_value) { + .window_size = .window_size, + .align = .align, + .new_col_name = result_name, + .f = function(slice, geo_key, ref_time_value) { fn(slice[[col_name]]) } ) %>% diff --git a/R/utils-misc.R b/R/utils-misc.R index af064b37c..b4d1c28b7 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -33,15 +33,14 @@ check_pname <- function(res, preds, object, newname = NULL) { grab_forged_keys <- function(forged, workflow, new_data) { - keys <- c("geo_value", "time_value", "key") forged_roles <- names(forged$extras$roles) - extras <- dplyr::bind_cols(forged$extras$roles[forged_roles %in% keys]) + extras <- dplyr::bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) # 1. these are the keys in the test data after prep/bake new_keys <- names(extras) # 2. these are the keys in the training data old_keys <- key_colnames(workflow) # 3. these are the keys in the test data as input - new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, keys[1:2])) + new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { cli::cli_warn(c( "Not all epi keys that were present in the training data are available", @@ -49,12 +48,11 @@ grab_forged_keys <- function(forged, workflow, new_data) { )) } if (is_epi_df(new_data)) { - extras <- as_epi_df(extras) - attr(extras, "metadata") <- attr(new_data, "metadata") - } else if (all(keys[1:2] %in% new_keys)) { - l <- list() - if (length(new_keys) > 2) l <- list(other_keys = new_keys[-c(1:2)]) - extras <- as_epi_df(extras, additional_metadata = l) + meta <- attr(new_data, "metadata") + extras <- as_epi_df(extras, as_of = meta$as_of, other_keys = meta$other_keys %||% character()) + } else if (all(c("geo_value", "time_value") %in% new_keys)) { + if (length(new_keys) > 2) other_keys <- new_keys[!new_keys %in% c("geo_value", "time_value")] + extras <- as_epi_df(extras, other_keys = other_keys %||% character()) } extras } diff --git a/data-raw/grad_employ_subset.R b/data-raw/grad_employ_subset.R index ae063d22f..38719a02e 100644 --- a/data-raw/grad_employ_subset.R +++ b/data-raw/grad_employ_subset.R @@ -101,6 +101,6 @@ ncol(gemploy) grad_employ_subset <- gemploy %>% as_epi_df( as_of = "2022-07-19", - additional_metadata = list(other_keys = c("age_group", "edu_qual")) + other_keys = c("age_group", "edu_qual") ) usethis::use_data(grad_employ_subset, overwrite = TRUE) diff --git a/data/grad_employ_subset.rda b/data/grad_employ_subset.rda index 3d74741cb..9380b43b5 100644 Binary files a/data/grad_employ_subset.rda and b/data/grad_employ_subset.rda differ diff --git a/man/autoplot-epipred.Rd b/man/autoplot-epipred.Rd index 27bfdf5f7..10236eb98 100644 --- a/man/autoplot-epipred.Rd +++ b/man/autoplot-epipred.Rd @@ -121,4 +121,6 @@ arx <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list(ahead = 14L) ) autoplot(arx, .max_facets = 6) +NULL + } diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index cd3c4ed67..0c7f1e436 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -44,11 +44,11 @@ weekly_deaths <- case_death_rate_subset \%>\% mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% select(-pop, -death_rate) \%>\% group_by(geo_value) \%>\% - epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + epi_slide(~ sum(.$deaths), .window_size = 7, .new_col_name = "deaths_7dsum") \%>\% ungroup() \%>\% filter(weekdays(time_value) == "Saturday") -cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths_7dsum") preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) if (require(ggplot2)) { @@ -62,7 +62,7 @@ if (require(ggplot2)) { geom_line(aes(y = .pred), color = "orange") + geom_line( data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = deaths) + aes(x = time_value, y = deaths_7dsum) ) + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + labs(x = "Date", y = "Weekly deaths") + diff --git a/man/epi_slide_wrapper.Rd b/man/epi_slide_wrapper.Rd index 0c05b7650..d67db1c88 100644 --- a/man/epi_slide_wrapper.Rd +++ b/man/epi_slide_wrapper.Rd @@ -6,8 +6,8 @@ \usage{ epi_slide_wrapper( new_data, - before, - after, + .window_size, + .align, columns, fns, fn_names, diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd index b43bc0ac2..b2be9b4fe 100644 --- a/man/flusight_hub_formatter.Rd +++ b/man/flusight_hub_formatter.Rd @@ -52,11 +52,11 @@ weekly_deaths <- case_death_rate_subset \%>\% mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% select(-pop, -death_rate) \%>\% group_by(geo_value) \%>\% - epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + epi_slide(~ sum(.$deaths), .window_size = 7, .new_col_name = "deaths_7dsum") \%>\% ungroup() \%>\% filter(weekdays(time_value) == "Saturday") -cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths_7dsum") flusight_hub_formatter(cdc) flusight_hub_formatter(cdc, target = "wk inc covid deaths") flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) diff --git a/man/step_epi_slide.Rd b/man/step_epi_slide.Rd index 46bb386ad..242f8e312 100644 --- a/man/step_epi_slide.Rd +++ b/man/step_epi_slide.Rd @@ -8,8 +8,8 @@ step_epi_slide( recipe, ..., .f, - before = 0L, - after = 0L, + .window_size = NULL, + .align = c("right", "center", "left"), role = "predictor", prefix = "epi_slide_", f_name = clean_f_name(.f), @@ -41,19 +41,25 @@ argument must be named \code{.x}. A common, though very difficult to debug error is using something like \code{function(x) mean}. This will not work because it returns the function mean, rather than \code{mean(x)}} -\item{before, after}{the size of the sliding window on the left and the right -of the center. Usually non-negative integers for data indexed by date, but -more restrictive in other cases (see \code{\link[epiprocess:epi_slide]{epiprocess::epi_slide()}} for details).} +\item{.window_size}{the size of the sliding window, required. Usually a +non-negative integer will suffice (e.g. for data indexed by date, but more +restrictive in other time_type cases (see \code{\link[epiprocess:epi_slide]{epiprocess::epi_slide()}} for +details). For example, set to 7 for a 7-day window.} + +\item{.align}{a character string indicating how the window should be aligned. +By default, this is "right", meaning the slide_window will be anchored with +its right end point on the reference date. (see \code{\link[epiprocess:epi_slide]{epiprocess::epi_slide()}} +for details).} \item{role}{For model terms created by this step, what analysis role should they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} \item{prefix}{A character string that will be prefixed to the new column.} -\item{f_name}{a character string of at most 20 characters that describes -the function. This will be combined with \code{prefix} and the columns in \code{...} -to name the result using \verb{\{prefix\}\{f_name\}_\{column\}}. By default it will be determined -automatically using \code{clean_f_name()}.} +\item{f_name}{a character string of at most 20 characters that describes the +function. This will be combined with \code{prefix} and the columns in \code{...} to +name the result using \verb{\{prefix\}\{f_name\}_\{column\}}. By default it will be +determined automatically using \code{clean_f_name()}.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked @@ -80,7 +86,7 @@ jhu <- case_death_rate_subset \%>\% rec <- epi_recipe(jhu) \%>\% step_epi_slide(case_rate, death_rate, .f = \(x) mean(x, na.rm = TRUE), - before = 6L + .window_size = 7L ) bake(prep(rec, jhu), new_data = NULL) } diff --git a/tests/testthat/test-epi_recipe.R b/tests/testthat/test-epi_recipe.R index ed27d88c0..a4cbb00b4 100644 --- a/tests/testthat/test-epi_recipe.R +++ b/tests/testthat/test-epi_recipe.R @@ -59,7 +59,7 @@ test_that("epi_recipe formula works", { time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5), geo_value = "ca", z = "dummy_key" - ) %>% epiprocess::as_epi_df(additional_metadata = list(other_keys = "z")) + ) %>% epiprocess::as_epi_df(other_keys = "z") # with an additional key r <- epi_recipe(y ~ x + geo_value, tib) diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index d55a515ca..3b3118740 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -30,12 +30,12 @@ test_that("key_colnames extracts additional keys when they are present", { value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) ) %>% as_epi_df( - additional_metadata = list(other_keys = c("state", "pol")) + other_keys = c("state", "pol") ) expect_identical( key_colnames(my_data), - c("geo_value", "time_value", "state", "pol") + c("geo_value", "state", "pol", "time_value") ) my_recipe <- epi_recipe(my_data) %>% @@ -43,16 +43,10 @@ test_that("key_colnames extracts additional keys when they are present", { step_epi_naomit() # order of the additional keys may be different - expect_setequal( - key_colnames(my_recipe), - c("geo_value", "time_value", "state", "pol") - ) + expect_equal(key_colnames(my_recipe), c("geo_value", "state", "pol", "time_value")) my_workflow <- epi_workflow(my_recipe, linear_reg()) %>% fit(my_data) # order of the additional keys may be different - expect_setequal( - key_colnames(my_workflow), - c("geo_value", "time_value", "state", "pol") - ) + expect_equal(key_colnames(my_workflow), c("geo_value", "state", "pol", "time_value")) }) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 9595b47b6..915804d6b 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -93,6 +93,8 @@ test_that("forecast date works for daily", { unclass() %>% as.data.frame() %>% mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>% + group_by(geo_value, time_value) %>% + summarize(case_rate = mean(case_rate), death_rate = mean(death_rate), .groups="drop") %>% as_epi_df() expect_error(predict(wf1, latest_yearly)) diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index e5349839b..f1fa3f217 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -104,6 +104,8 @@ test_that("target date works for daily and yearly", { unclass() %>% as.data.frame() %>% mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>% + group_by(geo_value, time_value) %>% + summarize(case_rate = mean(case_rate), death_rate = mean(death_rate), .groups = "drop") %>% as_epi_df() expect_error(predict(wf1, latest_bad)) diff --git a/tests/testthat/test-pad_to_end.R b/tests/testthat/test-pad_to_end.R index 0ea6244b0..6949f06ac 100644 --- a/tests/testthat/test-pad_to_end.R +++ b/tests/testthat/test-pad_to_end.R @@ -32,6 +32,6 @@ test_that("test set padding works", { # make sure it maintains the epi_df dat <- dat %>% dplyr::rename(geo_value = gr1) %>% - as_epi_df() + as_epi_df(other_keys = "gr2") expect_s3_class(pad_to_end(dat, "geo_value", 2), "epi_df") }) diff --git a/tests/testthat/test-step_epi_slide.R b/tests/testthat/test-step_epi_slide.R index 29e046eae..e90c317f8 100644 --- a/tests/testthat/test-step_epi_slide.R +++ b/tests/testthat/test-step_epi_slide.R @@ -7,32 +7,23 @@ edf <- data.frame( value = c(2:21, 3:22) ) %>% as_epi_df() - r <- epi_recipe(edf) -rolled_before <- edf %>% - group_by(geo_value) %>% - epi_slide(value = mean(value), before = 3L) %>% - pull(value) -rolled_after <- edf %>% - group_by(geo_value) %>% - epi_slide(value = mean(value), after = 3L) %>% - pull(value) test_that("epi_slide errors when needed", { # not an epi_recipe - expect_error(recipe(edf) %>% step_epi_slide(value, .f = mean, before = 6L)) + expect_error(recipe(edf) %>% step_epi_slide(value, .f = mean, .window_size = 7L)) # non-scalar args - expect_error(r %>% step_epi_slide(value, .f = mean, before = c(3L, 6L))) - expect_error(r %>% step_epi_slide(value, .f = mean, after = c(3L, 6L))) + expect_error(r %>% step_epi_slide(value, .f = mean, .window_size = c(3L, 6L))) + expect_error(r %>% step_epi_slide(value, .f = mean, .align = c("right", "left"))) expect_error(r %>% step_epi_slide(value, .f = mean, skip = c(TRUE, FALSE))) expect_error(r %>% step_epi_slide(value, .f = mean, role = letters[1:2])) expect_error(r %>% step_epi_slide(value, .f = mean, prefix = letters[1:2])) expect_error(r %>% step_epi_slide(value, .f = mean, id = letters[1:2])) # wrong types - expect_error(r %>% step_epi_slide(value, .f = mean, before = 1.5)) - expect_error(r %>% step_epi_slide(value, .f = mean, after = 1.5)) + expect_error(r %>% step_epi_slide(value, .f = mean, .window_size = 1.5)) + expect_error(r %>% step_epi_slide(value, .f = mean, .align = 1.5)) expect_error(r %>% step_epi_slide(value, .f = mean, skip = "a")) expect_error(r %>% step_epi_slide(value, .f = mean, role = 1)) expect_error(r %>% step_epi_slide(value, .f = mean, prefix = 1)) @@ -45,31 +36,40 @@ test_that("epi_slide errors when needed", { test_that("epi_slide handles different function specs", { cfun <- r %>% - step_epi_slide(value, .f = "mean", before = 3L) %>% + step_epi_slide(value, .f = "mean", .window_size = 4L) %>% prep(edf) %>% bake(new_data = NULL) + expected_out <- edf %>% + group_by(geo_value) %>% + epi_slide(~ mean(.x$value), .window_size = 4L) %>% + ungroup() %>% + rename(epi_slide__.f_value = slide_value) + expect_equal(cfun, expected_out) ffun <- r %>% - step_epi_slide(value, .f = mean, before = 3L) %>% + step_epi_slide(value, .f = mean, .window_size = 4L) %>% prep(edf) %>% bake(new_data = NULL) + expect_equal(ffun, expected_out) # formula NOT currently supported expect_error( lfun <- r %>% - step_epi_slide(value, .f = ~ mean(.x, na.rm = TRUE), before = 3L), + step_epi_slide(value, .f = ~ mean(.x, na.rm = TRUE), .window_size = 4L), regexp = "cannot be a formula." ) + # expect_equal(lfun, rolled_before) blfun <- r %>% - step_epi_slide(value, .f = function(x) mean(x, na.rm = TRUE), before = 3L) %>% + step_epi_slide(value, .f = function(x) mean(x, na.rm = TRUE), .window_size = 4L) %>% prep(edf) %>% bake(new_data = NULL) + expected_out <- edf %>% + group_by(geo_value) %>% + epi_slide(~ mean(.x$value, na.rm = TRUE), .window_size = 4L) %>% + ungroup() %>% + rename(epi_slide__.f_value = slide_value) + expect_equal(blfun, expected_out) nblfun <- r %>% - step_epi_slide(value, .f = \(x) mean(x, na.rm = TRUE), before = 3L) %>% + step_epi_slide(value, .f = \(x) mean(x, na.rm = TRUE), .window_size = 4L) %>% prep(edf) %>% bake(new_data = NULL) - - expect_equal(cfun[[4]], rolled_before) - expect_equal(ffun[[4]], rolled_before) - # expect_equal(lfun[[4]], rolled_before) - expect_equal(blfun[[4]], rolled_before) - expect_equal(nblfun[[4]], rolled_before) + expect_equal(nblfun, expected_out) }) diff --git a/tests/testthat/test-step_training_window.R b/tests/testthat/test-step_training_window.R index f49668a40..a9f2170d3 100644 --- a/tests/testthat/test-step_training_window.R +++ b/tests/testthat/test-step_training_window.R @@ -84,7 +84,7 @@ test_that("step_training_window works with multiple keys", { expect_equal(nrow(p4), 12L) expect_equal(ncol(p4), 5L) expect_s3_class(p4, "epi_df") - expect_named(p4, c("geo_value", "time_value", "additional_key", "x", "y")) + expect_named(p4, c("geo_value", "additional_key", "time_value", "x", "y")) expect_equal( p4$time_value, rep(c( diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd index ec6f67359..31cc7b9b0 100644 --- a/vignettes/articles/sliding.Rmd +++ b/vignettes/articles/sliding.Rmd @@ -54,7 +54,7 @@ applying `epi_slide()` to the latest snapshot of the data. First, we download the version history (ie. archive) of the percentage of doctor's visits with CLI (COVID-like illness) computed from medical insurance claims and the number of new confirmed COVID-19 cases per 100,000 population -(daily) for all 50 states from the COVIDcast API. +(daily) for all 50 states from the COVIDcast API.
@@ -69,7 +69,6 @@ versions for the less up-to-date input archive. theme_set(theme_bw()) y <- readRDS("all_states_covidcast_signals.rds") - y <- purrr::map(y, ~ select(.x, geo_value, time_value, version = issue, value)) x <- epix_merge( @@ -93,15 +92,11 @@ output. ```{r arx-kweek-preliminaries, warning = FALSE} # Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$versions_end)) -fc_time_values <- seq( - from = as.Date("2020-08-01"), - to = as.Date("2021-11-01"), - by = "1 month" -) +x_latest <- epix_as_of(x, version = max(x$versions_end)) +fc_time_values <- seq(from = as.Date("2021-11-01"), to = as.Date("2021-11-01"), by = "1 month") aheads <- c(7, 14, 21, 28) -k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { +forecast_k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { epi_slide( epi_df, ~ arx_forecaster( @@ -109,9 +104,9 @@ k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { args_list = arx_args_list(ahead = ahead) )$predictions %>% select(-geo_value), - before = 120 - 1, - ref_time_values = fc_time_values, - new_col_name = "fc" + .window_size = 120, + .ref_time_values = fc_time_values, + .new_col_name = "fc" ) %>% select(geo_value, time_value, starts_with("fc")) %>% mutate(engine_type = engine$engine) @@ -121,22 +116,22 @@ k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { ```{r make-arx-kweek} # Generate the forecasts and bind them together fc <- bind_rows( - map( - aheads, - ~ k_week_ahead( - x_latest, "case_rate", c("case_rate", "percent_cli"), .x, - engine = linear_reg() - ) - ) %>% list_rbind(), - map( - aheads, - ~ k_week_ahead( - x_latest, "case_rate", c("case_rate", "percent_cli"), .x, - engine = rand_forest(mode = "regression") - ) - ) %>% list_rbind() -) %>% - pivot_quantiles_wider(fc_.pred_distn) + map(aheads, ~ forecast_k_week_ahead( + x_latest, + outcome = "case_rate", + predictors = c("case_rate", "percent_cli"), + ahead = .x, + engine = linear_reg() + )), + map(aheads, ~ forecast_k_week_ahead( + x_latest, + outcome = "case_rate", + predictors = c("case_rate", "percent_cli"), + ahead = .x, + engine = rand_forest(mode = "regression") + )) +) +pivot_quantiles_wider(fc_.pred_distn) ``` Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the @@ -235,11 +230,11 @@ can_latest <- epix_as_of(can, max_version = max(can$DT$version)) can_fc <- bind_rows( map( aheads, - ~ k_week_ahead(can_latest, "cr_7dav", "cr_7dav", .x, linear_reg()) + ~ forecast_k_week_ahead(can_latest, "cr_7dav", "cr_7dav", .x, linear_reg()) ) %>% list_rbind(), map( aheads, - ~ k_week_ahead( + ~ forecast_k_week_ahead( can_latest, "cr_7dav", "cr_7dav", .x, boost_tree(mode = "regression", trees = 20) ) diff --git a/vignettes/arx-classifier.Rmd b/vignettes/arx-classifier.Rmd index ae1641cce..b2a2bbf8e 100644 --- a/vignettes/arx-classifier.Rmd +++ b/vignettes/arx-classifier.Rmd @@ -50,10 +50,7 @@ jhu <- case_death_rate_subset %>% geo_value %in% c("ca", "fl", "tx", "ny", "nj") ) -out <- arx_classifier(jhu, - outcome = "case_rate", - predictors = "case_rate" -) +out <- arx_classifier(jhu, outcome = "case_rate", predictors = "case_rate") out$predictions ``` @@ -93,7 +90,8 @@ relying on the default of 0.25. We can do this by passing 0.5 to the `breaks` argument in `arx_class_args_list()` as follows: ```{r} -out_break_0.5 <- arx_classifier(jhu, +out_break_0.5 <- arx_classifier( + jhu, outcome = "case_rate", predictors = "case_rate", args_list = arx_class_args_list( @@ -142,8 +140,8 @@ the present? To answer this question, we can create a predictive model for upswings and downswings of case rates rather than the raw case rates themselves. For this situation, our target is to predict whether there is an increase in case rates -or not. Following -[McDonald, Bien, Green, Hu, et al.(2021)](https://www.pnas.org/doi/full/10.1073/pnas.2111453118), +or not. Following +[McDonald, Bien, Green, Hu, et al.(2021)](https://www.pnas.org/doi/full/10.1073/pnas.2111453118), we look at the relative change between $Y_{l,t}$ and $Y_{l, t+a}$, where the former is the case rate at location $l$ at time $t$ and the latter is the rate for that location at @@ -152,7 +150,7 @@ with two classes $$\begin{align} Z_{l,t} = \left\{\begin{matrix} -\text{up,} & \text{if } Y_{l,t}^\Delta > 0.25\\ +\text{up,} & \text{if } Y_{l,t}^\Delta > 0.25\\ \text{not up,} & \text{otherwise} \end{matrix}\right. \end{align}$$ @@ -166,7 +164,7 @@ $$\begin{align} \pi_{\text{up}}(x) &= Pr(Z_{l, t} = \text{up}|x) = \frac{e^{g_{\text{up}}(x)}}{1 + e^{g_{\text{up}}(x)}}, \\ \pi_{\text{not up}}(x)&= Pr(Z_{l, t} = \text{not up}|x) = 1 - Pr(Z_{l, t} = \text{up}|x) = \frac{1}{1 + e^{g_{\text{up}}(x)}} \end{align}$$ -where +where $$ g_{\text{up}}(x) = \log\left ( \frac{\Pr(Z_{l, t} = \text{up} \vert x)}{\Pr(Z_{l, t} = \text{not up} \vert x)} \right ) = \beta_{10} + \beta_{11}Y_{l,t}^\Delta + \beta_{12}Y_{l,t-7}^\Delta + \beta_{13}Y_{l,t-14}^\Delta. @@ -223,7 +221,7 @@ require access to the training data. The other optional arguments for controlling the growth rate calculation (that can be inputted as `additional_gr_args`) can be found in the documentation for -`epiprocess::growth_rate()` and the related +`epiprocess::growth_rate()` and the related `vignette("growth_rate", package = "epiprocess")`. ### Visualizing the results @@ -280,4 +278,4 @@ to start with using the built-in classifier for ostensibly simple projects and begin to implement your own when the modelling project takes a complicated turn. To get some practice on coding up a classifier by hand, consider translating this binary classification model example to an `epi_workflow`, akin to that in -`vignette("preprocessing-and-models")`. +`vignette("preprocessing-and-models")`. diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd index 7e24b04c6..1925de2fb 100644 --- a/vignettes/epipredict.Rmd +++ b/vignettes/epipredict.Rmd @@ -110,8 +110,6 @@ the *same set* of `geo_value`'s and `time_value`'s could actually be different. For more details, see [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html). - - ## Why doesn't this package already exist? As described above: @@ -121,7 +119,7 @@ preprocessing, training, and prediction, bound together, through a package calle `{workflows}`. We built `{epipredict}` on top of that setup. In this way, you CAN use almost everything they provide. -* However, `{workflows}` doesn't do postprocessing. And nothing in the -verse +* However, `{workflows}` doesn't do postprocessing. And nothing in the -verse handles _panel data_. * The tidy-team doesn't have plans to do either of these things. (We checked). @@ -131,7 +129,7 @@ handles _panel data_. etc.[^2] Our group has not prioritized these sorts of models for epidemic forecasting, but one could also integrate these methods into our framework. -[^2]: These are [`{timetk}`](https://business-science.github.io/timetk/index.html) +[^2]: These are [`{timetk}`](https://business-science.github.io/timetk/index.html) and [`{modeltime}`](https://business-science.github.io/timetk/index.html). There are *lots* of useful methods there than can be used to do fairly complex machine learning methodology, though not directly for panel data and not for direct @@ -327,6 +325,7 @@ the `time_value`, `geo_value`, and any additional keys so that these are availab when necessary. The `epi_recipe` from `out_gb` can be extracted from the result: + ```{r} extract_recipe(out_gb$epi_workflow) ``` @@ -441,7 +440,7 @@ But ideally, a user could create their own forecasters by building up the components we provide. In other vignettes, we try to walk through some of these customizations. -To illustrate everything above, here is (roughly) the code for the +To illustrate everything above, here is (roughly) the code for the `flatline_forecaster()` applied to the `case_rate`. ```{r}