From 7a8ec75b56f1ce577768fc8264945bd88f1dfa48 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 7 Nov 2024 17:01:46 -0800 Subject: [PATCH] WIP --- NAMESPACE | 2 ++ R/epiprocess-package.R | 2 ++ R/slide.R | 72 ++++++++++++++++++++++-------------------- R/utils.R | 67 ++++++++++++++++++++++++++++++++++++++- man/epi_slide.Rd | 4 +-- man/epi_slide_opt.Rd | 2 +- 6 files changed, 110 insertions(+), 39 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 10cc594f0..e044739b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -200,6 +200,7 @@ importFrom(rlang,env) importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) +importFrom(rlang,is_bare_integerish) importFrom(rlang,is_environment) importFrom(rlang,is_formula) importFrom(rlang,is_function) @@ -233,3 +234,4 @@ importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) +importFrom(vctrs,vec_data) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index c481df076..b640a0b5f 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -17,6 +17,8 @@ #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom rlang %||% +#' @importFrom rlang is_bare_integerish +#' @importFrom vctrs vec_data ## usethis namespace: end NULL diff --git a/R/slide.R b/R/slide.R index 9077769dd..bdefba434 100644 --- a/R/slide.R +++ b/R/slide.R @@ -537,7 +537,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' #' @template basic-slide-params #' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), #' [other tidy-select expression][tidyselect::language], or a vector of #' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if #' they were positions in the data frame, so expressions like `x:y` can be @@ -692,6 +692,36 @@ epi_slide_opt <- function( } f_from_package <- f_info$package + user_provided_rtvs <- !is.null(.ref_time_values) + if (!user_provided_rtvs) { + .ref_time_values <- unique(.x$time_value) + } else { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { + cli_abort( + "`ref_time_values` must be a unique subset of the time values in `x`.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) + } + if (anyDuplicated(.ref_time_values) != 0L) { + cli_abort( + "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) + } + } + ref_time_values <- sort(.ref_time_values) + + # Handle window arguments + .align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + if (is.null(.window_size)) { + cli_abort("epi_slide_opt: `.window_size` must be specified.") + } + validate_slide_window_arg(.window_size, time_type) + window_args <- get_before_after_from_window(.window_size, .align, time_type) + + # Handle output naming assert_string(.prefix, null.ok = TRUE) assert_string(.suffix, null.ok = TRUE) assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE) @@ -701,21 +731,22 @@ epi_slide_opt <- function( ) } if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { - .suffix <- "_{.window_size}{.time_unit}{.f_abbr}" + .suffix <- "_{.n}{.time_unit}{.f_abbr}" } if (!is.null(.prefix) || !is.null(.suffix)) { .prefix <- .prefix %||% "" .suffix <- .suffix %||% "" + # FIXME alignment marker glue_env <- rlang::env( - .window_size = .window_size, # FIXME typing - .time_unit = "d", # FIXME + .n = time_delta_to_n_steps(.window_size, time_type), # FIXME Inf... + .time_unit = time_type_unit_abbr(time_type), .f_abbr = f_info$abbr, quo_get_env(col_names_quo) ) .new_col_names <- unclass( glue(.prefix, .envir = glue_env) + - col_names_chr + - glue(.suffix, .envir = glue_env) + col_names_chr + + glue(.suffix, .envir = glue_env) ) } else { # `.new_col_names` was provided by user; we don't need to do anything. @@ -728,35 +759,6 @@ epi_slide_opt <- function( } result_col_names <- .new_col_names - user_provided_rtvs <- !is.null(.ref_time_values) - if (!user_provided_rtvs) { - .ref_time_values <- unique(.x$time_value) - } else { - assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(.ref_time_values, unique(.x$time_value))) { - cli_abort( - "`ref_time_values` must be a unique subset of the time values in `x`.", - class = "epiprocess__epi_slide_opt_invalid_ref_time_values" - ) - } - if (anyDuplicated(.ref_time_values) != 0L) { - cli_abort( - "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", - class = "epiprocess__epi_slide_opt_invalid_ref_time_values" - ) - } - } - ref_time_values <- sort(.ref_time_values) - - # Handle window arguments - .align <- rlang::arg_match(.align) - time_type <- attr(.x, "metadata")$time_type - if (is.null(.window_size)) { - cli_abort("epi_slide_opt: `.window_size` must be specified.") - } - validate_slide_window_arg(.window_size, time_type) - window_args <- get_before_after_from_window(.window_size, .align, time_type) - # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) all_dates <- date_seq_list$all_dates diff --git a/R/utils.R b/R/utils.R index 1bfd21292..9caa6ad5b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -640,7 +640,7 @@ guess_time_type <- function(time_value, time_value_arg = rlang::caller_arg(time_ return("day") } else if (inherits(time_value, "yearmonth")) { return("yearmonth") - } else if (rlang::is_integerish(time_value)) { + } else if (is_bare_integerish(time_value)) { return("integer") } @@ -1109,3 +1109,68 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU ) } } + + +#' Convert a time delta to a compatible integerish number of steps between time values +#' +#' @param time_delta a vector that can be added to time values of time type +#' `time_type` to arrive at other time values of that time type, or +#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed +#' in, if supported by the class of `time_delta`, even if `time_type` doesn't +#' necessarily support Inf/-Inf entries. Basically a slide window arg but +#' without sign and length restrictions. +#' @param time_type as in [`validate_slide_window_arg`] +#' @return [bare integerish][rlang::is_integerish] vector (with possible +#' infinite values) that produces the same result as `time_delta` when added +#' to time values of time type `time_type`. If the given time type does not +#' support infinite values, then it should produce +Inf or -Inf for analogous +#' entries of `time_delta`, and match the addition result match the addition +#' result for non-infinite values, and product +Inf / -Inf when match the sign +#' and of `time_delta`. +#' +#' @keywords internal +time_delta_to_n_steps <- function(time_delta, time_type) { + # could be S3 if we're willing to export + if (inherits(time_delta, "difftime")) { + output_units <- switch(time_type, + day = "days", + week = "weeks", + cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") + ) + units(time_delta) <- output_units # converts number accordingly, doesn't just set attr + n_steps <- vec_data(time_delta) + if (!is_bare_integerish(n_steps)) { + cli_abort("`time_delta` did not appear to contain only integerish numbers + of steps between time values of time type {format_chr_with_quotes(time_type)}") + } + n_steps + } else if (is_bare_integerish(time_delta)) { # (allows infinite values) + switch(time_type, + day = , + week = , + yearmonth = , + integer = time_delta, + cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") + ) + } else { + cli_abort("Invalid or unsupported kind of `time_delta`") + } +} + +# Using these unit abbreviations happens to make our automatic slide output +# naming look like taking ISO-8601 duration designations, removing the P, and +# lowercasing any characters. Fortnightly or sub-daily time types would need an +# adjustment to remain consistent. +time_type_unit_abbrs <- c( + day = "d", + week = "w", + yearmon = "m" +) + +time_type_unit_abbr <- function(time_type) { + maybe_unit_abbr <- time_type_unit_abbrs[time_type] + if (is.na(maybe_unit_abbr)) { + cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") + } + maybe_unit_abbr +} diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 10d389578..1c399d353 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -99,8 +99,8 @@ This is useful for computations like rolling averages. The function supports many ways to specify the computation, but by far the most common use case is as follows: -\if{html}{\out{
}}\preformatted{# Create new column `cases_7dm` that contains a 7-day trailing median of cases -epi_slide(edf, cases_7dav = median(cases), .window_size = 7) +\if{html}{\out{
}}\preformatted{# Create new column `cases_7dmed` that contains a 7-day trailing median of cases +epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) }\if{html}{\out{
}} For two very common use cases, we provide optimized functions that are much diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index b5e923f47..64498fc7e 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -46,7 +46,7 @@ columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} \item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be