From 25322782366b855aa39ce109cebb8282083296bc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 6 Nov 2024 04:22:53 -0800 Subject: [PATCH] WIP on .prefix =, .suffix =, .new_col_names = args --- NAMESPACE | 3 ++ R/epi_df.R | 11 +++-- R/epiprocess-package.R | 1 + R/slide.R | 91 ++++++++++++++++++++++++++++++++---------- man/epi_slide_opt.Rd | 3 ++ 5 files changed, 81 insertions(+), 28 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 04e8bf615..10cc594f0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -114,6 +114,7 @@ importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) +importFrom(checkmate,assert_string) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) @@ -176,6 +177,7 @@ importFrom(dplyr,summarize) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(ggplot2,autoplot) +importFrom(glue,glue) importFrom(lifecycle,deprecated) importFrom(lubridate,as.period) importFrom(lubridate,days) @@ -206,6 +208,7 @@ importFrom(rlang,is_quosure) importFrom(rlang,list2) importFrom(rlang,missing_arg) importFrom(rlang,new_function) +importFrom(rlang,quo_get_env) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) diff --git a/R/epi_df.R b/R/epi_df.R index bcf9e56fd..5cf379e2a 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -232,7 +232,6 @@ as_epi_df.tbl_df <- function( as_of, other_keys = character(), ...) { - # possible standard substitutions for time_value x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "geo_value", geo_column_names()) @@ -282,11 +281,11 @@ as_epi_df.tbl_df <- function( cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") } - duplicated_time_values <- x %>% - group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% - filter(dplyr::n() > 1) %>% - ungroup() - if (nrow(duplicated_time_values) > 0) { + if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) { + duplicated_time_values <- x %>% + group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% + filter(dplyr::n() > 1) %>% + ungroup() bad_data <- capture.output(duplicated_time_values) cli_abort( "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.", diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index e28cec0f3..c481df076 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -6,6 +6,7 @@ #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt +#' @importFrom checkmate assert_string #' @importFrom checkmate check_atomic check_data_frame expect_class test_int #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname diff --git a/R/slide.R b/R/slide.R index e7325e666..f5a0ac69d 100644 --- a/R/slide.R +++ b/R/slide.R @@ -564,8 +564,9 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' functions). #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of -#' @importFrom rlang enquo expr_label caller_arg +#' @importFrom rlang enquo expr_label caller_arg quo_get_env #' @importFrom tidyselect eval_select +#' @importFrom glue glue #' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period @@ -593,6 +594,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { assert_class(.x, "epi_df") @@ -644,21 +646,37 @@ epi_slide_opt <- function( ) } + # The position of a given column can be differ between input `.x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + col_names_quo <- enquo(.col_names) + pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) + col_names_chr <- names(.x)[pos] + # Check that slide function `.f` is one of those short-listed from # `data.table` and `slider` (or a function that has the exact same # definition, e.g. if the function has been reexported or defined # locally). - if (any(map_lgl( - list(frollmean, frollsum, frollapply), - ~ identical(.f, .x) - ))) { - f_from_package <- "data.table" - } else if (any(map_lgl( - list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), - ~ identical(.f, .x) - ))) { - f_from_package <- "slider" - } else { + f_possibilities <- + tibble::tribble( + ~f, ~package, ~abbr, + frollmean, "data.table", "av", + frollsum, "data.table", "sum", + frollapply, "data.table", "slide", + slide_sum, "slider", "sum", + slide_prod, "slider", "prod", + slide_mean, "slider", "av", + slide_min, "slider", "min", + slide_max, "slider", "max", + slide_all, "slider", "all", + slide_any, "slider", "any", + ) + f_info <- f_possibilities %>% + filter(map_lgl(.data$f, ~ identical(.f, .x))) + if (nrow(f_info) == 0L) { # `f` is from somewhere else and not supported cli_abort( c( @@ -672,6 +690,45 @@ epi_slide_opt <- function( epiprocess__f = .f ) } + f_from_package <- f_info$package + + assert_string(.prefix, null.ok = TRUE) + assert_string(.suffix, null.ok = TRUE) + assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE) + if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { + cli_abort( + "Can't use both .prefix/.suffix and .new_col_names at the same time." + ) + } + if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { + .suffix <- "_{.window_size}{.time_unit}{.f_abbr}" + } + if (!is.null(.prefix) || !is.null(.suffix)) { + .prefix <- .prefix %||% "" + .suffix <- .suffix %||% "" + glue_env <- rlang::env( + .window_size = .window_size, + .time_unit = "d", # FIXME + .f_abbr = f_info$abbr, + quo_get_env(col_names_quo) + ) + .new_col_names <- unclass( + glue(.prefix, .envir = glue_env) + + col_names_chr + + glue(.suffix, .envir = glue_env) + ) + } else { + # .new_col_names was provided by user; we don't need to do anything. + # + # XXX or do some gluing? + } + if (any(.new_col_names %in% names(.x))) { + cli_abort(c( + "Naming conflict between new columns and existing columns", + "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" + )) + } + result_col_names <- .new_col_names user_provided_rtvs <- !is.null(.ref_time_values) if (!user_provided_rtvs) { @@ -708,16 +765,6 @@ epi_slide_opt <- function( pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates - # The position of a given column can be differ between input `.x` and - # `.data_group` since the grouping step by default drops grouping columns. - # To avoid rerunning `eval_select` for every `.data_group`, convert - # positions of user-provided `col_names` into string column names. We avoid - # using `names(pos)` directly for robustness and in case we later want to - # allow users to rename fields via tidyselection. - pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) - col_names_chr <- names(.x)[pos] - # Always rename results to "slide_value_". - result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] # `frollmean` requires a full window to compute a result. Add NA values diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index cd293ee10..b5e923f47 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,6 +13,9 @@ epi_slide_opt( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE )