diff --git a/DESCRIPTION b/DESCRIPTION index e14bc7c6..333bf13c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -79,6 +79,7 @@ Collate: 'correlation.R' 'data.R' 'epi_df.R' + 'epi_df_forbidden_methods.R' 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' diff --git a/NAMESPACE b/NAMESPACE index a417837f..a9544763 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ S3method("[",epi_df) S3method("names<-",epi_df) +S3method(Summary,epi_df) S3method(arrange_canonical,default) S3method(arrange_canonical,epi_df) S3method(as_epi_df,data.frame) @@ -36,6 +37,7 @@ S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) +S3method(mean,epi_df) S3method(next_after,Date) S3method(next_after,integer) S3method(print,epi_archive) @@ -148,6 +150,7 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) +importFrom(dplyr,group_map) importFrom(dplyr,group_modify) importFrom(dplyr,group_vars) importFrom(dplyr,groups) diff --git a/R/archive.R b/R/archive.R index 5cf55ff6..07394d9d 100644 --- a/R/archive.R +++ b/R/archive.R @@ -693,9 +693,9 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. - if (!any(grouping_col_is_factor)) { + if (length(grouping_cols) != 0L && !any(grouping_col_is_factor)) { cli_warn( - "`.drop=FALSE` but there are no factor grouping columns; + "`.drop=FALSE` but none of the grouping columns are factors; did you mean to convert one of the columns to a factor beforehand?", class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" ) @@ -703,10 +703,10 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, cli_warn( "`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these - columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + non-factor columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, - or use `.drop=TRUE` and add a call to `tidyr::complete`.", + or use `.drop=TRUE` and add a call to `tidyr::complete()`.", class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" ) } diff --git a/R/epi_df_forbidden_methods.R b/R/epi_df_forbidden_methods.R new file mode 100644 index 00000000..254713d6 --- /dev/null +++ b/R/epi_df_forbidden_methods.R @@ -0,0 +1,48 @@ +# Methods in this file are used to +# * Disable problematic inherited behavior (e.g., mean on epi_dfs) +# * Provide better error messaging if possible for things that already abort +# when they should (e.g., sum on epi_dfs) + + +# Disable mean on epi_dfs, to prevent `epi_slide(~ mean(.x), ....)` bad output: + +#' @export +mean.epi_df <- function(x, ...) { + cli_abort(c( + "`mean` shouldn't be used on entire `epi_df`s", + "x" = "{rlang::caller_arg(x)} was an `epi_df`", + "i" = "If you encountered this while trying to take a rolling mean + of a column using `epi_slide`, you probably forgot to + specify the column name (e.g., ~ mean(.x$colname)). You may + also prefer to use the specialized `epi_slide_mean` method." + ), class = "epiprocess__summarizer_on_entire_epi_df") +} + +# Similarly, provide better error messages for some other potentially-common +# slide operations (sum, prod, min, max, all, any, range): + +#' @export +Summary.epi_df <- function(..., na.rm = FALSE) { + # cli uses dot prefixes for special purpose; use alias to avoid confusion during interpolation + generic <- .Generic + opt_pointer <- switch(.Generic, + sum = "You may also prefer to use the specialized `epi_slide_sum` method.", + prod = , + min = , + max = , + all = , + any = "You may also prefer to use the specialized `epi_slide_opt` method.", + range = "", + cli_abort("Unrecognized .Generic: {generic}") + ) + cli_abort(c( + "`{generic}` shouldn't be used on entire `epi_df`s", + # We'd like to quote user input in the error message, but `caller_arg(..1)` is + # just "..1" and (eagerness/S4/unnamedness?) issues thwart some alternatives; just + # use something generic: + "x" = "`{generic}`'s first argument was an `epi_df`", + "i" = "If you encountered this while trying to take a rolling {generic} + of a column using `epi_slide`, you probably forgot to + specify the column name (e.g., ~ {generic}(.x$colname)). {opt_pointer}" + ), class = "epiprocess__summarizer_on_entire_epi_df") +} diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 2b1fd5c3..0524b48b 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -341,30 +341,75 @@ epix_slide.grouped_epi_archive <- function( ", class = "epiprocess__invalid_slide_comp_value") } + .group_key_label <- if (nrow(.group_key) == 0L) { + # Edge case: we'll get here if a requested `.version` had 0 rows and we + # grouped by a nonzero number of columns using the default `.drop = TRUE` + # (or on all non-factor columns with `.drop = FALSE` for some reason, + # probably a user bug). Mimicking `dplyr`, we'll let `.group_key` provided + # to the computation be 0 rows, but then label it using NAs. (In the + # bizarre situation of grouping by a mix of factor and non-factor with + # `.drop = FALSE`, `.group_key` will already have 1 row. For ungrouped + # epix_slides and 0-variable-grouped epix_slides with either `.drop` + # setting, we will have a 1x0 .group_key, although perhaps for the latter + # this should be 0x0.) + vctrs::vec_cast(NA, .group_key) + } else { + .group_key + } + # Construct result first as list, then tibble-ify, to try to avoid some - # redundant work. `group_modify()` provides the group key, we provide the - # ref time value (appropriately recycled) and comp_value (appropriately - # named / unpacked, for quick feedback) - res <- list(version = vctrs::vec_rep(.version, vctrs::vec_size(comp_value))) + # redundant work. However, we will sacrifice some performance here doing + # checks here in the inner loop, in order to provide immediate feedback on + # some formatting errors. + res <- c( + list(), # get list output; a bit faster than `as.list()`-ing `.group_key_label` + .group_key_label, + list(version = .version) + ) + res <- vctrs::vec_recycle_common(!!!res, .size = vctrs::vec_size(comp_value)) if (is.null(.new_col_name)) { if (inherits(comp_value, "data.frame")) { - # unpack into separate columns (without name prefix): - res <- c(res, comp_value) + # Sometimes comp_value can parrot back columns already in `res`; allow + # this, but balk if a column has the same name as one in `res` but a + # different value: + comp_nms <- names(comp_value) + overlaps_label_names <- comp_nms %in% names(res) + for (comp_i in which(overlaps_label_names)) { + if (!identical(comp_value[[comp_i]], res[[comp_nms[[comp_i]]]])) { + lines <- c( + cli::format_error(c( + "conflict detected between slide value computation labels and output:", + "i" = "we are labeling slide computations with the following columns: {syms(names(res))}", + "x" = "a slide computation output included a column {syms(comp_nms[[comp_i]])} that didn't match the label" + )), + capture.output(print(waldo::compare(res[[comp_nms[[comp_i]]]], comp_value[[comp_i]], x_arg = "label", y_arg = "comp output"))), + cli::format_message(c("You likely want to rename or remove this column in your output, or debug why it has a different value.")) + ) + rlang::abort(paste(collapse = "\n", lines), + class = "epiprocess__epix_slide_label_vs_output_column_conflict" + ) + } + } + # Unpack into separate columns (without name prefix). If there are + # columns duplicating label columns, de-dupe and order them as if they + # didn't exist in comp_value. + res <- c(res, comp_value[!overlaps_label_names]) } else { - # apply default name (to vector or packed data.frame-type column): + # Apply default name (to vector or packed data.frame-type column): res[["slide_value"]] <- comp_value + # TODO check for bizarre conflicting `slide_value` label col name. + # Either here or on entry to `epix_slide` (even if there we don't know + # whether vecs will be output). Or just turn this into a special case of + # the preceding branch and let the checking code there generate a + # complaint. } } else { - # vector or packed data.frame-type column (note: .new_col_name of - # "version" is disallowed): + # vector or packed data.frame-type column (note: overlaps with label + # column names should already be forbidden by earlier validation): res[[.new_col_name]] <- comp_value } - # Stop on naming conflicts (names() fine here, non-NULL). Not the - # friendliest error messages though. - vctrs::vec_as_names(names(res), repair = "check_unique") - # Fast conversion: return(validate_tibble(new_tibble(res))) } @@ -380,18 +425,19 @@ epix_slide.grouped_epi_archive <- function( # Set: # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not + # `group_map` as the `.data` argument. Might or might not # include version column. - # * `group_modify_fn`, the corresponding `.f` argument + # * `group_map_fn`, the corresponding `.f` argument for `group_map` + # (not our `.f`) if (!.all_versions) { as_of_df <- as_of_raw - group_modify_fn <- comp_one_grp + group_map_fn <- comp_one_grp } else { as_of_archive <- as_of_raw # We essentially want to `group_modify` the archive, but # haven't implemented this method yet. Next best would be # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. + # behavior based on whether or not `dtplyr` < 1.3.0 is loaded. # Instead, go through an ordinary data frame, trying to avoid # copies. if (address(as_of_archive$DT) == address(.x$private$ungrouped$DT)) { @@ -408,10 +454,10 @@ epix_slide.grouped_epi_archive <- function( data.table::setDF(as_of_df) # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn <- function(.data_group, .group_key, - .slide_comp, ..., - .version, - .new_col_name) { + group_map_fn <- function(.data_group, .group_key, + .slide_comp, ..., + .version, + .new_col_name) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a # DT; convert and wrap: @@ -428,14 +474,14 @@ epix_slide.grouped_epi_archive <- function( } return( - dplyr::group_modify( + dplyr::bind_rows(dplyr::group_map( # note: output will be ungrouped dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop), - group_modify_fn, + group_map_fn, .slide_comp = .slide_comp, ..., .version = .version, .new_col_name = .new_col_name, .keep = TRUE - ) + )) ) }) # Combine output into a single tibble (allowing for packed columns) diff --git a/R/slide.R b/R/slide.R index 62059fa1..df2e9c91 100644 --- a/R/slide.R +++ b/R/slide.R @@ -36,7 +36,7 @@ #' @template basic-slide-details #' #' @importFrom lubridate days weeks -#' @importFrom dplyr bind_rows group_vars filter select +#' @importFrom dplyr bind_rows group_map group_vars filter select #' @importFrom rlang .data .env !! enquos sym env missing_arg #' @export #' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`] @@ -127,7 +127,7 @@ epi_slide <- function( assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) if (!test_subset(.ref_time_values, unique(.x$time_value))) { cli_abort( - "`ref_time_values` must be a unique subset of the time values in `x`.", + "`.ref_time_values` must be a unique subset of the time values in `.x`.", class = "epi_slide__invalid_ref_time_values" ) } @@ -260,8 +260,8 @@ epi_slide <- function( ) # If this wasn't a tidyeval computation, we still need to check the output - # types. We'll let `list_unchop` deal with checking for type compatibility - # between the outputs. + # types. We'll let `list_unchop`/`bind_rows` deal with checking for type + # compatibility between the outputs. if (!used_data_masking && !all(vapply(slide_values_list, function(comp_value) { # vctrs considers data.frames to be vectors, but we still check @@ -288,8 +288,26 @@ epi_slide <- function( dplyr::count(.data$time_value) %>% `[[`("n") - slide_values <- vctrs::list_unchop(slide_values_list) + if (length(slide_values_list) == 0L) { + # We don't know what .ptype we should be outputting, and we won't try to + # infer it by running a dummy computation. We should just output something + # that will combine well with what computations exist. In some edge cases + # (zero rows in .x, zero .ref_time_values) we may end up just not adding + # any columns, but those edge cases are currently explicitly handled + # earlier (outputting zero columns and aborting, respectively). + + # To combine well, we want something of a "super"-.ptype of all possible + # values. `NULL` almost works but can't be `vec_rep`'d. We'll use a 0-col + # data.frame instead, but will have to ensure it's unpacked into its 0 + # columns in case other computations return vectors by introducing a + # .group_new_col_name. + .group_new_col_name <- NULL + slide_values_list <- vctrs::new_list_of(slide_values_list, data.frame()) + } else { + .group_new_col_name <- .new_col_name + } + slide_values <- vctrs::list_unchop(slide_values_list) if ( all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && @@ -318,26 +336,70 @@ epi_slide <- function( .data_group <- filter(.data_group, o) } - result <- - if (is.null(.new_col_name)) { - if (inherits(slide_values, "data.frame")) { - # unpack into separate columns (without name prefix) and, if there are - # re-bindings, make the last one win for determining column value & - # column placement: - mutate(.data_group, slide_values) - } else { - # apply default name: - mutate(.data_group, slide_value = slide_values) + # To label the result, we will parallel some code from `epix_slide`, though + # some logic is different and some optimizations are less likely to be + # needed as we're at a different loop depth. + + # Unlike `epix_slide`, we will not every have to deal with a 0-row + # `.group_key`: we return early if `epi_slide`'s `.x` has 0 rows, and our + # loop over groups is the outer loop (>= 1 row into the group loop ensures + # we will have only 1-row `.group_key`s). Further, unlike `epix_slide`, we + # actually will be using `.group_data` rather than work with `.group_key` at + # all, in order to keep the pre-existing non-key columns. We will also try + # to work directly with `epi_df`s instead of listified tibbles; since we're + # not in as tight of a loop, the increased overhead hopefully won't matter. + # We'll need to use `bind_cols` rather than `c` to avoid losing + # `epi_df`ness. + + res <- .data_group + + if (is.null(.group_new_col_name)) { + if (inherits(slide_values, "data.frame")) { + # Sometimes slide_values can parrot back columns already in `res`; allow + # this, but balk if a column has the same name as one in `res` but a + # different value: + comp_nms <- names(slide_values) + overlaps_existing_names <- comp_nms %in% names(res) + for (comp_i in which(overlaps_existing_names)) { + if (!identical(slide_values[[comp_i]], res[[comp_nms[[comp_i]]]])) { + lines <- c( + cli::format_error(c( + "conflict detected between existing columns and slide computation output:", + "i" = "pre-existing columns: {syms(names(res))}", + "x" = "slide computation output included a column {syms(comp_nms[[comp_i]])} that didn't match the pre-existing value" + )), + capture.output(print(waldo::compare(res[[comp_nms[[comp_i]]]], slide_values[[comp_i]], x_arg = "existing", y_arg = "comp output"))), + cli::format_message(c("You likely want to rename or remove this column from your slide computation's output, or debug why it has a different value.")) + ) + rlang::abort(paste(collapse = "\n", lines), + class = "epiprocess__epi_slide_existing_vs_output_column_conflict" + ) + } } + # Unpack into separate columns (without name prefix). If there are + # columns duplicating existing columns, de-dupe and order them as if they + # didn't exist in slide_values. + res <- bind_cols(res, slide_values[!overlaps_existing_names]) } else { - # vector or packed data.frame-type column: - mutate(.data_group, !!.new_col_name := slide_values) + # Apply default name (to vector or packed data.frame-type column): + res[["slide_value"]] <- slide_values + # TODO check for bizarre conflicting `slide_value` existing col name. + # Either here or on entry to `epi_slide` (even if there we don't know + # whether vecs will be output). Or just turn this into a special case of + # the preceding branch and let the checking code there generate a + # complaint. } + } else { + # vector or packed data.frame-type column (note: overlaps with existing + # column names should already be forbidden by earlier validation): + res[[.group_new_col_name]] <- slide_values + } - return(result) + return(res) } - .x <- group_modify(.x, slide_one_grp, + .x_groups <- groups(.x) + .x <- bind_rows(group_map(.x, slide_one_grp, ..., .slide_comp_factory = slide_comp_wrapper_factory, .starts = .starts, @@ -345,9 +407,9 @@ epi_slide <- function( .ref_time_values = .ref_time_values, .all_rows = .all_rows, .new_col_name = .new_col_name, - .keep = FALSE - ) - + .keep = TRUE + )) + .x <- group_by(.x, !!!.x_groups) return(.x) } diff --git a/tests/testthat/_snaps/archive.md b/tests/testthat/_snaps/archive.md index 9eab6e9f..6e010da0 100644 --- a/tests/testthat/_snaps/archive.md +++ b/tests/testthat/_snaps/archive.md @@ -2,7 +2,7 @@ Code res <- dumb_ex %>% as_epi_archive() - Condition + Condition Warning: Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space: Key: diff --git a/tests/testthat/_snaps/epi_df_forbidden_methods.md b/tests/testthat/_snaps/epi_df_forbidden_methods.md new file mode 100644 index 00000000..12dc3d48 --- /dev/null +++ b/tests/testthat/_snaps/epi_df_forbidden_methods.md @@ -0,0 +1,40 @@ +# Forbidden epi_df methods have decent error messages + + Code + edf %>% epi_slide(.window_size = 7L, ~ mean(.x)) + Condition + Error in `mean()`: + ! `mean` shouldn't be used on entire `epi_df`s + x .x was an `epi_df` + i If you encountered this while trying to take a rolling mean of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ mean(.x$colname)). You may also prefer to use the specialized `epi_slide_mean` method. + +--- + + Code + edf %>% epi_slide(.window_size = 7L, ~ sum(.x)) + Condition + Error in `.slide_comp()`: + ! `sum` shouldn't be used on entire `epi_df`s + x `sum`'s first argument was an `epi_df` + i If you encountered this while trying to take a rolling sum of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ sum(.x$colname)). You may also prefer to use the specialized `epi_slide_sum` method. + +--- + + Code + edf %>% epi_slide(.window_size = 7L, ~ min(.x)) + Condition + Error in `.slide_comp()`: + ! `min` shouldn't be used on entire `epi_df`s + x `min`'s first argument was an `epi_df` + i If you encountered this while trying to take a rolling min of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ min(.x$colname)). You may also prefer to use the specialized `epi_slide_opt` method. + +--- + + Code + edf %>% epi_slide(.window_size = 7L, ~ range(.x)) + Condition + Error in `.slide_comp()`: + ! `range` shouldn't be used on entire `epi_df`s + x `range`'s first argument was an `epi_df` + i If you encountered this while trying to take a rolling range of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ range(.x$colname)). + diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1791d870..7bde9b46 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -55,7 +55,7 @@ dumb_ex <- data.frame( version = as.Date(c("2020-01-01", "2020-01-02")) ) test_that("new_epi_archive correctly detects and warns about compactification", { - expect_snapshot(res <- dumb_ex %>% as_epi_archive()) + expect_snapshot(res <- dumb_ex %>% as_epi_archive(), cnd_class = TRUE) }) test_that("other_keys can only contain names of the data.frame columns", { diff --git a/tests/testthat/test-epi_df_forbidden_methods.R b/tests/testthat/test-epi_df_forbidden_methods.R new file mode 100644 index 00000000..62d7cba0 --- /dev/null +++ b/tests/testthat/test-epi_df_forbidden_methods.R @@ -0,0 +1,23 @@ +edf <- as_epi_df(tibble( + geo_value = rep("nd", 10L), + time_value = as.Date("2020-01-01") + 1:10 - 1L, + value = 1:10 +)) + +test_that("Forbidden epi_df methods catches omitted column names in slide comp", { + for (f in list(mean, sum, prod, min, max, all, any, range)) { + expect_error(edf %>% epi_slide(.window_size = 7L, ~ f(.x)), + class = "epiprocess__summarizer_on_entire_epi_df" + ) + expect_error(edf %>% group_by(geo_value) %>% epi_slide(.window_size = 7L, ~ f(.x)), + class = "epiprocess__summarizer_on_entire_epi_df" + ) + } +}) + +test_that("Forbidden epi_df methods have decent error messages", { + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ mean(.x))) + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ sum(.x))) + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ min(.x))) + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ range(.x))) +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 179d9427..3589ed77 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -770,3 +770,12 @@ test_that("`epix_slide` works with .before = Inf", { pull(sum_binary) ) }) + +test_that("`epix_slide` de-dupes labeling & value columns", { + expect_identical( + xx %>% epix_slide(version = .version), + xx$DT %>% as.data.frame() %>% as_tibble() %>% distinct(version) %>% arrange(version) + ) + expect_error(xx %>% epix_slide(version = .version + 1)) + # FIXME more tests +}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 1e953d6f..8ed5ea02 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -37,9 +37,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { expect_error(toy_archive %>% group_by(.drop = "bogus"), regexp = "Must be of type 'logical', not 'character'" ) - expect_warning(toy_archive %>% group_by(.drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) + expect_no_warning(toy_archive %>% group_by(.drop = FALSE)) expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" )