Merge pull request #521 from cmu-delphi/lcb/slide-unnest-dedupe-cols

cmu-delphi · Sep 16, 2024 · 9ab9731 · 9ab9731
2 parents ef2639e + 16cf8d7
commit 9ab9731
Show file tree

Hide file tree

Showing 12 changed files with 285 additions and 55 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -79,6 +79,7 @@ Collate:
     'correlation.R'
     'data.R'
     'epi_df.R'
+    'epi_df_forbidden_methods.R'
     'epiprocess.R'
     'group_by_epi_df_methods.R'
     'methods-epi_archive.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method("[",epi_df)
 S3method("names<-",epi_df)
+S3method(Summary,epi_df)
 S3method(arrange_canonical,default)
 S3method(arrange_canonical,epi_df)
 S3method(as_epi_df,data.frame)
@@ -36,6 +37,7 @@ S3method(key_colnames,data.frame)
 S3method(key_colnames,default)
 S3method(key_colnames,epi_archive)
 S3method(key_colnames,epi_df)
+S3method(mean,epi_df)
 S3method(next_after,Date)
 S3method(next_after,integer)
 S3method(print,epi_archive)
@@ -148,6 +150,7 @@ importFrom(dplyr,everything)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_drop_default)
+importFrom(dplyr,group_map)
 importFrom(dplyr,group_modify)
 importFrom(dplyr,group_vars)
 importFrom(dplyr,groups)

diff --git a/R/archive.R b/R/archive.R
@@ -693,20 +693,20 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE,
     grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]]
     grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor)
     # ^ Use `as.list` to try to avoid any possibility of a deep copy.
-    if (!any(grouping_col_is_factor)) {
+    if (length(grouping_cols) != 0L && !any(grouping_col_is_factor)) {
       cli_warn(
-        "`.drop=FALSE` but there are no factor grouping columns;
+        "`.drop=FALSE` but none of the grouping columns are factors;
         did you mean to convert one of the columns to a factor beforehand?",
         class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors"
       )
     } else if (any(diff(grouping_col_is_factor) == -1L)) {
       cli_warn(
         "`.drop=FALSE` but there are one or more non-factor grouping columns listed
         after a factor grouping column; this may produce groups with `NA`s for these
-        columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553;
+        non-factor columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553;
         depending on how you want completion to work, you might instead want to convert all
         grouping columns to factors beforehand, specify the non-factor grouping columns first,
-        or use `.drop=TRUE` and add a call to `tidyr::complete`.",
+        or use `.drop=TRUE` and add a call to `tidyr::complete()`.",
         class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor"
       )
     }

diff --git a/R/epi_df_forbidden_methods.R b/R/epi_df_forbidden_methods.R
@@ -0,0 +1,48 @@
+# Methods in this file are used to
+# * Disable problematic inherited behavior (e.g., mean on epi_dfs)
+# * Provide better error messaging if possible for things that already abort
+#   when they should (e.g., sum on epi_dfs)
+
+
+# Disable mean on epi_dfs, to prevent `epi_slide(~ mean(.x), ....)` bad output:
+
+#' @export
+mean.epi_df <- function(x, ...) {
+  cli_abort(c(
+    "`mean` shouldn't be used on entire `epi_df`s",
+    "x" = "{rlang::caller_arg(x)} was an `epi_df`",
+    "i" = "If you encountered this while trying to take a rolling mean
+           of a column using `epi_slide`, you probably forgot to
+           specify the column name (e.g., ~ mean(.x$colname)). You may
+           also prefer to use the specialized `epi_slide_mean` method."
+  ), class = "epiprocess__summarizer_on_entire_epi_df")
+}
+
+# Similarly, provide better error messages for some other potentially-common
+# slide operations (sum, prod, min, max, all, any, range):
+
+#' @export
+Summary.epi_df <- function(..., na.rm = FALSE) {
+  # cli uses dot prefixes for special purpose; use alias to avoid confusion during interpolation
+  generic <- .Generic
+  opt_pointer <- switch(.Generic,
+    sum = "You may also prefer to use the specialized `epi_slide_sum` method.",
+    prod = ,
+    min = ,
+    max = ,
+    all = ,
+    any = "You may also prefer to use the specialized `epi_slide_opt` method.",
+    range = "",
+    cli_abort("Unrecognized .Generic: {generic}")
+  )
+  cli_abort(c(
+    "`{generic}` shouldn't be used on entire `epi_df`s",
+    # We'd like to quote user input in the error message, but `caller_arg(..1)` is
+    # just "..1" and (eagerness/S4/unnamedness?) issues thwart some alternatives; just
+    # use something generic:
+    "x" = "`{generic}`'s first argument was an `epi_df`",
+    "i" = "If you encountered this while trying to take a rolling {generic}
+           of a column using `epi_slide`, you probably forgot to
+           specify the column name (e.g., ~ {generic}(.x$colname)). {opt_pointer}"
+  ), class = "epiprocess__summarizer_on_entire_epi_df")
+}
diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
@@ -341,30 +341,75 @@ epix_slide.grouped_epi_archive <- function(
       ", class = "epiprocess__invalid_slide_comp_value")
     }
 
+    .group_key_label <- if (nrow(.group_key) == 0L) {
+      # Edge case: we'll get here if a requested `.version` had 0 rows and we
+      # grouped by a nonzero number of columns using the default `.drop = TRUE`
+      # (or on all non-factor columns with `.drop = FALSE` for some reason,
+      # probably a user bug). Mimicking `dplyr`, we'll let `.group_key` provided
+      # to the computation be 0 rows, but then label it using NAs. (In the
+      # bizarre situation of grouping by a mix of factor and non-factor with
+      # `.drop = FALSE`, `.group_key` will already have 1 row. For ungrouped
+      # epix_slides and 0-variable-grouped epix_slides with either `.drop`
+      # setting, we will have a 1x0 .group_key, although perhaps for the latter
+      # this should be 0x0.)
+      vctrs::vec_cast(NA, .group_key)
+    } else {
+      .group_key
+    }
+
     # Construct result first as list, then tibble-ify, to try to avoid some
-    # redundant work. `group_modify()` provides the group key, we provide the
-    # ref time value (appropriately recycled) and comp_value (appropriately
-    # named / unpacked, for quick feedback)
-    res <- list(version = vctrs::vec_rep(.version, vctrs::vec_size(comp_value)))
+    # redundant work. However, we will sacrifice some performance here doing
+    # checks here in the inner loop, in order to provide immediate feedback on
+    # some formatting errors.
+    res <- c(
+      list(), # get list output; a bit faster than `as.list()`-ing `.group_key_label`
+      .group_key_label,
+      list(version = .version)
+    )
+    res <- vctrs::vec_recycle_common(!!!res, .size = vctrs::vec_size(comp_value))
 
     if (is.null(.new_col_name)) {
       if (inherits(comp_value, "data.frame")) {
-        # unpack into separate columns (without name prefix):
-        res <- c(res, comp_value)
+        # Sometimes comp_value can parrot back columns already in `res`; allow
+        # this, but balk if a column has the same name as one in `res` but a
+        # different value:
+        comp_nms <- names(comp_value)
+        overlaps_label_names <- comp_nms %in% names(res)
+        for (comp_i in which(overlaps_label_names)) {
+          if (!identical(comp_value[[comp_i]], res[[comp_nms[[comp_i]]]])) {
+            lines <- c(
+              cli::format_error(c(
+                "conflict detected between slide value computation labels and output:",
+                "i" = "we are labeling slide computations with the following columns: {syms(names(res))}",
+                "x" = "a slide computation output included a column {syms(comp_nms[[comp_i]])} that didn't match the label"
+              )),
+              capture.output(print(waldo::compare(res[[comp_nms[[comp_i]]]], comp_value[[comp_i]], x_arg = "label", y_arg = "comp output"))),
+              cli::format_message(c("You likely want to rename or remove this column in your output, or debug why it has a different value."))
+            )
+            rlang::abort(paste(collapse = "\n", lines),
+              class = "epiprocess__epix_slide_label_vs_output_column_conflict"
+            )
+          }
+        }
+        # Unpack into separate columns (without name prefix). If there are
+        # columns duplicating label columns, de-dupe and order them as if they
+        # didn't exist in comp_value.
+        res <- c(res, comp_value[!overlaps_label_names])
       } else {
-        # apply default name (to vector or packed data.frame-type column):
+        # Apply default name (to vector or packed data.frame-type column):
         res[["slide_value"]] <- comp_value
+        # TODO check for bizarre conflicting `slide_value` label col name.
+        # Either here or on entry to `epix_slide` (even if there we don't know
+        # whether vecs will be output). Or just turn this into a special case of
+        # the preceding branch and let the checking code there generate a
+        # complaint.
       }
     } else {
-      # vector or packed data.frame-type column (note: .new_col_name of
-      # "version" is disallowed):
+      # vector or packed data.frame-type column (note: overlaps with label
+      # column names should already be forbidden by earlier validation):
       res[[.new_col_name]] <- comp_value
     }
 
-    # Stop on naming conflicts (names() fine here, non-NULL). Not the
-    # friendliest error messages though.
-    vctrs::vec_as_names(names(res), repair = "check_unique")
-
     # Fast conversion:
     return(validate_tibble(new_tibble(res)))
   }
@@ -380,18 +425,19 @@ epix_slide.grouped_epi_archive <- function(
 
     # Set:
     # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will
-    #    `group_modify` as the `.data` argument. Might or might not
+    #    `group_map` as the `.data` argument. Might or might not
     #    include version column.
-    # * `group_modify_fn`, the corresponding `.f` argument
+    # * `group_map_fn`, the corresponding `.f` argument for `group_map`
+    #   (not our `.f`)
     if (!.all_versions) {
       as_of_df <- as_of_raw
-      group_modify_fn <- comp_one_grp
+      group_map_fn <- comp_one_grp
     } else {
       as_of_archive <- as_of_raw
       # We essentially want to `group_modify` the archive, but
       # haven't implemented this method yet. Next best would be
       # `group_modify` on its `$DT`, but that has different
-      # behavior based on whether or not `dtplyr` is loaded.
+      # behavior based on whether or not `dtplyr` < 1.3.0 is loaded.
       # Instead, go through an ordinary data frame, trying to avoid
       # copies.
       if (address(as_of_archive$DT) == address(.x$private$ungrouped$DT)) {
@@ -408,10 +454,10 @@ epix_slide.grouped_epi_archive <- function(
       data.table::setDF(as_of_df)
 
       # Convert each subgroup chunk to an archive before running the calculation.
-      group_modify_fn <- function(.data_group, .group_key,
-                                  .slide_comp, ...,
-                                  .version,
-                                  .new_col_name) {
+      group_map_fn <- function(.data_group, .group_key,
+                               .slide_comp, ...,
+                               .version,
+                               .new_col_name) {
         # .data_group is coming from as_of_df as a tibble, but we
         # want to feed `comp_one_grp` an `epi_archive` backed by a
         # DT; convert and wrap:
@@ -428,14 +474,14 @@ epix_slide.grouped_epi_archive <- function(
     }
 
     return(
-      dplyr::group_modify(
+      dplyr::bind_rows(dplyr::group_map( # note: output will be ungrouped
         dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop),
-        group_modify_fn,
+        group_map_fn,
         .slide_comp = .slide_comp, ...,
         .version = .version,
         .new_col_name = .new_col_name,
         .keep = TRUE
-      )
+      ))
     )
   })
   # Combine output into a single tibble (allowing for packed columns)