fix: aggregate is now sum_groups_epi_df and other review changes

* duplicated time values check in epi_df constructor improved * dplyr warning and unnecessary if in sum_groups_epi_df fixed * args in epi_slide are now validated in order of func signature * simplify deprecated check * error if .new_col_name is "geo_value" or "time_value" * better TODO comment over last part of epi_slide * comment about yearmonth - Inf weirdness * change tests few tests * remove complete_only and auto complete Co-authored-by: brookslogan <[email protected]>
cmu-delphi · Sep 12, 2024 · dfd49f5 · dfd49f5
1 parent dd5c769
commit dfd49f5
Show file tree

Hide file tree

Showing 10 changed files with 347 additions and 380 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -45,7 +45,6 @@ S3method(ungroup,epi_df)
 S3method(ungroup,grouped_epi_archive)
 S3method(unnest,epi_df)
 export("%>%")
-export(aggregate_epi_df)
 export(archive_cases_dv_subset)
 export(arrange)
 export(arrange_canonical)
@@ -87,6 +86,7 @@ export(relocate)
 export(rename)
 export(revision_summary)
 export(slice)
+export(sum_groups_epi_df)
 export(time_column_names)
 export(ungroup)
 export(unnest)

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -278,11 +278,13 @@ as_epi_df.tbl_df <- function(
 
   assert_character(other_keys)
 
-  # Check one time_value per group
+  if (".time_value_counts" %in% other_keys) {
+    cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
+  }
   duplicated_time_values <- x %>%
     group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
-    dplyr::summarize(n = dplyr::n(), .groups = "drop") %>%
-    filter(n > 1)
+    filter(dplyr::n() > 1) %>%
+    ungroup()
   if (nrow(duplicated_time_values) > 0) {
     bad_data <- capture.output(duplicated_time_values)
     cli_abort(
@@ -325,5 +327,5 @@ is_epi_df <- function(x) {
 }
 
 group_epi_df <- function(x) {
-  x %>% group_by(group_by(across(all_of(kill_time_value(key_colnames(.))))))
+  x %>% group_by(across(all_of(kill_time_value(key_colnames(.)))))
 }
diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
@@ -275,7 +275,7 @@ epix_slide.grouped_epi_archive <- function(
   checkmate::assert_string(.new_col_name, null.ok = TRUE)
   if (identical(.new_col_name, "time_value")) {
     cli_abort(
-      '`new_col_name` must not be `"time_value"`; `epix_slide()` uses that column name
+      '`.new_col_name` must not be `"time_value"`; `epix_slide()` uses that column name
       to attach the `ref_time_value` associated with each slide computation'
     )
   }

diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -255,9 +255,10 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) {
 
 #' Complete epi_df
 #'
-#' A [tidyr::complete()] analogue for `epi_df` objects. This function fills in
-#' missing combinations of `geo_value` and `time_value` with `NA` values. See
-#' the examples for usage details.
+#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function
+#' can be used, for example, to add rows for missing combinations
+#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s.
+#' See the examples for usage details.
 #'
 #' @param data an `epi_df`
 #' @param ... see [`tidyr::complete`]
@@ -391,28 +392,46 @@ arrange_canonical.epi_df <- function(x, ...) {
 #' the resulting `epi_df` will have `geo_value` set to `"total"`.
 #'
 #' @param .x an `epi_df`
-#' @param value_col character name of the column to aggregate
-#' @param group_cols character vector of column names to group by
+#' @param value_col character vector of the columns to aggregate
+#' @param group_cols character vector of column names to group by. "time_value" is
+#' included by default.
 #' @return an `epi_df` object
 #'
 #' @export
-aggregate_epi_df <- function(.x, value_col = "value", group_cols = "time_value") {
+sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) {
   assert_class(.x, "epi_df")
-  assert_character(value_col, len = 1)
+  assert_character(sum_cols)
   assert_character(group_cols)
-  checkmate::assert_subset(value_col, names(.x))
-  checkmate::assert_subset(group_cols, names(.x))
+  checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x)))
+  checkmate::assert_subset(group_cols, key_colnames(.x))
+  if (!"time_value" %in% group_cols) {
+    group_cols <- c("time_value", group_cols)
+  }
 
-  .x %>%
+  out <- .x %>%
     group_by(across(all_of(group_cols))) %>%
-    dplyr::summarize(!!(value_col) := sum(!!sym(value_col))) %>%
-    ungroup() %>%
-    {
-      if (!"geo_value" %in% group_cols) {
-        mutate(., geo_value = "total") %>% relocate(geo_value, .before = 1)
-      } else {
-        .
-      }
-    } %>%
-    as_epi_df(as_of = attr(.x, "metadata")$as_of)
+    dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop")
+
+  # To preserve epi_df-ness, we need to ensure that the `geo_value` column is
+  # present.
+  out <- if (!"geo_value" %in% group_cols) {
+    out %>%
+      mutate(geo_value = "total") %>%
+      relocate(geo_value, .before = 1)
+  } else {
+    out
+  }
+
+  # The `geo_type` will be correctly inherited here by the following logic:
+  # - if `geo_value` is in `group_cols`, then the constructor will see the
+  #   geo_value here and will correctly read the existing values
+  # - if `geo_value` is not in `group_cols`, then the constructor will see
+  #   the unrecognizeable "total" value and will correctly infer the "custom"
+  #   geo_type.
+  out %>%
+    as_epi_df(
+      as_of = attr(.x, "metadata")$as_of,
+      other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
+    ) %>%
+    arrange_canonical()
 }