Merge pull request #519 from cmu-delphi/ds/epi-slide-group

undefined
cmu-delphi · Sep 18, 2024 · 214100d · 214100d
2 parents c167ddf + 409dcac
commit 214100d
Show file tree

Hide file tree

Showing 14 changed files with 1,039 additions and 937 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,10 @@ S3method("names<-",epi_df)
 S3method(Summary,epi_df)
 S3method(arrange_canonical,default)
 S3method(arrange_canonical,epi_df)
+S3method(arrange_col_canonical,default)
+S3method(arrange_col_canonical,epi_df)
+S3method(arrange_row_canonical,default)
+S3method(arrange_row_canonical,epi_df)
 S3method(as_epi_df,data.frame)
 S3method(as_epi_df,epi_df)
 S3method(as_epi_df,tbl_df)
@@ -76,6 +80,7 @@ export(filter)
 export(full_seq)
 export(geo_column_names)
 export(group_by)
+export(group_epi_df)
 export(group_modify)
 export(growth_rate)
 export(guess_period)
@@ -91,6 +96,7 @@ export(relocate)
 export(rename)
 export(revision_summary)
 export(slice)
+export(sum_groups_epi_df)
 export(time_column_names)
 export(ungroup)
 export(unnest)

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -245,10 +245,10 @@ as_epi_df.tbl_df <- function(
     )
   }
   if (lifecycle::is_present(geo_type)) {
-    cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
+    cli_warn("epi_df constructor argument `geo_type` is now ignored. Consider removing.")
   }
   if (lifecycle::is_present(time_type)) {
-    cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.")
+    cli_warn("epi_df constructor argument `time_type` is now ignored. Consider removing.")
   }
 
   # If geo type is missing, then try to guess it
@@ -277,6 +277,22 @@ as_epi_df.tbl_df <- function(
   }
 
   assert_character(other_keys)
+
+  if (".time_value_counts" %in% other_keys) {
+    cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
+  }
+  duplicated_time_values <- x %>%
+    group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
+    filter(dplyr::n() > 1) %>%
+    ungroup()
+  if (nrow(duplicated_time_values) > 0) {
+    bad_data <- capture.output(duplicated_time_values)
+    cli_abort(
+      "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
+      body = c("Sample groups:", bad_data)
+    )
+  }
+
   new_epi_df(x, geo_type, time_type, as_of, other_keys)
 }
 

diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
@@ -278,7 +278,7 @@ epix_slide.grouped_epi_archive <- function(
     .versions <- sort(.versions)
   }
 
-  validate_slide_window_arg(.before, .x$private$ungrouped$time_type)
+  validate_slide_window_arg(.before, .x$private$ungrouped$time_type, lower = 0) # nolint: object_usage_linter
 
   checkmate::assert_string(.new_col_name, null.ok = TRUE)
   if (!is.null(.new_col_name)) {
@@ -292,7 +292,8 @@ epix_slide.grouped_epi_archive <- function(
       ))
     }
     if (identical(.new_col_name, "version")) {
-      cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element of `.versions` associated with each slide computation') # nolint: line_length_linter
+      cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element
+      of `.versions` associated with each slide computation')
     }
   }
 

diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
@@ -623,26 +623,29 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 #' @param .f Function, formula, or missing; together with `...` specifies the
 #'   computation to slide. To "slide" means to apply a computation over a
 #'   sliding (a.k.a. "rolling") time window for each data group. The window is
-#'   determined by the `before` parameter described below. One time step is
-#'   typically one day or one week; see [`epi_slide`] details for more
-#'   explanation. If a function, `.f` must take an `epi_df` with the same
-#'   column names as the archive's `DT`, minus the `version` column; followed
-#'   by a one-row tibble containing the values of the grouping variables for
-#'   the associated group; followed by a reference time value, usually as a
-#'   `Date` object; followed by any number of named arguments. If a formula,
-#'   `.f` can operate directly on columns accessed via `.x$var` or `.$var`, as
-#'   in `~ mean (.x$var)` to compute a mean of a column `var` for each
-#'   group-`ref_time_value` combination. The group key can be accessed via
-#'   `.y` or `.group_key`, and the reference time value can be accessed via
-#'   `.z` or `.ref_time_value`. If `.f` is missing, then `...` will specify the
+#'   determined by the `.before` parameter (see details for more). If a
+#'   function, `.f` must have the form `function(x, g, t, ...)`, where
+#'
+#'   - "x" is an epi_df with the same column names as the archive's `DT`, minus
+#'     the `version` column
+#'   - "g" is a one-row tibble containing the values of the grouping variables
+#'   for the associated group
+#'   - "t" is the ref_time_value for the current window
+#'   - "..." are additional arguments
+#'
+#'   If a formula, `.f` can operate directly on columns accessed via `.x$var` or
+#'   `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for
+#'   each group-`ref_time_value` combination. The group key can be accessed via
+#'   `.y` or `.group_key`, and the reference time value can be accessed via `.z`
+#'   or `.ref_time_value`. If `.f` is missing, then `...` will specify the
 #'   computation.
 #' @param ... Additional arguments to pass to the function or formula specified
-#'   via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted as
-#'   a ["data-masking"][rlang::args_data_masking] expression or expressions for
-#'   tidy evaluation; in addition to referring columns directly by name, the
+#'   via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted
+#'   as a ["data-masking"][rlang::args_data_masking] expression or expressions
+#'   for tidy evaluation; in addition to referring columns directly by name, the
 #'   expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs,
-#'   and can also refer to `.x`, `.group_key`, and `.ref_time_value`. See
-#'   details.
+#'   and can also refer to `.x` (not the same as the input epi_archive),
+#'   `.group_key`, and `.ref_time_value`. See details for more.
 #' @param .before How many time values before the `.ref_time_value`
 #'   should each snapshot handed to the function `.f` contain? If provided, it
 #'   should be a single value that is compatible with the time_type of the

diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -255,9 +255,10 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) {
 
 #' Complete epi_df
 #'
-#' A [tidyr::complete()] analogue for `epi_df` objects. This function fills in
-#' missing combinations of `geo_value` and `time_value` with `NA` values. See
-#' the examples for usage details.
+#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function
+#' can be used, for example, to add rows for missing combinations
+#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s.
+#' See the examples for usage details.
 #'
 #' @param data an `epi_df`
 #' @param ... see [`tidyr::complete`]
@@ -378,8 +379,101 @@ arrange_canonical.default <- function(x, ...) {
 #' @export
 arrange_canonical.epi_df <- function(x, ...) {
   rlang::check_dots_empty()
-  keys <- key_colnames(x)
   x %>%
-    dplyr::relocate(dplyr::all_of(keys), .before = 1) %>%
-    dplyr::arrange(dplyr::across(dplyr::all_of(keys)))
+    arrange_row_canonical() %>%
+    arrange_col_canonical()
+}
+
+arrange_row_canonical <- function(x, ...) {
+  UseMethod("arrange_row_canonical")
+}
+
+#' @export
+arrange_row_canonical.default <- function(x, ...) {
+  rlang::check_dots_empty()
+  cli::cli_abort(c(
+    "`arrange_row_canonical()` is only meaningful for an {.cls epi_df}."
+  ))
+  return(x)
+}
+
+#' @export
+arrange_row_canonical.epi_df <- function(x, ...) {
+  rlang::check_dots_empty()
+  x %>% dplyr::arrange(dplyr::across(dplyr::all_of(key_colnames(.))))
+}
+
+arrange_col_canonical <- function(x, ...) {
+  UseMethod("arrange_col_canonical")
+}
+
+#' @export
+arrange_col_canonical.default <- function(x, ...) {
+  rlang::check_dots_empty()
+  cli::cli_abort(c(
+    "`arrange_col_canonical()` is only meaningful for an {.cls epi_df}."
+  ))
+  return(x)
+}
+
+#' @export
+arrange_col_canonical.epi_df <- function(x, ...) {
+  rlang::check_dots_empty()
+  x %>% dplyr::relocate(dplyr::all_of(key_colnames(.)), .before = 1)
+}
+
+#' @export
+group_epi_df <- function(x) {
+  x %>% group_by(across(all_of(kill_time_value(key_colnames(.)))))
+}
+
+#' Aggregate an `epi_df` object
+#'
+#' Aggregates an `epi_df` object by the specified group columns, summing the
+#' `value` column, and returning an `epi_df`. If aggregating over `geo_value`,
+#' the resulting `epi_df` will have `geo_value` set to `"total"`.
+#'
+#' @param .x an `epi_df`
+#' @param value_col character vector of the columns to aggregate
+#' @param group_cols character vector of column names to group by. "time_value" is
+#' included by default.
+#' @return an `epi_df` object
+#'
+#' @export
+sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) {
+  assert_class(.x, "epi_df")
+  assert_character(sum_cols)
+  assert_character(group_cols)
+  checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x)))
+  checkmate::assert_subset(group_cols, key_colnames(.x))
+  if (!"time_value" %in% group_cols) {
+    group_cols <- c("time_value", group_cols)
+  }
+
+  out <- .x %>%
+    group_by(across(all_of(group_cols))) %>%
+    dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop")
+
+  # To preserve epi_df-ness, we need to ensure that the `geo_value` column is
+  # present.
+  out <- if (!"geo_value" %in% group_cols) {
+    out %>%
+      mutate(geo_value = "total") %>%
+      relocate(geo_value, .before = 1)
+  } else {
+    out
+  }
+
+  # The `geo_type` will be correctly inherited here by the following logic:
+  # - if `geo_value` is in `group_cols`, then the constructor will see the
+  #   geo_value here and will correctly read the existing values
+  # - if `geo_value` is not in `group_cols`, then the constructor will see
+  #   the unrecognizeable "total" value and will correctly infer the "custom"
+  #   geo_type.
+  out %>%
+    as_epi_df(
+      as_of = attr(.x, "metadata")$as_of,
+      other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
+    ) %>%
+    arrange_canonical()
 }