refactor+doc: key_colnames and vignettes

* key_colnames order change * replace kill_time_value with exclude arg in key_colnames * move duplicate time_values check in epi_slide
cmu-delphi · Sep 26, 2024 · 7f5094d · 7f5094d
1 parent dd19428
commit 7f5094d
Show file tree

Hide file tree

Showing 32 changed files with 616 additions and 765 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -16,3 +16,5 @@
 ^.lintr$
 ^DEVELOPMENT.md$
 man-roxygen
+^.venv$
+^sandbox.R$
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ docs
 renv/
 renv.lock
 .Rprofile
+sandbox.R
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -50,7 +50,8 @@ Imports:
     tidyselect (>= 1.2.0),
     tsibble,
     utils,
-    vctrs
+    vctrs,
+    waldo
 Suggests:
     covidcast,
     devtools,

diff --git a/R/autoplot.R b/R/autoplot.R
@@ -55,7 +55,7 @@ autoplot.epi_df <- function(
 
   key_cols <- key_colnames(object)
   non_key_cols <- setdiff(names(object), key_cols)
-  geo_and_other_keys <- kill_time_value(key_cols)
+  geo_and_other_keys <- key_colnames(object, exclude = "time_value")
 
   # --- check for numeric variables
   allowed <- purrr::map_lgl(object[non_key_cols], is.numeric)

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -184,18 +184,14 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value =
   metadata$other_keys <- other_keys
 
   # Reorder columns (geo_value, time_value, ...)
-  if (sum(dim(x)) != 0) {
-    cols_to_put_first <- c("geo_value", "time_value", other_keys)
-    x <- x[, c(
-      cols_to_put_first,
-      # All other columns
-      names(x)[!(names(x) %in% cols_to_put_first)]
-    )]
+  if (nrow(x) > 0) {
+    x <- x %>% relocate(all_of(c("geo_value", other_keys, "time_value")), .before = 1)
   }
 
   # Apply epi_df class, attach metadata, and return
   class(x) <- c("epi_df", class(x))
   attributes(x)$metadata <- metadata
+
   return(x)
 }
 
@@ -281,6 +277,7 @@ as_epi_df.tbl_df <- function(
   if (".time_value_counts" %in% other_keys) {
     cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
   }
+
   duplicated_time_values <- x %>%
     group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
     filter(dplyr::n() > 1) %>%

diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
@@ -397,8 +397,8 @@ epix_slide.grouped_epi_archive <- function(
               )),
               capture.output(print(waldo::compare(
                 res[[comp_nms[[comp_i]]]], comp_value[[comp_i]],
-                x_arg = rlang::expr_deparse(expr(`$`(label, !!sym(comp_nms[[comp_i]])))),
-                y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
+                x_arg = rlang::expr_deparse(dplyr::expr(`$`(label, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
+                y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
               ))),
               cli::format_message(c(
                 "You likely want to rename or remove this column in your output, or debug why it has a different value."

diff --git a/R/key_colnames.R b/R/key_colnames.R
@@ -2,39 +2,46 @@
 #'
 #' @param x a data.frame, tibble, or epi_df
 #' @param ... additional arguments passed on to methods
-#'
-#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`
+#' @param other_keys an optional character vector of other keys to include
+#' @param exclude an optional character vector of keys to exclude
+#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`.
 #' @keywords internal
 #' @export
 key_colnames <- function(x, ...) {
   UseMethod("key_colnames")
 }
 
+#' @rdname key_colnames
+#' @method key_colnames default
 #' @export
 key_colnames.default <- function(x, ...) {
   character(0L)
 }
 
+#' @rdname key_colnames
+#' @method key_colnames data.frame
 #' @export
-key_colnames.data.frame <- function(x, other_keys = character(0L), ...) {
+key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) {
   assert_character(other_keys)
-  nm <- c("geo_value", "time_value", other_keys)
+  assert_character(exclude)
+  nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude)
   intersect(nm, colnames(x))
 }
 
+#' @rdname key_colnames
+#' @method key_colnames epi_df
 #' @export
-key_colnames.epi_df <- function(x, ...) {
+key_colnames.epi_df <- function(x, exclude = character(0L), ...) {
+  assert_character(exclude)
   other_keys <- attr(x, "metadata")$other_keys
-  c("geo_value", "time_value", other_keys)
+  setdiff(c("geo_value", other_keys, "time_value"), exclude)
 }
 
+#' @rdname key_colnames
+#' @method key_colnames epi_archive
 #' @export
-key_colnames.epi_archive <- function(x, ...) {
+key_colnames.epi_archive <- function(x, exclude = character(0L), ...) {
+  assert_character(exclude)
   other_keys <- attr(x, "metadata")$other_keys
-  c("geo_value", "time_value", other_keys)
-}
-
-kill_time_value <- function(v) {
-  assert_character(v)
-  v[v != "time_value"]
+  setdiff(c("geo_value", other_keys, "time_value"), exclude)
 }
diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
@@ -731,7 +731,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 #' library(dplyr)
 #'
 #' # Reference time points for which we want to compute slide values:
-#' versions <- seq(as.Date("2020-06-01"),
+#' versions <- seq(as.Date("2020-06-02"),
 #'   as.Date("2020-06-15"),
 #'   by = "1 day"
 #' )
@@ -780,7 +780,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 #'     .versions = versions
 #'   ) %>%
 #'   ungroup() %>%
-#'   arrange(geo_value, time_value)
+#'   arrange(geo_value, version)
 #'
 #' # --- Advanced: ---
 #'

diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -41,10 +41,13 @@ as_tibble.epi_df <- function(x, ...) {
 #' @export
 as_tsibble.epi_df <- function(x, key, ...) {
   if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys)
-  return(as_tsibble(tibble::as_tibble(x),
-    key = tidyselect::all_of(key), index = "time_value",
-    ...
-  ))
+  return(
+    as_tsibble(
+      tibble::as_tibble(x),
+      key = tidyselect::all_of(key), index = "time_value",
+      ...
+    )
+  )
 }
 
 #' Base S3 methods for an `epi_df` object
@@ -150,10 +153,10 @@ dplyr_reconstruct.epi_df <- function(data, template) {
   # keep any grouping that has been applied:
   res <- NextMethod()
 
-  cn <- names(res)
+  col_names <- names(res)
 
   # Duplicate columns, cli_abort
-  dup_col_names <- cn[duplicated(cn)]
+  dup_col_names <- col_names[duplicated(col_names)]
   if (length(dup_col_names) != 0) {
     cli_abort(c(
       "Duplicate column names are not allowed",
@@ -163,7 +166,7 @@ dplyr_reconstruct.epi_df <- function(data, template) {
     ))
   }
 
-  not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
+  not_epi_df <- !("time_value" %in% col_names) || !("geo_value" %in% col_names)
 
   if (not_epi_df) {
     # If we're calling on an `epi_df` from one of our own functions, we need to
@@ -182,7 +185,7 @@ dplyr_reconstruct.epi_df <- function(data, template) {
 
   # Amend additional metadata if some other_keys cols are dropped in the subset
   old_other_keys <- attr(template, "metadata")$other_keys
-  attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
+  attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names]
 
   res
 }
@@ -424,9 +427,13 @@ arrange_col_canonical.epi_df <- function(x, ...) {
   x %>% dplyr::relocate(dplyr::all_of(cols), .before = 1)
 }
 
+#' Group an `epi_df` object by default keys
+#' @param x an `epi_df`
+#' @param exclude character vector of column names to exclude from grouping
+#' @return a grouped `epi_df`
 #' @export
-group_epi_df <- function(x) {
-  cols <- kill_time_value(key_colnames(x))
+group_epi_df <- function(x, exclude = character()) {
+  cols <- key_colnames(x, exclude = exclude)
   x %>% group_by(across(all_of(cols)))
 }
 
@@ -437,7 +444,7 @@ group_epi_df <- function(x) {
 #' the resulting `epi_df` will have `geo_value` set to `"total"`.
 #'
 #' @param .x an `epi_df`
-#' @param value_col character vector of the columns to aggregate
+#' @param sum_cols character vector of the columns to aggregate
 #' @param group_cols character vector of column names to group by. "time_value" is
 #' included by default.
 #' @return an `epi_df` object

diff --git a/R/outliers.R b/R/outliers.R
@@ -161,8 +161,7 @@ detect_outlr <- function(x = seq_along(y), y,
 #'   group_by(geo_value) %>%
 #'   mutate(outlier_info = detect_outlr_rm(
 #'     x = time_value, y = cases
-#'   )) %>%
-#'   unnest(outlier_info)
+#'   ))
 detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
                             log_transform = FALSE,
                             detect_negatives = FALSE,
@@ -189,7 +188,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
 
   # Calculate lower and upper thresholds and replacement value
   z <- z %>%
-    epi_slide(fitted = median(y), .window_size = n, .align = "center") %>%
+    epi_slide(fitted = median(y, na.rm = TRUE), .window_size = n, .align = "center") %>%
     dplyr::mutate(resid = y - fitted) %>%
     roll_iqr(
       n = n,
@@ -256,9 +255,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
 #'   group_by(geo_value) %>%
 #'   mutate(outlier_info = detect_outlr_stl(
 #'     x = time_value, y = cases,
-#'     seasonal_period = 7
-#'   )) %>% # weekly seasonality for daily data
-#'   unnest(outlier_info)
+#'     seasonal_period = 7 # weekly seasonality for daily data
+#'   ))
 detect_outlr_stl <- function(x = seq_along(y), y,
                              n_trend = 21,
                              n_seasonal = 21,
@@ -359,7 +357,7 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius,
 
   z %>%
     epi_slide(
-      roll_iqr = stats::IQR(resid),
+      roll_iqr = stats::IQR(resid, na.rm = TRUE),
       .window_size = n, .align = "center"
     ) %>%
     dplyr::mutate(

diff --git a/R/revision_analysis.R b/R/revision_analysis.R
@@ -81,8 +81,8 @@ revision_summary <- function(epi_arch,
                              should_compactify = TRUE) {
   arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
   if (length(arg) == 0) {
-    first_non_key <- !(names(epi_arch$DT) %in% c(key_colnames(epi_arch), "version"))
-    arg <- names(epi_arch$DT)[first_non_key][1]
+    # Choose the first column that's not a key or version
+    arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]]
   } else if (length(arg) > 1) {
     cli_abort("Not currently implementing more than one column at a time. Run each separately")
   }
@@ -99,11 +99,9 @@ revision_summary <- function(epi_arch,
   #
   # revision_tibble
   keys <- key_colnames(epi_arch)
-  names(epi_arch$DT)
 
-  revision_behavior <-
-    epi_arch$DT %>%
-    select(c(geo_value, time_value, all_of(keys), version, !!arg))
+  revision_behavior <- epi_arch$DT %>%
+    select(all_of(unique(c("geo_value", "time_value", keys, "version", arg))))
   if (!is.null(min_waiting_period)) {
     revision_behavior <- revision_behavior %>%
       filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)

diff --git a/R/slide.R b/R/slide.R
@@ -122,8 +122,7 @@ epi_slide <- function(
   assert_class(.x, "epi_df")
   if (checkmate::test_class(.x, "grouped_df")) {
     expected_group_keys <- .x %>%
-      key_colnames() %>%
-      kill_time_value() %>%
+      key_colnames(exclude = "time_value") %>%
       sort()
     if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) {
       cli_abort(
@@ -134,12 +133,11 @@ epi_slide <- function(
       )
     }
   } else {
-    .x <- group_epi_df(.x)
+    .x <- group_epi_df(.x, exclude = "time_value")
   }
   if (nrow(.x) == 0L) {
     return(.x)
   }
-
   # If `.f` is missing, interpret ... as an expression for tidy evaluation
   if (missing(.f)) {
     used_data_masking <- TRUE
@@ -191,6 +189,20 @@ epi_slide <- function(
 
   assert_logical(.all_rows, len = 1)
 
+  # Check for duplicated time values within groups
+  duplicated_time_values <- .x %>%
+    group_epi_df() %>%
+    filter(dplyr::n() > 1) %>%
+    ungroup()
+  if (nrow(duplicated_time_values) > 0) {
+    bad_data <- capture.output(duplicated_time_values)
+    cli_abort(
+      "as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
+      epi_df requires a unique time_value per group.",
+      body = c("Sample groups:", bad_data)
+    )
+  }
+
   # Begin handling completion. This will create a complete time index between
   # the smallest and largest time values in the data. This is used to ensure
   # that the slide function is called with a complete window of data. Each slide
@@ -241,7 +253,7 @@ epi_slide <- function(
     .keep = TRUE
   ) %>%
     bind_rows() %>%
-    filter(.data$.real) %>%
+    filter(.real) %>%
     select(-.real) %>%
     arrange_col_canonical() %>%
     group_by(!!!.x_groups)
@@ -275,11 +287,16 @@ epi_slide_one_group <- function(
   missing_times <- all_dates[!(all_dates %in% .data_group$time_value)]
   .data_group <- bind_rows(
     .data_group,
-    tibble(time_value = c(
-      missing_times,
-      .date_seq_list$pad_early_dates,
-      .date_seq_list$pad_late_dates
-    ), .real = FALSE)
+    dplyr::bind_cols(
+      .group_key,
+      tibble(
+        time_value = c(
+          missing_times,
+          .date_seq_list$pad_early_dates,
+          .date_seq_list$pad_late_dates
+        ), .real = FALSE
+      )
+    )
   ) %>%
     arrange(.data$time_value)
 
@@ -405,8 +422,8 @@ epi_slide_one_group <- function(
             )),
             capture.output(print(waldo::compare(
               res[[comp_nms[[comp_i]]]], slide_values[[comp_i]],
-              x_arg = rlang::expr_deparse(expr(`$`(existing, !!sym(comp_nms[[comp_i]])))),
-              y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
+              x_arg = rlang::expr_deparse(dplyr::expr(`$`(existing, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
+              y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) # nolint: object_usage_linter
             ))),
             cli::format_message(c(
               ">" = "You likely want to rename or remove this column from your slide
@@ -711,7 +728,7 @@ epi_slide_opt <- function(
   # positions of user-provided `col_names` into string column names. We avoid
   # using `names(pos)` directly for robustness and in case we later want to
   # allow users to rename fields via tidyselection.
-  if (class(quo_get_expr(enquo(.col_names))) == "character") {
+  if (inherits(quo_get_expr(enquo(.col_names)), "character")) {
     pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE)
   } else {
     pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE)