Bring back slide unique-key checks, make as_epi_df ukey checks faster

cmu-delphi · Nov 26, 2024 · 95b485f · 95b485f
1 parent 0352d7b
commit 95b485f
Show file tree

Hide file tree

Showing 10 changed files with 154 additions and 23 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -109,13 +109,16 @@ importFrom(checkmate,assert)
 importFrom(checkmate,assert_character)
 importFrom(checkmate,assert_class)
 importFrom(checkmate,assert_data_frame)
+importFrom(checkmate,assert_false)
 importFrom(checkmate,assert_function)
 importFrom(checkmate,assert_int)
 importFrom(checkmate,assert_list)
 importFrom(checkmate,assert_logical)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,assert_scalar)
 importFrom(checkmate,assert_string)
+importFrom(checkmate,assert_subset)
+importFrom(checkmate,assert_tibble)
 importFrom(checkmate,checkInt)
 importFrom(checkmate,check_atomic)
 importFrom(checkmate,check_data_frame)
@@ -165,6 +168,7 @@ importFrom(dplyr,groups)
 importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
+importFrom(dplyr,is_grouped_df)
 importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
 importFrom(dplyr,near)
@@ -236,3 +240,4 @@ importFrom(tsibble,as_tsibble)
 importFrom(utils,capture.output)
 importFrom(utils,tail)
 importFrom(vctrs,vec_data)
+importFrom(vctrs,vec_equal)
diff --git a/NEWS.md b/NEWS.md
@@ -18,6 +18,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   additional arguments for specifying names: `.prefix`, `.suffix`,
   `.new_col_names`. To obtain the old naming behavior, use `.prefix =
   "slide_value_"`.
+- `as_epi_df` now removes any grouping that `x` had applied.
 
 ## Improvements
 
@@ -31,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   by `geo_value` and any `other_keys` for the slide operation rather than raise
   an error about duplicated time values. `epi_slide`'s analogous automatic
   grouping has been made temporary in order to match.
+- Improved speed of key-uniqueness checks.
 
 ## Bug fixes
 

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -279,22 +279,20 @@ as_epi_df.tbl_df <- function(
   }
 
   assert_character(other_keys)
+  assert_subset(other_keys, names(x))
+  # Fix up if given more than just other keys, at least until epipredict#428
+  # merged:
+  other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")]
 
   if (".time_value_counts" %in% other_keys) {
     cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
   }
 
-  if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) {
-    duplicated_time_values <- x %>%
-      group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
-      filter(dplyr::n() > 1) %>%
-      ungroup()
-    bad_data <- capture.output(duplicated_time_values)
-    cli_abort(
-      "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
-      body = c("Sample groups:", bad_data)
-    )
-  }
+  assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c(
+    ">" = "If this is line list data, convert it to counts/rates first.",
+    ">" = "If this contains a demographic breakdown, check that you have
+           specified appropriate `other_keys`" # . from checkmate
+  )))
 
   new_epi_df(x, geo_type, time_type, as_of, other_keys)
 }

diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R
@@ -5,20 +5,26 @@
 #' @import epidatasets
 #' @importFrom checkmate anyInfinite anyMissing assert assert_character
 #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
+#' @importFrom checkmate assert_false
 #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
 #' @importFrom checkmate assert_string
+#' @importFrom checkmate assert_subset
+#' @importFrom checkmate assert_tibble
 #' @importFrom checkmate check_atomic check_data_frame expect_class test_int
 #' @importFrom checkmate check_names
 #' @importFrom checkmate test_subset test_set_equal vname
 #' @importFrom cli cli_abort cli_warn
 #' @importFrom data.table as.data.table
 #' @importFrom data.table key
 #' @importFrom data.table setkeyv
+#' @importFrom dplyr arrange
+#' @importFrom dplyr is_grouped_df
 #' @importFrom dplyr select
 #' @importFrom lifecycle deprecated
 #' @importFrom rlang %||%
 #' @importFrom rlang is_bare_integerish
 #' @importFrom vctrs vec_data
+#' @importFrom vctrs vec_equal
 ## usethis namespace: end
 NULL
 

diff --git a/R/slide.R b/R/slide.R
@@ -259,18 +259,7 @@ epi_slide <- function(
   assert_logical(.all_rows, len = 1)
 
   # Check for duplicated time values within groups
-  duplicated_time_values <- .x %>%
-    group_epi_df() %>%
-    filter(dplyr::n() > 1) %>%
-    ungroup()
-  if (nrow(duplicated_time_values) > 0) {
-    bad_data <- capture.output(duplicated_time_values)
-    cli_abort(
-      "as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
-      epi_df requires a unique time_value per group.",
-      body = c("Sample groups:", bad_data)
-    )
-  }
+  assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
 
   # Begin handling completion. This will create a complete time index between
   # the smallest and largest time values in the data. This is used to ensure
@@ -752,6 +741,9 @@ epi_slide_opt <- function(
     )
   }
 
+  # Check for duplicated time values within groups
+  assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
+
   # The position of a given column can be differ between input `.x` and
   # `.data_group` since the grouping step by default drops grouping columns.
   # To avoid rerunning `eval_select` for every `.data_group`, convert

diff --git a/R/utils.R b/R/utils.R
@@ -1195,7 +1195,61 @@ time_type_unit_abbr <- function(time_type) {
   maybe_unit_abbr
 }
 
+#' Extract singular element of a length-1 unnamed list (validated)
+#'
+#' Inverse of `list(elt)`.
+#'
+#' @param x a length-1 list
+#' @return x[[1L]], if x actually was a length-1 list; error otherwise
+#'
+#' @keywords internal
 unwrap <- function(x) {
   checkmate::assert_list(x, len = 1L, names = "unnamed")
   x[[1L]]
 }
+
+#' Check that a unique key is indeed unique in a tibble (TRUE/str)
+#'
+#' A `checkmate`-style check function.
+#'
+#' @param x a tibble, with no particular row or column order (if you have a
+#'   guaranteed row order based on the ukey you can probably do something more
+#'   efficient)
+#' @param ukey_names character vector; subset of column names of `x` denoting a
+#'   unique key.
+#' @param end_cli_message optional character vector, a cli message format
+#'   string/vector; information/advice to tack onto any error messages.
+#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no
+#'   duplicated rows); string with an error message if there are errors.
+#'
+#' @keywords internal
+check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) {
+  assert_tibble(x) # to not have to think about `data.table` perf, xface
+  assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface
+  assert_character(ukey_names)
+  assert_subset(ukey_names, names(x))
+  #
+  if (nrow(x) <= 1L) {
+    TRUE
+  } else {
+    # Fast check, slow error message.
+    arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names)))
+    if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) {
+      TRUE
+    } else {
+      bad_data <- x %>%
+        group_by(across(all_of(ukey_names))) %>%
+        filter(dplyr::n() > 1) %>%
+        ungroup()
+      lines <- c(
+        cli::format_error("
+          There cannot be more than one row with the same combination of
+          {format_varnames(ukey_names)}.  Problematic rows:
+        "),
+        capture.output(bad_data),
+        cli::format_message(end_cli_message)
+      )
+      paste(collapse = "\n", lines)
+    }
+  }
+}
diff --git a/man/check_ukey_unique.Rd b/man/check_ukey_unique.Rd
diff --git a/man/unwrap.Rd b/man/unwrap.Rd
diff --git a/tests/testthat/_snaps/epi_df.md b/tests/testthat/_snaps/epi_df.md
@@ -0,0 +1,15 @@
+# as_epi_df errors on nonunique epikeytime
+
+    Code
+      as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5)
+    Condition
+      Error:
+      ! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value.  Problematic rows:
+      # A tibble: 2 x 3
+        geo_value time_value value
+            <dbl>      <dbl> <int>
+      1         1          1     1
+      2         1          1     2
+      > If this is line list data, convert it to counts/rates first.
+      > If this contains a demographic breakdown, check that you have specified appropriate `other_keys`.
+
diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R
@@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", {
   expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol")))
 })
 
+test_that("as_epi_df errors on nonunique epikeytime", {
+  expect_snapshot(
+    as_epi_df(tibble::tibble(
+      geo_value = 1, time_value = 1, value = 1:2
+    ), as_of = 5),
+    error = TRUE
+  )
+  expect_no_error(
+    as_epi_df(tibble::tibble(
+      geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2
+    ), other_keys = "age_group", as_of = 5)
+  )
+})
+
 test_that("as_epi_df works for nonstandard input", {
   tib <- tibble::tibble(
     x = 1:10, y = 1:10,