From 081bd22f69c5924c9c173050a5e6a9a71cd886da Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 26 Nov 2024 04:54:03 -0800
Subject: [PATCH] Bring back slide unique-key checks, make as_epi_df ukey
 checks faster

---
 DESCRIPTION                     |  2 +-
 NAMESPACE                       |  5 +++
 NEWS.md                         |  2 ++
 R/epi_df.R                      | 20 ++++++------
 R/epiprocess-package.R          |  6 ++++
 R/slide.R                       | 16 +++-------
 R/utils.R                       | 54 +++++++++++++++++++++++++++++++++
 man/check_ukey_unique.Rd        | 27 +++++++++++++++++
 man/unwrap.Rd                   | 18 +++++++++++
 tests/testthat/_snaps/epi_df.md | 15 +++++++++
 tests/testthat/test-epi_df.R    | 14 +++++++++
 11 files changed, 155 insertions(+), 24 deletions(-)
 create mode 100644 man/check_ukey_unique.Rd
 create mode 100644 man/unwrap.Rd
 create mode 100644 tests/testthat/_snaps/epi_df.md

diff --git a/DESCRIPTION b/DESCRIPTION
index fd09aa57..c3f67c08 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: epiprocess
 Type: Package
 Title: Tools for basic signal processing in epidemiology
-Version: 0.9.6
+Version: 0.9.7
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")),
diff --git a/NAMESPACE b/NAMESPACE
index e214d8f3..1f5180fb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -109,6 +109,7 @@ importFrom(checkmate,assert)
 importFrom(checkmate,assert_character)
 importFrom(checkmate,assert_class)
 importFrom(checkmate,assert_data_frame)
+importFrom(checkmate,assert_false)
 importFrom(checkmate,assert_function)
 importFrom(checkmate,assert_int)
 importFrom(checkmate,assert_list)
@@ -116,6 +117,8 @@ importFrom(checkmate,assert_logical)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,assert_scalar)
 importFrom(checkmate,assert_string)
+importFrom(checkmate,assert_subset)
+importFrom(checkmate,assert_tibble)
 importFrom(checkmate,checkInt)
 importFrom(checkmate,check_atomic)
 importFrom(checkmate,check_data_frame)
@@ -165,6 +168,7 @@ importFrom(dplyr,groups)
 importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
+importFrom(dplyr,is_grouped_df)
 importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
 importFrom(dplyr,near)
@@ -236,3 +240,4 @@ importFrom(tsibble,as_tsibble)
 importFrom(utils,capture.output)
 importFrom(utils,tail)
 importFrom(vctrs,vec_data)
+importFrom(vctrs,vec_equal)
diff --git a/NEWS.md b/NEWS.md
index 436b0c7d..ba6826da 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -18,6 +18,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   additional arguments for specifying names: `.prefix`, `.suffix`,
   `.new_col_names`. To obtain the old naming behavior, use `.prefix =
   "slide_value_"`.
+- `as_epi_df` now removes any grouping that `x` had applied.
 
 ## Improvements
 
@@ -31,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   by `geo_value` and any `other_keys` for the slide operation rather than raise
   an error about duplicated time values. `epi_slide`'s analogous automatic
   grouping has been made temporary in order to match.
+- Improved speed of key-uniqueness checks.
 
 ## Bug fixes
 
diff --git a/R/epi_df.R b/R/epi_df.R
index 83cca073..6cae22dd 100644
--- a/R/epi_df.R
+++ b/R/epi_df.R
@@ -279,22 +279,20 @@ as_epi_df.tbl_df <- function(
   }
 
   assert_character(other_keys)
+  assert_subset(other_keys, names(x))
+  # Fix up if given more than just other keys, at least until epipredict#428
+  # merged:
+  other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")]
 
   if (".time_value_counts" %in% other_keys) {
     cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
   }
 
-  if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) {
-    duplicated_time_values <- x %>%
-      group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
-      filter(dplyr::n() > 1) %>%
-      ungroup()
-    bad_data <- capture.output(duplicated_time_values)
-    cli_abort(
-      "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
-      body = c("Sample groups:", bad_data)
-    )
-  }
+  assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c(
+    ">" = "If this is line list data, convert it to counts/rates first.",
+    ">" = "If this contains a demographic breakdown, check that you have
+           specified appropriate `other_keys`" # . from checkmate
+  )))
 
   new_epi_df(x, geo_type, time_type, as_of, other_keys)
 }
diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R
index 68b7b9b5..675d000d 100644
--- a/R/epiprocess-package.R
+++ b/R/epiprocess-package.R
@@ -5,8 +5,11 @@
 #' @import epidatasets
 #' @importFrom checkmate anyInfinite anyMissing assert assert_character
 #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
+#' @importFrom checkmate assert_false
 #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
 #' @importFrom checkmate assert_string
+#' @importFrom checkmate assert_subset
+#' @importFrom checkmate assert_tibble
 #' @importFrom checkmate check_atomic check_data_frame expect_class test_int
 #' @importFrom checkmate check_names
 #' @importFrom checkmate test_subset test_set_equal vname
@@ -14,11 +17,14 @@
 #' @importFrom data.table as.data.table
 #' @importFrom data.table key
 #' @importFrom data.table setkeyv
+#' @importFrom dplyr arrange
+#' @importFrom dplyr is_grouped_df
 #' @importFrom dplyr select
 #' @importFrom lifecycle deprecated
 #' @importFrom rlang %||%
 #' @importFrom rlang is_bare_integerish
 #' @importFrom vctrs vec_data
+#' @importFrom vctrs vec_equal
 ## usethis namespace: end
 NULL
 
diff --git a/R/slide.R b/R/slide.R
index 0c747d17..7342e0fd 100644
--- a/R/slide.R
+++ b/R/slide.R
@@ -259,18 +259,7 @@ epi_slide <- function(
   assert_logical(.all_rows, len = 1)
 
   # Check for duplicated time values within groups
-  duplicated_time_values <- .x %>%
-    group_epi_df() %>%
-    filter(dplyr::n() > 1) %>%
-    ungroup()
-  if (nrow(duplicated_time_values) > 0) {
-    bad_data <- capture.output(duplicated_time_values)
-    cli_abort(
-      "as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
-      epi_df requires a unique time_value per group.",
-      body = c("Sample groups:", bad_data)
-    )
-  }
+  assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
 
   # Begin handling completion. This will create a complete time index between
   # the smallest and largest time values in the data. This is used to ensure
@@ -752,6 +741,9 @@ epi_slide_opt <- function(
     )
   }
 
+  # Check for duplicated time values within groups
+  assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))
+
   # The position of a given column can be differ between input `.x` and
   # `.data_group` since the grouping step by default drops grouping columns.
   # To avoid rerunning `eval_select` for every `.data_group`, convert
diff --git a/R/utils.R b/R/utils.R
index 55022677..e350ade2 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1195,7 +1195,61 @@ time_type_unit_abbr <- function(time_type) {
   maybe_unit_abbr
 }
 
+#' Extract singular element of a length-1 unnamed list (validated)
+#'
+#' Inverse of `list(elt)`.
+#'
+#' @param x a length-1 list
+#' @return x[[1L]], if x actually was a length-1 list; error otherwise
+#'
+#' @keywords internal
 unwrap <- function(x) {
   checkmate::assert_list(x, len = 1L, names = "unnamed")
   x[[1L]]
 }
+
+#' Check that a unique key is indeed unique in a tibble (TRUE/str)
+#'
+#' A `checkmate`-style check function.
+#'
+#' @param x a tibble, with no particular row or column order (if you have a
+#'   guaranteed row order based on the ukey you can probably do something more
+#'   efficient)
+#' @param ukey_names character vector; subset of column names of `x` denoting a
+#'   unique key.
+#' @param end_cli_message optional character vector, a cli message format
+#'   string/vector; information/advice to tack onto any error messages.
+#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no
+#'   duplicated rows); string with an error message if there are errors.
+#'
+#' @keywords internal
+check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) {
+  assert_tibble(x) # to not have to think about `data.table` perf, xface
+  assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface
+  assert_character(ukey_names)
+  assert_subset(ukey_names, names(x))
+  #
+  if (nrow(x) <= 1L) {
+    TRUE
+  } else {
+    # Fast check, slow error message.
+    arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names)))
+    if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) {
+      TRUE
+    } else {
+      bad_data <- x %>%
+        group_by(across(all_of(ukey_names))) %>%
+        filter(dplyr::n() > 1) %>%
+        ungroup()
+      lines <- c(
+        cli::format_error("
+          There cannot be more than one row with the same combination of
+          {format_varnames(ukey_names)}.  Problematic rows:
+        "),
+        capture.output(bad_data),
+        cli::format_message(end_cli_message)
+      )
+      paste(collapse = "\n", lines)
+    }
+  }
+}
diff --git a/man/check_ukey_unique.Rd b/man/check_ukey_unique.Rd
new file mode 100644
index 00000000..c6306f07
--- /dev/null
+++ b/man/check_ukey_unique.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{check_ukey_unique}
+\alias{check_ukey_unique}
+\title{Check that a unique key is indeed unique in a tibble (TRUE/str)}
+\usage{
+check_ukey_unique(x, ukey_names, end_cli_message = character())
+}
+\arguments{
+\item{x}{a tibble, with no particular row or column order (if you have a
+guaranteed row order based on the ukey you can probably do something more
+efficient)}
+
+\item{ukey_names}{character vector; subset of column names of \code{x} denoting a
+unique key.}
+
+\item{end_cli_message}{optional character vector, a cli message format
+string/vector; information/advice to tack onto any error messages.}
+}
+\value{
+\code{TRUE} if no ukey is duplicated (i.e., \code{x[ukey_names]} has no
+duplicated rows); string with an error message if there are errors.
+}
+\description{
+A \code{checkmate}-style check function.
+}
+\keyword{internal}
diff --git a/man/unwrap.Rd b/man/unwrap.Rd
new file mode 100644
index 00000000..dad0b441
--- /dev/null
+++ b/man/unwrap.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{unwrap}
+\alias{unwrap}
+\title{Extract singular element of a length-1 unnamed list (validated)}
+\usage{
+unwrap(x)
+}
+\arguments{
+\item{x}{a length-1 list}
+}
+\value{
+x[\link{1L}], if x actually was a length-1 list; error otherwise
+}
+\description{
+Inverse of \code{list(elt)}.
+}
+\keyword{internal}
diff --git a/tests/testthat/_snaps/epi_df.md b/tests/testthat/_snaps/epi_df.md
new file mode 100644
index 00000000..29280bf8
--- /dev/null
+++ b/tests/testthat/_snaps/epi_df.md
@@ -0,0 +1,15 @@
+# as_epi_df errors on nonunique epikeytime
+
+    Code
+      as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5)
+    Condition
+      Error:
+      ! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value.  Problematic rows:
+      # A tibble: 2 x 3
+        geo_value time_value value
+            <dbl>      <dbl> <int>
+      1         1          1     1
+      2         1          1     2
+      > If this is line list data, convert it to counts/rates first.
+      > If this contains a demographic breakdown, check that you have specified appropriate `other_keys`.
+
diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R
index 44bb62e2..c3e51aa2 100644
--- a/tests/testthat/test-epi_df.R
+++ b/tests/testthat/test-epi_df.R
@@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", {
   expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol")))
 })
 
+test_that("as_epi_df errors on nonunique epikeytime", {
+  expect_snapshot(
+    as_epi_df(tibble::tibble(
+      geo_value = 1, time_value = 1, value = 1:2
+    ), as_of = 5),
+    error = TRUE
+  )
+  expect_no_error(
+    as_epi_df(tibble::tibble(
+      geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2
+    ), other_keys = "age_group", as_of = 5)
+  )
+})
+
 test_that("as_epi_df works for nonstandard input", {
   tib <- tibble::tibble(
     x = 1:10, y = 1:10,