From 081bd22f69c5924c9c173050a5e6a9a71cd886da Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 26 Nov 2024 04:54:03 -0800 Subject: [PATCH] Bring back slide unique-key checks, make as_epi_df ukey checks faster --- DESCRIPTION | 2 +- NAMESPACE | 5 +++ NEWS.md | 2 ++ R/epi_df.R | 20 ++++++------ R/epiprocess-package.R | 6 ++++ R/slide.R | 16 +++------- R/utils.R | 54 +++++++++++++++++++++++++++++++++ man/check_ukey_unique.Rd | 27 +++++++++++++++++ man/unwrap.Rd | 18 +++++++++++ tests/testthat/_snaps/epi_df.md | 15 +++++++++ tests/testthat/test-epi_df.R | 14 +++++++++ 11 files changed, 155 insertions(+), 24 deletions(-) create mode 100644 man/check_ukey_unique.Rd create mode 100644 man/unwrap.Rd create mode 100644 tests/testthat/_snaps/epi_df.md diff --git a/DESCRIPTION b/DESCRIPTION index fd09aa57..c3f67c08 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: epiprocess Type: Package Title: Tools for basic signal processing in epidemiology -Version: 0.9.6 +Version: 0.9.7 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index e214d8f3..1f5180fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -109,6 +109,7 @@ importFrom(checkmate,assert) importFrom(checkmate,assert_character) importFrom(checkmate,assert_class) importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_false) importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) @@ -116,6 +117,8 @@ importFrom(checkmate,assert_logical) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) importFrom(checkmate,assert_string) +importFrom(checkmate,assert_subset) +importFrom(checkmate,assert_tibble) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) @@ -165,6 +168,7 @@ importFrom(dplyr,groups) importFrom(dplyr,if_all) importFrom(dplyr,if_any) importFrom(dplyr,if_else) +importFrom(dplyr,is_grouped_df) importFrom(dplyr,lag) importFrom(dplyr,mutate) importFrom(dplyr,near) @@ -236,3 +240,4 @@ importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) importFrom(vctrs,vec_data) +importFrom(vctrs,vec_equal) diff --git a/NEWS.md b/NEWS.md index 436b0c7d..ba6826da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat additional arguments for specifying names: `.prefix`, `.suffix`, `.new_col_names`. To obtain the old naming behavior, use `.prefix = "slide_value_"`. +- `as_epi_df` now removes any grouping that `x` had applied. ## Improvements @@ -31,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat by `geo_value` and any `other_keys` for the slide operation rather than raise an error about duplicated time values. `epi_slide`'s analogous automatic grouping has been made temporary in order to match. +- Improved speed of key-uniqueness checks. ## Bug fixes diff --git a/R/epi_df.R b/R/epi_df.R index 83cca073..6cae22dd 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -279,22 +279,20 @@ as_epi_df.tbl_df <- function( } assert_character(other_keys) + assert_subset(other_keys, names(x)) + # Fix up if given more than just other keys, at least until epipredict#428 + # merged: + other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")] if (".time_value_counts" %in% other_keys) { cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") } - if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) { - duplicated_time_values <- x %>% - group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% - filter(dplyr::n() > 1) %>% - ungroup() - bad_data <- capture.output(duplicated_time_values) - cli_abort( - "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.", - body = c("Sample groups:", bad_data) - ) - } + assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c( + ">" = "If this is line list data, convert it to counts/rates first.", + ">" = "If this contains a demographic breakdown, check that you have + specified appropriate `other_keys`" # . from checkmate + ))) new_epi_df(x, geo_type, time_type, as_of, other_keys) } diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 68b7b9b5..675d000d 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -5,8 +5,11 @@ #' @import epidatasets #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list +#' @importFrom checkmate assert_false #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt #' @importFrom checkmate assert_string +#' @importFrom checkmate assert_subset +#' @importFrom checkmate assert_tibble #' @importFrom checkmate check_atomic check_data_frame expect_class test_int #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname @@ -14,11 +17,14 @@ #' @importFrom data.table as.data.table #' @importFrom data.table key #' @importFrom data.table setkeyv +#' @importFrom dplyr arrange +#' @importFrom dplyr is_grouped_df #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom rlang %||% #' @importFrom rlang is_bare_integerish #' @importFrom vctrs vec_data +#' @importFrom vctrs vec_equal ## usethis namespace: end NULL diff --git a/R/slide.R b/R/slide.R index 0c747d17..7342e0fd 100644 --- a/R/slide.R +++ b/R/slide.R @@ -259,18 +259,7 @@ epi_slide <- function( assert_logical(.all_rows, len = 1) # Check for duplicated time values within groups - duplicated_time_values <- .x %>% - group_epi_df() %>% - filter(dplyr::n() > 1) %>% - ungroup() - if (nrow(duplicated_time_values) > 0) { - bad_data <- capture.output(duplicated_time_values) - cli_abort( - "as_epi_df: some groups in a resulting dplyr computation have duplicated time values. - epi_df requires a unique time_value per group.", - body = c("Sample groups:", bad_data) - ) - } + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) # Begin handling completion. This will create a complete time index between # the smallest and largest time values in the data. This is used to ensure @@ -752,6 +741,9 @@ epi_slide_opt <- function( ) } + # Check for duplicated time values within groups + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) + # The position of a given column can be differ between input `.x` and # `.data_group` since the grouping step by default drops grouping columns. # To avoid rerunning `eval_select` for every `.data_group`, convert diff --git a/R/utils.R b/R/utils.R index 55022677..e350ade2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1195,7 +1195,61 @@ time_type_unit_abbr <- function(time_type) { maybe_unit_abbr } +#' Extract singular element of a length-1 unnamed list (validated) +#' +#' Inverse of `list(elt)`. +#' +#' @param x a length-1 list +#' @return x[[1L]], if x actually was a length-1 list; error otherwise +#' +#' @keywords internal unwrap <- function(x) { checkmate::assert_list(x, len = 1L, names = "unnamed") x[[1L]] } + +#' Check that a unique key is indeed unique in a tibble (TRUE/str) +#' +#' A `checkmate`-style check function. +#' +#' @param x a tibble, with no particular row or column order (if you have a +#' guaranteed row order based on the ukey you can probably do something more +#' efficient) +#' @param ukey_names character vector; subset of column names of `x` denoting a +#' unique key. +#' @param end_cli_message optional character vector, a cli message format +#' string/vector; information/advice to tack onto any error messages. +#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no +#' duplicated rows); string with an error message if there are errors. +#' +#' @keywords internal +check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) { + assert_tibble(x) # to not have to think about `data.table` perf, xface + assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface + assert_character(ukey_names) + assert_subset(ukey_names, names(x)) + # + if (nrow(x) <= 1L) { + TRUE + } else { + # Fast check, slow error message. + arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names))) + if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) { + TRUE + } else { + bad_data <- x %>% + group_by(across(all_of(ukey_names))) %>% + filter(dplyr::n() > 1) %>% + ungroup() + lines <- c( + cli::format_error(" + There cannot be more than one row with the same combination of + {format_varnames(ukey_names)}. Problematic rows: + "), + capture.output(bad_data), + cli::format_message(end_cli_message) + ) + paste(collapse = "\n", lines) + } + } +} diff --git a/man/check_ukey_unique.Rd b/man/check_ukey_unique.Rd new file mode 100644 index 00000000..c6306f07 --- /dev/null +++ b/man/check_ukey_unique.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{check_ukey_unique} +\alias{check_ukey_unique} +\title{Check that a unique key is indeed unique in a tibble (TRUE/str)} +\usage{ +check_ukey_unique(x, ukey_names, end_cli_message = character()) +} +\arguments{ +\item{x}{a tibble, with no particular row or column order (if you have a +guaranteed row order based on the ukey you can probably do something more +efficient)} + +\item{ukey_names}{character vector; subset of column names of \code{x} denoting a +unique key.} + +\item{end_cli_message}{optional character vector, a cli message format +string/vector; information/advice to tack onto any error messages.} +} +\value{ +\code{TRUE} if no ukey is duplicated (i.e., \code{x[ukey_names]} has no +duplicated rows); string with an error message if there are errors. +} +\description{ +A \code{checkmate}-style check function. +} +\keyword{internal} diff --git a/man/unwrap.Rd b/man/unwrap.Rd new file mode 100644 index 00000000..dad0b441 --- /dev/null +++ b/man/unwrap.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{unwrap} +\alias{unwrap} +\title{Extract singular element of a length-1 unnamed list (validated)} +\usage{ +unwrap(x) +} +\arguments{ +\item{x}{a length-1 list} +} +\value{ +x[\link{1L}], if x actually was a length-1 list; error otherwise +} +\description{ +Inverse of \code{list(elt)}. +} +\keyword{internal} diff --git a/tests/testthat/_snaps/epi_df.md b/tests/testthat/_snaps/epi_df.md new file mode 100644 index 00000000..29280bf8 --- /dev/null +++ b/tests/testthat/_snaps/epi_df.md @@ -0,0 +1,15 @@ +# as_epi_df errors on nonunique epikeytime + + Code + as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5) + Condition + Error: + ! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value. Problematic rows: + # A tibble: 2 x 3 + geo_value time_value value + + 1 1 1 1 + 2 1 1 2 + > If this is line list data, convert it to counts/rates first. + > If this contains a demographic breakdown, check that you have specified appropriate `other_keys`. + diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 44bb62e2..c3e51aa2 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", { expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol"))) }) +test_that("as_epi_df errors on nonunique epikeytime", { + expect_snapshot( + as_epi_df(tibble::tibble( + geo_value = 1, time_value = 1, value = 1:2 + ), as_of = 5), + error = TRUE + ) + expect_no_error( + as_epi_df(tibble::tibble( + geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2 + ), other_keys = "age_group", as_of = 5) + ) +}) + test_that("as_epi_df works for nonstandard input", { tib <- tibble::tibble( x = 1:10, y = 1:10,