Skip to content

Commit

Permalink
Bring back slide unique-key checks, make as_epi_df ukey checks faster
Browse files Browse the repository at this point in the history
  • Loading branch information
brookslogan committed Nov 26, 2024
1 parent 0352d7b commit 95b485f
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 23 deletions.
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,16 @@ importFrom(checkmate,assert)
importFrom(checkmate,assert_character)
importFrom(checkmate,assert_class)
importFrom(checkmate,assert_data_frame)
importFrom(checkmate,assert_false)
importFrom(checkmate,assert_function)
importFrom(checkmate,assert_int)
importFrom(checkmate,assert_list)
importFrom(checkmate,assert_logical)
importFrom(checkmate,assert_numeric)
importFrom(checkmate,assert_scalar)
importFrom(checkmate,assert_string)
importFrom(checkmate,assert_subset)
importFrom(checkmate,assert_tibble)
importFrom(checkmate,checkInt)
importFrom(checkmate,check_atomic)
importFrom(checkmate,check_data_frame)
Expand Down Expand Up @@ -165,6 +168,7 @@ importFrom(dplyr,groups)
importFrom(dplyr,if_all)
importFrom(dplyr,if_any)
importFrom(dplyr,if_else)
importFrom(dplyr,is_grouped_df)
importFrom(dplyr,lag)
importFrom(dplyr,mutate)
importFrom(dplyr,near)
Expand Down Expand Up @@ -236,3 +240,4 @@ importFrom(tsibble,as_tsibble)
importFrom(utils,capture.output)
importFrom(utils,tail)
importFrom(vctrs,vec_data)
importFrom(vctrs,vec_equal)
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
additional arguments for specifying names: `.prefix`, `.suffix`,
`.new_col_names`. To obtain the old naming behavior, use `.prefix =
"slide_value_"`.
- `as_epi_df` now removes any grouping that `x` had applied.

## Improvements

Expand All @@ -31,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
by `geo_value` and any `other_keys` for the slide operation rather than raise
an error about duplicated time values. `epi_slide`'s analogous automatic
grouping has been made temporary in order to match.
- Improved speed of key-uniqueness checks.

## Bug fixes

Expand Down
20 changes: 9 additions & 11 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -279,22 +279,20 @@ as_epi_df.tbl_df <- function(
}

assert_character(other_keys)
assert_subset(other_keys, names(x))
# Fix up if given more than just other keys, at least until epipredict#428
# merged:
other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")]

if (".time_value_counts" %in% other_keys) {
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
}

if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) {
duplicated_time_values <- x %>%
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
filter(dplyr::n() > 1) %>%
ungroup()
bad_data <- capture.output(duplicated_time_values)
cli_abort(
"as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
body = c("Sample groups:", bad_data)
)
}
assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c(
">" = "If this is line list data, convert it to counts/rates first.",
">" = "If this contains a demographic breakdown, check that you have
specified appropriate `other_keys`" # . from checkmate
)))

new_epi_df(x, geo_type, time_type, as_of, other_keys)
}
Expand Down
6 changes: 6 additions & 0 deletions R/epiprocess-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,26 @@
#' @import epidatasets
#' @importFrom checkmate anyInfinite anyMissing assert assert_character
#' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
#' @importFrom checkmate assert_false
#' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
#' @importFrom checkmate assert_string
#' @importFrom checkmate assert_subset
#' @importFrom checkmate assert_tibble
#' @importFrom checkmate check_atomic check_data_frame expect_class test_int
#' @importFrom checkmate check_names
#' @importFrom checkmate test_subset test_set_equal vname
#' @importFrom cli cli_abort cli_warn
#' @importFrom data.table as.data.table
#' @importFrom data.table key
#' @importFrom data.table setkeyv
#' @importFrom dplyr arrange
#' @importFrom dplyr is_grouped_df
#' @importFrom dplyr select
#' @importFrom lifecycle deprecated
#' @importFrom rlang %||%
#' @importFrom rlang is_bare_integerish
#' @importFrom vctrs vec_data
#' @importFrom vctrs vec_equal
## usethis namespace: end
NULL

Expand Down
16 changes: 4 additions & 12 deletions R/slide.R
Original file line number Diff line number Diff line change
Expand Up @@ -259,18 +259,7 @@ epi_slide <- function(
assert_logical(.all_rows, len = 1)

# Check for duplicated time values within groups
duplicated_time_values <- .x %>%
group_epi_df() %>%
filter(dplyr::n() > 1) %>%
ungroup()
if (nrow(duplicated_time_values) > 0) {
bad_data <- capture.output(duplicated_time_values)
cli_abort(
"as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
epi_df requires a unique time_value per group.",
body = c("Sample groups:", bad_data)
)
}
assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))

# Begin handling completion. This will create a complete time index between
# the smallest and largest time values in the data. This is used to ensure
Expand Down Expand Up @@ -752,6 +741,9 @@ epi_slide_opt <- function(
)
}

# Check for duplicated time values within groups
assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value")))

# The position of a given column can be differ between input `.x` and
# `.data_group` since the grouping step by default drops grouping columns.
# To avoid rerunning `eval_select` for every `.data_group`, convert
Expand Down
54 changes: 54 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,61 @@ time_type_unit_abbr <- function(time_type) {
maybe_unit_abbr
}

#' Extract singular element of a length-1 unnamed list (validated)
#'
#' Inverse of `list(elt)`.
#'
#' @param x a length-1 list
#' @return x[[1L]], if x actually was a length-1 list; error otherwise
#'
#' @keywords internal
unwrap <- function(x) {
checkmate::assert_list(x, len = 1L, names = "unnamed")
x[[1L]]
}

#' Check that a unique key is indeed unique in a tibble (TRUE/str)
#'
#' A `checkmate`-style check function.
#'
#' @param x a tibble, with no particular row or column order (if you have a
#' guaranteed row order based on the ukey you can probably do something more
#' efficient)
#' @param ukey_names character vector; subset of column names of `x` denoting a
#' unique key.
#' @param end_cli_message optional character vector, a cli message format
#' string/vector; information/advice to tack onto any error messages.
#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no
#' duplicated rows); string with an error message if there are errors.
#'
#' @keywords internal
check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) {
assert_tibble(x) # to not have to think about `data.table` perf, xface
assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface
assert_character(ukey_names)
assert_subset(ukey_names, names(x))
#
if (nrow(x) <= 1L) {
TRUE
} else {
# Fast check, slow error message.
arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names)))
if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) {
TRUE
} else {
bad_data <- x %>%
group_by(across(all_of(ukey_names))) %>%
filter(dplyr::n() > 1) %>%
ungroup()
lines <- c(
cli::format_error("
There cannot be more than one row with the same combination of
{format_varnames(ukey_names)}. Problematic rows:
"),
capture.output(bad_data),
cli::format_message(end_cli_message)
)
paste(collapse = "\n", lines)
}
}
}
27 changes: 27 additions & 0 deletions man/check_ukey_unique.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/unwrap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions tests/testthat/_snaps/epi_df.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# as_epi_df errors on nonunique epikeytime

Code
as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5)
Condition
Error:
! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value. Problematic rows:
# A tibble: 2 x 3
geo_value time_value value
<dbl> <dbl> <int>
1 1 1 1
2 1 1 2
> If this is line list data, convert it to counts/rates first.
> If this contains a demographic breakdown, check that you have specified appropriate `other_keys`.

14 changes: 14 additions & 0 deletions tests/testthat/test-epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", {
expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol")))
})

test_that("as_epi_df errors on nonunique epikeytime", {
expect_snapshot(
as_epi_df(tibble::tibble(
geo_value = 1, time_value = 1, value = 1:2
), as_of = 5),
error = TRUE
)
expect_no_error(
as_epi_df(tibble::tibble(
geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2
), other_keys = "age_group", as_of = 5)
)
})

test_that("as_epi_df works for nonstandard input", {
tib <- tibble::tibble(
x = 1:10, y = 1:10,
Expand Down

0 comments on commit 95b485f

Please sign in to comment.