Skip to content

Commit

Permalink
Merge pull request #484 from cmu-delphi/lcb/fix-guess-period-datetimes
Browse files Browse the repository at this point in the history
Fix guess_period on datetimes, make it more precise + generic
  • Loading branch information
brookslogan authored Jul 19, 2024
2 parents 69ea5e4 + 8948868 commit a2c5154
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 37 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: epiprocess
Title: Tools for basic signal processing in epidemiology
Version: 0.7.13
Version: 0.7.14
Authors@R: c(
person("Jacob", "Bien", role = "ctb"),
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ S3method(group_by,grouped_epi_archive)
S3method(group_by_drop_default,grouped_epi_archive)
S3method(group_modify,epi_df)
S3method(groups,grouped_epi_archive)
S3method(guess_period,Date)
S3method(guess_period,POSIXt)
S3method(guess_period,default)
S3method(key_colnames,data.frame)
S3method(key_colnames,default)
S3method(key_colnames,epi_archive)
Expand Down Expand Up @@ -65,6 +68,7 @@ export(geo_column_names)
export(group_by)
export(group_modify)
export(growth_rate)
export(guess_period)
export(is_epi_df)
export(is_grouped_epi_archive)
export(key_colnames)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
of default conversions, see `time_column_names` for a list of columns that
will automatically be recognized and converted to `time_value` column (there
are similar functions for `geo` and `version`).
- Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would
output a huge number of `ref_time_values` spaced apart by mere seconds.

## Cleanup
- Resolved some linting messages in package checks (#468).
Expand Down
72 changes: 49 additions & 23 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -769,28 +769,54 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) {
vctrs::vec_cast(numeric_gcd, dividends)
}

#' Use max valid period as guess for `period` of `ref_time_values`
#'
#' @param ref_time_values Vector containing time-interval-like or time-like
#' data, with at least two distinct values, [`diff`]-able (e.g., a
#' `time_value` or `version` column), and should have a sensible result from
#' adding `is.numeric` versions of its `diff` result (via `as.integer` if its
#' `typeof` is `"integer"`, otherwise via `as.numeric`).
#' @param ref_time_values_arg Optional, string; name to give `ref_time_values`
#' in error messages. Defaults to quoting the expression the caller fed into
#' the `ref_time_values` argument.
#' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)`
guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) {
sorted_distinct_ref_time_values <- sort(unique(ref_time_values))
if (length(sorted_distinct_ref_time_values) < 2L) {
cli_abort("Not enough distinct values in {.code {ref_time_values_arg}} to guess the period.", ref_time_values_arg)
#' Use max valid period as guess for `period` of `time_values`
#'
#' `r lifecycle::badge("experimental")`
#'
#' @param time_values Vector containing time-interval-like or time-point-like
#' data, with at least two distinct values.
#' @param time_values_arg Optional, string; name to give `time_values` in error
#' messages. Defaults to quoting the expression the caller fed into the
#' `time_values` argument.
#' @param ... Should be empty, there to satisfy the S3 generic.
#' @return length-1 vector; `r lifecycle::badge("experimental")` class will
#' either be the same class as [`base::diff()`] on such time values, an
#' integer, or a double, such that all `time_values` can be exactly obtained
#' by adding `k * result` for an integer k, and such that there is no smaller
#' `result` that can achieve this.
#'
#' @export
guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
UseMethod("guess_period")
}

#' @export
guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
rlang::check_dots_empty()
sorted_distinct_time_values <- sort(unique(time_values))
if (length(sorted_distinct_time_values) < 2L) {
cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.",
class = "epiprocess__guess_period__not_enough_times",
time_values = time_values
)
}
skips <- diff(sorted_distinct_ref_time_values)
decayed_skips <-
if (typeof(skips) == "integer") {
as.integer(skips)
} else {
as.numeric(skips)
}
gcd_num(decayed_skips)
skips <- diff(sorted_distinct_time_values)
# Certain diff results have special classes or attributes; use vctrs to try to
# appropriately destructure for gcd_num, then restore to their original class
# & attributes.
skips_data <- vctrs::vec_data(skips)
period_data <- gcd_num(skips_data, rrtol = 0)
vctrs::vec_restore(period_data, skips)
}

# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types:

#' @export
guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
as.numeric(NextMethod(), units = "days")
}

#' @export
guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) {
as.numeric(NextMethod(), units = "secs")
}
30 changes: 17 additions & 13 deletions man/guess_period.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions tests/testthat/test-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,52 @@ test_that("as_slide_computation raises errors as expected", {
class = "epiprocess__as_slide_computation__cant_convert_catchall"
)
})

test_that("guess_period works", {
# Error cases:
expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times")
expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times")
# Different numeric classes and cases:
expect_identical(guess_period(c(1, 8)), 7)
expect_identical(guess_period(c(1, 8, 15)), 7)
expect_identical(guess_period(c(1L, 8L, 15L)), 7L)
expect_identical(guess_period(c(0, 7, 14, 15)), 1)
# We currently allow the guessed frequency to not appear in the diffs, but
# this might not be a good idea as it likely indicates an issue with the data
# (#485).
expect_identical(guess_period(c(0, 2, 5)), 1)
expect_identical(guess_period(c(0, 4, 10)), 2)
# On Dates:
daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day")
weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week")
expect_identical(
daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L),
daily_dates
)
expect_identical(
weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L),
weekly_dates
)
# On POSIXcts:
daily_posixcts <- as.POSIXct(daily_dates, tz = "ET") + 3600
weekly_posixcts <- as.POSIXct(weekly_dates, tz = "ET") + 3600
expect_identical(
daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L),
daily_posixcts
)
expect_identical(
weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L),
weekly_posixcts
)
# On POSIXlts:
daily_posixlts <- as.POSIXlt(daily_dates, tz = "ET") + 3600
weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "ET") + 3600
expect_identical(
daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L),
daily_posixlts
)
expect_identical(
weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L),
weekly_posixlts
)
})

0 comments on commit a2c5154

Please sign in to comment.