From 7a50d9d43c4cf8feaf9f8fdf445de94de7317a7a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Jun 2024 18:47:12 -0700 Subject: [PATCH] refactor(time_types): refactor time types * guess time_type of "day", "week", "yearmonth", "integer" from time column and warn if not compatible * restrict `before` and `after` to types compatible with time column * deprecate geo_type and time_type constructor arguments, infer only * improve documentation on geo and time types * enforce time_value and version being same type in epi_archive * update vignettes * move arg validation from new_epi_df to as_epi_df.tbl_df to match epi_archive a --- NAMESPACE | 1 + NEWS.md | 19 +- R/archive.R | 114 ++- R/epi_df.R | 250 +++--- R/epiprocess.R | 1 + R/grouped_epi_archive.R | 31 +- R/methods-epi_archive.R | 40 +- R/methods-epi_df.R | 18 +- R/slide.R | 240 ++---- R/utils.R | 102 ++- man-roxygen/basic-slide-details.R | 34 + man-roxygen/basic-slide-params.R | 41 +- man-roxygen/epi_df-params.R | 12 +- man-roxygen/opt-slide-details.R | 31 +- man/as_epi_df.Rd | 148 ---- man/epi_archive.Rd | 88 +- man/epi_df.Rd | 178 +++- man/epi_slide.Rd | 79 +- man/epi_slide_mean.Rd | 75 +- man/epi_slide_opt.Rd | 75 +- man/epi_slide_sum.Rd | 75 +- man/epix_as_of.Rd | 2 +- man/epix_slide.Rd | 21 +- man/new_epi_df.Rd | 45 - tests/testthat/test-archive-version-bounds.R | 2 +- tests/testthat/test-archive.R | 63 +- tests/testthat/test-autoplot.R | 16 +- tests/testthat/test-compactify.R | 7 +- tests/testthat/test-correlation.R | 2 +- tests/testthat/test-epi_df.R | 22 +- tests/testthat/test-epi_slide.R | 788 +++++++----------- .../testthat/test-epix_fill_through_version.R | 45 +- tests/testthat/test-epix_merge.R | 130 ++- tests/testthat/test-epix_slide.R | 200 ++--- tests/testthat/test-methods-epi_df.R | 4 +- tests/testthat/test-utils.R | 56 +- vignettes/aggregation.Rmd | 33 - vignettes/archive.Rmd | 5 +- vignettes/epiprocess.Rmd | 30 +- 39 files changed, 1326 insertions(+), 1797 deletions(-) create mode 100644 man-roxygen/basic-slide-details.R delete mode 100644 man/as_epi_df.Rd delete mode 100644 man/new_epi_df.Rd diff --git a/NAMESPACE b/NAMESPACE index e571858f..f8610226 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -102,6 +102,7 @@ importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) importFrom(checkmate,check_names) importFrom(checkmate,expect_class) +importFrom(checkmate,test_int) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) diff --git a/NEWS.md b/NEWS.md index 3d7ea718..e186b8fe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,9 +5,10 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat # epiprocess 0.8 ## Breaking changes + - `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use `detect_outlr_stl(seasonal_period = , seasonal_as_residual = TRUE)` - instead. See `?detect_outlr_stl` for more details. + instead. See `?detect_outlr_stl` for more details. ## Improvements @@ -49,15 +50,23 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat output a huge number of `ref_time_values` spaced apart by mere seconds. ## Cleanup -- Resolved some linting messages in package checks (#468). -## Cleanup +- Resolved some linting messages in package checks (#468). - Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471). - -## Cleanup - Removed some external package dependencies. +## Breaking Changes + +- `epi_df`'s are now more strict about what types they allow in the time column. + Namely, we are explicit about only supporting `Date` at the daily and weekly + cadence and generic integer types (for yearly cadence). +- `epi_slide` `before` and `after` arguments are now require the user to + specific time units in certain cases. The `time_step` argument has been + removed. +- `epix_slide` `before` argument now defaults to `Inf`, and requires the user to + specify units in some cases. The `time_step` argument has been removed. + # epiprocess 0.7.0 ## Breaking changes: diff --git a/R/archive.R b/R/archive.R index 780279b0..052f2776 100644 --- a/R/archive.R +++ b/R/archive.R @@ -170,11 +170,8 @@ NULL #' The data table `DT` has key variables `geo_value`, `time_value`, `version`, #' as well as any others (these can be specified when instantiating the #' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for `as_epi_archive()` for -#' information and examples of relevant parameter names for an `epi_archive` -#' object. Note that there can only be a single row per unique combination of -#' key variables, and thus the key variables are critical for figuring out how -#' to generate a snapshot of data from the archive, as of a given version. +#' on `DT` directly). Note that there can only be a single row per unique +#' combination of key variables. #' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` @@ -184,18 +181,15 @@ NULL #' * `time_type`: the type for the time values. #' * `additional_metadata`: list of additional metadata for the data archive. #' -#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be -#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, -#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the -#' metadata of an `epi_archive` object are not currently used by any -#' downstream functions in the `epiprocess` package, and serve only as useful -#' bits of information to convey about the data set at hand. +#' While this metadata is not protected, it is generally recommended to treat it +#' as read-only, and to use the `epi_archive` methods to interact with the data +#' archive. Unexpected behavior may result from modifying the metadata +#' directly. #' #' @section Generating Snapshots: #' An `epi_archive` object can be used to generate a snapshot of the data in -#' `epi_df` format, which represents the most up-to-date values of the signal -#' variables, as of the specified version. This is accomplished by calling -#' `epix_as_of()`. +#' `epi_df` format, which represents the most up-to-date time series values up +#' to a point in time. This is accomplished by calling `epix_as_of()`. #' #' @section Sliding Computations: #' We can run a sliding computation over an `epi_archive` object, much like @@ -208,19 +202,18 @@ NULL #' #' @param x A data.frame, data.table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". +#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the +#' location column and set to "custom" if not recognized. +#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time +#' column and set to "custom" if not recognized. Unpredictable behavior may result +#' if the time type is not recognized. #' @param other_keys Character vector specifying the names of variables in `x` #' that should be considered key variables (in the language of `data.table`) #' apart from "geo_value", "time_value", and "version". #' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`. `TRUE` will remove some +#' `epi_archive` object. The metadata will have the `geo_type` field; named +#' entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean. `TRUE` will remove some #' redundant rows, `FALSE` will not, and missing or `NULL` will remove #' redundant rows, but issue a warning. See more information at `compactify`. #' @param clobberable_versions_start Optional; `length`-1; either a value of the @@ -269,10 +262,7 @@ NULL #' value = rnorm(10, mean = 2, sd = 1) #' ) #' -#' toy_epi_archive <- tib %>% as_epi_archive( -#' geo_type = "state", -#' time_type = "day" -#' ) +#' toy_epi_archive <- tib %>% as_epi_archive() #' toy_epi_archive #' #' # Ex. with an additional key for county @@ -295,21 +285,17 @@ NULL #' cases_rate = c(0.01, 0.02, 0.01, 0.05) #' ) #' -#' x <- df %>% as_epi_archive( -#' geo_type = "state", -#' time_type = "day", -#' other_keys = "county" -#' ) +#' x <- df %>% as_epi_archive(other_keys = "county") #' new_epi_archive <- function( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL) { + geo_type, + time_type, + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end) { # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -398,13 +384,11 @@ new_epi_archive <- function( #' @export validate_epi_archive <- function( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL) { + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end) { # Finish off with small checks on keys variables and metadata if (!test_subset(other_keys, names(x))) { cli_abort("`other_keys` must be contained in the column names of `x`.") @@ -413,12 +397,20 @@ validate_epi_archive <- function( cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\" or \"time_type\".") } # Conduct checks and apply defaults for `compactify` assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) + # Make sure `time_value` and `version` have the same time type + if (!identical(class(x[["time_value"]]), class(x[["version"]]))) { + cli_abort( + "`time_value` and `version` must have the same class.", + class = "epiprocess__time_value_version_mismatch" + ) + } + # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) @@ -453,13 +445,13 @@ validate_epi_archive <- function( #' @export as_epi_archive <- function( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, + geo_type = deprecated(), + time_type = deprecated(), + other_keys = character(0L), + additional_metadata = list(), compactify = NULL, - clobberable_versions_start = NULL, - .versions_end = NULL, ..., + clobberable_versions_start = NA, + .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end) { assert_data_frame(x) x <- rename(x, ...) @@ -477,16 +469,18 @@ as_epi_archive <- function( if (anyMissing(x$version)) { cli_abort("Column `version` must not contain missing values.") } + if (lifecycle::is_present(geo_type)) { + cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") + } + if (lifecycle::is_present(time_type)) { + cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.") + } - geo_type <- geo_type %||% guess_geo_type(x$geo_value) - time_type <- time_type %||% guess_time_type(x$time_value) - other_keys <- other_keys %||% character(0L) - additional_metadata <- additional_metadata %||% list() - clobberable_versions_start <- clobberable_versions_start %||% NA - versions_end <- versions_end %||% max_version_with_row_in(x) + geo_type <- guess_geo_type(x$geo_value) + time_type <- guess_time_type(x$time_value) validate_epi_archive( - x, geo_type, time_type, other_keys, additional_metadata, + x, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end ) new_epi_archive( diff --git a/R/epi_df.R b/R/epi_df.R index 707944f6..37f26b87 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -1,30 +1,39 @@ -#' @title `epi_df` object +#' `epi_df` object #' -#' @description An `epi_df` is a tibble with certain minimal column structure -#' and metadata. It can be seen as a snapshot of a data set that contains the -#' most up-to-date values of some signal variables of interest, as of a given -#' time. +#' An `epi_df` is a tibble with certain minimal column structure and metadata. +#' It can be seen as a snapshot of a data set that contains the most +#' up-to-date values of some signal variables of interest, as of a given time. #' #' @details An `epi_df` is a tibble with (at least) the following columns: #' -#' * `geo_value`: the geographic value associated with each row of measurements. -#' * `time_value`: the time value associated with each row of measurements. +#' - `geo_value`: A character vector representing the geographical unit of +#' observation. This could be a country code, a state name, a county code, +#' etc. +#' - `time_value`: A date or integer vector representing the time of observation. #' #' Other columns can be considered as measured variables, which we also refer to #' as signal variables. An `epi_df` object also has metadata with (at least) #' the following fields: #' #' * `geo_type`: the type for the geo values. -#' * `time_type`: the type for the time values. #' * `as_of`: the time value at which the given data were available. #' +#' Most users should use `as_epi_df`. The input tibble `x` to the constructor +#' must contain the columns `geo_value` and `time_value`. All other columns +#' will be preserved as is, and treated as measured variables. If `as_of` is +#' missing, then the function will try to guess it from an `as_of`, `issue`, +#' or `version` column of `x` (if any of these are present), or from as an +#' `as_of` field in its metadata (stored in its attributes); if this fails, +#' then the current day-time will be used. The `new_epi_df` constructor +#' assumes its arguments have already been validated, so it should mainly be +#' used by advanced users. +#' #' Metadata for an `epi_df` object `x` can be accessed (and altered) via -#' `attributes(x)$metadata`. The first two fields in the above list, -#' `geo_type` and `time_type`, can usually be inferred from the `geo_value` -#' and `time_value` columns, respectively. They are not currently used by any -#' downstream functions in the `epiprocess` package, and serve only as useful -#' bits of information to convey about the data set at hand. More information -#' on their coding is given below. +#' `attributes(x)$metadata`. The first field in the above list, `geo_type`, +#' can usually be inferred from the `geo_value` columns. They are not +#' currently used by any downstream functions in the `epiprocess` package, +#' and serve only as useful bits of information to convey about the data set +#' at hand. More information on their coding is given below. #' #' The last field in the above list, `as_of`, is one of the most unique aspects #' of an `epi_df` object. In brief, we can think of an `epi_df` object as a @@ -61,109 +70,19 @@ #' @section Time Types: #' The following time types are recognized in an `epi_df`. #' -#' * `"day-time"`: each observation corresponds to a time on a given day -#' (measured to the second); coded as a `POSIXct` object, as in -#' `as.POSIXct("2022-01-31 18:45:40")`. #' * `"day"`: each observation corresponds to a day; coded as a `Date` object, #' as in `as.Date("2022-01-31")`. #' * `"week"`: each observation corresponds to a week; the alignment can be #' arbitrary (as to whether a week starts on a Monday, Tuesday); coded as a #' `Date` object, representing the start date of week. -#' * `"yearweek"`: each observation corresponds to a week; the alignment can be -#' arbitrary; coded as a `tsibble::yearweek` object, where the alignment is -#' stored in the `week_start` field of its attributes. #' * `"yearmonth"`: each observation corresponds to a month; coded as a #' `tsibble::yearmonth` object. -#' * `"yearquarter"`: each observation corresponds to a quarter; coded as a -#' `tsibble::yearquarter` object. -#' * `"year"`: each observation corresponds to a year; coded as an integer -#' greater than or equal to 1582. +#' * `"integer"`: a generic integer index (e.g. years or something else). #' #' An unrecognizable time type is labeled "custom". #' -#' @name epi_df -NULL - - -#' Creates an `epi_df` object -#' -#' Creates a new `epi_df` object. By default, builds an empty tibble with the -#' correct metadata for an `epi_df` object (ie. `geo_type`, `time_type`, and `as_of`). -#' Refer to the below info. about the arguments for more details. -#' -#' @template epi_df-params -#' -#' @export -new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list()) { - assert_data_frame(x) - assert_list(additional_metadata) - - additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) - - # If geo type is missing, then try to guess it - if (missing(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (missing(time_type)) { - time_type <- guess_time_type(x$time_value) - } - - # If as_of is missing, then try to guess it - if (missing(as_of)) { - # First check the metadata for an as_of field - if ( - "metadata" %in% names(attributes(x)) && - "as_of" %in% names(attributes(x)$metadata) - ) { - as_of <- attributes(x)$metadata$as_of - } else if ("as_of" %in% names(x)) { - # Next check for as_of, issue, or version columns - as_of <- max(x$as_of) - } else if ("issue" %in% names(x)) { - as_of <- max(x$issue) - } else if ("version" %in% names(x)) { - as_of <- max(x$version) - } else { - # If we got here then we failed - as_of <- Sys.time() - } # Use the current day-time - } - - # Define metadata fields - metadata <- list() - metadata$geo_type <- geo_type - metadata$time_type <- time_type - metadata$as_of <- as_of - metadata <- c(metadata, additional_metadata) - - # Reorder columns (geo_value, time_value, ...) - if (sum(dim(x)) != 0) { - cols_to_put_first <- c("geo_value", "time_value") - x <- x[, c( - cols_to_put_first, - # All other columns - names(x)[!(names(x) %in% cols_to_put_first)] - )] - } - - # Apply epi_df class, attach metadata, and return - class(x) <- c("epi_df", class(x)) - attributes(x)$metadata <- metadata - return(x) -} - -#' Convert to `epi_df` format -#' -#' Converts a data frame or tibble into an `epi_df` object. See the [getting -#' started -#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for -#' examples. -#' -#' @param ... Additional arguments passed to methods. #' @template epi_df-params +#' @rdname epi_df #' #' @export #' @examples @@ -186,11 +105,10 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' #' # The `other_keys` metadata (`"county_code"` in this case) is automatically #' # inferred from the `tsibble`'s `key`: -#' ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") +#' ex1 <- as_epi_df(x = ex1_input, as_of = "2020-06-03") #' attr(ex1, "metadata")[["other_keys"]] #' #' -#' #' # Dealing with misspecified column names: #' # Geographical and temporal information must be provided in columns named #' # `geo_value` and `time_value`; if we start from a data frame with a @@ -211,14 +129,13 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' ex2 <- ex2_input %>% #' dplyr::rename(geo_value = state, time_value = reported_date) %>% #' as_epi_df( -#' geo_type = "state", as_of = "2020-06-03", +#' as_of = "2020-06-03", #' additional_metadata = list(other_keys = "pol") #' ) #' #' attr(ex2, "metadata") #' #' -#' #' # Adding additional keys to an `epi_df` object #' #' ex3_input <- jhu_csse_county_level_subset %>% @@ -237,36 +154,57 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) #' #' attr(ex3, "metadata") +new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, + additional_metadata = list()) { + # Define metadata fields + metadata <- list() + metadata$geo_type <- geo_type + metadata$time_type <- time_type + metadata$as_of <- as_of + metadata <- c(metadata, additional_metadata) + + # Reorder columns (geo_value, time_value, ...) + if (sum(dim(x)) != 0) { + cols_to_put_first <- c("geo_value", "time_value") + x <- x[, c( + cols_to_put_first, + # All other columns + names(x)[!(names(x) %in% cols_to_put_first)] + )] + } + + # Apply epi_df class, attach metadata, and return + class(x) <- c("epi_df", class(x)) + attributes(x)$metadata <- metadata + return(x) +} + +#' @rdname epi_df +#' @export as_epi_df <- function(x, ...) { UseMethod("as_epi_df") } #' @method as_epi_df epi_df -#' @describeIn as_epi_df Simply returns the `epi_df` object unchanged. +#' @rdname epi_df #' @export as_epi_df.epi_df <- function(x, ...) { return(x) } #' @method as_epi_df tbl_df -#' @describeIn as_epi_df The input tibble `x` must contain the columns -#' `geo_value` and `time_value`, or column names that uniquely map onto these -#' (e.g. `date` or `province`). Alternatively, you can specify the conversion -#' explicitly (`time_value = someWeirdColumnName`). All other columns not -#' specified as `other_keys` will be preserved as is, and treated as measured -#' variables. -#' -#' If `as_of` is missing, then the function will try to guess it from an -#' `as_of`, `issue`, or `version` column of `x` (if any of these are present), -#' or from as an `as_of` field in its metadata (stored in its attributes); if -#' this fails, then the current day-time will be used. +#' @rdname epi_df #' @importFrom rlang .data #' @importFrom tidyselect any_of #' @importFrom cli cli_inform #' @export -as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), - ...) { +as_epi_df.tbl_df <- function( + x, + geo_type = deprecated(), + time_type = deprecated(), + as_of, + additional_metadata = list(), + ...) { # possible standard substitutions for time_value x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) @@ -280,41 +218,61 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, ) } - new_epi_df( - x, geo_type, time_type, as_of, - additional_metadata - ) + if (lifecycle::is_present(geo_type)) { + cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") + } + if (lifecycle::is_present(time_type)) { + cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.") + } + + # If geo type is missing, then try to guess it + geo_type <- guess_geo_type(x$geo_value) + time_type <- guess_time_type(x$time_value) + + # If as_of is missing, then try to guess it + if (missing(as_of)) { + # First check the metadata for an as_of field + if ( + "metadata" %in% names(attributes(x)) && + "as_of" %in% names(attributes(x)$metadata) + ) { + as_of <- attributes(x)$metadata$as_of + } else if ("as_of" %in% names(x)) { + # Next check for as_of, issue, or version columns + as_of <- max(x$as_of) + } else if ("issue" %in% names(x)) { + as_of <- max(x$issue) + } else if ("version" %in% names(x)) { + as_of <- max(x$version) + } else { + # If we got here then we failed + as_of <- Sys.time() + } # Use the current day-time + } + + assert_list(additional_metadata) + additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) + new_epi_df(x, geo_type, time_type, as_of, additional_metadata) } #' @method as_epi_df data.frame -#' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`. +#' @rdname epi_df #' @export -as_epi_df.data.frame <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { - as_epi_df.tbl_df( - tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ... - ) +as_epi_df.data.frame <- function(x, as_of, additional_metadata = list(), ...) { + as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...) } #' @method as_epi_df tbl_ts -#' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`, except that -#' the `tbl_ts` class is dropped, and any key variables (other than -#' "geo_value") are added to the metadata of the returned object, under the -#' `other_keys` field. +#' @rdname epi_df #' @export -as_epi_df.tbl_ts <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { +as_epi_df.tbl_ts <- function(x, as_of, additional_metadata = list(), ...) { tsibble_other_keys <- setdiff(tsibble::key_vars(x), "geo_value") if (length(tsibble_other_keys) != 0) { additional_metadata$other_keys <- unique( c(additional_metadata$other_keys, tsibble_other_keys) ) } - as_epi_df.tbl_df( - tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ... - ) + as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...) } #' Test for `epi_df` format diff --git a/R/epiprocess.R b/R/epiprocess.R index 40c3ce8a..ba072a2d 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -8,6 +8,7 @@ #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic #' anyInfinite test_subset test_set_equal checkInt expect_class +#' test_int #' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% #' @name epiprocess diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index c6326751..d0418eea 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -204,10 +204,16 @@ ungroup.grouped_epi_archive <- function(x, ...) { #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg #' @export -epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { +epix_slide.grouped_epi_archive <- function( + x, + f, + ..., + before = Inf, + ref_time_values = NULL, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE) { # Perform some deprecated argument checks without using ` = # deprecated()` in the function signature, because they are from # early development versions and much more likely to be clutter than @@ -231,7 +237,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } - if (missing(ref_time_values)) { + if (is.null(ref_time_values)) { ref_time_values <- epix_slide_ref_time_values_default(x$private$ungrouped) } else { assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) @@ -246,20 +252,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, ref_time_values <- sort(ref_time_values) } - # Validate and pre-process `before`: - if (missing(before)) { - cli_abort("`before` is required (and must be passed by name); - if you did not want to apply a sliding window but rather - to map `epix_as_of` and `f` across various `ref_time_values`, - pass a large `before` value (e.g., if time steps are days, - `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) - - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) + validate_slide_window_arg(before, x$private$ungrouped$time_type) # Symbolize column name new_col <- sym(new_col_name) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 891cc064..8363fa2e 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -50,7 +50,7 @@ #' # (a.k.a. "hotfixed", "clobbered", etc.): #' clobberable_versions_start = max(archive_cases_dv_subset$DT$version), #' # Suppose today is the following day, and there are no updates out yet: -#' versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, +#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L, #' compactify = TRUE #' ) #' @@ -111,8 +111,6 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL tibble::as_tibble() %>% dplyr::select(-"version") %>% as_epi_df( - geo_type = x$geo_type, - time_type = x$time_type, as_of = max_version, additional_metadata = c( x$additional_metadata, @@ -271,7 +269,7 @@ epix_merge <- function(x, y, } if (!identical(x$time_type, y$time_type)) { - cli_abort("`x` and `y` must have the same `$time_type`") + cli_abort("`x` and `y` must share data type on their `time_value` column.") } if (length(x$additional_metadata) != 0L) { @@ -307,11 +305,11 @@ epix_merge <- function(x, y, y_dt <- y$DT } } else if (sync %in% c("na", "locf")) { - new_versions_end <- max(x$versions_end, y$versions_end) + new_versions_end <- max(c(x$versions_end, y$versions_end)) x_dt <- epix_fill_through_version(x, new_versions_end, sync)$DT y_dt <- epix_fill_through_version(y, new_versions_end, sync)$DT } else if (sync == "truncate") { - new_versions_end <- min(x$versions_end, y$versions_end) + new_versions_end <- min(c(x$versions_end, y$versions_end)) x_dt <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] y_dt <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] } else { @@ -450,8 +448,6 @@ epix_merge <- function(x, y, return(as_epi_archive( result_dt[], # clear data.table internal invisibility flag if set - geo_type = x$geo_type, - time_type = x$time_type, other_keys = setdiff(key(result_dt), c("geo_value", "time_value", "version")), additional_metadata = result_additional_metadata, # It'd probably be better to pre-compactify before the merge, and might be @@ -610,12 +606,6 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' set to a regularly-spaced sequence of values set to cover the range of #' `version`s in the `DT` plus the `versions_end`; the spacing of values will #' be guessed (using the GCD of the skips between values). -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. @@ -790,9 +780,8 @@ epix_slide <- function( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -803,10 +792,16 @@ epix_slide <- function( #' @rdname epix_slide #' @export -epix_slide.epi_archive <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { +epix_slide.epi_archive <- function( + x, + f, + ..., + before = Inf, + ref_time_values = NULL, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big # group (group by 0 vars), like `dplyr::summarize`, and let the # resulting `grouped_epi_archive` handle the slide: @@ -814,8 +809,7 @@ epix_slide.epi_archive <- function(x, f, ..., before, ref_time_values, group_by(x), f, ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, + before = before, ref_time_values = ref_time_values, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, all_versions = all_versions ) %>% diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index cc532021..daccabd8 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -83,7 +83,6 @@ print.epi_df <- function(x, ...) { summary.epi_df <- function(object, ...) { cat("An `epi_df` x, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", attributes(object)$metadata$geo_type)) - cat(sprintf("* %-9s = %s\n", "time_type", attributes(object)$metadata$time_type)) cat(sprintf("* %-9s = %s\n", "as_of", attributes(object)$metadata$as_of)) cat("----------\n") cat(sprintf("* %-27s = %s\n", "min time value", min(object$time_value))) @@ -118,15 +117,14 @@ decay_epi_df <- function(x) { } # Implementing `dplyr_extending`: we have a few metadata attributes to consider: -# `as_of` is an attribute doesn't depend on the rows or columns, `geo_type` and -# `time_type` are scalar attributes dependent on columns, and `other_keys` acts -# like an attribute vectorized over columns; `dplyr_extending` advice at time of -# writing says to implement `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and -# `names<-`, but not `dplyr_row_slice`; however, we'll also implement -# `dplyr_row_slice` anyway to prevent a `arrange` on grouped `epi_df`s from -# dropping the `epi_df` class. We'll implement `[` to allow either 1d or 2d. -# We'll also implement some other methods where we want to (try to) maintain an -# `epi_df`. +# `as_of` is an attribute doesn't depend on the rows or columns, `geo_type` is a +# scalar attribute dependent on columns, and `other_keys` acts like an attribute +# vectorized over columns; `dplyr_extending` advice at time of writing says to +# implement `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but +# not `dplyr_row_slice`; however, we'll also implement `dplyr_row_slice` anyway +# to prevent a `arrange` on grouped `epi_df`s from dropping the `epi_df` class. +# We'll implement `[` to allow either 1d or 2d. We'll also implement some other +# methods where we want to (try to) maintain an `epi_df`. #' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may #' directly feed in latter from our other methods) diff --git a/R/slide.R b/R/slide.R index 9d26174a..be8d895b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -35,49 +35,7 @@ #' the names of the resulting columns are given by prepending `new_col_name` #' to the names of the list elements. #' -#' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is centered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. The unit (the meaning of one time step) is implicitly -#' defined by the way the `time_value` column treats addition and subtraction; -#' for example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals -#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly -#' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If there are not enough time -#' steps available to complete the window at any given reference time, then -#' `epi_slide()` still attempts to perform the computation anyway (it does not -#' require a complete window). The issue of what to do with partial -#' computations (those run on incomplete windows) is therefore left up to the -#' user, either through the specified function or formula `f`, or through -#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and -#' `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) -#' -#' If `f` is missing, then an expression for tidy evaluation can be specified, -#' for example, as in: -#' ``` -#' epi_slide(x, cases_7dav = mean(cases), before = 6) -#' ``` -#' which would be equivalent to: -#' ``` -#' epi_slide(x, function(x, g) mean(x$cases), before = 6, -#' new_col_name = "cases_7dav") -#' ``` -#' Thus, to be clear, when the computation is specified via an expression for -#' tidy evaluation (first example, above), then the name for the new column is -#' inferred from the given expression and overrides any name passed explicitly -#' through the `new_col_name` argument. +#' @template basic-slide-details #' #' @importFrom lubridate days weeks #' @importFrom dplyr bind_rows group_vars filter select @@ -130,13 +88,16 @@ #' before = 1, as_list_col = TRUE #' ) %>% #' ungroup() -epi_slide <- function(x, f, ..., before, after, ref_time_values, - time_step, +epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") - if (missing(ref_time_values)) { + if (nrow(x) == 0L) { + return(x) + } + + if (is.null(ref_time_values)) { ref_time_values <- unique(x$time_value) } else { assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) @@ -151,40 +112,22 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, } ref_time_values <- sort(ref_time_values) - # Validate and pre-process `before`, `after`: - if (!missing(before)) { - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (!missing(after)) { - after <- vctrs::vec_cast(after, integer()) - assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (missing(before)) { - if (missing(after)) { - cli_abort("Either or both of `before`, `after` must be provided.") - } else if (after == 0L) { - cli_warn("`before` missing, `after==0`; maybe this was intended to be some - non-zero-width trailing window, but since `before` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") - } - before <- 0L - } else if (missing(after)) { - if (before == 0L) { - cli_warn("`before==0`, `after` missing; maybe this was intended to be some - non-zero-width leading window, but since `after` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") + if (is.null(before) && !is.null(after)) { + if (inherits(after, "difftime")) { + before <- as.difftime(0, units = units(after)) + } else { + before <- 0 } - after <- 0L } - - # If a custom time step is specified, then redefine units - if (!missing(time_step)) { - before <- time_step(before) - after <- time_step(after) + if (is.null(after) && !is.null(before)) { + if (inherits(before, "difftime")) { + after <- as.difftime(0, units = units(before)) + } else { + after <- 0 + } } + validate_slide_window_arg(before, attr(x, "metadata")$time_type) + validate_slide_window_arg(after, attr(x, "metadata")$time_type) # Arrange by increasing time_value x <- arrange(x, .data$time_value) @@ -434,8 +377,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() -epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, - time_step, +epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, all_rows = FALSE) { assert_class(x, "epi_df") @@ -504,7 +446,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, ) } - user_provided_rtvs <- !missing(ref_time_values) + user_provided_rtvs <- !is.null(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) } else { @@ -520,37 +462,25 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, } ref_time_values <- sort(ref_time_values) - # Validate and pre-process `before`, `after`: - if (!missing(before)) { - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (!missing(after)) { - after <- vctrs::vec_cast(after, integer()) - assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (missing(before)) { - if (missing(after)) { - cli_abort("Either or both of `before`, `after` must be provided.") - } else if (after == 0L) { - cli_warn("`before` missing, `after==0`; maybe this was intended to be some - non-zero-width trailing window, but since `before` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") + if (is.null(before) && !is.null(after)) { + if (inherits(after, "difftime")) { + before <- as.difftime(0, units = units(after)) + } else { + before <- 0 } - before <- 0L - } else if (missing(after)) { - if (before == 0L) { - cli_warn("`before==0`, `after` missing; maybe this was intended to be some - non-zero-width leading window, but since `after` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") + } + if (is.null(after) && !is.null(before)) { + if (inherits(before, "difftime")) { + after <- as.difftime(0, units = units(before)) + } else { + after <- 0 } - after <- 0L } + validate_slide_window_arg(before, attr(x, "metadata")$time_type) + validate_slide_window_arg(after, attr(x, "metadata")$time_type) # Make a complete date sequence between min(x$time_value) and max(x$time_value). - date_seq_list <- full_date_seq(x, before, after, time_step) + date_seq_list <- full_date_seq(x, before, after, attr(x, "metadata")$time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates @@ -632,7 +562,10 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, } else if (f_from_package == "slider") { for (i in seq_along(col_names_chr)) { .data_group[, result_col_names[i]] <- f( - x = .data_group[[col_names_chr[i]]], before = before, after = after, ... + x = .data_group[[col_names_chr[i]]], + before = as.numeric(before), + after = as.numeric(after), + ... ) } } @@ -725,8 +658,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% #' ungroup() -epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, - time_step, +epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, all_rows = FALSE) { epi_slide_opt( @@ -737,7 +669,6 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, before = before, after = after, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, @@ -772,8 +703,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% #' ungroup() -epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, - time_step, +epi_slide_sum <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, all_rows = FALSE) { epi_slide_opt( @@ -784,7 +714,6 @@ epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, before = before, after = after, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, @@ -796,24 +725,25 @@ epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, #' (x$time_value). Produce lists of dates before min(x$time_value) and after #' max(x$time_value) for padding initial and final windows to size `n`. #' -#' `before` and `after` inputs here should be raw (numeric) values; -#' `time_step` function should NOT have been applied. `full_date_seq` applies -#' `time_step` as needed. +#' `before` and `after` args are assumed to have been validated by the calling +#' function (using `validate_slide_window_arg`). #' #' @importFrom checkmate assert_function #' @noRd -full_date_seq <- function(x, before, after, time_step) { +full_date_seq <- function(x, before, after, time_type) { + if (!time_type %in% c("day", "week", "yearmonth", "integer")) { + cli_abort( + "time_type must be one of 'day', 'week', or 'integer'." + ) + } + pad_early_dates <- c() pad_late_dates <- c() - # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` - # defines a step of 1 to automatically be the quantum (smallest resolvable - # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. - # - # `tsibble` classes apparently can't be added to in different units, so even - # if `time_step` is provided by the user, use a value-1 unitless step. - if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { # nolint: indentation_linter + # `tsibble` time types have their own behavior, where adding 1 corresponds to + # incrementing by a quantum (smallest resolvable unit) of the date class. For + # example, one step = 1 quarter for `yearquarter`. + if (time_type %in% c("yearmonth", "integer")) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { @@ -822,70 +752,24 @@ full_date_seq <- function(x, before, after, time_step) { if (after != 0) { pad_late_dates <- all_dates[length(all_dates)] + 1:after } - } else if (missing(time_step)) { - # Guess what `by` should be based on the epi_df's `time_type`. - ttype <- attributes(x)$metadata$time_type - by <- switch(ttype, + } else { + by <- switch(time_type, day = "days", week = "weeks", - yearweek = "weeks", - yearmonth = "months", - yearquarter = "quarters", - year = "years", - NA # default value for "custom", "day-time" ) - if (is.na(by)) { - cli_abort( - c( - "`frollmean` requires a full window to compute a result, but the - `time_type` associated with the epi_df was not mappable to a period - type valid for creating a date sequence.", - "i" = c("The input data's `time_type` was probably `custom` or `day-time`. - These require also passing a `time_step` function.") - ), - class = "epiprocess__full_date_seq__unmappable_time_type", - epiprocess__time_type = ttype - ) - } - - # `seq.Date` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - if (before != 0) { - # Use `seq.Date` here to avoid having to map `epi_df` `time_type` to - # `time_step` functions. - # - # The first element `seq.Date` returns is always equal to the provided - # `from` date (`from + 0`). The full return value is equivalent to - # `from + 0:n`. In our case, we `from + 1:n`, so drop the first - # element. - # - # Adding "-1" to the `by` arg makes `seq.Date` go backwards in time. + # The behavior is analogous to the branch with tsibble types above. For + # more detail, note that the function `seq.Date(from, ..., length.out = + # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first + # element. Adding "-1" to the `by` arg makes `seq.Date` go backwards in + # time. pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) } if (after != 0) { pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] } - } else { - # A custom time step is specified. - assert_function(time_step) - - # Calculate the number of `time_step`s required to go between min and max time - # values. This is roundabout because difftime objects, lubridate::period objects, - # and Dates are hard to convert to the same time scale and add. - t_elapsed_s <- difftime(max(x$time_value), min(x$time_value), units = "secs") - step_size_s <- lubridate::as.period(time_step(1), unit = "secs") - n_steps <- ceiling(as.numeric(t_elapsed_s) / as.numeric(step_size_s)) - - all_dates <- min(x$time_value) + time_step(0:n_steps) - - if (before != 0) { - pad_early_dates <- all_dates[1L] - time_step(before:1) - } - if (after != 0) { - pad_late_dates <- all_dates[length(all_dates)] + time_step(1:after) - } } return(list( diff --git a/R/utils.R b/R/utils.R index 46a5cdc5..8c1c622f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -395,56 +395,38 @@ guess_geo_type <- function(geo_value) { } } - # If we got here then we failed return("custom") } -guess_time_type <- function(time_value) { - # Convert character time values to Date or POSIXct - if (is.character(time_value)) { - if (nchar(time_value[1]) <= 10L) { - new_time_value <- tryCatch( - { - as.Date(time_value) - }, - error = function(e) NULL - ) - } else { - new_time_value <- tryCatch( - { - as.POSIXct(time_value) - }, - error = function(e) NULL - ) - } - if (!is.null(new_time_value)) time_value <- new_time_value - } - # Now, if a POSIXct class, then use "day-time" - if (inherits(time_value, "POSIXct")) { - return("day-time") - } else if (inherits(time_value, "Date")) { - # Else, if a Date class, then use "week" or "day" depending on gaps - # Convert to numeric so we can use the modulo operator. +guess_time_type <- function(time_value, time_value_arg = rlang::caller_arg(time_value)) { + if (inherits(time_value, "Date")) { unique_time_gaps <- as.numeric(diff(sort(unique(time_value)))) - # We need to check the modulus of `unique_time_gaps` in case there are - # missing dates. Gaps in a weekly date sequence will cause some diffs to - # be larger than 7 days. If we just check if `diffs == 7`, it will fail - # unless the weekly date sequence is already complete. - return(ifelse(all(unique_time_gaps %% 7 == 0), "week", "day")) - } else if (inherits(time_value, "yearweek")) { - # Else, check whether it's one of the tsibble classes - return("yearweek") + # Gaps in a weekly date sequence will cause some diffs to be larger than 7 + # days, so check modulo 7 equality, rather than equality with 7. + if (all(unique_time_gaps %% 7 == 0)) { + return("week") + } + if (all(unique_time_gaps >= 28)) { + cli_abort( + "Found a monthly or longer cadence in the time column `{time_value_arg}`. + Consider using tsibble::yearmonth for monthly data and 'YYYY' integers for year data." + ) + } + return("day") } else if (inherits(time_value, "yearmonth")) { return("yearmonth") - } else if (inherits(time_value, "yearquarter")) { - return("yearquarter") - } else if (rlang::is_integerish(time_value) && - all(nchar(as.character(time_value)) == 4L)) { # nolint: indentation_linter - return("year") + } else if (rlang::is_integerish(time_value)) { + return("integer") } - # If we got here then we failed + cli_warn( + "Unsupported time type in column `{time_value_arg}`, with class {.code {class(time_value)}}. + Time-related functionality may have unexpected behavior. + ", + class = "epiprocess__guess_time_type__unknown_time_type", + epiprocess__time_value = time_value + ) return("custom") } @@ -820,3 +802,41 @@ guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(t guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { as.numeric(NextMethod(), units = "secs") } + + +validate_slide_window_arg <- function(arg, time_type, arg_name = rlang::caller_arg(arg)) { + if (is.null(arg)) { + cli_abort("`{arg_name}` is a required argument.") + } + + if (!checkmate::test_scalar(arg)) { + cli_abort("Expected `{arg_name}` to be a scalar value.") + } + + if (time_type == "custom") { + cli_abort("Unsure how to interpret slide units with a custom time type. Consider converting your time + column to a Date, yearmonth, or integer type.") + } + + if (!identical(arg, Inf)) { + if (time_type == "day") { + if (!test_int(arg, lower = 0L) && !(inherits(arg, "difftime") && units(arg) == "days")) { + cli_abort("Expected `{arg_name}` to be a difftime with units in days or a non-negative integer.") + } + } else if (time_type == "week") { + if (!(inherits(arg, "difftime") && units(arg) == "weeks")) { + cli_abort("Expected `{arg_name}` to be a difftime with units in weeks.") + } + } else if (time_type == "yearmonth") { + if (!test_int(arg, lower = 0L) || inherits(arg, "difftime")) { + cli_abort("Expected `{arg_name}` to be a non-negative integer.") + } + } else if (time_type == "integer") { + if (!test_int(arg, lower = 0L) || inherits(arg, "difftime")) { + cli_abort("Expected `{arg_name}` to be a non-negative integer.") + } + } else { + cli_abort("Expected `{arg_name}` to be Inf, an appropriate a difftime, or a non-negative integer.") + } + } +} diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R new file mode 100644 index 00000000..f8f6792d --- /dev/null +++ b/man-roxygen/basic-slide-details.R @@ -0,0 +1,34 @@ +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is centered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. +#' +#' If there are not enough time steps available to complete the window at any +#' given reference time, then `epi_slide()` still attempts to perform the +#' computation anyway (it does not require a complete window). The issue of +#' what to do with partial computations (those run on incomplete windows) is +#' therefore left up to the user, either through the specified function or +#' formula `f`, or through post-processing. For a centrally-aligned slide of +#' `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after = +#' (n-1)/2` when the number of `time_value`s in a sliding window is odd and +#' `before = n/2-1` and `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments. +#' +#' If `f` is missing, then an expression for tidy evaluation can be specified, +#' for example, as in: +#' ``` +#' epi_slide(x, cases_7dav = mean(cases), before = 6) +#' ``` +#' which would be equivalent to: +#' ``` +#' epi_slide(x, function(x, g) mean(x$cases), before = 6, +#' new_col_name = "cases_7dav") +#' ``` +#' Thus, to be clear, when the computation is specified via an expression for +#' tidy evaluation (first example, above), then the name for the new column is +#' inferred from the given expression and overrides any name passed explicitly +#' through the `new_col_name` argument. diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index 383c102d..7e169af6 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -3,31 +3,32 @@ #' single data group. #' @param before,after How far `before` and `after` each `ref_time_value` should #' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' provided; the other's default will be 0. The accepted values for these +#' depend on the type of the `time_value` column: +#' +#' - if it is a Date and the cadence is daily, then they can be integers +#' (which will be interpreted in units of days) or difftimes with units +#' "days" +#' - if it is a Date and the cadence is weekly, then they must be difftimes +#' with units "weeks" +#' - if it is an integer, then they must be integers +#' +#' Endpoints of the window are inclusive. Common settings: +#' +#' - For trailing/right-aligned windows from `ref_time_value - k` to +#' `ref_time_value`: either pass `before=k` by itself, or pass `before=k, +#' after=0`. +#' - For center-aligned windows from `ref_time_value - k` to +#' `ref_time_value + k`: pass `before=k, after=k`. +#' - For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + k`: either pass pass `after=k` by itself, #' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. +#' +#' See "Details:" on how missing rows are handled within the window. #' @param ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the #' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class [lubridate::period]. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R index 59c51603..bedcb7d4 100644 --- a/man-roxygen/epi_df-params.R +++ b/man-roxygen/epi_df-params.R @@ -1,10 +1,9 @@ #' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". +#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the +#' location column and set to "custom" if not recognized. +#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time +#' column and set to "custom" if not recognized. Unpredictable behavior may result +#' if the time type is not recognized. #' @param as_of Time value representing the time at which the given data were #' available. For example, if `as_of` is January 31, 2022, then the `epi_df` #' object that is created would represent the most up-to-date version of the @@ -15,4 +14,5 @@ #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a #' character vector in the `other_keys` component of `additional_metadata`. +#' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R index 33fb437c..5e8876d2 100644 --- a/man-roxygen/opt-slide-details.R +++ b/man-roxygen/opt-slide-details.R @@ -1,25 +1,16 @@ #' @details To "slide" means to apply a function over a rolling window of time -#' steps for each data group, where the window is centered at a reference -#' time and left and right endpoints are given by the `before` and `after` -#' arguments. The unit (the meaning of one time step) is implicitly defined -#' by the way the `time_value` column treats addition and subtraction; for -#' example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals `as.Date -#' ("2022-01-02")`. Alternatively, the time step can be set explicitly using -#' the `time_step` argument (which if specified would override the default -#' choice based on `time_value` column). If there are not enough time steps -#' available to complete the window at any given reference time, then -#' `epi_slide_*()` will fail; it requires a complete window to perform the -#' computation. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` -#' and `after = n/2` when `n` is even. +#' steps for each data group, where the window is centered at a reference time +#' and left and right endpoints are given by the `before` and `after` +#' arguments. + +#' If there are not enough time steps available to complete the window at any +#' given reference time, then `epi_slide_*()` will fail; it requires a +#' complete window to perform the computation. For a centrally-aligned slide +#' of `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after +#' = (n-1)/2` when the number of `time_value`s in a sliding window is odd and +#' `before = n/2-1` and `after = n/2` when `n` is even. #' #' Sometimes, we want to experiment with various trailing or leading window #' widths and compare the slide outputs. In the (uncommon) case where #' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) +#' `after` arguments. diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd deleted file mode 100644 index 98cdbb83..00000000 --- a/man/as_epi_df.Rd +++ /dev/null @@ -1,148 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_df.R -\name{as_epi_df} -\alias{as_epi_df} -\alias{as_epi_df.epi_df} -\alias{as_epi_df.tbl_df} -\alias{as_epi_df.data.frame} -\alias{as_epi_df.tbl_ts} -\title{Convert to \code{epi_df} format} -\usage{ -as_epi_df(x, ...) - -\method{as_epi_df}{epi_df}(x, ...) - -\method{as_epi_df}{tbl_df}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) - -\method{as_epi_df}{data.frame}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) - -\method{as_epi_df}{tbl_ts}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) -} -\arguments{ -\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} - -\item{...}{Additional arguments passed to methods.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{as_of}{Time value representing the time at which the given data were -available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} -object that is created would represent the most up-to-date version of the -data available as of January 31, 2022. If the \code{as_of} argument is missing, -then the current day-time will be used.} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and -\code{as_of} fields; named entries from the passed list will be included as -well. If your tibble has additional keys, be sure to specify them as a -character vector in the \code{other_keys} component of \code{additional_metadata}.} -} -\value{ -An \code{epi_df} object. -} -\description{ -Converts a data frame or tibble into an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html}{getting started guide} for -examples. -} -\section{Methods (by class)}{ -\itemize{ -\item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. - -\item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns -\code{geo_value} and \code{time_value}, or column names that uniquely map onto these -(e.g. \code{date} or \code{province}). Alternatively, you can specify the conversion -explicitly (\code{time_value = someWeirdColumnName}). All other columns not -specified as \code{other_keys} will be preserved as is, and treated as measured -variables. - -If \code{as_of} is missing, then the function will try to guess it from an -\code{as_of}, \code{issue}, or \code{version} column of \code{x} (if any of these are present), -or from as an \code{as_of} field in its metadata (stored in its attributes); if -this fails, then the current day-time will be used. - -\item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. - -\item \code{as_epi_df(tbl_ts)}: Works analogously to \code{as_epi_df.tbl_df()}, except that -the \code{tbl_ts} class is dropped, and any key variables (other than -"geo_value") are added to the metadata of the returned object, under the -\code{other_keys} field. - -}} -\examples{ -# Convert a `tsibble` that has county code as an extra key -# Notice that county code should be a character string to preserve any leading zeroes - -ex1_input <- tibble::tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c( - "06059", "06061", "06067", - "12111", "12113", "12117", - "42101", "42103", "42105" - ), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(geo_value)), - value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) -) \%>\% - tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) - -# The `other_keys` metadata (`"county_code"` in this case) is automatically -# inferred from the `tsibble`'s `key`: -ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") -attr(ex1, "metadata")[["other_keys"]] - - - -# Dealing with misspecified column names: -# Geographical and temporal information must be provided in columns named -# `geo_value` and `time_value`; if we start from a data frame with a -# different format, it must be converted to use `geo_value` and `time_value` -# before calling `as_epi_df`. - -ex2_input <- tibble::tibble( - state = rep(c("ca", "fl", "pa"), each = 3), # misnamed - pol = rep(c("blue", "swing", "swing"), each = 3), # extra key - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(state)), # misnamed - value = 1:length(state) + 0.01 * rnorm(length(state)) -) - -print(ex2_input) - -ex2 <- ex2_input \%>\% - dplyr::rename(geo_value = state, time_value = reported_date) \%>\% - as_epi_df( - geo_type = "state", as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol") - ) - -attr(ex2, "metadata") - - - -# Adding additional keys to an `epi_df` object - -ex3_input <- jhu_csse_county_level_subset \%>\% - dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% - dplyr::slice_tail(n = 6) - -ex3 <- ex3_input \%>\% - tsibble::as_tsibble() \%>\% # needed to add the additional metadata - # add 2 extra keys - dplyr::mutate( - state = rep("MA", 6), - pol = rep(c("blue", "swing", "swing"), each = 2) - ) \%>\% - # the 2 extra keys we added have to be specified in the other_keys - # component of additional_metadata. - as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) - -attr(ex3, "metadata") -} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 99203052..74591693 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -9,35 +9,33 @@ \usage{ new_epi_archive( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL + geo_type, + time_type, + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end ) validate_epi_archive( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end ) as_epi_archive( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, + geo_type = deprecated(), + time_type = deprecated(), + other_keys = character(0L), + additional_metadata = list(), compactify = NULL, - clobberable_versions_start = NULL, - .versions_end = NULL, + clobberable_versions_start = NA, + .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end ) @@ -46,23 +44,22 @@ as_epi_archive( \item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, \code{time_value}, \code{version}, and then any additional number of columns.} -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} +\item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the +location column and set to "custom" if not recognized.} -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} +\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time +column and set to "custom" if not recognized. Unpredictable behavior may result +if the time type is not recognized.} \item{other_keys}{Character vector specifying the names of variables in \code{x} that should be considered key variables (in the language of \code{data.table}) apart from "geo_value", "time_value", and "version".} \item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} +\code{epi_archive} object. The metadata will have the \code{geo_type} field; named +entries from the passed list or will be included as well.} -\item{compactify}{Optional; Boolean or \code{NULL}. \code{TRUE} will remove some +\item{compactify}{Optional; Boolean. \code{TRUE} will remove some redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove redundant rows, but issue a warning. See more information at \code{compactify}.} @@ -126,11 +123,8 @@ later. The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, as well as any others (these can be specified when instantiating the \code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Refer to the documentation for \code{as_epi_archive()} for -information and examples of relevant parameter names for an \code{epi_archive} -object. Note that there can only be a single row per unique combination of -key variables, and thus the key variables are critical for figuring out how -to generate a snapshot of data from the archive, as of a given version. +on \code{DT} directly). Note that there can only be a single row per unique +combination of key variables. } \section{Metadata}{ @@ -142,20 +136,17 @@ object: \item \code{additional_metadata}: list of additional metadata for the data archive. } -Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be -accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, -etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the -metadata of an \code{epi_archive} object are not currently used by any -downstream functions in the \code{epiprocess} package, and serve only as useful -bits of information to convey about the data set at hand. +While this metadata is not protected, it is generally recommended to treat it +as read-only, and to use the \code{epi_archive} methods to interact with the data +archive. Unexpected behavior may result from modifying the metadata +directly. } \section{Generating Snapshots}{ An \code{epi_archive} object can be used to generate a snapshot of the data in -\code{epi_df} format, which represents the most up-to-date values of the signal -variables, as of the specified version. This is accomplished by calling -\code{epix_as_of()}. +\code{epi_df} format, which represents the most up-to-date time series values up +to a point in time. This is accomplished by calling \code{epix_as_of()}. } \section{Sliding Computations}{ @@ -182,10 +173,7 @@ tib <- tibble::tibble( value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% as_epi_archive( - geo_type = "state", - time_type = "day" -) +toy_epi_archive <- tib \%>\% as_epi_archive() toy_epi_archive # Ex. with an additional key for county @@ -208,10 +196,6 @@ df <- data.frame( cases_rate = c(0.01, 0.02, 0.01, 0.05) ) -x <- df \%>\% as_epi_archive( - geo_type = "state", - time_type = "day", - other_keys = "county" -) +x <- df \%>\% as_epi_archive(other_keys = "county") } diff --git a/man/epi_df.Rd b/man/epi_df.Rd index 4e5af146..dbb4a917 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -1,19 +1,78 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/epi_df.R -\name{epi_df} -\alias{epi_df} +\name{new_epi_df} +\alias{new_epi_df} +\alias{as_epi_df} +\alias{as_epi_df.epi_df} +\alias{as_epi_df.tbl_df} +\alias{as_epi_df.data.frame} +\alias{as_epi_df.tbl_ts} \title{\code{epi_df} object} +\usage{ +new_epi_df( + x = tibble::tibble(), + geo_type, + time_type, + as_of, + additional_metadata = list() +) + +as_epi_df(x, ...) + +\method{as_epi_df}{epi_df}(x, ...) + +\method{as_epi_df}{tbl_df}( + x, + geo_type = deprecated(), + time_type = deprecated(), + as_of, + additional_metadata = list(), + ... +) + +\method{as_epi_df}{data.frame}(x, as_of, additional_metadata = list(), ...) + +\method{as_epi_df}{tbl_ts}(x, as_of, additional_metadata = list(), ...) +} +\arguments{ +\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} + +\item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the +location column and set to "custom" if not recognized.} + +\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time +column and set to "custom" if not recognized. Unpredictable behavior may result +if the time type is not recognized.} + +\item{as_of}{Time value representing the time at which the given data were +available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} +object that is created would represent the most up-to-date version of the +data available as of January 31, 2022. If the \code{as_of} argument is missing, +then the current day-time will be used.} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and +\code{as_of} fields; named entries from the passed list will be included as +well. If your tibble has additional keys, be sure to specify them as a +character vector in the \code{other_keys} component of \code{additional_metadata}.} + +\item{...}{Additional arguments passed to methods.} +} +\value{ +An \code{epi_df} object. +} \description{ -An \code{epi_df} is a tibble with certain minimal column structure -and metadata. It can be seen as a snapshot of a data set that contains the -most up-to-date values of some signal variables of interest, as of a given -time. +An \code{epi_df} is a tibble with certain minimal column structure and metadata. +It can be seen as a snapshot of a data set that contains the most +up-to-date values of some signal variables of interest, as of a given time. } \details{ An \code{epi_df} is a tibble with (at least) the following columns: \itemize{ -\item \code{geo_value}: the geographic value associated with each row of measurements. -\item \code{time_value}: the time value associated with each row of measurements. +\item \code{geo_value}: A character vector representing the geographical unit of +observation. This could be a country code, a state name, a county code, +etc. +\item \code{time_value}: A date or integer vector representing the time of observation. } Other columns can be considered as measured variables, which we also refer to @@ -21,17 +80,25 @@ as signal variables. An \code{epi_df} object also has metadata with (at least) the following fields: \itemize{ \item \code{geo_type}: the type for the geo values. -\item \code{time_type}: the type for the time values. \item \code{as_of}: the time value at which the given data were available. } +Most users should use \code{as_epi_df}. The input tibble \code{x} to the constructor +must contain the columns \code{geo_value} and \code{time_value}. All other columns +will be preserved as is, and treated as measured variables. If \code{as_of} is +missing, then the function will try to guess it from an \code{as_of}, \code{issue}, +or \code{version} column of \code{x} (if any of these are present), or from as an +\code{as_of} field in its metadata (stored in its attributes); if this fails, +then the current day-time will be used. The \code{new_epi_df} constructor +assumes its arguments have already been validated, so it should mainly be +used by advanced users. + Metadata for an \code{epi_df} object \code{x} can be accessed (and altered) via -\code{attributes(x)$metadata}. The first two fields in the above list, -\code{geo_type} and \code{time_type}, can usually be inferred from the \code{geo_value} -and \code{time_value} columns, respectively. They are not currently used by any -downstream functions in the \code{epiprocess} package, and serve only as useful -bits of information to convey about the data set at hand. More information -on their coding is given below. +\code{attributes(x)$metadata}. The first field in the above list, \code{geo_type}, +can usually be inferred from the \code{geo_value} columns. They are not +currently used by any downstream functions in the \code{epiprocess} package, +and serve only as useful bits of information to convey about the data set +at hand. More information on their coding is given below. The last field in the above list, \code{as_of}, is one of the most unique aspects of an \code{epi_df} object. In brief, we can think of an \code{epi_df} object as a @@ -72,25 +139,86 @@ An unrecognizable geo type is labeled "custom". The following time types are recognized in an \code{epi_df}. \itemize{ -\item \code{"day-time"}: each observation corresponds to a time on a given day -(measured to the second); coded as a \code{POSIXct} object, as in -\code{as.POSIXct("2022-01-31 18:45:40")}. \item \code{"day"}: each observation corresponds to a day; coded as a \code{Date} object, as in \code{as.Date("2022-01-31")}. \item \code{"week"}: each observation corresponds to a week; the alignment can be arbitrary (as to whether a week starts on a Monday, Tuesday); coded as a \code{Date} object, representing the start date of week. -\item \code{"yearweek"}: each observation corresponds to a week; the alignment can be -arbitrary; coded as a \code{tsibble::yearweek} object, where the alignment is -stored in the \code{week_start} field of its attributes. \item \code{"yearmonth"}: each observation corresponds to a month; coded as a \code{tsibble::yearmonth} object. -\item \code{"yearquarter"}: each observation corresponds to a quarter; coded as a -\code{tsibble::yearquarter} object. -\item \code{"year"}: each observation corresponds to a year; coded as an integer -greater than or equal to 1582. +\item \code{"integer"}: a generic integer index (e.g. years or something else). } An unrecognizable time type is labeled "custom". } +\examples{ +# Convert a `tsibble` that has county code as an extra key +# Notice that county code should be a character string to preserve any leading zeroes + +ex1_input <- tibble::tibble( + geo_value = rep(c("ca", "fl", "pa"), each = 3), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), + time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), + by = "day" + ), length.out = length(geo_value)), + value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) +) \%>\% + tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) + +# The `other_keys` metadata (`"county_code"` in this case) is automatically +# inferred from the `tsibble`'s `key`: +ex1 <- as_epi_df(x = ex1_input, as_of = "2020-06-03") +attr(ex1, "metadata")[["other_keys"]] + + +# Dealing with misspecified column names: +# Geographical and temporal information must be provided in columns named +# `geo_value` and `time_value`; if we start from a data frame with a +# different format, it must be converted to use `geo_value` and `time_value` +# before calling `as_epi_df`. + +ex2_input <- tibble::tibble( + state = rep(c("ca", "fl", "pa"), each = 3), # misnamed + pol = rep(c("blue", "swing", "swing"), each = 3), # extra key + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), + by = "day" + ), length.out = length(state)), # misnamed + value = 1:length(state) + 0.01 * rnorm(length(state)) +) + +print(ex2_input) + +ex2 <- ex2_input \%>\% + dplyr::rename(geo_value = state, time_value = reported_date) \%>\% + as_epi_df( + as_of = "2020-06-03", + additional_metadata = list(other_keys = "pol") + ) + +attr(ex2, "metadata") + + +# Adding additional keys to an `epi_df` object + +ex3_input <- jhu_csse_county_level_subset \%>\% + dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% + dplyr::slice_tail(n = 6) + +ex3 <- ex3_input \%>\% + tsibble::as_tsibble() \%>\% # needed to add the additional metadata + # add 2 extra keys + dplyr::mutate( + state = rep("MA", 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) \%>\% + # the 2 extra keys we added have to be specified in the other_keys + # component of additional_metadata. + as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) + +attr(ex3, "metadata") +} diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index a1319f99..5f4db7b4 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -8,10 +8,9 @@ epi_slide( x, f, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -47,35 +46,35 @@ as in \code{dplyr} verbs, and can also refer to \code{.x}, \code{.group_key}, an \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} @@ -115,31 +114,21 @@ for examples. To "slide" means to apply a function or formula over a rolling window of time steps for each data group, where the window is centered at a reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. The unit (the meaning of one time step) is implicitly -defined by the way the \code{time_value} column treats addition and subtraction; -for example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals -\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly -using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If there are not enough time -steps available to complete the window at any given reference time, then -\code{epi_slide()} still attempts to perform the computation anyway (it does not -require a complete window). The issue of what to do with partial -computations (those run on incomplete windows) is therefore left up to the -user, either through the specified function or formula \code{f}, or through -post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and -\code{after = n/2} when \code{n} is even. +\code{after} arguments. + +If there are not enough time steps available to complete the window at any +given reference time, then \code{epi_slide()} still attempts to perform the +computation anyway (it does not require a complete window). The issue of +what to do with partial computations (those run on incomplete windows) is +therefore left up to the user, either through the specified function or +formula \code{f}, or through post-processing. For a centrally-aligned slide of +\code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. If \code{f} is missing, then an expression for tidy evaluation can be specified, for example, as in: diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 850a45a1..aeb56729 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -8,10 +8,9 @@ epi_slide_mean( x, col_names, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, @@ -41,35 +40,35 @@ passed the data \code{x} to operate on, the window size \code{n}, and the alignm \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{Character vector indicating the name(s) of the new column(s) that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to any existing @@ -105,29 +104,19 @@ examples. Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference -time and left and right endpoints are given by the \code{before} and \code{after} -arguments. The unit (the meaning of one time step) is implicitly defined -by the way the \code{time_value} column treats addition and subtraction; for -example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using -the \code{time_step} argument (which if specified would override the default -choice based on \code{time_value} column). If there are not enough time steps -available to complete the window at any given reference time, then -\verb{epi_slide_*()} will fail; it requires a complete window to perform the -computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} -and \code{after = n/2} when \code{n} is even. +steps for each data group, where the window is centered at a reference time +and left and right endpoints are given by the \code{before} and \code{after} +arguments. +If there are not enough time steps available to complete the window at any +given reference time, then \verb{epi_slide_*()} will fail; it requires a +complete window to perform the computation. For a centrally-aligned slide +of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. } \examples{ # slide a 7-day trailing average formula on cases diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4b011c16..629134d5 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -9,10 +9,9 @@ epi_slide_opt( col_names, f, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, @@ -62,35 +61,35 @@ points \code{before} and \code{after} to use in the computation.} \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{Character vector indicating the name(s) of the new column(s) that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to any existing @@ -126,29 +125,19 @@ for examples. } \details{ To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference -time and left and right endpoints are given by the \code{before} and \code{after} -arguments. The unit (the meaning of one time step) is implicitly defined -by the way the \code{time_value} column treats addition and subtraction; for -example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using -the \code{time_step} argument (which if specified would override the default -choice based on \code{time_value} column). If there are not enough time steps -available to complete the window at any given reference time, then -\verb{epi_slide_*()} will fail; it requires a complete window to perform the -computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} -and \code{after = n/2} when \code{n} is even. +steps for each data group, where the window is centered at a reference time +and left and right endpoints are given by the \code{before} and \code{after} +arguments. +If there are not enough time steps available to complete the window at any +given reference time, then \verb{epi_slide_*()} will fail; it requires a +complete window to perform the computation. For a centrally-aligned slide +of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. } \examples{ # slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 8c835bdb..7bf92e23 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -8,10 +8,9 @@ epi_slide_sum( x, col_names, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, @@ -41,35 +40,35 @@ passed the data \code{x} to operate on, the window size \code{n}, and the alignm \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{Character vector indicating the name(s) of the new column(s) that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to any existing @@ -105,29 +104,19 @@ examples. Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference -time and left and right endpoints are given by the \code{before} and \code{after} -arguments. The unit (the meaning of one time step) is implicitly defined -by the way the \code{time_value} column treats addition and subtraction; for -example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using -the \code{time_step} argument (which if specified would override the default -choice based on \code{time_value} column). If there are not enough time steps -available to complete the window at any given reference time, then -\verb{epi_slide_*()} will fail; it requires a complete window to perform the -computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} -and \code{after = n/2} when \code{n} is even. +steps for each data group, where the window is centered at a reference time +and left and right endpoints are given by the \code{before} and \code{after} +arguments. +If there are not enough time steps available to complete the window at any +given reference time, then \verb{epi_slide_*()} will fail; it requires a +complete window to perform the computation. For a centrally-aligned slide +of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. } \examples{ # slide a 7-day trailing sum formula on cases diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 42b121fa..4ab23882 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -62,7 +62,7 @@ archive_cases_dv_subset2 <- as_epi_archive( # (a.k.a. "hotfixed", "clobbered", etc.): clobberable_versions_start = max(archive_cases_dv_subset$DT$version), # Suppose today is the following day, and there are no updates out yet: - versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, + versions_end = max(archive_cases_dv_subset$DT$version) + 1L, compactify = TRUE ) diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index c8f09594..2789cb01 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -10,9 +10,8 @@ epix_slide( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -23,9 +22,8 @@ epix_slide( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -36,9 +34,8 @@ epix_slide( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -97,12 +94,6 @@ set to a regularly-spaced sequence of values set to cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will be guessed (using the GCD of the skips between values).} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd deleted file mode 100644 index 8010b700..00000000 --- a/man/new_epi_df.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_df.R -\name{new_epi_df} -\alias{new_epi_df} -\title{Creates an \code{epi_df} object} -\usage{ -new_epi_df( - x = tibble::tibble(), - geo_type, - time_type, - as_of, - additional_metadata = list() -) -} -\arguments{ -\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{as_of}{Time value representing the time at which the given data were -available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} -object that is created would represent the most up-to-date version of the -data available as of January 31, 2022. If the \code{as_of} argument is missing, -then the current day-time will be used.} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and -\code{as_of} fields; named entries from the passed list will be included as -well. If your tibble has additional keys, be sure to specify them as a -character vector in the \code{other_keys} component of \code{additional_metadata}.} -} -\value{ -An \code{epi_df} object. -} -\description{ -Creates a new \code{epi_df} object. By default, builds an empty tibble with the -correct metadata for an \code{epi_df} object (ie. \code{geo_type}, \code{time_type}, and \code{as_of}). -Refer to the below info. about the arguments for more details. -} diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index d36fcab1..d12c4060 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -89,7 +89,7 @@ test_that("`validate_version_bound` validate and class checks together allow and test_that("archive version bounds args work as intended", { measurement_date <- as.Date("2000-01-01") update_tbl <- tibble::tibble( - geo_value = "g1", + geo_value = "ak", time_value = measurement_date, version = measurement_date + 1:5, value = 1:5 diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index d437c983..ac5aee8d 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -67,21 +67,22 @@ test_that("other_keys cannot contain names geo_value, time_value or version", { ) }) -test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { +test_that("Warning thrown when other_metadata contains overlapping names with geo_type field", { expect_warning(as_epi_archive(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + regexp = "`additional_metadata` names overlap with existing metadata fields" ) expect_warning(as_epi_archive(dt, additional_metadata = list(time_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + regexp = "`additional_metadata` names overlap with existing metadata fields" ) }) test_that("epi_archives are correctly instantiated with a variety of data types", { + d <- as.Date("2020-01-01") # Data frame df <- data.frame( geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, + time_value = d, + version = d + 0:19, value = 1:20 ) @@ -107,8 +108,8 @@ test_that("epi_archives are correctly instantiated with a variety of data types" # Keyed data.table kdt <- data.table::data.table( geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, + time_value = d, + version = d + 0:19, value = 1:20, code = "CA", key = "code" @@ -127,8 +128,8 @@ test_that("epi_archives are correctly instantiated with a variety of data types" # Unkeyed data.table udt <- data.table::data.table( geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, + time_value = d, + version = d + 0:19, value = 1:20, code = "CA" ) @@ -157,7 +158,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" # Keyed epi_df edf2 <- data.frame( geo_value = "al", - time_value = rep(as.Date("2020-01-01") + 0:9, 2), + time_value = rep(d + 0:9, 2), version = c( rep(as.Date("2020-01-25"), 10), rep(as.Date("2020-01-26"), 10) @@ -177,14 +178,13 @@ test_that("epi_archives are correctly instantiated with a variety of data types" }) test_that("`epi_archive` rejects nonunique keys", { - toy_update_tbl <- - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130, - "us", "pediatric", "2000-01-01", "2000-01-02", 5 - ) %>% + toy_update_tbl <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% mutate( age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value), @@ -199,3 +199,30 @@ test_that("`epi_archive` rejects nonunique keys", { as_epi_archive(toy_update_tbl, other_keys = "age_group"), ) }) + +test_that("`epi_archive` rejects dataframes where time_value and version columns don't share type", { + tbl1 <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", as.Date("2000-01-01"), as.Date("2000-01-02"), 121, + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + ) + expect_no_error(as_epi_archive(tbl1)) + tbl2 <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", as.Date("2000-01-01"), 2022, 121, + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + ) + expect_error(as_epi_archive(tbl2), class = "epiprocess__time_value_version_mismatch") + tbl3 <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", as.Date("2000-01-01"), as.POSIXct("2000-01-01"), 121, + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + ) + expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch") +}) diff --git a/tests/testthat/test-autoplot.R b/tests/testthat/test-autoplot.R index 0e4654eb..3b7d9c1f 100644 --- a/tests/testthat/test-autoplot.R +++ b/tests/testthat/test-autoplot.R @@ -1,17 +1,17 @@ -d <- as.Date("2020-01-01") +test_date <- as.Date("2020-01-01") raw_df_chr <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = "a"), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = "d") + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = "a"), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = "d") ) -ungrouped_chr <- as_epi_df(raw_df_chr, as_of = d + 6) +ungrouped_chr <- as_epi_df(raw_df_chr, as_of = test_date + 6) grouped_chr <- ungrouped_chr %>% group_by(geo_value) raw_df_num <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = 1:5) ) -ungrouped_num <- as_epi_df(raw_df_num, as_of = d + 6) +ungrouped_num <- as_epi_df(raw_df_num, as_of = test_date + 6) grouped_num <- ungrouped_num %>% group_by(geo_value) @@ -33,7 +33,7 @@ test_that("autoplot fails if no non-key columns are numeric", { # A numeric column is available, but is a key not a value. testdf <- mutate(raw_df_chr, key1 = c(1:5, 5:9)) %>% as_tsibble(index = time_value, key = c(geo_value, key1)) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) expect_error(autoplot(testdf), class = "epiprocess__no_numeric_vars_available" ) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 042a69ea..d05fe0b3 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -87,10 +87,11 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti }) test_that("compactify does not alter the default clobberable and observed version bounds", { + d <- as.Date("2000-01-01") x <- tibble::tibble( - geo_value = "geo1", - time_value = as.Date("2000-01-01"), - version = as.Date("2000-01-01") + 1:5, + geo_value = "ak", + time_value = d, + version = d + 1:5, value = 42L ) ea_true <- as_epi_archive(x, compactify = TRUE) diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index 98507434..886d94c4 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -21,7 +21,7 @@ test_that("epi_cor functions as intended", { ) edf <- as_epi_df(data.frame( - geo_value = rep("asdf", 20), + geo_value = rep("ak", 20), time_value = as.Date("2020-01-01") + 1:20, pos = 1:20, neg = -(1:20) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 1c5e527f..a49855aa 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -1,14 +1,8 @@ test_that("new_epi_df works as intended", { - # Empty tibble - wmsg <- capture_warnings(a <- new_epi_df()) - expect_match( - wmsg[1], - "Unknown or uninitialised column: `geo_value`." - ) - expect_match( - wmsg[2], - "Unknown or uninitialised column: `time_value`." - ) + # Empty call fails + expect_error(new_epi_df(), "argument \"geo_type\" is missing") + # Empty tibble works, but requires metadata + a <- new_epi_df(tibble(), geo_type = "custom", time_type = "custom", as_of = as.POSIXct("2020-01-01")) expect_true(is_epi_df(a)) expect_identical(attributes(a)$metadata$geo_type, "custom") expect_identical(attributes(a)$metadata$time_type, "custom") @@ -21,7 +15,7 @@ test_that("new_epi_df works as intended", { geo_value = rep(c("ca", "hi"), each = 5) ) - epi_tib <- new_epi_df(tib) + epi_tib <- new_epi_df(tib, geo_type = "state", time_type = "day", as_of = as.POSIXct("2020-01-01")) expect_true(is_epi_df(epi_tib)) expect_length(epi_tib, 4L) expect_identical(attributes(epi_tib)$metadata$geo_type, "state") @@ -82,13 +76,12 @@ test_that("as_epi_df works for nonstandard input", { }) # select fixes - tib <- tibble::tibble( x = 1:10, y = 1:10, time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) -epi_tib <- epiprocess::new_epi_df(tib) +epi_tib <- epiprocess::as_epi_df(tib) test_that("grouped epi_df maintains type for select", { grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-y) @@ -115,10 +108,9 @@ test_that("grouped epi_df handles extra keys correctly", { geo_value = rep(c("ca", "hi"), each = 5), extra_key = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2) ) - epi_tib <- epiprocess::new_epi_df(tib, + epi_tib <- epiprocess::as_epi_df(tib, additional_metadata = list(other_keys = "extra_key") ) - attributes(epi_tib) grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-extra_key) expect_true(inherits(selected_df, "epi_df")) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 9aa67603..f369fe15 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1,205 +1,180 @@ ## Create an epi. df and a function to test epi_slide with -d <- as.Date("2020-01-01") +test_date <- as.Date("2020-01-01") +days_dt <- as.difftime(1, units = "days") +weeks_dt <- as.difftime(1, units = "weeks") ungrouped <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value = 1:200), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5)) ) %>% as_epi_df() grouped <- ungrouped %>% group_by(geo_value) small_x <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5)) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% group_by(geo_value) - f <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) toy_edf <- tibble::tribble( ~geo_value, ~time_value, ~value, - "a", 1:10, 2L^(1:10), - "b", 1:10, 2L^(11:20), + "a", test_date + 1:10, 2L^(1:10), + "b", test_date + 1:10, 2L^(11:20), ) %>% tidyr::unchop(c(time_value, value)) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint start: line_length_linter. basic_sum_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) basic_mean_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "a", test_date + 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint end: line_length_linter. ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { expect_error( - epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = d + 3), - "Assertion on 'before' failed: Must have length 1" + epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = test_date + 3), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), - "Assertion on 'after' failed: Must have length 1" + epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = test_date + 3), + "Expected `after` to be a scalar value." ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = d + 3), - "Assertion on 'before' failed: Must have length 1" + epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = test_date + 3), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = d + 3), - "Assertion on 'after' failed: Must have length 1" + epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = test_date + 3), + "Expected `after` to be a scalar value." ) }) test_that("Test errors/warnings for discouraged features", { expect_error( - epi_slide(grouped, f, ref_time_values = d + 1), - "Either or both of `before`, `after` must be provided." - ) - expect_warning( - epi_slide(grouped, f, before = 0L, ref_time_values = d + 1), - "`before==0`, `after` missing" - ) - expect_warning( - epi_slide(grouped, f, after = 0L, ref_time_values = d + 1), - "`before` missing, `after==0`" + epi_slide(grouped, f, ref_time_values = test_date + 1), + "`before` is a required argument." ) expect_error( - epi_slide_mean(grouped, col_names = value, ref_time_values = d + 1), - "Either or both of `before`, `after` must be provided." - ) - expect_warning( - epi_slide_mean(grouped, col_names = value, before = 0L, ref_time_values = d + 1), - "`before==0`, `after` missing" - ) - expect_warning( - epi_slide_mean(grouped, col_names = value, after = 0L, ref_time_values = d + 1), - "`before` missing, `after==0`" + epi_slide_mean(grouped, col_names = value, ref_time_values = test_date + 1), + "`before` is a required argument." ) - # Below cases should raise no errors/warnings: expect_no_warning( - ref1 <- epi_slide(grouped, f, before = 1L, ref_time_values = d + 2) + ref1 <- epi_slide(grouped, f, before = days_dt, ref_time_values = test_date + 2) ) expect_no_warning( - ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2) - ) - expect_no_warning( - ref3 <- epi_slide(grouped, f, - before = 0L, after = 0L, ref_time_values = d + 2 - ) + ref2 <- epi_slide(grouped, f, after = days_dt, ref_time_values = test_date + 2) ) expect_no_warning( opt1 <- epi_slide_mean(grouped, col_names = value, - before = 1L, ref_time_values = d + 2, na.rm = TRUE + before = days_dt, ref_time_values = test_date + 2, na.rm = TRUE ) ) expect_no_warning( opt2 <- epi_slide_mean(grouped, col_names = value, - after = 1L, ref_time_values = d + 2, na.rm = TRUE - ) - ) - expect_no_warning( - opt3 <- epi_slide_mean(grouped, - col_names = value, - before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE + after = days_dt, ref_time_values = test_date + 2, na.rm = TRUE ) ) # Results from epi_slide and epi_slide_mean should match expect_equal(select(ref1, -slide_value_count), opt1) expect_equal(select(ref2, -slide_value_count), opt2) - expect_equal(select(ref3, -slide_value_count), opt3) }) test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { expect_error( - epi_slide(grouped, f, before = -1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: Element 1 is not >= 0" + epi_slide(grouped, f, before = -1L, ref_time_values = test_date + 2L), + "Expected `before` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d + 2L), - "Assertion on 'after' failed: Element 1 is not >= 0" + epi_slide(grouped, f, after = -1L, ref_time_values = test_date + 2L), + "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = "a", after = days_dt, ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = days_dt, after = "a", ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = 0.5, after = days_dt, ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = days_dt, after = 0.5, ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: May not be NA" + epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = test_date + 2L), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), - "Assertion on 'after' failed: May not be NA" + epi_slide(grouped, f, before = days_dt, after = NA, ref_time_values = test_date + 2L), + "Expected `after` to be a scalar value." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: Element 1 is not >= 0" + epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = test_date + 2L), + "Expected `before` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = 2L, after = -1L, ref_time_values = d + 2L), - "Assertion on 'after' failed: Element 1 is not >= 0" + epi_slide_mean(grouped, col_names = value, after = -1L, ref_time_values = test_date + 2L), + "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = "a", ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, after = "a", ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = 0.5, ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, after = 0.5, ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = NA, after = 1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: May not be NA" + epi_slide_mean(grouped, col_names = value, before = NA, after = days_dt, ref_time_values = test_date + 2L), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = 1L, after = NA, ref_time_values = d + 2L), - "Assertion on 'after' failed: May not be NA" + epi_slide_mean(grouped, col_names = value, before = days_dt, after = NA, ref_time_values = test_date + 2L), + "Expected `after` to be a scalar value." ) # Non-integer-class but integer-compatible values are allowed: expect_no_error( - ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L) + ref <- epi_slide(grouped, f, before = days_dt, after = days_dt, ref_time_values = test_date + 2L) ) expect_no_error(opt <- epi_slide_mean( grouped, - col_names = value, before = 1, after = 1, - ref_time_values = d + 2L, na.rm = TRUE + col_names = value, before = days_dt, after = days_dt, + ref_time_values = test_date + 2L, na.rm = TRUE )) # Results from epi_slide and epi_slide_mean should match @@ -208,20 +183,25 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d), + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 207L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window expect_error( - epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d), + epi_slide_mean(grouped, col_names = value, before = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( - epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d + 207L), + epi_slide_mean( + grouped, + col_names = value, + before = 2 * days_dt, + ref_time_values = test_date + 207L + ), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window }) @@ -234,51 +214,25 @@ test_that( ), { expect_error( - epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), + epi_slide(grouped, f, after = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 201L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window expect_error( - epi_slide_mean(grouped, value, before = 0L, after = 2L, ref_time_values = d), + epi_slide_mean(grouped, value, after = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( - epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 201L), + epi_slide_mean(grouped, value, before = 2 * days_dt, ref_time_values = test_date + 201L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window } ) -## --- These cases generate warnings (or not): --- -test_that("Warn user against having a blank `before`", { - expect_no_warning(ref1 <- epi_slide( - grouped, f, - after = 1L, ref_time_values = d + 1L - )) - expect_no_warning(ref2 <- epi_slide( - grouped, f, - before = 0L, after = 1L, ref_time_values = d + 1L - )) - - expect_no_warning(opt1 <- epi_slide_mean( - grouped, value, - after = 1L, ref_time_values = d + 1L, na.rm = TRUE - )) - expect_no_warning(opt2 <- epi_slide_mean( - grouped, value, - before = 0L, after = 1L, - ref_time_values = d + 1L, na.rm = TRUE - )) - - # Results from epi_slide and epi_slide_mean should match - expect_equal(select(ref1, -slide_value_count), opt1) - expect_equal(select(ref2, -slide_value_count), opt2) -}) - ## --- These cases doesn't generate the error: --- test_that( c( @@ -287,26 +241,32 @@ test_that( ), { expect_equal( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 200L) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group expect_equal( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 3) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) ) # not out of range for either group expect_equal( - epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% + epi_slide_mean( + grouped, value, + before = 2 * days_dt, ref_time_values = test_date + 200L, na.rm = TRUE + ) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group expect_equal( - epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% + epi_slide_mean( + grouped, value, + before = 2 * days_dt, ref_time_values = test_date + 3, na.rm = TRUE + ) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) @@ -318,19 +278,23 @@ test_that("computation output formats x as_list_col", { # See `toy_edf` and `basic_sum_result` definitions at top of file. # We'll try 7d sum with a few formats. expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ sum(.x$value)), basic_sum_result ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ sum(.x$value), as_list_col = TRUE), basic_sum_result %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value))), basic_sum_result %>% rename(slide_value_value = slide_value) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), basic_sum_result %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) @@ -347,7 +311,7 @@ test_that("epi_slide_mean errors when `as_list_col` non-NULL", { ) %>% epi_slide_mean( value, - before = 6L, na.rm = TRUE + before = 6 * days_dt, na.rm = TRUE ), basic_mean_result %>% dplyr::mutate( slide_value_value = slide_value @@ -361,7 +325,7 @@ test_that("epi_slide_mean errors when `as_list_col` non-NULL", { ) %>% epi_slide_mean( value, - before = 6L, as_list_col = TRUE, na.rm = TRUE + before = 6 * days_dt, as_list_col = TRUE, na.rm = TRUE ), class = "epiprocess__epi_slide_opt__list_not_supported" ) @@ -372,7 +336,7 @@ test_that("nested dataframe output names are controllable", { expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), new_col_name = "result" ), basic_sum_result %>% rename(result_value = slide_value) @@ -380,7 +344,7 @@ test_that("nested dataframe output names are controllable", { expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value_sum = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value_sum = sum(.x$value)), names_sep = NULL ), basic_sum_result %>% rename(value_sum = slide_value) @@ -392,27 +356,29 @@ test_that("non-size-1 outputs are recycled", { # nolint start: line_length_linter. basic_result_from_size2 <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, + "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint end expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + 0:1), basic_result_from_size2 ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + 0:1, as_list_col = TRUE), basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value) + 0:1)), basic_result_from_size2 %>% rename(slide_value_value = slide_value) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6 * days_dt, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE + ), basic_result_from_size2 %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) @@ -421,11 +387,17 @@ test_that("non-size-1 outputs are recycled", { test_that("epi_slide alerts if the provided f doesn't take enough args", { f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) - expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) + expect_error( + epi_slide(grouped, f_xgt, before = days_dt, ref_time_values = test_date + 1), + regexp = NA + ) + expect_warning( + epi_slide(grouped, f_xgt, before = days_dt, ref_time_values = test_date + 1), + regexp = NA + ) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) - expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d + 1), + expect_warning(epi_slide(grouped, f_x_dots, before = days_dt, ref_time_values = test_date + 1), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) @@ -436,32 +408,32 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # nolint start: line_length_linter. basic_full_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint end # slide computations returning atomic vecs: expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value)), basic_full_result ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L) + before = 6 * days_dt, ~ sum(.x$value), + ref_time_values = test_date + c(2L, 8L) ), - basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) + basic_full_result %>% dplyr::filter(time_value %in% (test_date + c(2L, 8L))) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L), all_rows = TRUE + before = 6 * days_dt, ~ sum(.x$value), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE ), basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) ) @@ -472,7 +444,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) %>% epi_slide_mean( value, - before = 6L, names_sep = NULL, na.rm = TRUE + before = 6 * days_dt, names_sep = NULL, na.rm = TRUE ), basic_mean_result %>% rename(slide_value_value = slide_value) @@ -483,10 +455,10 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) %>% epi_slide_mean( value, - before = 6L, ref_time_values = c(2L, 8L), + before = 6 * days_dt, ref_time_values = test_date + c(2L, 8L), names_sep = NULL, na.rm = TRUE ), - filter(basic_mean_result, time_value %in% c(2L, 8L)) %>% + filter(basic_mean_result, time_value %in% (test_date + c(2L, 8L))) %>% rename(slide_value_value = slide_value) ) expect_equal( @@ -495,11 +467,11 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) %>% epi_slide_mean( value, - before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, names_sep = NULL, na.rm = TRUE ), basic_mean_result %>% - dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) %>% select(-slide_value) @@ -507,25 +479,25 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # slide computations returning data frames: expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value))), basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L) + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L) ), basic_full_result %>% - dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% dplyr::rename(slide_value_value = slide_value) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE ), basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) %>% dplyr::rename(slide_value_value = slide_value) @@ -533,7 +505,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # slide computations returning data frames with `as_list_col=TRUE`: expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE ), basic_full_result %>% @@ -541,30 +513,30 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), as_list_col = TRUE ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% - dplyr::filter(time_value %in% c(2L, 8L)) + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, as_list_col = TRUE ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, list(NULL) )) ) # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE ) %>% unnest(slide_value, names_sep = "_"), @@ -572,19 +544,19 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), as_list_col = TRUE ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% - dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% dplyr::rename(slide_value_value = slide_value) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, as_list_col = TRUE ) %>% unnest(slide_value, names_sep = "_"), @@ -592,7 +564,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # XXX unclear exactly what we want in this case. Current approach is # compatible with `vctrs::vec_detect_missing` but breaks `tidyr::unnest` # compatibility - dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% dplyr::rename(slide_value_value = slide_value) ) rework_nulls <- function(slide_values_list) { @@ -604,14 +576,14 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { } expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, as_list_col = TRUE ) %>% mutate(slide_value = rework_nulls(slide_value)) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) %>% dplyr::rename(slide_value_value = slide_value) @@ -621,7 +593,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { test_that("`epi_slide` doesn't decay date output", { expect_true( ungrouped %>% - epi_slide(before = 5L, ~ as.Date("2020-01-01")) %>% + epi_slide(before = 5 * days_dt, ~ as.Date("2020-01-01")) %>% `[[`("slide_value") %>% inherits("Date") ) @@ -629,34 +601,34 @@ test_that("`epi_slide` doesn't decay date output", { test_that("basic grouped epi_slide computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) # formula - result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50) + result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50 * days_dt) expect_equal(result1, expected_output) # function - result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50) + result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50 * days_dt) expect_equal(result2, expected_output) # dots - result3 <- epi_slide(small_x, slide_value = sum(value), before = 50) + result3 <- epi_slide(small_x, slide_value = sum(value), before = 50 * days_dt) expect_equal(result3, expected_output) }) test_that("basic grouped epi_slide_mean computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) - result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + result1 <- epi_slide_mean(small_x, value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) }) @@ -665,7 +637,7 @@ test_that("ungrouped epi_slide computation completes successfully", { small_x %>% ungroup() %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = sum(.x$value) ) ) @@ -673,15 +645,15 @@ test_that("ungrouped epi_slide computation completes successfully", { test_that("basic ungrouped epi_slide computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15)) ) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = sum(.x$value) ) expect_equal(result1, expected_output) @@ -689,19 +661,19 @@ test_that("basic ungrouped epi_slide computation produces expected output", { # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( dplyr::tibble( - geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) + geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) ), dplyr::tibble( - geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) + geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) ) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% arrange(time_value) result2 <- small_x %>% ungroup() %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = sum(.x$value) ) expect_equal(result2, expected_output) @@ -709,37 +681,37 @@ test_that("basic ungrouped epi_slide computation produces expected output", { test_that("basic ungrouped epi_slide_mean computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), ) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% - epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) + epi_slide_mean(value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) # Ungrouped with multiple geos # epi_slide_mean fails when input data groups contain duplicate time_values, # e.g. aggregating across geos expect_error( - small_x %>% ungroup() %>% epi_slide_mean(value, before = 6L), + small_x %>% ungroup() %>% epi_slide_mean(value, before = 6 * days_dt), class = "epiprocess__epi_slide_opt__duplicate_time_values" ) }) test_that("epi_slide computation via formula can use ref_time_value", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( f = ~.ref_time_value, - before = 50 + before = 50 * days_dt ) expect_equal(result1, expected_output) @@ -747,7 +719,7 @@ test_that("epi_slide computation via formula can use ref_time_value", { result2 <- small_x %>% epi_slide( f = ~.z, - before = 50 + before = 50 * days_dt ) expect_equal(result2, expected_output) @@ -755,40 +727,40 @@ test_that("epi_slide computation via formula can use ref_time_value", { result3 <- small_x %>% epi_slide( f = ~..3, - before = 50 + before = 50 * days_dt ) expect_equal(result3, expected_output) # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% arrange(time_value) result4 <- small_x %>% ungroup() %>% epi_slide( f = ~.ref_time_value, - before = 50 + before = 50 * days_dt ) expect_equal(result4, expected_output) }) test_that("epi_slide computation via function can use ref_time_value", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( f = function(x, g, t) t, - before = 2 + before = 2 * days_dt ) expect_equal(result1, expected_output) @@ -797,15 +769,15 @@ test_that("epi_slide computation via function can use ref_time_value", { test_that("epi_slide computation via dots can use ref_time_value and group", { # ref_time_value expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .ref_time_value ) @@ -815,22 +787,22 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # `.env`. expect_error(small_x %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .env$.ref_time_value )) # group_key # Use group_key column expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = "ak"), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = "al") + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = "ak"), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = "al") ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result3 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = .group_key$geo_value ) @@ -838,15 +810,15 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # Use entire group_key object expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = 1L), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = 1L) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = 1L), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = 1L) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result4 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = nrow(.group_key) ) @@ -854,16 +826,16 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% arrange(time_value) result5 <- small_x %>% ungroup() %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .ref_time_value ) expect_equal(result5, expected_output) @@ -872,14 +844,14 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { test_that("epi_slide computation via dots outputs the same result using col names and the data var", { expected_output <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = max(time_value) ) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = max(.x$time_value) ) @@ -887,7 +859,7 @@ test_that("epi_slide computation via dots outputs the same result using col name result2 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = max(.data$time_value) ) @@ -896,7 +868,9 @@ test_that("epi_slide computation via dots outputs the same result using col name test_that("`epi_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epi_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + archive_haystack %>% epi_slide( + has_needle = time_value_needle %in% time_value, before = 365000L * days_dt + ) } expect_error( helper(small_x, as.Date("2021-01-01")), @@ -906,45 +880,50 @@ test_that("`epi_slide` can access objects inside of helper functions", { test_that("basic slide behavior is correct when groups have non-overlapping date ranges", { small_x_misaligned_dates <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5)) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% group_by(geo_value) expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble( + geo_value = "al", time_value = test_date + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5 + ) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) - result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50) + result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50 * days_dt) expect_equal(result1, expected_output) - result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_mean( + small_x_misaligned_dates, value, + before = 50 * days_dt, names_sep = NULL, na.rm = TRUE + ) expect_equal(result2, expected_output %>% rename(slide_value_value = slide_value)) }) test_that("epi_slide gets correct ref_time_value when groups have non-overlapping date ranges", { small_x_misaligned_dates <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5)) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% group_by(geo_value) expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = d + 151:155) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5), slide_value = test_date + 151:155) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x_misaligned_dates %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .ref_time_value ) @@ -952,7 +931,7 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin }) test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { - test_time_type_mean <- function(dates, vals, before = 6L, after = 0L, n, m, n_obs, k, ...) { + test_time_type_mean <- function(dates, vals, before = 6 * days_dt, after = 0 * days_dt, n, m, n_obs, k, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. last (m-1):(n-1) dates are missing epi_data <- epiprocess::as_epi_df(rbind(tibble( @@ -975,10 +954,7 @@ test_that("results for different `before`s and `after`s match between epi_slide ), before = before, after = after, names_sep = NULL, ... ) - result2 <- epi_slide_mean(epi_data, - col_names = c(a, b), na.rm = TRUE, - before = before, after = after, ... - ) + result2 <- epi_slide_mean(epi_data, col_names = c(a, b), na.rm = TRUE, before = before, after = after, ...) expect_equal(result1, result2) } @@ -994,12 +970,12 @@ test_that("results for different `before`s and `after`s match between epi_slide # Basic time type days <- as.Date("2022-01-01") + k - test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) # Without any missing dates n <- 15 # Max date index @@ -1011,12 +987,12 @@ test_that("results for different `before`s and `after`s match between epi_slide # Basic time type days <- as.Date("2022-01-01") + k - test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) }) test_that("results for different time_types match between epi_slide and epi_slide_mean", { @@ -1047,26 +1023,11 @@ test_that("results for different time_types match between epi_slide and epi_slid )), ...) } - # Basic time type + # Basic time type, require before and after in difftimes days <- as.Date("2022-01-01") + k - - # Require lubridate::period function to be passed as `time_step` - day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes - day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks - - # Don't require a `time_step` fn - yearweeks <- tsibble::yearweek(10L + k) + weeks <- as.Date("2022-01-01") + 7L * k yearmonths <- tsibble::yearmonth(10L + k) - yearquarters <- tsibble::yearquarter(10L + k) - years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format - - # Not supported - custom_dates <- c( - "January 1, 2022", "January 2, 2022", "January 3, 2022", - "January 4, 2022", "January 5, 2022", "January 6, 2022" - ) - not_dates <- c("a", "b", "c", "d", "e", "f") + integers <- 2000L + k ref_epi_data <- generate_special_date_data(days) %>% group_by(geo_value) @@ -1075,10 +1036,10 @@ test_that("results for different time_types match between epi_slide and epi_slid slide_value_a = mean(.x$a, rm.na = TRUE), slide_value_b = mean(.x$b, rm.na = TRUE) ), - before = 6L, after = 0L, names_sep = NULL + before = 6 * days_dt, names_sep = NULL ) - test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { + test_time_type_mean <- function(dates, before) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. date 10 is missing epi_data <- generate_special_date_data(dates) %>% @@ -1088,11 +1049,10 @@ test_that("results for different time_types match between epi_slide and epi_slid slide_value_a = mean(.x$a, rm.na = TRUE), slide_value_b = mean(.x$b, rm.na = TRUE) ), - before = before, after = after, names_sep = NULL, ... + before = before, names_sep = NULL ) result2 <- epi_slide_mean(epi_data, - col_names = c(a, b), na.rm = TRUE, - before = before, after = after, ... + col_names = c(a, b), na.rm = TRUE, before = before ) expect_equal(result1, result2) @@ -1101,14 +1061,10 @@ test_that("results for different time_types match between epi_slide and epi_slid expect_equal(select(ref_result, -time_value), select(result2, -time_value)) } - test_time_type_mean(days) - test_time_type_mean(yearweeks) - test_time_type_mean(yearmonths) - test_time_type_mean(yearquarters) - test_time_type_mean(years) - test_time_type_mean(day_times_minute, time_step = lubridate::minutes) - test_time_type_mean(day_times_hour, time_step = lubridate::hours) - test_time_type_mean(weeks, time_step = lubridate::weeks) + test_time_type_mean(days, before = 6 * days_dt) + test_time_type_mean(weeks, before = 6 * weeks_dt) + test_time_type_mean(yearmonths, before = 6) + test_time_type_mean(integers, before = 6) # `epi_slide_mean` can also handle `weeks` without `time_step` being # provided, but `epi_slide` can't @@ -1116,47 +1072,11 @@ test_that("results for different time_types match between epi_slide and epi_slid group_by(geo_value) result2 <- epi_slide_mean(epi_data, col_names = c(a, b), na.rm = TRUE, - before = 6L, after = 0L + before = 6 * weeks_dt ) expect_equal(select(ref_result, -time_value), select(result2, -time_value)) }) -test_that("special time_types without time_step fail in epi_slide_mean", { - n_obs <- 6 - k <- 1:n_obs - - day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes - day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - - # Not supported - custom_dates <- c( - "January 1, 2022", "January 2, 2022", "January 3, 2022", - "January 4, 2022", "January 5, 2022", "January 6, 2022" - ) - not_dates <- c("a", "b", "c", "d", "e", "f") - - test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { - epi_data <- epiprocess::as_epi_df(tibble( - geo_value = "al", - time_value = dates, - a = 1:n_obs - )) - - expect_error( - epi_slide_mean(epi_data, - col_names = a, - before = before, after = after - ), - class = "epiprocess__full_date_seq__unmappable_time_type" - ) - } - - test_time_type_mean(custom_dates) - test_time_type_mean(not_dates) - test_time_type_mean(day_times_minute) - test_time_type_mean(day_times_hour) -}) - test_that("helper `full_date_seq` returns expected date values", { n <- 6L # Max date index m <- 1L # Number of missing dates @@ -1185,19 +1105,11 @@ test_that("helper `full_date_seq` returns expected date values", { )), ...) } - # Basic time type + # Basic time type, require before and after in difftimes days <- as.Date("2022-01-01") + k - - # Require lubridate::period function to be passed as `time_step` - day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes - day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks - - # Don't require a `time_step` fn - yearweeks <- tsibble::yearweek(10L + k) + weeks <- as.Date("2022-01-01") + 7L * k yearmonths <- tsibble::yearmonth(10L + k) - yearquarters <- tsibble::yearquarter(10L + k) - years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + integers <- 2000L + k before <- 2L after <- 1L @@ -1205,7 +1117,7 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( generate_special_date_data(days), - before = before, after = after + before = before * days_dt, after = after * days_dt, time_type = "day" ), list( all_dates = as.Date(c( @@ -1217,21 +1129,18 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq( - generate_special_date_data(yearweeks), - before = before, after = after - ), + full_date_seq(generate_special_date_data(weeks), before = before, after = after, time_type = "week"), list( - all_dates = tsibble::yearweek(10:16), - pad_early_dates = tsibble::yearweek(8:9), - pad_late_dates = tsibble::yearweek(17) + all_dates = as.Date(c( + "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", + "2022-01-29", "2022-02-05", "2022-02-12" + )), + pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), + pad_late_dates = as.Date(c("2022-02-19")) ) ) expect_identical( - full_date_seq( - generate_special_date_data(yearmonths), - before = before, after = after - ), + full_date_seq(generate_special_date_data(yearmonths), before = before, after = after, time_type = "yearmonth"), list( all_dates = tsibble::yearmonth(10:16), pad_early_dates = tsibble::yearmonth(8:9), @@ -1239,81 +1148,13 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq( - generate_special_date_data(yearquarters), - before = before, after = after - ), - list( - all_dates = tsibble::yearquarter(10:16), - pad_early_dates = tsibble::yearquarter(8:9), - pad_late_dates = tsibble::yearquarter(17) - ) - ) - expect_identical( - full_date_seq( - generate_special_date_data(years), - before = before, after = after - ), + full_date_seq(generate_special_date_data(integers), before = before, after = after, time_type = "integer"), list( all_dates = 2000L:2006L, pad_early_dates = 1998L:1999L, pad_late_dates = 2007L ) ) - expect_identical( - full_date_seq( - generate_special_date_data(day_times_minute), - before = before, after = after, - time_step = lubridate::minutes - ), - list( - all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(0:6), - pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::minutes(2:1), - pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(7) - ) - ) - expect_identical( - full_date_seq( - generate_special_date_data(day_times_hour), - before = before, after = after, - time_step = lubridate::hours - ), - list( - all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(0:6), - pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::hours(2:1), - pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(7) - ) - ) - expect_identical( - full_date_seq( - generate_special_date_data(weeks), - before = before, after = after, - time_step = lubridate::weeks - ), - list( - all_dates = as.Date(c( - "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", - "2022-01-29", "2022-02-05", "2022-02-12" - )), - pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), - pad_late_dates = as.Date(c("2022-02-19")) - ) - ) - # Check the middle branch (`if (missing(time_step))`) of `full_date_seq`. - expect_identical( - full_date_seq( - generate_special_date_data(weeks, time_type = "week"), - before = before, after = after - ), - list( - all_dates = as.Date(c( - "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", - "2022-01-29", "2022-02-05", "2022-02-12" - )), - pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), - pad_late_dates = as.Date(c("2022-02-19")) - ) - ) # Other before/after values before <- 5L @@ -1322,7 +1163,7 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( generate_special_date_data(days), - before = before, after = after + before = before * days_dt, after = after * days_dt, time_type = "day" ), list( all_dates = as.Date(c( @@ -1343,7 +1184,7 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( generate_special_date_data(days), - before = before, after = after + before = before * days_dt, after = after * days_dt, time_type = "day" ), list( all_dates = as.Date(c( @@ -1358,49 +1199,44 @@ test_that("helper `full_date_seq` returns expected date values", { ) }) -test_that("`epi_slide_mean` errors when passed `time_values` with closer than expected spacing", { - time_df <- tibble( - geo_value = 1, - value = c(0:7, 3.5, 10, 20), - # Adding the value 3.5 creates a time that has fractional seconds, which - # doesn't follow the expected 1-second spacing of the `time_values`. - # This leads to `frollmean` using obs spanning less than the expected - # time frame for some computation windows. - time_value = Sys.time() + value - ) %>% - as_epi_df() - expect_error( - epi_slide_mean(time_df, value, before = 6L, time_step = lubridate::seconds), - class = "epiprocess__epi_slide_opt__unexpected_row_number" - ) -}) - test_that("epi_slide_mean produces same output as epi_slide_opt", { - result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - result2 <- epi_slide_opt(small_x, value, + result1 <- epi_slide_mean( + small_x, + value, + before = 50 * days_dt, + names_sep = NULL, + na.rm = TRUE + ) + result2 <- epi_slide_opt( + small_x, + value, f = data.table::frollmean, - before = 50, names_sep = NULL, na.rm = TRUE + before = 50 * days_dt, + names_sep = NULL, + na.rm = TRUE ) expect_equal(result1, result2) - - result3 <- epi_slide_opt(small_x, value, + result3 <- epi_slide_opt( + small_x, + value, f = slider::slide_mean, - before = 50, names_sep = NULL, na_rm = TRUE + before = 50 * days_dt, + names_sep = NULL, + na_rm = TRUE ) expect_equal(result1, result3) }) test_that("epi_slide_sum produces same output as epi_slide_opt", { - result1 <- epi_slide_sum(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + result1 <- epi_slide_sum(small_x, value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) result2 <- epi_slide_opt(small_x, value, f = data.table::frollsum, - before = 50, names_sep = NULL, na.rm = TRUE + before = 50 * days_dt, names_sep = NULL, na.rm = TRUE ) expect_equal(result1, result2) - result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, - before = 50, names_sep = NULL, na_rm = TRUE + before = 50 * days_dt, names_sep = NULL, na_rm = TRUE ) expect_equal(result1, result3) }) @@ -1410,31 +1246,29 @@ test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` fun epi_slide_opt( grouped, col_names = value, f = data.table::frollmean, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ) ) expect_no_error( epi_slide_opt( grouped, col_names = value, f = slider::slide_min, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ) ) - reexport_frollmean <- data.table::frollmean expect_no_error( epi_slide_opt( grouped, col_names = value, f = reexport_frollmean, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ) ) - expect_error( epi_slide_opt( grouped, col_names = value, f = mean, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ), class = "epiprocess__epi_slide_opt__unsupported_slide_function" ) diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index b87b26ed..7da6c6be 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -1,9 +1,11 @@ +test_date <- as.Date("2020-01-01") + test_that("epix_fill_through_version mirrors input when it is sufficiently up to date", { ea_orig <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 + geo_value = "ak", time_value = test_date, + version = test_date + 1:5, value = 1:5 )) - some_earlier_observed_version <- 2L + some_earlier_observed_version <- test_date + 2L ea_trivial_fill_na1 <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") ea_trivial_fill_na2 <- epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") ea_trivial_fill_locf <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") @@ -15,13 +17,13 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to test_that("epix_fill_through_version can extend observed versions, gives expected `as_of`s", { ea_orig <- as_epi_archive(data.table::data.table( - geo_value = "g1", - time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), - version = c(1:5, 2L), + geo_value = "ak", + time_value = test_date + c(rep(0L, 5L), 1L), + version = test_date + c(1:5, 2L), value = 1:6 )) - first_unobserved_version <- 6L - later_unobserved_version <- 10L + first_unobserved_version <- test_date + 6L + later_unobserved_version <- test_date + 10L ea_fill_na <- epix_fill_through_version(ea_orig, later_unobserved_version, "na") ea_fill_locf <- epix_fill_through_version(ea_orig, later_unobserved_version, "locf") @@ -29,7 +31,7 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte { expect_identical(ea_fill_na$versions_end, later_unobserved_version) expect_identical(tibble::as_tibble(epix_as_of(ea_fill_na, first_unobserved_version)), - tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + tibble::tibble(geo_value = "ak", time_value = test_date + 0:1, value = rep(NA_integer_, 2L)), ignore_attr = TRUE ) expect_identical(ea_fill_locf$versions_end, later_unobserved_version) @@ -50,38 +52,39 @@ test_that("epix_fill_through_version does not mutate x", { for (ea_orig in list( # vanilla case as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 + geo_value = "ak", time_value = test_date, + version = test_date + 1:5, value = 1:5 )), # data.table unique yielding original DT by reference special case (maybe # having only 1 row is the trigger? having no revisions of initial values # doesn't seem sufficient to trigger) - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date + 1, version = test_date + 1, value = 10L)) )) { ea_orig_before <- clone(ea_orig) - some_unobserved_version <- 8L - ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") + ea_fill_na <- epix_fill_through_version(ea_orig, test_date + 8, "na") expect_identical(ea_orig_before, ea_orig) - ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") + ea_fill_locf <- epix_fill_through_version(ea_orig, test_date + 8, "locf") expect_identical(ea_orig_before, ea_orig) } }) test_that("epix_fill_through_version return with expected visibility", { ea <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 + geo_value = "ak", time_value = test_date, + version = test_date + 1:5, value = 1:5 )) - expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) + expect_true(withVisible(epix_fill_through_version(ea, test_date + 10L, "na"))[["visible"]]) }) test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { - ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + ea <- as_epi_archive( + tibble::tibble(geo_value = "ak", time_value = test_date + 1, version = test_date + 1, value = 10L) + ) old_dt_copy <- data.table::copy(ea$DT) old_key <- data.table::key(ea$DT) - expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) - expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version(ea, test_date + 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version(ea, test_date + 5L, "locf")$DT), old_key) expect_identical(data.table::key(ea$DT), old_key) }) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index c29301b8..d336622f 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,3 +1,5 @@ +test_date <- as.Date("2020-01-01") + test_that("epix_merge requires forbids on invalid `y`", { ea <- archive_cases_dv_subset expect_error(epix_merge(ea, data.frame(x = 1))) @@ -9,20 +11,20 @@ test_that("epix_merge merges and carries forward updates properly", { tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, # same version set for x and y - "g1", 1L, 1:3, paste0("XA", 1:3), + "ak", test_date + 1, test_date + 1:3, paste0("XA", 1:3), # versions of x surround those of y + this measurement has # max update version beyond some others - "g1", 2L, 1:5, paste0("XB", 1:5), + "ak", test_date + 2, test_date + 1:5, paste0("XB", 1:5), # mirror case - "g1", 3L, 2L, paste0("XC", 2L), + "ak", test_date + 3, test_date + 2L, paste0("XC", 2L), # x has 1 version, y has 0 - "g1", 4L, 1L, paste0("XD", 1L), + "ak", test_date + 4, test_date + 1L, paste0("XD", 1L), # non-NA values that should be carried forward # (version-wise LOCF) in other versions, plus NAs that # should (similarly) be carried forward as NA (latter # wouldn't work with an ordinary merge + post-processing # with `data.table::nafill`) - "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + "ak", test_date + 6, test_date + c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) ) %>% tidyr::unchop(c(version, x_value)) %>% dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) @@ -32,11 +34,11 @@ test_that("epix_merge merges and carries forward updates properly", { data.table::as.data.table( tibble::tribble( ~geo_value, ~time_value, ~version, ~y_value, - "g1", 1L, 1:3, paste0("YA", 1:3), - "g1", 2L, 2L, paste0("YB", 2L), - "g1", 3L, 1:5, paste0("YC", 1:5), - "g1", 5L, 1L, paste0("YD", 1L), - "g1", 6L, 1:5, paste0("YE", 1:5), + "ak", test_date + 1, test_date + 1:3, paste0("YA", 1:3), + "ak", test_date + 2, test_date + 2L, paste0("YB", 2L), + "ak", test_date + 3, test_date + 1:5, paste0("YC", 1:5), + "ak", test_date + 5, test_date + 1L, paste0("YD", 1L), + "ak", test_date + 6, test_date + 1:5, paste0("YE", 1:5), ) %>% tidyr::unchop(c(version, y_value)) %>% dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) @@ -47,12 +49,12 @@ test_that("epix_merge merges and carries forward updates properly", { data.table::as.data.table( tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), - "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), - "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), - "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), - "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), - "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + "ak", test_date + 1, test_date + 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "ak", test_date + 2, test_date + 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "ak", test_date + 3, test_date + 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "ak", test_date + 4, test_date + 1L, paste0("XD", 1L), paste0("YD", NA), + "ak", test_date + 5, test_date + 1L, paste0("XD", NA), paste0("YD", 1L), + "ak", test_date + 6, test_date + 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), ) %>% tidyr::unchop(c(version, x_value, y_value)) %>% dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) @@ -65,39 +67,41 @@ test_that("epix_merge merges and carries forward updates properly", { test_that("epix_merge forbids and warns on metadata and naming issues", { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "tx", time_value = test_date, version = test_date + 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "us", time_value = test_date, version = test_date + 5L, y_value = 2L)) ), regexp = "must have the same.*geo_type" ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "pa", time_value = test_date, version = test_date + 1L, x_value = 1L)), + as_epi_archive( + tibble::tibble(geo_value = "pa", time_value = 1L, version = 2L, y_value = 2L) + ) ), - regexp = "must have the same.*time_type" + regexp = "must share data type on their `time_value` column." ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, value = 2L)) ), regexp = "overlapping.*names" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L), additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) ), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L)) ), regexp = "x\\$additional_metadata", class = "epiprocess__epix_merge_ignores_additional_metadata" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L), additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) ) ), @@ -109,30 +113,30 @@ test_that("epix_merge forbids and warns on metadata and naming issues", { # use `local` to prevent accidentally using the x, y, xy bindings here # elsewhere, while allowing reuse across a couple tests local({ - x <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), - clobberable_versions_start = 1L, versions_end = 10L + x <- as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L), + clobberable_versions_start = test_date + 1L, versions_end = test_date + 10L ) - y <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), - clobberable_versions_start = 3L, versions_end = 10L + y <- as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L), + clobberable_versions_start = test_date + 3L, versions_end = test_date + 10L ) xy <- epix_merge(x, y) test_that("epix_merge considers partially-clobberable row to be clobberable", { - expect_identical(xy$clobberable_versions_start, 1L) + expect_identical(xy$clobberable_versions_start, test_date + 1L) }) test_that("epix_merge result uses versions_end metadata not max version val", { - expect_identical(xy$versions_end, 10L) + expect_identical(xy$versions_end, test_date + 10L) }) }) local({ x <- as_epi_archive( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), - clobberable_versions_start = 1L, - versions_end = 3L + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 10L), + clobberable_versions_start = test_date + 1L, + versions_end = test_date + 3L ) y <- as_epi_archive( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), - clobberable_versions_start = 1L + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 5L, y_value = 20L), + clobberable_versions_start = test_date + 1L ) test_that('epix_merge forbids on sync default or "forbid"', { expect_error(epix_merge(x, y), @@ -147,12 +151,12 @@ local({ epix_merge(x, y, sync = "na"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet - 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + "ak", test_date, test_date + 1L, 10L, NA_integer_, # x updated, y not observed yet + "ak", test_date, test_date + 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + "ak", test_date, test_date + 5L, NA_integer_, 20L, # x still NA, y updated # (we should not have a y vals -> NA update here; version 5 should be # the `versions_end` of the result) - ), clobberable_versions_start = 1L) + ), clobberable_versions_start = test_date + 1L) ) }) test_that('epix_merge sync="locf" works', { @@ -160,9 +164,9 @@ local({ epix_merge(x, y, sync = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated - ), clobberable_versions_start = 1L) + "ak", test_date, test_date + 1L, 10L, NA_integer_, # x updated, y not observed yet + "ak", test_date, test_date + 5L, 10L, 20L, # x LOCF'd, y updated + ), clobberable_versions_start = test_date + 1L) ) }) test_that('epix_merge sync="truncate" works', { @@ -170,16 +174,20 @@ local({ epix_merge(x, y, sync = "truncate"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + "ak", test_date, test_date + 1L, 10L, NA_integer_, # x updated, y not observed yet # y's update beyond x's last update has been truncated - ), clobberable_versions_start = 1L, versions_end = 3L) + ), clobberable_versions_start = test_date + 1L, versions_end = test_date + 3L) ) }) - x_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) - y_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + x_no_conflict <- as_epi_archive( + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 10L) + ) + y_no_conflict <- as_epi_archive( + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 20L) + ) xy_no_conflict_expected <- as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + "ak", test_date, test_date + 1L, 10L, 20L, # x updated, y not observed yet )) test_that('epix_merge sync="forbid" on no-conflict works', { expect_equal( @@ -209,25 +217,3 @@ local({ ) }) }) - - -test_that('epix_merge sync="na" balks if do not know next_after', { - expect_error( - epix_merge( - as_epi_archive(tibble::tibble( - geo_value = 1L, - time_value = 1L, - version = as.POSIXct(as.Date("2020-01-01")), - x_value = 10L - )), - as_epi_archive(tibble::tibble( - geo_value = 1L, - time_value = 1L, - version = as.POSIXct(as.Date("2020-01-02")), - y_value = 20L - )), - sync = "na" - ), - regexp = "no applicable method.*next_after" - ) -}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index a5b72cbf..cb7b3bdc 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -1,19 +1,21 @@ library(dplyr) +test_date <- as.Date("2020-01-01") + test_that("epix_slide only works on an epi_archive", { expect_error(epix_slide(data.frame(x = 1))) }) x <- tibble::tribble( ~version, ~time_value, ~binary, - 4, c(1:3), 2^(1:3), - 5, c(1:2, 4), 2^(4:6), - 6, c(1:2, 4:5), 2^(7:10), - 7, 2:6, 2^(11:15) + test_date + 4, test_date + c(1:3), 2^(1:3), + test_date + 5, test_date + c(1:2, 4), 2^(4:6), + test_date + 6, test_date + c(1:2, 4:5), 2^(7:10), + test_date + 7, test_date + 2:6, 2^(11:15) ) %>% tidyr::unnest(c(time_value, binary)) -xx <- bind_cols(geo_value = rep("x", 15), x) %>% +xx <- bind_cols(geo_value = rep("ak", 15), x) %>% as_epi_archive() test_that("epix_slide works as intended", { @@ -26,8 +28,8 @@ test_that("epix_slide works as intended", { ) xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -77,10 +79,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_dfrow2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = c( 2^3 + 2^2, @@ -90,7 +91,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) %>% purrr::map(~ data.frame(bin_sum = .x)) ) %>% group_by(geo_value) - expect_identical(xx_dfrow1, xx_dfrow2) # * xx_dfrow3 <- xx %>% @@ -100,7 +100,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical xx_df1 <- xx %>% @@ -110,10 +109,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_df2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -123,7 +121,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) %>% purrr::map(~ data.frame(bin = rev(.x))) ) %>% group_by(geo_value) - expect_identical(xx_df1, xx_df2) xx_scalar1 <- xx %>% @@ -133,10 +130,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_scalar2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = list( 2^3 + 2^2, @@ -146,7 +142,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) ) %>% group_by(geo_value) - expect_identical(xx_scalar1, xx_scalar2) xx_vec1 <- xx %>% @@ -156,10 +151,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_vec2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -169,48 +163,26 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) %>% purrr::map(rev) ) %>% group_by(geo_value) - expect_identical(xx_vec1, xx_vec2) }) test_that("epix_slide `before` validation works", { - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary)), - "`before` is required" - ) expect_error( xx %>% epix_slide(f = ~ sum(.x$binary), before = NA), - "Assertion on 'before' failed: May not be NA" + "Expected `before` to be a scalar value." ) expect_error( xx %>% epix_slide(f = ~ sum(.x$binary), before = -1), - "Assertion on 'before' failed: Element 1 is not >= 0" + "Expected `before` to be a difftime with units in days or a non-negative integer." ) expect_error( xx %>% epix_slide(f = ~ sum(.x$binary), before = 1.5), - regexp = "before", - class = "vctrs_error_incompatible_type" + "Expected `before` to be a difftime with units in days or a non-negative integer." ) - # We might want to allow this at some point (issue #219): - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = Inf), - regexp = "before", - class = "vctrs_error_incompatible_type" - ) - expect_error(xx %>% epix_slide(f = ~ sum(.x$binary)), "`before` is required") # These `before` values should be accepted: - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 0), - NA - ) - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 2L), - NA - ) - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 365000), - NA - ) + expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = 0)) + expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = 2)) + expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = as.difftime(365000, units = "days"))) }) test_that("quosure passing issue in epix_slide is resolved + other potential issues", { @@ -349,15 +321,15 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea <- tibble::tribble( ~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1) + test_date + 2, test_date + 1:1, 2^(1:1), + test_date + 3, test_date + 1:2, 2^(2:1), + test_date + 4, test_date + 1:3, 2^(3:1), + test_date + 5, test_date + 1:4, 2^(4:1), + test_date + 6, test_date + 1:5, 2^(5:1), + test_date + 7, test_date + 1:6, 2^(6:1) ) %>% tidyr::unnest(c(time_value, binary)) %>% - mutate(geo_value = "x") %>% + mutate(geo_value = "ak") %>% as_epi_archive() test_that("epix_slide with all_versions option has access to all older versions", { @@ -385,12 +357,12 @@ test_that("epix_slide with all_versions option has access to all older versions" result2 <- tibble::tribble( ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, - 2, 1L, sum(1:1), "data.table", key(ea$DT), - 3, 2L, sum(1:2), "data.table", key(ea$DT), - 4, 3L, sum(1:3), "data.table", key(ea$DT), - 5, 4L, sum(1:4), "data.table", key(ea$DT), - 6, 5L, sum(1:5), "data.table", key(ea$DT), - 7, 6L, sum(1:6), "data.table", key(ea$DT), + test_date + 2, 1L, sum(1:1), "data.table", key(ea$DT), + test_date + 3, 2L, sum(1:2), "data.table", key(ea$DT), + test_date + 4, 3L, sum(1:3), "data.table", key(ea$DT), + test_date + 5, 4L, sum(1:4), "data.table", key(ea$DT), + test_date + 6, 5L, sum(1:5), "data.table", key(ea$DT), + test_date + 7, 6L, sum(1:6), "data.table", key(ea$DT), ) expect_identical(result1, result2) # * @@ -437,21 +409,24 @@ test_that("epix_slide with all_versions option has access to all older versions" test_that("epix_as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: - f1 <- function(x, gk, rtv) { tibble( diff_mean = mean(diff(x$binary)) ) } - ref_time_value1 <- 5 + ref_time_value1 <- test_date expect_identical( ea %>% epix_as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), - ea %>% epix_slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ea %>% epix_slide( + f1, + before = 1000, + ref_time_values = ref_time_value1, + names_sep = NULL + ) ) # For all_versions = TRUE: - f2 <- function(x, gk, rtv) { x %>% # extract time&version-lag-1 data: @@ -473,33 +448,45 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { ) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } - ref_time_value2 <- 5 + ref_time_value2 <- test_date + 5 expect_identical( ea %>% epix_as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), - ea %>% epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ea %>% epix_slide( + f2, + before = 1000, + ref_time_values = ref_time_value2, + all_versions = TRUE, + names_sep = NULL + ) ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. ea_multigeo <- ea ea_multigeo$DT <- rbind( ea_multigeo$DT, - copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + copy(ea_multigeo$DT)[, geo_value := "ak"][, binary := -binary][] ) setkeyv(ea_multigeo$DT, key(ea$DT)) expect_identical( ea_multigeo %>% group_by(geo_value) %>% - epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% - filter(geo_value == "x"), + epix_slide( + f2, + before = 1000, + ref_time_values = ref_time_value2, + all_versions = TRUE, + names_sep = NULL + ) %>% + filter(geo_value == "ak"), ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` epix_as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% - transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + transmute(geo_value = "ak", time_value = ref_time_value2, mean_abs_delta) %>% group_by(geo_value) ) }) @@ -515,7 +502,7 @@ test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_version epix_slide( f = slide_fn, before = 1, - ref_time_values = 5, + ref_time_values = test_date + 5, new_col_name = "out", all_versions = TRUE ) @@ -532,8 +519,8 @@ test_that("epix_slide with all_versions option works as intended", { ) xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -582,7 +569,7 @@ test_that("epix_slide with all_versions option works as intended", { test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty <- function(ea, ...) { ea %>% - epix_slide(before = 5L, ..., function(x, gk, rtv) { + epix_slide(before = 5, ..., function(x, gk, rtv) { tibble::tibble() }) } @@ -601,8 +588,6 @@ test_that("epix_slide works with 0-row computation outputs", { geo_value = ea$DT$geo_value[integer(0)], time_value = ea$DT$version[integer(0)] ) %>% - # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, - # as_of = ea$versions_end) %>% group_by(geo_value) ) # with `all_versions=TRUE`, we have something similar but never get an @@ -637,11 +622,11 @@ test_that("epix_slide works with 0-row computation outputs", { # tibble::tibble(value = 42) # }, names_sep = NULL), # tibble::tibble( -# geo_value = "x", +# geo_value = "ak", # time_value = epix_slide_ref_time_values_default(ea), # value = 42 # ) %>% -# new_epi_df(as_of = ea$versions_end) +# as_epi_df(as_of = ea$versions_end) # ) # }) # nolint end @@ -649,20 +634,20 @@ test_that("epix_slide works with 0-row computation outputs", { test_that("epix_slide alerts if the provided f doesn't take enough args", { f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) - expect_warning(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) + expect_error(epix_slide(xx, f = f_xgt, before = 2), regexp = NA) + expect_warning(epix_slide(xx, f = f_xgt, before = 2), regexp = NA) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(epix_slide(xx, f_x_dots, before = 2L), + expect_warning(epix_slide(xx, f_x_dots, before = 2), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) test_that("epix_slide computation via formula can use ref_time_value", { xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -696,9 +681,9 @@ test_that("epix_slide computation via formula can use ref_time_value", { test_that("epix_slide computation via function can use ref_time_value", { xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -715,9 +700,9 @@ test_that("epix_slide computation via function can use ref_time_value", { test_that("epix_slide computation via dots can use ref_time_value and group", { # ref_time_value xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -732,9 +717,9 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { # group_key xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = "x" + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = "ak" ) %>% group_by(geo_value) @@ -765,14 +750,14 @@ test_that("epix_slide computation via dots outputs the same result using col nam group_by(.data$geo_value) %>% epix_slide( before = 2, - sum_binary = sum(time_value) + sum_binary = sum(binary) ) xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( before = 2, - sum_binary = sum(.x$time_value) + sum_binary = sum(.x$binary) ) expect_identical(xx1, xx_ref) @@ -781,7 +766,7 @@ test_that("epix_slide computation via dots outputs the same result using col nam group_by(.data$geo_value) %>% epix_slide( before = 2, - sum_binary = sum(.data$time_value) + sum_binary = sum(.data$binary) ) expect_identical(xx2, xx_ref) @@ -791,9 +776,8 @@ test_that("`epix_slide` doesn't decay date output", { expect_true( xx$DT %>% as_tibble() %>% - mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% as_epi_archive() %>% - epix_slide(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + epix_slide(before = 5, ~ attr(.x, "metadata")$as_of) %>% `[[`("slide_value") %>% inherits("Date") ) @@ -801,14 +785,8 @@ test_that("`epix_slide` doesn't decay date output", { test_that("`epix_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = Inf) } - expect_error( - helper(archive_cases_dv_subset, as.Date("2021-01-01")), - NA - ) - expect_error( - helper(xx, 3L), - NA - ) + expect_no_error(helper(archive_cases_dv_subset, as.Date("2021-01-01"))) + expect_no_error(helper(xx, 3L)) }) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 5ba66ed2..27e9097c 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -142,7 +142,7 @@ test_that("Grouping are dropped by `as_tibble`", { }) test_that("Renaming columns gives appropriate colnames and metadata", { - edf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% + edf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% as_epi_df(additional_metadata = list(other_keys = "age")) # renaming using base R renamed_edf1 <- edf %>% @@ -158,7 +158,7 @@ test_that("Renaming columns gives appropriate colnames and metadata", { }) test_that("Renaming columns while grouped gives appropriate colnames and metadata", { - gedf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% + gedf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% as_epi_df(additional_metadata = list(other_keys = "age")) %>% group_by(geo_value) # renaming using base R diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 3067ba8a..e220af16 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -40,48 +40,30 @@ test_that("guess_geo_type tests for different types of geo_value's", { test_that("guess_time_type works for different types", { days <- as.Date("2022-01-01") + 0:6 weeks <- as.Date("2022-01-01") + 7 * 0:6 - - yearweeks <- tsibble::yearweek(10) yearmonths <- tsibble::yearmonth(10) - yearquarters <- tsibble::yearquarter(10) - - years <- c(1999, 2000) - ambiguous_yearweeks <- c(199901, 199902) # -> "custom" - - daytimes <- as.POSIXct(c("2022-01-01 05:00:00", "2022-01-01 15:0:00"), tz = "UTC") - daytimes_chr <- as.character(daytimes) + integers <- c(1999, 2000) # YYYY-MM-DD is the accepted format not_ymd1 <- "January 1, 2022" not_ymd2 <- "1 January 2022" not_ymd3 <- "1 Jan 2022" - not_a_date <- "asdf" expect_equal(guess_time_type(days), "day") expect_equal(guess_time_type(weeks), "week") - - expect_equal(guess_time_type(yearweeks), "yearweek") expect_equal(guess_time_type(yearmonths), "yearmonth") - expect_equal(guess_time_type(yearquarters), "yearquarter") - - expect_equal(guess_time_type(years), "year") - expect_equal(guess_time_type(ambiguous_yearweeks), "custom") - - expect_equal(guess_time_type(daytimes), "day-time") - expect_equal(guess_time_type(daytimes_chr), "day-time") + expect_equal(guess_time_type(integers), "integer") - expect_equal(guess_time_type(not_ymd1), "custom") - expect_equal(guess_time_type(not_ymd2), "custom") - expect_equal(guess_time_type(not_ymd3), "custom") - expect_equal(guess_time_type(not_a_date), "custom") + expect_warning(guess_time_type(not_ymd1), "Unsupported time type in column") + expect_warning(guess_time_type(not_ymd2), "Unsupported time type in column") + expect_warning(guess_time_type(not_ymd3), "Unsupported time type in column") + expect_warning(guess_time_type(not_a_date), "Unsupported time type in column") }) test_that("guess_time_type works with gaps", { - days_gaps <- as.Date("2022-01-01") + c(0, 1, 3, 4, 8, 8 + 7) - weeks_gaps <- as.Date("2022-01-01") + 7 * c(0, 1, 3, 4, 8, 8 + 7) - expect_equal(guess_time_type(days_gaps), "day") - expect_equal(guess_time_type(weeks_gaps), "week") + gaps <- c(1:6, 8, 9, 11, 8 + 7) + expect_equal(guess_time_type(as.Date("2022-01-01") + gaps), "day") + expect_equal(guess_time_type(as.Date("2022-01-01") + 7 * gaps), "week") }) test_that("enlist works", { @@ -280,3 +262,23 @@ test_that("guess_period works", { weekly_posixlts ) }) + + +test_that("validate_slide_window_arg works", { + for (time_type in c("day", "week", "integer", "yearmonth")) { + expect_no_error(validate_slide_window_arg(Inf, time_type)) + } + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "days"), "day")) + expect_no_error(validate_slide_window_arg(1, "day")) + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "day")) + + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "week")) + expect_error(validate_slide_window_arg(1, "week")) + + expect_no_error(validate_slide_window_arg(1, "integer")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "days"), "integer")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "integer")) + + expect_no_error(validate_slide_window_arg(1, "yearmonth")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "yearmonth")) +}) diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index dca595ff..ec5f36af 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -214,39 +214,6 @@ xt_filled %>% print(n = 7) ``` -## Aggregate to different time scales - -Continuing on with useful `tsibble` functionality, we can aggregate to different -time scales using `index_by()` from `tsibble`, which modifies the index variable -in the given object by applying a suitable time-coarsening transformation (say, -moving from days to weeks, or weeks to months, and so on). The most common use -case would be to follow up with a call to a `dplyr` verb like `summarize()` in -order to perform some kind of aggregation of our measured variables over the new -index variable. - -Below, we use the functions `yearweek()` and `yearmonth()` that are provided in -the `tsibble` package in order to aggregate to weekly and monthly resolutions. -In the former call, we set `week_start = 7` to coincide with the CDC definition -of an epiweek (epidemiological week). - -```{r} -# Aggregate to weekly -xt_filled_week <- xt_filled %>% - index_by(epiweek = ~ yearweek(., week_start = 7)) %>% - group_by(geo_value) %>% - summarize(cases = sum(cases, na.rm = TRUE)) - -head(xt_filled_week) - -# Aggregate to monthly -xt_filled_month <- xt_filled_week %>% - index_by(month = ~ yearmonth(.)) %>% - group_by(geo_value) %>% - summarize(cases = sum(cases, na.rm = TRUE)) - -head(xt_filled_month) -``` - ## Geographic aggregation TODO diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index a34429d9..686f558f 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -93,7 +93,7 @@ print(x) An `epi_archive` is consists of a primary field `DT`, which is a data table (from the `data.table` package) that has the columns `geo_value`, `time_value`, `version` (and possibly additional ones), and other metadata fields, such as -`geo_type` and `time_type`. +`geo_type`. ```{r} class(x$DT) @@ -119,11 +119,10 @@ The following pieces of metadata are included as fields in an `epi_archive` object: * `geo_type`: the type for the geo values. -* `time_type`: the type for the time values. * `additional_metadata`: list of additional metadata for the data archive. Metadata for an `epi_archive` object `x` can be accessed (and altered) directly, -as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function +as in `x$geo_type`, etc. Just like `as_epi_df()`, the function `as_epi_archive()` attempts to guess metadata fields when an `epi_archive` object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index c0cb0011..24a98505 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -129,8 +129,6 @@ frame into `epi_df` format. ```{r, message = FALSE} x <- as_epi_df(cases, - geo_type = "state", - time_type = "day", as_of = max(cases$issue) ) %>% select(geo_value, time_value, total_cases = value) @@ -146,15 +144,13 @@ attributes(x)$metadata In general, an `epi_df` object has the following fields in its metadata: * `geo_type`: the type for the geo values. -* `time_type`: the type for the time values. * `as_of`: the time value at which the given data were available. Metadata for an `epi_df` object `x` can be accessed (and altered) via -`attributes(x)$metadata`. The first two fields here, `geo_type` and `time_type`, -are not currently used by any downstream functions in the `epiprocess` package, -and serve only as useful bits of information to convey about the data set at -hand. The last field here, `as_of`, is one of the most unique aspects of an -`epi_df` object. +`attributes(x)$metadata`. The field, `geo_type`,is not currently used by any +downstream functions in the `epiprocess` package, and serve only as useful bits +of information to convey about the data set at hand. The last field here, +`as_of`, is one of the most unique aspects of an `epi_df` object. In brief, we can think of an `epi_df` object as a single snapshot of a data set that contains the most up-to-date values of some signals of interest, as of the @@ -166,11 +162,11 @@ data set. See the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for more. -If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a -call to `as_epi_df()`, then this function will try to infer them from the passed -object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value` -and `time_value` columns, respectively, but inferring the `as_of` field is not -as easy. See the documentation for `as_epi_df()` more details. +If `geo_type` or `as_of` arguments are missing in a call to `as_epi_df()`, then +this function will try to infer them from the passed object. Usually, `geo_type` +can be inferred from the `geo_value` columns, respectively, but inferring the +`as_of` field is not as easy. See the documentation for `as_epi_df()` more +details. ```{r} x <- as_epi_df(cases, as_of = as.Date("2024-03-20")) %>% @@ -196,7 +192,7 @@ ex1 <- tibble( ) %>% as_tsibble(index = time_value, key = c(geo_value, county_code)) -ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03") +ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03") ``` The metadata now includes `county_code` as an extra key. @@ -237,7 +233,7 @@ head(ex2) ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>% as_epi_df( - geo_type = "state", as_of = "2020-06-03", + as_of = "2020-06-03", additional_metadata = list(other_keys = "pol") ) @@ -304,7 +300,7 @@ cases in Canada in 2003, from the x <- outbreaks::sars_canada_2003 %>% mutate(geo_value = "ca") %>% select(geo_value, time_value = date, starts_with("cases")) %>% - as_epi_df(geo_type = "nation", as_of = as.Date("2024-03-20")) + as_epi_df(as_of = as.Date("2024-03-20")) head(x) @@ -352,7 +348,7 @@ x <- outbreaks::ebola_sierraleone_2014 %>% time_value = full_seq(time_value, period = 1), fill = list(cases = 0) ) %>% - as_epi_df(geo_type = "province", as_of = as.Date("2024-03-20")) + as_epi_df(as_of = as.Date("2024-03-20")) ggplot(x, aes(x = time_value, y = cases)) + geom_col(aes(fill = geo_value), show.legend = FALSE) +