From 0dcb5a6489ecbdc8c03ebb8f5bf715c8ca708502 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 8 Feb 2024 18:04:37 -0800 Subject: [PATCH] wip+feat: refactor epi_archive to use S3 * A/B testing branch * make sure the parallel set of tests pass --- DESCRIPTION | 3 + NAMESPACE | 26 + R/archive.R | 3 - R/archive_new.R | 1115 +++++++++++++++++ R/grouped_archive_new.R | 456 +++++++ R/methods-epi_archive_new.R | 830 ++++++++++++ man/as_epi_archive2.Rd | 142 +++ man/as_of.epi_archive2.Rd | 33 + man/epi_archive.Rd | 106 +- man/epix_as_of2.Rd | 96 ++ man/epix_fill_through_version2.Rd | 48 + man/epix_merge2.Rd | 73 ++ man/epix_slide2.Rd | 283 +++++ man/epix_truncate_versions_after.Rd | 10 +- ...ate_versions_after.grouped_epi_archive2.Rd | 11 + man/fill_through_version.epi_archive2.Rd | 21 + man/group_by.epi_archive.Rd | 23 +- man/group_by.epi_archive2.Rd | 147 +++ man/is_epi_archive2.Rd | 35 + man/max_version_with_row_in.Rd | 9 +- man/merge_epi_archive2.Rd | 30 + man/new_epi_archive2.Rd | 69 + man/next_after.Rd | 8 +- man/print.epi_archive2.Rd | 17 + man/slide.epi_archive2.Rd | 101 ++ man/slide.grouped_epi_archive2.Rd | 24 + man/truncate_versions_after.epi_archive2.Rd | 19 + ...ate_versions_after.grouped_epi_archive2.Rd | 18 + tests/testthat/test-archive_new.R | 173 +++ tests/testthat/test-compactify_new.R | 110 ++ .../test-epix_fill_through_version_new.R | 109 ++ tests/testthat/test-epix_merge_new.R | 228 ++++ tests/testthat/test-epix_slide_new.R | 810 ++++++++++++ tests/testthat/test-grouped_epi_archive_new.R | 104 ++ tests/testthat/test-methods-epi_archive_new.R | 138 ++ 35 files changed, 5420 insertions(+), 8 deletions(-) create mode 100644 R/archive_new.R create mode 100644 R/grouped_archive_new.R create mode 100644 R/methods-epi_archive_new.R create mode 100644 man/as_epi_archive2.Rd create mode 100644 man/as_of.epi_archive2.Rd create mode 100644 man/epix_as_of2.Rd create mode 100644 man/epix_fill_through_version2.Rd create mode 100644 man/epix_merge2.Rd create mode 100644 man/epix_slide2.Rd create mode 100644 man/epix_truncate_versions_after.grouped_epi_archive2.Rd create mode 100644 man/fill_through_version.epi_archive2.Rd create mode 100644 man/group_by.epi_archive2.Rd create mode 100644 man/is_epi_archive2.Rd create mode 100644 man/merge_epi_archive2.Rd create mode 100644 man/new_epi_archive2.Rd create mode 100644 man/print.epi_archive2.Rd create mode 100644 man/slide.epi_archive2.Rd create mode 100644 man/slide.grouped_epi_archive2.Rd create mode 100644 man/truncate_versions_after.epi_archive2.Rd create mode 100644 man/truncate_versions_after.grouped_epi_archive2.Rd create mode 100644 tests/testthat/test-archive_new.R create mode 100644 tests/testthat/test-compactify_new.R create mode 100644 tests/testthat/test-epix_fill_through_version_new.R create mode 100644 tests/testthat/test-epix_merge_new.R create mode 100644 tests/testthat/test-epix_slide_new.R create mode 100644 tests/testthat/test-grouped_epi_archive_new.R create mode 100644 tests/testthat/test-methods-epi_archive_new.R diff --git a/DESCRIPTION b/DESCRIPTION index 71d95969..538fd023 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -73,6 +73,7 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' + 'archive_new.R' 'autoplot.R' 'correlation.R' 'data.R' @@ -80,9 +81,11 @@ Collate: 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' + 'grouped_archive_new.R' 'grouped_epi_archive.R' 'growth_rate.R' 'key_colnames.R' + 'methods-epi_archive_new.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index 03e0e41d..51c4091c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,40 +6,57 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_of,epi_archive2) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) +S3method(clone,epi_archive2) +S3method(clone,grouped_epi_archive2) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) S3method(epix_truncate_versions_after,epi_archive) +S3method(epix_truncate_versions_after,epi_archive2) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(epix_truncate_versions_after,grouped_epi_archive2) S3method(group_by,epi_archive) +S3method(group_by,epi_archive2) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) +S3method(group_by,grouped_epi_archive2) S3method(group_by_drop_default,grouped_epi_archive) +S3method(group_by_drop_default,grouped_epi_archive2) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) +S3method(groups,grouped_epi_archive2) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) +S3method(print,epi_archive2) S3method(print,epi_df) +S3method(print,grouped_epi_archive2) S3method(select,epi_df) +S3method(slide,grouped_epi_archive2) S3method(summary,epi_df) +S3method(truncate_versions_after,grouped_epi_archive2) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) +S3method(ungroup,grouped_epi_archive2) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) export(arrange) export(as_epi_archive) +export(as_epi_archive2) export(as_epi_df) +export(as_of) export(as_tsibble) export(autoplot) +export(clone) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -47,24 +64,33 @@ export(epi_archive) export(epi_cor) export(epi_slide) export(epix_as_of) +export(epix_as_of2) export(epix_merge) +export(epix_merge2) export(epix_slide) +export(epix_slide2) export(epix_truncate_versions_after) +export(fill_through_version) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) +export(is_epi_archive2) export(is_epi_df) export(is_grouped_epi_archive) +export(is_grouped_epi_archive2) export(key_colnames) export(max_version_with_row_in) export(mutate) +export(new_epi_archive2) export(new_epi_df) export(next_after) export(relocate) export(rename) export(slice) +export(slide) +export(truncate_versions_after) export(ungroup) export(unnest) importFrom(R6,R6Class) diff --git a/R/archive.R b/R/archive.R index ff3bc20c..a530cc05 100644 --- a/R/archive.R +++ b/R/archive.R @@ -514,9 +514,6 @@ epi_archive <- fromLast = TRUE ) %>% tibble::as_tibble() %>% - # (`as_tibble` should de-alias the DT and its columns in any edge - # cases where they are aliased. We don't say we guarantee this - # though.) dplyr::select(-"version") %>% as_epi_df( geo_type = self$geo_type, diff --git a/R/archive_new.R b/R/archive_new.R new file mode 100644 index 00000000..bcfa84c3 --- /dev/null +++ b/R/archive_new.R @@ -0,0 +1,1115 @@ +# We use special features of data.table's `[`. The data.table package has a +# compatibility feature that disables some/all of these features if it thinks we +# might expect `data.frame`-compatible behavior instead. We can signal that we +# want the special behavior via `.datatable.aware = TRUE` or by importing any +# `data.table` package member. Do both to prevent surprises if we decide to use +# `data.table::` everywhere and not importing things. +.datatable.aware <- TRUE + +#' Validate a version bound arg +#' +#' Expected to be used on `clobberable_versions_start`, `versions_end`, +#' and similar arguments. Some additional context-specific checks may be needed. +#' +#' @param version_bound the version bound to validate +#' @param x a data frame containing a version column with which to check +#' compatibility +#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will +#' have a special context-dependent meaning.) +#' @param version_bound_arg optional string; what to call the version bound in +#' error messages +#' +#' @section Side effects: raises an error if version bound appears invalid +#' +#' @noRd +validate_version_bound <- function(version_bound, x, na_ok = FALSE, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { + if (is.null(version_bound)) { + cli_abort( + "{version_bound_arg} cannot be NULL" + ) + } + if (na_ok && is.na(version_bound)) { + return(invisible(NULL)) + } + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + ) + } + if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same types as x$version, + which is {typeof(x$version)}", + ) + } + + return(invisible(NULL)) +} + +#' `max(x$version)`, with error if `x` has 0 rows +#' +#' Exported to make defaults more easily copyable. +#' +#' @param x `x` argument of [`as_epi_archive`] +#' +#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or +#' an `NA` version value +#' +#' @export +max_version_with_row_in <- function(x) { + if (nrow(x) == 0L) { + cli_abort( + "`nrow(x)==0L`, representing a data set history with no row up through the + latest observed version, but we don't have a sensible guess at what version + that is, or whether any of the empty versions might be clobbered in the + future; if we use `x` to form an `epi_archive`, then + `clobberable_versions_start` and `versions_end` must be manually specified.", + class = "epiprocess__max_version_cannot_be_used" + ) + } else { + version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist + if (anyNA(version_col)) { + cli_abort("version values cannot be NA", + class = "epiprocess__version_values_must_not_be_na" + ) + } else { + version_bound <- max(version_col) + } + } +} + +#' Get the next possible value greater than `x` of the same type +#' +#' @param x the starting "value"(s) +#' @return same class, typeof, and length as `x` +#' +#' @export +next_after <- function(x) UseMethod("next_after") + +#' @export +next_after.integer <- function(x) x + 1L + +#' @export +next_after.Date <- function(x) x + 1L + + + +#' epi archive +#' @title `epi_archive` object +#' +#' @description An `epi_archive` is an R6 class which contains a data table +#' along with several relevant pieces of metadata. The data table can be seen +#' as the full archive (version history) for some signal variables of +#' interest. +#' +#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of +#' class `data.table` from the `data.table` package, with (at least) the +#' following columns: +#' +#' * `geo_value`: the geographic value associated with each row of measurements. +#' * `time_value`: the time value associated with each row of measurements. +#' * `version`: the time value specifying the version for each row of +#' measurements. For example, if in a given row the `version` is January 15, +#' 2022 and `time_value` is January 14, 2022, then this row contains the +#' measurements of the data for January 14, 2022 that were available one day +#' later. +#' +#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, +#' as well as any others (these can be specified when instantiating the +#' `epi_archive` object via the `other_keys` argument, and/or set by operating +#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for +#' information and examples of relevant parameter names for an `epi_archive` object. +#' Note that there can only be a single row per unique combination of +#' key variables, and thus the key variables are critical for figuring out how +#' to generate a snapshot of data from the archive, as of a given version. +#' +#' In general, the last version of each observation is carried forward (LOCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `versions_end`. One consequence is that the `DT` +#' doesn't have to contain a full snapshot of every version (although this +#' generally works), but can instead contain only the rows that are new or +#' changed from the previous version (see `compactify`, which does this +#' automatically). Currently, deletions must be represented as revising a row +#' to a special state (e.g., making the entries `NA` or including a special +#' column that flags the data as removed and performing some kind of +#' post-processing), and the archive is unaware of what this state is. Note +#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, +#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' **A word of caution:** R6 objects, unlike most other objects in R, have +#' reference semantics. A primary consequence of this is that objects are not +#' copied when modified. You can read more about this in Hadley Wickham's +#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order +#' to construct a modified archive while keeping the original intact, first +#' make a clone using the `$clone` method, then overwrite the clone's `DT` +#' field with `data.table::copy(clone$DT)`, and finally perform the +#' modifications on the clone. +#' +#' @section Metadata: +#' The following pieces of metadata are included as fields in an `epi_archive` +#' object: +#' +#' * `geo_type`: the type for the geo values. +#' * `time_type`: the type for the time values. +#' * `additional_metadata`: list of additional metadata for the data archive. +#' +#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be +#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, +#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the +#' metadata of an `epi_archive` object are not currently used by any +#' downstream functions in the `epiprocess` package, and serve only as useful +#' bits of information to convey about the data set at hand. +#' +#' @section Generating Snapshots: +#' An `epi_archive` object can be used to generate a snapshot of the data in +#' `epi_df` format, which represents the most up-to-date values of the signal +#' variables, as of the specified version. This is accomplished by calling the +#' `as_of()` method for an `epi_archive` object `x`. More details on this +#' method are documented in the wrapper function [`epix_as_of()`]. +#' +#' @section Sliding Computations: +#' We can run a sliding computation over an `epi_archive` object, much like +#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling +#' the `slide()` method for an `epi_archive` object, which works similarly to +#' the way `epi_slide()` works for an `epi_df` object, but with one key +#' difference: it is version-aware. That is, for an `epi_archive` object, the +#' sliding computation at any given reference time point t is performed on +#' **data that would have been available as of t**. More details on `slide()` +#' are documented in the wrapper function [`epix_slide()`]. +#' +#' @export +#' @examples +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% epi_archive$new( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' @name epi_archive +# TODO: Figure out where to actually put this documentation +NULL + +#' New epi archive +#' @description Creates a new `epi_archive` object. +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space while maintaining the same behavior (with the help of the +#' `clobberable_versions_start` and `versions_end` fields in some edge cases). +#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will +#' remove these rows and issue a warning. Generally, this can be set to +#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` +#' such as its `DT`, or rely on redundant updates to achieve a certain +#' behavior of the `ref_time_values` default in `epix_slide`, you will have to +#' determine whether `compactify=TRUE` will produce the desired results. If +#' compactification here is removing a large proportion of the rows, this may +#' indicate a potential for space, time, or bandwidth savings upstream the +#' data pipeline, e.g., by avoiding fetching, storing, or processing these +#' rows of `x`. +#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] +#' @param versions_end Optional; as in [`as_epi_archive`] +#' @return An `epi_archive` object. +#' @importFrom data.table as.data.table key setkeyv +#' +#' @details +#' Refer to the documentation for [as_epi_archive()] for more information +#' and examples of parameter names. +#' @export +new_epi_archive2 <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + # If geo type is missing, then try to guess it + if (missing(geo_type) || is.null(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } + + # If time type is missing, then try to guess it + if (missing(time_type) || is.null(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } + assert_logical(compactify, len = 1, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end) || is.null(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + sprintf( + "`versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } + + # --- End of validation and replacing missing args with defaults --- + + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + structure( + list( + DT = DT, + geo_type = geo_type, + time_type = time_type, + additional_metadata = additional_metadata, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end, + private = list() # TODO: to be encapsulated with guard-rails later + ), + class = "epi_archive2" + ) +} + +#' Print information about an `epi_archive` object +#' @param class Boolean; whether to print the class label header +#' @param methods Boolean; whether to print all available methods of +#' the archive +#' @importFrom cli cli_inform +#' @export +print.epi_archive2 <- function(epi_archive, class = TRUE, methods = TRUE) { + cli_inform( + c( + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(epi_archive$DT), c("geo_value", "time_value", "version"))) > 0) { + "Non-standard DT keys: {setdiff(key(epi_archive$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(epi_archive$DT$time_value)} / {max(epi_archive$DT$time_value)}", + "i" = "First/last version with update: {min(epi_archive$DT$version)} / {max(epi_archive$DT$version)}", + "i" = if (!is.na(epi_archive$clobberable_versions_start)) { + "Clobberable versions start: {epi_archive$clobberable_versions_start}" + }, + "i" = "Versions end: {epi_archive$versions_end}", + "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", + "i" = "A preview of the table ({nrow(epi_archive$DT)} rows x {ncol(epi_archive$DT)} columns):" + ) + ) + + return(invisible(epi_archive$DT %>% print())) +} + + +#' @export +as_of <- function(x, ...) { + UseMethod("as_of") +} + + +#' As of epi_archive +#' @description Generates a snapshot in `epi_df` format as of a given version. +#' See the documentation for the wrapper function [`epix_as_of()`] for +#' details. The parameter descriptions below are copied from there +#' @param epi_archive An `epi_archive` object +#' @param max_version Version specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose `time_value`s are at least +#' `min_time_value`). +#' @param min_time_value Time value specifying the min `time_value` to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @importFrom data.table between key +#' @export +as_of.epi_archive2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + other_keys <- setdiff( + key(epi_archive$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `epi_archive$DT$version`." + ) + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same types as `epi_archive$DT$version`." + ) + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + assert_logical(all_versions, len = 1) + if (!is.na(epi_archive$clobberable_versions_start) && max_version >= epi_archive$clobberable_versions_start) { + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + # epi_archive is copied into result, so we can modify result directly + result <- epix_truncate_versions_after(epi_archive, max_version) + result$DT <- result$DT[time_value >= min_time_value, ] + return(result) + } + + # Make sure to use data.table ways of filtering and selecting + as_of_epi_df <- epi_archive$DT[time_value >= min_time_value & version <= max_version, ] %>% + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE + ) %>% + tibble::as_tibble() %>% + dplyr::select(-"version") %>% + as_epi_df( + geo_type = epi_archive$geo_type, + time_type = epi_archive$time_type, + as_of = max_version, + additional_metadata = c(epi_archive$additional_metadata, + other_keys = other_keys + ) + ) + + return(as_of_epi_df) +} + + +#' @export +fill_through_version <- function(x, ...) { + UseMethod("fill_through_version") +} + + +#' Fill through version +#' @description Fill in unobserved history using requested scheme by mutating +#' the given object and potentially reseating its fields. See +#' [`epix_fill_through_version`], which doesn't mutate the input archive but +#' might alias its fields. +#' +#' @param epi_archive an `epi_archive` object +#' @param fill_versions_end as in [`epix_fill_through_version`] +#' @param how as in [`epix_fill_through_version`] +#' +#' @importFrom data.table key setkeyv := address copy +#' @importFrom rlang arg_match +fill_through_version.epi_archive2 <- function( + epi_archive, + fill_versions_end, + how = c("na", "locf")) { + validate_version_bound(fill_versions_end, epi_archive$DT, na_ok = FALSE) + how <- arg_match(how) + if (epi_archive$versions_end < fill_versions_end) { + new_DT <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `epi_archive` is outdated. + nonversion_key_cols <- setdiff(key(epi_archive$DT), "version") + nonkey_cols <- setdiff(names(epi_archive$DT), key(epi_archive$DT)) + next_version_tag <- next_after(epi_archive$versions_end) + if (next_version_tag > fill_versions_end) { + cli_abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), epi_archive$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(epi_archive$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(epi_archive$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(epi_archive$DT, next_version_DT), key(epi_archive$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + epi_archive$DT + } + ) + new_versions_end <- fill_versions_end + # Update `epi_archive` all at once with simple, error-free operations + + # return below: + epi_archive$DT <- new_DT + epi_archive$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(invisible(epi_archive)) +} + + +#' @export +truncate_versions_after <- function(x, ...) { + UseMethod("truncate_versions_after") +} + + +#' Truncate versions after +#' @description Filter to keep only older versions, mutating the archive by +#' potentially reseating but not mutating some fields. `DT` is likely, but not +#' guaranteed, to be copied. Returns the mutated archive +#' [invisibly][base::invisible]. +#' @param epi_archive as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +truncate_versions_after.epi_archive2 <- function( + epi_archive, + max_version) { + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + epi_archive$DT <- epi_archive$DT[epi_archive$DT$version <= max_version, colnames(epi_archive$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(epi_archive$clobberable_versions_start) && epi_archive$clobberable_versions_start > max_version) { + epi_archive$clobberable_versions_start <- NA + } + epi_archive$versions_end <- max_version + return(invisible(epi_archive)) +} + + +#' Merge epi archive +#' @description Merges another `epi_archive` with the current one, mutating the +#' current one by reseating its `DT` and several other fields, but avoiding +#' mutation of the old `DT`; returns the current archive +#' [invisibly][base::invisible]. See [`epix_merge`] for a full description +#' of the non-R6-method version, which does not mutate either archive, and +#' does not alias either archive's `DT`.a +#' @param x as in [`epix_merge`] +#' @param y as in [`epix_merge`] +#' @param sync as in [`epix_merge`] +#' @param compactify as in [`epix_merge`] +merge_epi_archive2 <- function( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + result <- epix_merge(x, y, + sync = sync, + compactify = compactify + ) + + # TODO: Use encapsulating methods instead. + if (length(x$private_fields) != 0L) { + cli_abort("expected no private fields in x", + internal = TRUE + ) + } + + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(x$public_fields)) { + x[[field_name]] <- result[[field_name]] + } + + return(invisible(x)) +} + + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, +#' introducing column-level aliasing between its input and its result. This +#' doesn't follow the general model for most `data.table` operations, which +#' seems to be that, given an nonaliased (i.e., unique) pointer to a +#' `data.table` object, its pointers to its columns should also be nonaliased. +#' If you mutate any of the columns of either the input or result, first ensure +#' that it is fine if columns of the other are also mutated, but do not rely on +#' such behavior to occur. Additionally, never perform mutation on the key +#' columns at all (except for strictly increasing transformations), as this will +#' invalidate sortedness assumptions about the rows. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive <- +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) +#' ) %>% +#' as_epi_archive(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive2 <- function(epi_archive, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(epi_archive)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate <- epix_detailed_restricted_mutate2(epi_archive, ...) + assert_logical(.drop) + if (!.drop) { + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + } else if (any(diff(grouping_col_is_factor) == -1L)) { + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these + columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert all + grouping columns to factors beforehand, specify the non-factor grouping columns first, + or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + } + } + new_grouped_epi_archive(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop + ) +} + + +#' @export +slide <- function(.data, ...) { + UseMethod("slide") +} + + +#' Slide over epi archive +#' @description Slides a given function over variables in an `epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. The parameter descriptions below are copied from there +#' @importFrom data.table key +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +slide.epi_archive2 <- function(epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + slide( + group_by(epi_archive), + f, + ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() +} + + +#' Convert to `epi_archive` format +#' +#' Converts a data frame, data table, or tibble into an `epi_archive` +#' object. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. The parameter descriptions below are copied from there +#' +#' @param x A data frame, data table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or +#' `NULL` will remove these rows and issue a warning. Generally, this can be +#' set to `TRUE`, but if you directly inspect or edit the fields of the +#' `epi_archive` such as its `DT`, you will have to determine whether +#' `compactify=TRUE` will produce the desired results. If compactification +#' here is removing a large proportion of the rows, this may indicate a +#' potential for space, time, or bandwidth savings upstream the data pipeline, +#' e.g., when fetching, storing, or preparing the input data `x` +#' @param clobberable_versions_start Optional; `length`-1; either a value of the +#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and +#' `typeof`: specifically, either (a) the earliest version that could be +#' subject to "clobbering" (being overwritten with different update data, but +#' using the *same* version tag as the old update data), or (b) `NA`, to +#' indicate that no versions are clobberable. There are a variety of reasons +#' why versions could be clobberable under routine circumstances, such as (a) +#' today's version of one/all of the columns being published after initially +#' being filled with `NA` or LOCF, (b) a buggy version of today's data being +#' published but then fixed and republished later in the day, or (c) data +#' pipeline delays (e.g., publisher uploading, periodic scraping, database +#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected +#' later in the day (or even on a different day) than expected; potential +#' causes vary between different data pipelines. The default value is `NA`, +#' which doesn't consider any versions to be clobberable. Another setting that +#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` and `typeof` as +#' `x$version`: what is the last version we have observed? The default is +#' `max_version_with_row_in(x)`, but values greater than this could also be +#' valid, and would indicate that we observed additional versions of the data +#' beyond `max(x$version)`, but they all contained empty updates. (The default +#' value of `clobberable_versions_start` does not fully trust these empty +#' updates, and assumes that any version `>= max(x$version)` could be +#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' @return An `epi_archive` object. +#' +#' @details This simply a wrapper around the `new()` method of the `epi_archive` +#' class, so for example: +#' ``` +#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") +#' ``` +#' would be equivalent to: +#' ``` +#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") +#' ``` +#' +#' @export +#' @examples +#' # Simple ex. with necessary keys +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% as_epi_archive( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' +#' # Ex. with an additional key for county +#' df <- data.frame( +#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c( +#' "2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02" +#' ), +#' version = c( +#' "2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03" +#' ), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05) +#' ) +#' +#' x <- df %>% as_epi_archive( +#' geo_type = "state", +#' time_type = "day", +#' other_keys = "county" +#' ) +as_epi_archive2 <- function(x, geo_type, time_type, other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x)) { + new_epi_archive2( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) +} + +#' Test for `epi_archive` format +#' +#' @param x An object. +#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also +#' count? Default is `FALSE`. +#' @return `TRUE` if the object inherits from `epi_archive`. +#' +#' @export +#' @examples +#' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +#' is_epi_archive(archive_cases_dv_subset) # TRUE +#' +#' # By default, grouped_epi_archives don't count as epi_archives, as they may +#' # support a different set of operations from regular `epi_archives`. This +#' # behavior can be controlled by `grouped_okay`. +#' grouped_archive <- archive_cases_dv_subset$group_by(geo_value) +#' is_epi_archive(grouped_archive) # FALSE +#' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE +#' +#' @seealso [`is_grouped_epi_archive`] +is_epi_archive2 <- function(x, grouped_okay = FALSE) { + inherits(x, "epi_archive2") || grouped_okay && inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone <- function(x, ...) { + UseMethod("clone") +} + + +#' @export +clone.epi_archive2 <- function(epi_archive, deep = FALSE) { + # TODO: Finish. + if (deep) { + epi_archive$DT <- copy(epi_archive$DT) + } else { + epi_archive$DT <- copy(epi_archive$DT) + } + return(epi_archive) +} diff --git a/R/grouped_archive_new.R b/R/grouped_archive_new.R new file mode 100644 index 00000000..c0e6c35e --- /dev/null +++ b/R/grouped_archive_new.R @@ -0,0 +1,456 @@ +#' +#' Convenience function for performing a `tidy_select` on dots according to its +#' docs, and taking the names (rather than the integer indices). +#' +#' @param ... tidyselect-syntax selection description +#' @param .data named vector / data frame; context for the description / the +#' object to which the selections apply +#' @return character vector containing names of entries/columns of +#' `names(.data)` denoting the selection +#' +#' @noRd +eval_pure_select_names_from_dots <- function(..., .data) { + # `?tidyselect::eval_select` tells us to use this form when we take in dots. + # It seems a bit peculiar, since the expr doesn't pack with it a way to get at + # the environment for the dots, but it looks like `eval_select` will assume + # the caller env (our `environment()`) when given an expr, and thus have + # access to the dots. + # + # If we were allowing renaming, we'd need to be careful about which names (new + # vs. old vs. both) to return here. + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) +} + +#' Get names of dots without forcing the dots +#' +#' For use in functions that use nonstandard evaluation (NSE) on the dots; we +#' can't use the pattern `names(list(...))` in this case because it will attempt +#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the +#' dots if we're using NSE. +#' +#' @noRd +nse_dots_names <- function(...) { + names(rlang::call_match()) +} +nse_dots_names2 <- function(...) { + rlang::names2(rlang::call_match()) +} + +#' @importFrom dplyr group_by_drop_default +#' @noRd +new_grouped_epi_archive <- function(ungrouped, vars, drop) { + if (inherits(ungrouped, "grouped_epi_archive")) { + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, + or `ungroup` first.", + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped) + ) + } + assert_class(ungrouped, "epi_archive2") + assert_character(vars) + if (!test_subset(vars, names(ungrouped$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", + ) + } + if ("version" %in% vars) { + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + assert_logical(drop, len = 1) + + # ----- + private <- list() + private$ungrouped <- ungrouped + private$vars <- vars + private$drop <- drop + + return(structure( + list( + private = private + ), + class = c("grouped_epi_archive2", "epi_archive2") + )) +} + +#' @export +print.grouped_epi_archive2 <- function(grouped_epi_archive, class = TRUE) { + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(grouped_epi_archive$private$vars, initial = "* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(grouped_epi_archive$private$ungrouped$DT, is.factor)[grouped_epi_archive$private$vars])) { + cat(strwrap(init = "* ", prefix = " ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (grouped_epi_archive$private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + print(grouped_epi_archive$private$ungrouped, class = FALSE) + # Return self invisibly for convenience in `$`-"pipe": + invisible(grouped_epi_archive) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_by +#' @export +group_by.grouped_epi_archive2 <- function( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive)) { + assert_logical(.add, len = 1) + if (!.add) { + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate <- epix_detailed_restricted_mutate2(grouped_epi_archive$private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(grouped_epi_archive$private$vars, vars_from_dots) + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, vars, .drop) + } +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +group_by_drop_default.grouped_epi_archive2 <- function(grouped_epi_archive) { + grouped_epi_archive$private$drop +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr groups +#' @export +groups.grouped_epi_archive2 <- function(grouped_epi_archive) { + rlang::syms(grouped_epi_archive$private$vars) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr ungroup +#' @export +ungroup.grouped_epi_archive2 <- function(grouped_epi_archive, ...) { + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + grouped_epi_archive$private$ungrouped + } else { + exclude_vars <- eval_pure_select_names_from_dots(..., .data = grouped_epi_archive$private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars <- grouped_epi_archive$private$vars[!grouped_epi_archive$private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, result_vars, grouped_epi_archive$private$drop) + } +} + +#' Truncate versions after a given version, grouped +#' @description Filter to keep only older versions by mutating the underlying +#' `epi_archive` using `$truncate_versions_after`. Returns the mutated +#' `grouped_epi_archive` [invisibly][base::invisible]. +#' @param x as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +#' @export +truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + truncate_versions_after(grouped_epi_archive$private$ungrouped, max_version) + return(invisible(grouped_epi_archive)) +} + +#' Truncate versions after a given version, grouped +#' @export +epix_truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + cloned_group_epi_archive <- clone(grouped_epi_archive, deep = TRUE) + return((truncate_versions_after(cloned_group_epi_archive, max_version))) + # ^ second set of parens drops invisibility +} + + +#' Slide over grouped epi archive +#' @description Slides a given function over variables in a `grouped_epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. +#' @importFrom data.table key address rbindlist setDF +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' env missing_arg +#' @export +slide.grouped_epi_archive2 <- function(grouped_epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + cli_abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + cli_abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(grouped_epi_archive$private$ungrouped) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > grouped_epi_archive$private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + cli_abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col <- sym(new_col_name) + + # Validate rest of parameters: + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) + + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group <- .data_group$DT + } + + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } + + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below + } + + f <- as_slide_computation(f, ...) + x <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- as_of(grouped_epi_archive$private$ungrouped, + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(grouped_epi_archive$private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). + as_of_archive$DT <- copy(as_of_archive$DT) + } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- clone(as_of_archive) + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop) + + # Unchop/unnest if we need to + if (!as_list_col) { + x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) +} + + +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining +# `Collate:`) and ordering of functions within each file in order to get the +# desired ordering. + + + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +is_grouped_epi_archive2 <- function(x) { + inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone.grouped_epi_archive2 <- function(x, deep = FALSE) { + # TODO: Finish. + if (deep) { + ungrouped <- clone(x$private$ungrouped, deep = TRUE) + } else { + ungrouped <- x$private$ungrouped + } + new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) +} diff --git a/R/methods-epi_archive_new.R b/R/methods-epi_archive_new.R new file mode 100644 index 00000000..3af3056f --- /dev/null +++ b/R/methods-epi_archive_new.R @@ -0,0 +1,830 @@ +#' Generate a snapshot from an `epi_archive` object +#' +#' Generates a snapshot in `epi_df` format from an `epi_archive` object, as of a +#' given version. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose time values are at least +#' `min_time_value`.) +#' @param min_time_value Time value specifying the min time value to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @return An `epi_df` object. +#' +#' @details This is simply a wrapper around the `as_of()` method of the +#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' ``` +#' epix_as_of(x, max_version = v) +#' ``` +#' is equivalent to: +#' ``` +#' x$as_of(max_version = v) +#' ``` +#' +#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' +#' @examples +#' # warning message of data latency shown +#' epix_as_of2( +#' x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version) +#' ) +#' +#' @examples +#' +#' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 +#' +#' epix_as_of2( +#' x = archive_cases_dv_subset, +#' max_version = as.Date("2020-06-12") +#' ) +#' +#' # When fetching a snapshot as of the latest version with update data in the +#' # archive, a warning is issued by default, as this update data might not yet +#' # be finalized (for example, if data versions are labeled with dates, these +#' # versions might be overwritten throughout the corresponding days with +#' # additional data or "hotfixes" of erroroneous data; when we build an archive +#' # based on database queries, the latest available update might still be +#' # subject to change, but previous versions should be finalized). We can +#' # muffle such warnings with the following pattern: +#' withCallingHandlers( +#' { +#' epix_as_of2( +#' x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version) +#' ) +#' }, +#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' ) +#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used +#' # to globally toggle these warnings. +#' +#' @export +epix_as_of2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + assert_class(epi_archive, "epi_archive2") + return(as_of(epi_archive, max_version, min_time_value, all_versions = all_versions)) +} + +#' `epi_archive` with unobserved history filled in (won't mutate, might alias) +#' +#' Sometimes, due to upstream data pipeline issues, we have to work with a +#' version history that isn't completely up to date, but with functions that +#' expect archives that are completely up to date, or equally as up-to-date as +#' another archive. This function provides one way to approach such mismatches: +#' pretend that we've "observed" additional versions, filling in these versions +#' with NAs or extrapolated values. +#' +#' '`epix_fill_through_version` will not mutate its `x` argument, but its result +#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate +#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to +#' give the result, but might reseat its fields (e.g., references to the old +#' `x$DT` might not be updated by this function or subsequent operations on +#' `x`), and returns the updated `x` [invisibly][base::invisible]. +#' +#' @param x An `epi_archive` +#' @param fill_versions_end Length-1, same class&type as `x$version`: the +#' version through which to fill in missing version history; this will be the +#' result's `$versions_end` unless it already had a later +#' `$versions_end`. +#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing +#' required version history with `NA`s, by inserting (if necessary) an update +#' immediately after the current `$versions_end` that revises all +#' existing measurements to be `NA` (this is only supported for `version` +#' classes with a `next_after` implementation); `"locf"` will fill in missing +#' version history with the last version of each observation carried forward +#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are +#' based on LOCF). Default is `"na"`. +#' @return An `epi_archive` +epix_fill_through_version2 <- function(epi_archive, fill_versions_end, + how = c("na", "locf")) { + assert_class(epi_archive, "epi_archive2") + cloned_epi_archive <- clone(epi_archive) + # Enclosing parentheses drop the invisibility flag. See description above of + # potential mutation and aliasing behavior. + (fill_through_version(cloned_epi_archive, fill_versions_end, how = how)) +} + +#' Merge two `epi_archive` objects +#' +#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and +#' set of key columns. When they also share a common `versions_end`, +#' using `$as_of` on the result should be the same as using `$as_of` on `x` and +#' `y` individually, then performing a full join of the `DT`s on the non-version +#' key columns (potentially consolidating multiple warnings about clobberable +#' versions). If the `versions_end` values differ, the +#' `sync` parameter controls what is done. +#' +#' This function, [`epix_merge`], does not mutate its inputs and will not alias +#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite +#' `x` with the result of the merge, reseating its `DT` and several other fields +#' (making them point to different objects), but avoiding mutation of the +#' contents of the old `DT` (only relevant if you have another reference to the +#' old `DT` in another object). +#' +#' @param x,y Two `epi_archive` objects to join together. +#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the +#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: +#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` +#' as the result's `versions_end`, but ensure that, if we request a snapshot +#' as of a version after `min(x$versions_end, y$versions_end)`, the +#' observation columns from the less up-to-date archive will be all NAs (i.e., +#' imagine there was an update immediately after its `versions_end` which +#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, +#' y$versions_end)` as the result's `versions_end`, allowing the last version +#' of each observation to be carried forward to extrapolate unavailable +#' versions for the less up-to-date input archive (i.e., imagining that in the +#' less up-to-date archive's data set remained unchanged between its actual +#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: +#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, +#' and discard any rows containing update rows for later versions. +#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be +#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' Default here is `TRUE`. +#' @return the resulting `epi_archive` +#' +#' @details In all cases, `additional_metadata` will be an empty list, and +#' `clobberable_versions_start` will be set to the earliest version that could +#' be clobbered in either input archive. +#' +#' @examples +#' # create two example epi_archive datasets +#' x <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% +#' as_epi_archive(compactify = TRUE) +#' y <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value, time_value, version, percent_cli) %>% +#' as_epi_archive(compactify = TRUE) +#' # merge results stored in a third object: +#' xy <- epix_merge(x, y) +#' # vs. mutating x to hold the merge result: +#' x$merge(y) +#' +#' @importFrom data.table key set setkeyv +#' @export +epix_merge2 <- function(x, y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + assert_class(x, "epi_archive2") + assert_class(y, "epi_archive2") + sync <- rlang::arg_match(sync) + + if (!identical(x$geo_type, y$geo_type)) { + cli_abort("`x` and `y` must have the same `$geo_type`") + } + + if (!identical(x$time_type, y$time_type)) { + cli_abort("`x` and `y` must have the same `$time_type`") + } + + if (length(x$additional_metadata) != 0L) { + cli_warn("x$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + if (length(y$additional_metadata) != 0L) { + cli_warn("y$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + result_additional_metadata <- list() + + result_clobberable_versions_start <- + if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { + NA # (any type of NA is fine here) + } else { + Min(c(x$clobberable_versions_start, y$clobberable_versions_start)) + } + + # The actual merge below may not succeed 100% of the time, so do this + # preprocessing using non-mutating (but potentially aliasing) functions. This + # approach potentially uses more memory, but won't leave behind a + # partially-mutated `x` on failure. + if (sync == "forbid") { + if (!identical(x$versions_end, y$versions_end)) { + cli_abort(paste( + "`x` and `y` were not equally up to date version-wise:", + "`x$versions_end` was not identical to `y$versions_end`;", + "either ensure that `x` and `y` are equally up to date before merging,", + "or specify how to deal with this using `sync`" + ), class = "epiprocess__epix_merge_unresolved_sync") + } else { + new_versions_end <- x$versions_end + x_DT <- x$DT + y_DT <- y$DT + } + } else if (sync %in% c("na", "locf")) { + new_versions_end <- max(x$versions_end, y$versions_end) + x_DT <- epix_fill_through_version2(x, new_versions_end, sync)$DT + y_DT <- epix_fill_through_version2(y, new_versions_end, sync)$DT + } else if (sync == "truncate") { + new_versions_end <- min(x$versions_end, y$versions_end) + x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + } else { + cli_abort("unimplemented") + } + + # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same + # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to + # split the code into separate functions if we wish), but still refer to + # {x,y}$DT in the error messages (further relying on this assumption). + # + # Check&ensure that the above assumption; if it didn't already hold, we likely + # have a bug in the preprocessing, a weird/invalid archive as input, and/or a + # data.table version with different semantics (which may break other parts of + # our code). + x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) + y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) + if (!x_DT_key_as_expected || !y_DT_key_as_expected) { + cli_warn(" + `epiprocess` internal warning (please report): pre-processing for + epix_merge unexpectedly resulted in an intermediate data table (or + tables) with a different key than the corresponding input archive. + Manually setting intermediate data table keys to the expected values. + ", internal = TRUE) + setkeyv(x_DT, key(x$DT)) + setkeyv(y_DT, key(y$DT)) + } + # Without some sort of annotations of what various columns represent, we can't + # do something that makes sense when merging archives with mismatched keys. + # E.g., even if we assume extra keys represent demographic breakdowns, a + # sensible default treatment of count-type and rate-type value columns would + # differ. + if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { + cli_abort(" + The archives must have the same set of key column names; if the + key columns represent the same things, just with different + names, please retry after manually renaming to match; if they + represent different things (e.g., x has an age breakdown + but y does not), please retry after processing them to share + the same key (e.g., by summarizing x to remove the age breakdown, + or by applying a static age breakdown to y). + ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") + } + # `by` cols = result (and each input's) `key` cols, and determine + # the row set, determined using a full join via `merge` + # + # non-`by` cols = "value"-ish cols, and are looked up with last + # version carried forward via rolling joins + by <- key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to contain + "geo_value", "time_value", and "version".', + class = "epiprocess__epi_archive_must_have_required_key_cols" + ) + } + if (length(by) < 1L || utils::tail(by, 1L) != "version") { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to have a "version" as + the last key col.', + class = "epiprocess__epi_archive_must_have_version_at_end_of_key" + ) + } + x_nonby_colnames <- setdiff(names(x_DT), by) + y_nonby_colnames <- setdiff(names(y_DT), by) + if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { + cli_abort(" + `x` and `y` DTs have overlapping non-by column names; + this is currently not supported; please manually fix up first: + any overlapping columns that can are key-like should be + incorporated into the key, and other columns should be renamed. + ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + } + x_by_vals <- x_DT[, by, with = FALSE] + if (anyDuplicated(x_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `x$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `x`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + y_by_vals <- y_DT[, by, with = FALSE] + if (anyDuplicated(y_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `y$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `y`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + result_DT <- merge(x_by_vals, y_by_vals, + by = by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all = TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian = TRUE + ) + set( + result_DT, , x_nonby_colnames, + x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + with = FALSE, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll = TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch = NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian = TRUE + ] + ) + set( + result_DT, , y_nonby_colnames, + y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + with = FALSE, + on = by, + roll = TRUE, + nomatch = NA, + allow.cartesian = TRUE + ] + ) + # The key could be unset in case of a key vs. by order mismatch as + # noted above. Ensure that we keep it: + setkeyv(result_DT, by) + + return(as_epi_archive2( + result_DT[], # clear data.table internal invisibility flag if set + geo_type = x$geo_type, + time_type = x$time_type, + other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), + additional_metadata = result_additional_metadata, + # It'd probably be better to pre-compactify before the merge, and might be + # guaranteed not to be necessary to compactify the merge result if the + # inputs are already compactified, but at time of writing we don't have + # compactify in its own method or field, and it seems like it should be + # pretty fast anyway. + compactify = compactify, + clobberable_versions_start = result_clobberable_versions_start, + versions_end = new_versions_end + )) +} + +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df <- function(parent_df) { + assert_class(parent_df, "data.frame") + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { + assert_class(col_modify_recorder_df, "col_modify_recorder_df") + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) + ) +} + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { + cli_abort("`col_modify_recorder_df` can only record `cols` once", + internal = TRUE + ) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data +} + +#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` +#' +#' More detailed: provides the names of the "requested" columns in addition to +#' the output expected from a regular `mutate` method. +#' +#' Restricted: doesn't allow replacing or removing key cols, where a sort is +#' potentially required at best and what the output key should be is unclear at +#' worst. (The originally expected restriction was that the `mutate` parameters +#' not present in `group_by` would not be recognized, but the current +#' implementation just lets `mutate` handle these even anyway, even if they're +#' not part of the regular `group_by` parameters; these arguments would have to +#' be passed by names with dot prefixes, so just hope that the user means to use +#' them here if provided.) +#' +#' This can introduce column-level aliasing in `data.table`s, which isn't really +#' intended in the `data.table` user model but we can make it part of our user +#' model (see +#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table +#' and links). +#' +#' Don't export this without cleaning up language of "mutate" as in side effects +#' vs. "mutate" as in `dplyr::mutate`. +#' @noRd +epix_detailed_restricted_mutate2 <- function(.data, ...) { + # We don't want to directly use `dplyr::mutate` on the `$DT`, as: + # - `mutate` behavior, including the output class, changes depending on + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing + # - behavior with `dtplyr` isn't fully compatible + # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not + # appropriately handle the `= NULL` and `= ` tidyeval cases + # Instead: + # - Use `as.list` to get a shallow copy (undocumented, but apparently + # intended, behavior), then `as_tibble` (also shallow, given a list) to get + # back to something that will use `dplyr`'s included `mutate` method(s), + # then convert this using shallow operations into a `data.table`. + # - Use `col_modify_recorder_df` to get the desired details. + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + col_modify_cols <- + destructure_col_modify_recorder_df( + mutate(new_col_modify_recorder_df(in_tbl), ...) + )[["cols"]] + invalidated_key_col_is <- + which(purrr::map_lgl(key(.data$DT), function(key_colname) { + key_colname %in% names(col_modify_cols) && + !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) + })) + if (length(invalidated_key_col_is) != 0L) { + rlang::abort(paste_lines(c( + "Key columns must not be replaced or removed.", + wrap_varnames(key(.data$DT)[invalidated_key_col_is], + initial = "Flagged key cols: " + ) + ))) + } else { + # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing + # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key = key(.data$DT)) + out_archive <- clone(.data) + out_archive$DT <- out_DT + request_names <- names(col_modify_cols) + return(list( + archive = out_archive, + request_names = request_names + )) + # (We might also consider special-casing when `mutate` hands back something + # equivalent (in some sense) to the input (probably only encountered when + # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, + # in the result, primarily in order to hedge against `as.list` or `setDT` + # changing their behavior and generating deep copies somehow. This could + # also prevent storage, and perhaps also generation, of shallow copies, but + # this seems unlikely to be a major gain unless it helps enable some + # in-place modifications of refcount-1 columns (although detecting this case + # seems to be common across `group_by` implementations; maybe there is + # something there).) + } +} + + +#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` +#' +#' Slides a given function over variables in an `epi_archive` object. This +#' behaves similarly to `epi_slide()`, with the key exception that it is +#' version-aware: the sliding computation at any given reference time t is +#' performed on **data that would have been available as of t**. See the +#' [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, +#' all data in `x` will be treated as part of a single data group. +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +#' @return A tibble whose columns are: the grouping variables, `time_value`, +#' containing the reference time values for the slide computation, and a +#' column named according to the `new_col_name` argument, containing the slide +#' values. +#' +#' @details A few key distinctions between the current function and `epi_slide()`: +#' 1. In `f` functions for `epix_slide`, one should not assume that the input +#' data to contain any rows with `time_value` matching the computation's +#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for +#' typical epidemiological surveillance data, observations pertaining to a +#' particular time period (`time_value`) are first reported `as_of` some +#' instant after that time period has ended. +#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend +#' from `before` time steps before a given `ref_time_value` through the last +#' `time_value` available as of version `ref_time_value` (typically, this +#' won't include `ref_time_value` itself, as observations about a particular +#' time interval (e.g., day) are only published after that time interval +#' ends); `epi_slide` windows extend from `before` time steps before a +#' `ref_time_value` through `after` time steps after `ref_time_value`. +#' 3. The input class and columns are similar but different: `epix_slide` +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) +#' 4. The output class and columns are similar but different: `epix_slide()` +#' returns a tibble containing only the grouping variables, `time_value`, and +#' the new column(s) from the slide computations, whereas `epi_slide()` +#' returns an `epi_df` with all original variables plus the new columns from +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. +#' 7. The `ref_time_values` default for `epix_slide` is based on making an +#' evenly-spaced sequence out of the `version`s in the `DT` plus the +#' `versions_end`, rather than the `time_value`s. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. +#' +#' Furthermore, the current function can be considerably slower than +#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch +#' properly-versioned snapshots from the data archive (via its `as_of()` +#' method), and (2) it performs a "manual" sliding of sorts, and does not +#' benefit from the highly efficient `slider` package. For this reason, it +#' should never be used in place of `epi_slide()`, and only used when +#' version-aware sliding is necessary (as it its purpose). +#' +#' Finally, this is simply a wrapper around the `slide()` method of the +#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an +#' object of either of these classes, then: +#' ``` +#' epix_slide(x, new_var = comp(old_var), before = 119) +#' ``` +#' is equivalent to: +#' ``` +#' x$slide(new_var = comp(old_var), before = 119) +#' ``` +#' +#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place +#' mutation of the input archives on their own. In some edge cases the inputs it +#' feeds to the slide computations may alias parts of the input archive, so copy +#' the slide computation inputs if needed before using mutating operations like +#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of +#' the slide operation may alias parts of the input archive, so similarly, make +#' sure to clone and/or copy appropriately before using in-place mutation. +#' +#' @examples +#' library(dplyr) +#' +#' # Reference time points for which we want to compute slide values: +#' ref_time_values <- seq(as.Date("2020-06-01"), +#' as.Date("2020-06-15"), +#' by = "1 day" +#' ) +#' +#' # A simple (but not very useful) example (see the archive vignette for a more +#' # realistic one): +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = "case_rate_7d_av_recent_av" +#' ) %>% +#' ungroup() +#' # We requested time windows that started 2 days before the corresponding time +#' # values. The actual number of `time_value`s in each computation depends on +#' # the reporting latency of the signal and `time_value` range covered by the +#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +#' # discarded +#' # * 1 `time_value`, for ref time 2020-06-02 +#' # * 2 `time_value`s, for the rest of the results +#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because +#' # of data latency, we'll never have an observation +#' # `time_value == ref_time_value` as of `ref_time_value`. +#' # The example below shows this type of behavior in more detail. +#' +#' # Examining characteristics of the data passed to each computation with +#' # `all_versions=FALSE`. +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if (nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) +#' +#' # --- Advanced: --- +#' +#' # `epix_slide` with `all_versions=FALSE` (the default) applies a +#' # version-unaware computation to several versions of the data. We can also +#' # use `all_versions=TRUE` to apply a version-*aware* computation to several +#' # versions of the data, again looking at characteristics of the data passed +#' # to each computation. In this case, each computation should expect an +#' # `epi_archive` containing the relevant version data: +#' +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' function(x, gk, rtv) { +#' tibble( +#' versions_start = if (nrow(x$DT) == 0L) { +#' "NA (0 rows)" +#' } else { +#' toString(min(x$DT$version)) +#' }, +#' versions_end = x$versions_end, +#' time_range = if (nrow(x$DT) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) +#' }, +#' n = nrow(x$DT), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = TRUE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' # Focus on one geo_value so we can better see the columns above: +#' filter(geo_value == "ca") %>% +#' select(-geo_value) +#' +#' @importFrom rlang enquo !!! +#' @export +epix_slide2 <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive2(x, grouped_okay = TRUE)) { + cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") + } + return(slide(x, f, ..., + before = before, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_name = new_col_name, + as_list_col = as_list_col, + names_sep = names_sep, + all_versions = all_versions + )) +} + + +#' Filter an `epi_archive` object to keep only older versions +#' +#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping +#' only rows with `version` falling on or before a specified date. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' filtered archive. That is, the output archive will comprise rows of the +#' current archive data having `version` less than or equal to the +#' specified `max_version` +#' @return An `epi_archive` object +#' +#' @export +epix_truncate_versions_after <- function(x, max_version) { + UseMethod("epix_truncate_versions_after") +} + +#' @export +epix_truncate_versions_after.epi_archive2 <- function(x, max_version) { + cloned_epi_archive <- clone(x) + return((truncate_versions_after(x, max_version))) + # ^ second set of parens drops invisibility +} diff --git a/man/as_epi_archive2.Rd b/man/as_epi_archive2.Rd new file mode 100644 index 00000000..090b455a --- /dev/null +++ b/man/as_epi_archive2.Rd @@ -0,0 +1,142 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_epi_archive2} +\alias{as_epi_archive2} +\title{Convert to \code{epi_archive} format} +\usage{ +as_epi_archive2( + x, + geo_type, + time_type, + other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x) +) +} +\arguments{ +\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or +\code{NULL} will remove these rows and issue a warning. Generally, this can be +set to \code{TRUE}, but if you directly inspect or edit the fields of the +\code{epi_archive} such as its \code{DT}, you will have to determine whether +\code{compactify=TRUE} will produce the desired results. If compactification +here is removing a large proportion of the rows, this may indicate a +potential for space, time, or bandwidth savings upstream the data pipeline, +e.g., when fetching, storing, or preparing the input data \code{x}} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} + +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Converts a data frame, data table, or tibble into an \code{epi_archive} +object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. The parameter descriptions below are copied from there +} +\details{ +This simply a wrapper around the \code{new()} method of the \code{epi_archive} +class, so for example: + +\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} +} +\examples{ +# Simple ex. with necessary keys +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% as_epi_archive( + geo_type = "state", + time_type = "day" +) +toy_epi_archive + +# Ex. with an additional key for county +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) +) + +x <- df \%>\% as_epi_archive( + geo_type = "state", + time_type = "day", + other_keys = "county" +) +} diff --git a/man/as_of.epi_archive2.Rd b/man/as_of.epi_archive2.Rd new file mode 100644 index 00000000..21a4cfc1 --- /dev/null +++ b/man/as_of.epi_archive2.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_of.epi_archive2} +\alias{as_of.epi_archive2} +\title{As of epi_archive} +\usage{ +\method{as_of}{epi_archive2}(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) +} +\arguments{ +\item{epi_archive}{An \code{epi_archive} object} + +\item{max_version}{Version specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose \code{time_value}s are at least +\code{min_time_value}).} + +\item{min_time_value}{Time value specifying the min \code{time_value} to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{Boolean; If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} +} +\description{ +Generates a snapshot in \code{epi_df} format as of a given version. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 6a25b2af..efe5d5ba 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,9 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{epi_archive} \alias{epi_archive} \title{\code{epi_archive} object} \description{ +An \code{epi_archive} is an R6 class which contains a data table +along with several relevant pieces of metadata. The data table can be seen +as the full archive (version history) for some signal variables of +interest. + An \code{epi_archive} is an R6 class which contains a data table along with several relevant pieces of metadata. The data table can be seen as the full archive (version history) for some signal variables of @@ -49,6 +54,56 @@ represent potential update data that we do not yet have access to; or in version in which it was first released, or if no version of that observation appears in the archive data at all. +\strong{A word of caution:} R6 objects, unlike most other objects in R, have +reference semantics. A primary consequence of this is that objects are not +copied when modified. You can read more about this in Hadley Wickham's +\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order +to construct a modified archive while keeping the original intact, first +make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} +field with \code{data.table::copy(clone$DT)}, and finally perform the +modifications on the clone. + +epi archive + +An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of +class \code{data.table} from the \code{data.table} package, with (at least) the +following columns: +\itemize{ +\item \code{geo_value}: the geographic value associated with each row of measurements. +\item \code{time_value}: the time value associated with each row of measurements. +\item \code{version}: the time value specifying the version for each row of +measurements. For example, if in a given row the \code{version} is January 15, +2022 and \code{time_value} is January 14, 2022, then this row contains the +measurements of the data for January 14, 2022 that were available one day +later. +} + +The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, +as well as any others (these can be specified when instantiating the +\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating +on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for +information and examples of relevant parameter names for an \code{epi_archive} object. +Note that there can only be a single row per unique combination of +key variables, and thus the key variables are critical for figuring out how +to generate a snapshot of data from the archive, as of a given version. + +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{versions_end}. One consequence is that the \code{DT} +doesn't have to contain a full snapshot of every version (although this +generally works), but can instead contain only the rows that are new or +changed from the previous version (see \code{compactify}, which does this +automatically). Currently, deletions must be represented as revising a row +to a special state (e.g., making the entries \code{NA} or including a special +column that flags the data as removed and performing some kind of +post-processing), and the archive is unaware of what this state is. Note +that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, +e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. + \strong{A word of caution:} R6 objects, unlike most other objects in R, have reference semantics. A primary consequence of this is that objects are not copied when modified. You can read more about this in Hadley Wickham's @@ -60,6 +115,22 @@ modifications on the clone. } \section{Metadata}{ +The following pieces of metadata are included as fields in an \code{epi_archive} +object: +\itemize{ +\item \code{geo_type}: the type for the geo values. +\item \code{time_type}: the type for the time values. +\item \code{additional_metadata}: list of additional metadata for the data archive. +} + +Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be +accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, +etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the +metadata of an \code{epi_archive} object are not currently used by any +downstream functions in the \code{epiprocess} package, and serve only as useful +bits of information to convey about the data set at hand. + + The following pieces of metadata are included as fields in an \code{epi_archive} object: \itemize{ @@ -78,6 +149,13 @@ bits of information to convey about the data set at hand. \section{Generating Snapshots}{ +An \code{epi_archive} object can be used to generate a snapshot of the data in +\code{epi_df} format, which represents the most up-to-date values of the signal +variables, as of the specified version. This is accomplished by calling the +\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this +method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. + + An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal variables, as of the specified version. This is accomplished by calling the @@ -87,6 +165,16 @@ method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_o \section{Sliding Computations}{ +We can run a sliding computation over an \code{epi_archive} object, much like +\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling +the \code{slide()} method for an \code{epi_archive} object, which works similarly to +the way \code{epi_slide()} works for an \code{epi_df} object, but with one key +difference: it is version-aware. That is, for an \code{epi_archive} object, the +sliding computation at any given reference time point t is performed on +\strong{data that would have been available as of t}. More details on \code{slide()} +are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. + + We can run a sliding computation over an \code{epi_archive} object, much like \code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling the \code{slide()} method for an \code{epi_archive} object, which works similarly to @@ -109,6 +197,22 @@ tib <- tibble::tibble( value = rnorm(10, mean = 2, sd = 1) ) +toy_epi_archive <- tib \%>\% epi_archive$new( + geo_type = "state", + time_type = "day" +) +toy_epi_archive +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + toy_epi_archive <- tib \%>\% epi_archive$new( geo_type = "state", time_type = "day" diff --git a/man/epix_as_of2.Rd b/man/epix_as_of2.Rd new file mode 100644 index 00000000..6c3db717 --- /dev/null +++ b/man/epix_as_of2.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_as_of2} +\alias{epix_as_of2} +\title{Generate a snapshot from an \code{epi_archive} object} +\usage{ +epix_as_of2( + epi_archive, + max_version, + min_time_value = -Inf, + all_versions = FALSE +) +} +\arguments{ +\item{max_version}{Time value specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose time values are at least +\code{min_time_value}.)} + +\item{min_time_value}{Time value specifying the min time value to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{x}{An \code{epi_archive} object} +} +\value{ +An \code{epi_df} object. +} +\description{ +Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, as of a +given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +This is simply a wrapper around the \code{as_of()} method of the +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: + +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input +archives, but may in some edge cases alias parts of the inputs, so copy the +outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} +operator. Currently, the only situation where there is potentially aliasing +is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change +in the future. +} +\examples{ +# warning message of data latency shown +epix_as_of2( + x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version) +) + + +range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 + +epix_as_of2( + x = archive_cases_dv_subset, + max_version = as.Date("2020-06-12") +) + +# When fetching a snapshot as of the latest version with update data in the +# archive, a warning is issued by default, as this update data might not yet +# be finalized (for example, if data versions are labeled with dates, these +# versions might be overwritten throughout the corresponding days with +# additional data or "hotfixes" of erroroneous data; when we build an archive +# based on database queries, the latest available update might still be +# subject to change, but previous versions should be finalized). We can +# muffle such warnings with the following pattern: +withCallingHandlers( + { + epix_as_of2( + x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version) + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +) +# Since R 4.0, there is a `globalCallingHandlers` function that can be used +# to globally toggle these warnings. + +} diff --git a/man/epix_fill_through_version2.Rd b/man/epix_fill_through_version2.Rd new file mode 100644 index 00000000..7389388a --- /dev/null +++ b/man/epix_fill_through_version2.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_fill_through_version2} +\alias{epix_fill_through_version2} +\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\usage{ +epix_fill_through_version2( + epi_archive, + fill_versions_end, + how = c("na", "locf") +) +} +\arguments{ +\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the +version through which to fill in missing version history; this will be the +result's \verb{$versions_end} unless it already had a later +\verb{$versions_end}.} + +\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing +required version history with \code{NA}s, by inserting (if necessary) an update +immediately after the current \verb{$versions_end} that revises all +existing measurements to be \code{NA} (this is only supported for \code{version} +classes with a \code{next_after} implementation); \code{"locf"} will fill in missing +version history with the last version of each observation carried forward +(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are +based on LOCF). Default is \code{"na"}.} + +\item{x}{An \code{epi_archive}} +} +\value{ +An \code{epi_archive} +} +\description{ +Sometimes, due to upstream data pipeline issues, we have to work with a +version history that isn't completely up to date, but with functions that +expect archives that are completely up to date, or equally as up-to-date as +another archive. This function provides one way to approach such mismatches: +pretend that we've "observed" additional versions, filling in these versions +with NAs or extrapolated values. +} +\details{ +'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result +might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate +\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to +give the result, but might reseat its fields (e.g., references to the old +\code{x$DT} might not be updated by this function or subsequent operations on +\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. +} diff --git a/man/epix_merge2.Rd b/man/epix_merge2.Rd new file mode 100644 index 00000000..a42e53e4 --- /dev/null +++ b/man/epix_merge2.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_merge2} +\alias{epix_merge2} +\title{Merge two \code{epi_archive} objects} +\usage{ +epix_merge2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x, y}{Two \code{epi_archive} objects to join together.} + +\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the +case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: +\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} +as the result's \code{versions_end}, but ensure that, if we request a snapshot +as of a version after \code{min(x$versions_end, y$versions_end)}, the +observation columns from the less up-to-date archive will be all NAs (i.e., +imagine there was an update immediately after its \code{versions_end} which +revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version +of each observation to be carried forward to extrapolate unavailable +versions for the less up-to-date input archive (i.e., imagining that in the +less up-to-date archive's data set remained unchanged between its actual +\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: +use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, +and discard any rows containing update rows for later versions.} + +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be +compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +Default here is \code{TRUE}.} +} +\value{ +the resulting \code{epi_archive} +} +\description{ +Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and +set of key columns. When they also share a common \code{versions_end}, +using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and +\code{y} individually, then performing a full join of the \code{DT}s on the non-version +key columns (potentially consolidating multiple warnings about clobberable +versions). If the \code{versions_end} values differ, the +\code{sync} parameter controls what is done. +} +\details{ +This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias +either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite +\code{x} with the result of the merge, reseating its \code{DT} and several other fields +(making them point to different objects), but avoiding mutation of the +contents of the old \code{DT} (only relevant if you have another reference to the +old \code{DT} in another object). + +In all cases, \code{additional_metadata} will be an empty list, and +\code{clobberable_versions_start} will be set to the earliest version that could +be clobbered in either input archive. +} +\examples{ +# create two example epi_archive datasets +x <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% + as_epi_archive(compactify = TRUE) +y <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value, time_value, version, percent_cli) \%>\% + as_epi_archive(compactify = TRUE) +# merge results stored in a third object: +xy <- epix_merge(x, y) +# vs. mutating x to hold the merge result: +x$merge(y) + +} diff --git a/man/epix_slide2.Rd b/man/epix_slide2.Rd new file mode 100644 index 00000000..71d3a11c --- /dev/null +++ b/man/epix_slide2.Rd @@ -0,0 +1,283 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_slide2} +\alias{epix_slide2} +\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} +\usage{ +epix_slide2( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, +all data in \code{x} will be treated as part of a single data group.} + +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\value{ +A tibble whose columns are: the grouping variables, \code{time_value}, +containing the reference time values for the slide computation, and a +column named according to the \code{new_col_name} argument, containing the slide +values. +} +\description{ +Slides a given function over variables in an \code{epi_archive} object. This +behaves similarly to \code{epi_slide()}, with the key exception that it is +version-aware: the sliding computation at any given reference time t is +performed on \strong{data that would have been available as of t}. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +A few key distinctions between the current function and \code{epi_slide()}: +\enumerate{ +\item In \code{f} functions for \code{epix_slide}, one should not assume that the input +data to contain any rows with \code{time_value} matching the computation's +\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for +typical epidemiological surveillance data, observations pertaining to a +particular time period (\code{time_value}) are first reported \code{as_of} some +instant after that time period has ended. +\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend +from \code{before} time steps before a given \code{ref_time_value} through the last +\code{time_value} available as of version \code{ref_time_value} (typically, this +won't include \code{ref_time_value} itself, as observations about a particular +time interval (e.g., day) are only published after that time interval +ends); \code{epi_slide} windows extend from \code{before} time steps before a +\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. +\item The input class and columns are similar but different: \code{epix_slide} +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) +\item The output class and columns are similar but different: \code{epix_slide()} +returns a tibble containing only the grouping variables, \code{time_value}, and +the new column(s) from the slide computations, whereas \code{epi_slide()} +returns an \code{epi_df} with all original variables plus the new columns from +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. +\item The \code{ref_time_values} default for \code{epix_slide} is based on making an +evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the +\code{versions_end}, rather than the \code{time_value}s. +} + +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + +Furthermore, the current function can be considerably slower than +\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch +properly-versioned snapshots from the data archive (via its \code{as_of()} +method), and (2) it performs a "manual" sliding of sorts, and does not +benefit from the highly efficient \code{slider} package. For this reason, it +should never be used in place of \code{epi_slide()}, and only used when +version-aware sliding is necessary (as it its purpose). + +Finally, this is simply a wrapper around the \code{slide()} method of the +\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an +object of either of these classes, then: + +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place +mutation of the input archives on their own. In some edge cases the inputs it +feeds to the slide computations may alias parts of the input archive, so copy +the slide computation inputs if needed before using mutating operations like +\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of +the slide operation may alias parts of the input archive, so similarly, make +sure to clone and/or copy appropriately before using in-place mutation. +} +\examples{ +library(dplyr) + +# Reference time points for which we want to compute slide values: +ref_time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-15"), + by = "1 day" +) + +# A simple (but not very useful) example (see the archive vignette for a more +# realistic one): +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = "case_rate_7d_av_recent_av" + ) \%>\% + ungroup() +# We requested time windows that started 2 days before the corresponding time +# values. The actual number of `time_value`s in each computation depends on +# the reporting latency of the signal and `time_value` range covered by the +# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +# discarded +# * 1 `time_value`, for ref time 2020-06-02 +# * 2 `time_value`s, for the rest of the results +# * never the 3 `time_value`s we would get from `epi_slide`, since, because +# of data latency, we'll never have an observation +# `time_value == ref_time_value` as of `ref_time_value`. +# The example below shows this type of behavior in more detail. + +# Examining characteristics of the data passed to each computation with +# `all_versions=FALSE`. +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide( + function(x, gk, rtv) { + tibble( + time_range = if (nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) + +# --- Advanced: --- + +# `epix_slide` with `all_versions=FALSE` (the default) applies a +# version-unaware computation to several versions of the data. We can also +# use `all_versions=TRUE` to apply a version-*aware* computation to several +# versions of the data, again looking at characteristics of the data passed +# to each computation. In this case, each computation should expect an +# `epi_archive` containing the relevant version data: + +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide( + function(x, gk, rtv) { + tibble( + versions_start = if (nrow(x$DT) == 0L) { + "NA (0 rows)" + } else { + toString(min(x$DT$version)) + }, + versions_end = x$versions_end, + time_range = if (nrow(x$DT) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) + }, + n = nrow(x$DT), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = TRUE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + # Focus on one geo_value so we can better see the columns above: + filter(geo_value == "ca") \%>\% + select(-geo_value) + +} diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd index 8f741418..f30be07f 100644 --- a/man/epix_truncate_versions_after.Rd +++ b/man/epix_truncate_versions_after.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, +% R/methods-epi_archive_new.R \name{epix_truncate_versions_after} \alias{epix_truncate_versions_after} \title{Filter an \code{epi_archive} object to keep only older versions} \usage{ +epix_truncate_versions_after(x, max_version) + epix_truncate_versions_after(x, max_version) } \arguments{ @@ -15,9 +18,14 @@ current archive data having \code{version} less than or equal to the specified \code{max_version}} } \value{ +An \code{epi_archive} object + An \code{epi_archive} object } \description{ +Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping +only rows with \code{version} falling on or before a specified date. + Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping only rows with \code{version} falling on or before a specified date. } diff --git a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..5fba48fb --- /dev/null +++ b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{epix_truncate_versions_after.grouped_epi_archive2} +\alias{epix_truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{epix_truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\description{ +Truncate versions after a given version, grouped +} diff --git a/man/fill_through_version.epi_archive2.Rd b/man/fill_through_version.epi_archive2.Rd new file mode 100644 index 00000000..48afb864 --- /dev/null +++ b/man/fill_through_version.epi_archive2.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{fill_through_version.epi_archive2} +\alias{fill_through_version.epi_archive2} +\title{Fill through version} +\usage{ +\method{fill_through_version}{epi_archive2}(epi_archive, fill_versions_end, how = c("na", "locf")) +} +\arguments{ +\item{epi_archive}{an \code{epi_archive} object} + +\item{fill_versions_end}{as in \code{\link{epix_fill_through_version}}} + +\item{how}{as in \code{\link{epix_fill_through_version}}} +} +\description{ +Fill in unobserved history using requested scheme by mutating +the given object and potentially reseating its fields. See +\code{\link{epix_fill_through_version}}, which doesn't mutate the input archive but +might alias its fields. +} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index 5e867bf3..f157e834 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -1,8 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_archive_new.R, +% R/grouped_epi_archive.R \name{group_by.epi_archive} \alias{group_by.epi_archive} \alias{grouped_epi_archive} +\alias{group_by.grouped_epi_archive2} +\alias{group_by_drop_default.grouped_epi_archive2} +\alias{groups.grouped_epi_archive2} +\alias{ungroup.grouped_epi_archive2} +\alias{is_grouped_epi_archive2} \alias{group_by.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} @@ -12,6 +18,21 @@ \usage{ \method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by}{grouped_epi_archive2}( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive) +) + +\method{group_by_drop_default}{grouped_epi_archive2}(grouped_epi_archive) + +\method{groups}{grouped_epi_archive2}(grouped_epi_archive) + +\method{ungroup}{grouped_epi_archive2}(grouped_epi_archive, ...) + +is_grouped_epi_archive2(x) + \method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) \method{groups}{grouped_epi_archive}(x) diff --git a/man/group_by.epi_archive2.Rd b/man/group_by.epi_archive2.Rd new file mode 100644 index 00000000..3191b134 --- /dev/null +++ b/man/group_by.epi_archive2.Rd @@ -0,0 +1,147 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{group_by.epi_archive2} +\alias{group_by.epi_archive2} +\alias{grouped_epi_archive} +\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} +\usage{ +\method{group_by}{epi_archive2}( + epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(epi_archive) +) +} +\arguments{ +\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); +\itemize{ +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +perform grouping, but note that, if you are regrouping an already-grouped +\code{.data} object, the calculations will be carried out ignoring such grouping +(same as \link[dplyr:group_by]{in dplyr}). +\item For \code{ungroup}: either +\itemize{ +\item empty, in order to remove the grouping and output an \code{epi_archive}; or +\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} +expression(s), in order to remove the matching variables from the list of +grouping variables, and output another \code{grouped_epi_archive}. +} +}} + +\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by +the variable selection from \code{...} only; if \code{TRUE}, the output will be +grouped by the current grouping variables plus the variable selection from +\code{...}.} + +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} + +\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} + +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} + +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} +} +\description{ +\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} +} +\details{ +To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as +"tidy evaluation") expressions \code{...}, not just column names, in a way similar +to \code{mutate}. Note that replacing or removing key columns with these +expressions is disabled. + +\code{archive \%>\% group_by()} and other expressions that group or regroup by zero +columns (indicating that all rows should be treated as part of one large +group) will output a \code{grouped_epi_archive}, in order to enable the use of +\code{grouped_epi_archive} methods on the result. This is in slight contrast to +the same operations on tibbles and grouped tibbles, which will \emph{not} output a +\code{grouped_df} in these circumstances. + +Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is +disabled; instead, \code{ungroup} first then \code{group_by}. + +Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, +introducing column-level aliasing between its input and its result. This +doesn't follow the general model for most \code{data.table} operations, which +seems to be that, given an nonaliased (i.e., unique) pointer to a +\code{data.table} object, its pointers to its columns should also be nonaliased. +If you mutate any of the columns of either the input or result, first ensure +that it is fine if columns of the other are also mutated, but do not rely on +such behavior to occur. Additionally, never perform mutation on the key +columns at all (except for strictly increasing transformations), as this will +invalidate sortedness assumptions about the rows. + +\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch +to \code{group_by_drop_default.default} (but there is a dedicated method for +\code{grouped_epi_archive}s). +} +\examples{ + +grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) + +# `print` for metadata and method listing: +grouped_archive \%>\% print() + +# The primary use for grouping is to perform a grouped `epix_slide`: + +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = "case_rate_3d_av" + ) \%>\% + ungroup() + +# ----------------------------------------------------------------- + +# Advanced: some other features of dplyr grouping are implemented: + +library(dplyr) +toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) \%>\% + as_epi_archive(other_keys = "age_group") + +# The following are equivalent: +toy_archive \%>\% group_by(geo_value, age_group) +toy_archive \%>\% + group_by(geo_value) \%>\% + group_by(age_group, .add = TRUE) +grouping_cols <- c("geo_value", "age_group") +toy_archive \%>\% group_by(across(all_of(grouping_cols))) + +# And these are equivalent: +toy_archive \%>\% group_by(geo_value) +toy_archive \%>\% + group_by(geo_value, age_group) \%>\% + ungroup(age_group) + +# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +toy_archive \%>\% + group_by(geo_value) \%>\% + groups() + +toy_archive \%>\% + group_by(geo_value, age_group, .drop = FALSE) \%>\% + epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() + +} diff --git a/man/is_epi_archive2.Rd b/man/is_epi_archive2.Rd new file mode 100644 index 00000000..fd2f0a1f --- /dev/null +++ b/man/is_epi_archive2.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{is_epi_archive2} +\alias{is_epi_archive2} +\title{Test for \code{epi_archive} format} +\usage{ +is_epi_archive2(x, grouped_okay = FALSE) +} +\arguments{ +\item{x}{An object.} + +\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also +count? Default is \code{FALSE}.} +} +\value{ +\code{TRUE} if the object inherits from \code{epi_archive}. +} +\description{ +Test for \code{epi_archive} format +} +\examples{ +is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +is_epi_archive(archive_cases_dv_subset) # TRUE + +# By default, grouped_epi_archives don't count as epi_archives, as they may +# support a different set of operations from regular `epi_archives`. This +# behavior can be controlled by `grouped_okay`. +grouped_archive <- archive_cases_dv_subset$group_by(geo_value) +is_epi_archive(grouped_archive) # FALSE +is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE + +} +\seealso{ +\code{\link{is_grouped_epi_archive}} +} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index cca554fa..6f0d35b3 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -1,18 +1,25 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{max_version_with_row_in} \alias{max_version_with_row_in} \title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ +max_version_with_row_in(x) + max_version_with_row_in(x) } \arguments{ \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ +\code{max(x$version)} if it has any rows; raises error if it has 0 rows or +an \code{NA} version value + \code{max(x$version)} if it has any rows; raises error if it has 0 rows or an \code{NA} version value } \description{ +Exported to make defaults more easily copyable. + Exported to make defaults more easily copyable. } diff --git a/man/merge_epi_archive2.Rd b/man/merge_epi_archive2.Rd new file mode 100644 index 00000000..dd1e671e --- /dev/null +++ b/man/merge_epi_archive2.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{merge_epi_archive2} +\alias{merge_epi_archive2} +\title{Merge epi archive} +\usage{ +merge_epi_archive2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x}{as in \code{\link{epix_merge}}} + +\item{y}{as in \code{\link{epix_merge}}} + +\item{sync}{as in \code{\link{epix_merge}}} + +\item{compactify}{as in \code{\link{epix_merge}}} +} +\description{ +Merges another \code{epi_archive} with the current one, mutating the +current one by reseating its \code{DT} and several other fields, but avoiding +mutation of the old \code{DT}; returns the current archive +\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description +of the non-R6-method version, which does not mutate either archive, and +does not alias either archive's \code{DT}.a +} diff --git a/man/new_epi_archive2.Rd b/man/new_epi_archive2.Rd new file mode 100644 index 00000000..52141190 --- /dev/null +++ b/man/new_epi_archive2.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{new_epi_archive2} +\alias{new_epi_archive2} +\title{New epi archive} +\usage{ +new_epi_archive2( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL +) +} +\arguments{ +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space while maintaining the same behavior (with the help of the +\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). +\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will +remove these rows and issue a warning. Generally, this can be set to +\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} +such as its \code{DT}, or rely on redundant updates to achieve a certain +behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to +determine whether \code{compactify=TRUE} will produce the desired results. If +compactification here is removing a large proportion of the rows, this may +indicate a potential for space, time, or bandwidth savings upstream the +data pipeline, e.g., by avoiding fetching, storing, or processing these +rows of \code{x}.} + +\item{clobberable_versions_start}{Optional; as in \code{\link{as_epi_archive}}} + +\item{versions_end}{Optional; as in \code{\link{as_epi_archive}}} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Creates a new \code{epi_archive} object. +} +\details{ +Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information +and examples of parameter names. +} diff --git a/man/next_after.Rd b/man/next_after.Rd index 5170e8d9..82fd3ebb 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -1,17 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{next_after} \alias{next_after} \title{Get the next possible value greater than \code{x} of the same type} \usage{ +next_after(x) + next_after(x) } \arguments{ \item{x}{the starting "value"(s)} } \value{ +same class, typeof, and length as \code{x} + same class, typeof, and length as \code{x} } \description{ +Get the next possible value greater than \code{x} of the same type + Get the next possible value greater than \code{x} of the same type } diff --git a/man/print.epi_archive2.Rd b/man/print.epi_archive2.Rd new file mode 100644 index 00000000..0105c47e --- /dev/null +++ b/man/print.epi_archive2.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{print.epi_archive2} +\alias{print.epi_archive2} +\title{Print information about an \code{epi_archive} object} +\usage{ +\method{print}{epi_archive2}(epi_archive, class = TRUE, methods = TRUE) +} +\arguments{ +\item{class}{Boolean; whether to print the class label header} + +\item{methods}{Boolean; whether to print all available methods of +the archive} +} +\description{ +Print information about an \code{epi_archive} object +} diff --git a/man/slide.epi_archive2.Rd b/man/slide.epi_archive2.Rd new file mode 100644 index 00000000..54db5636 --- /dev/null +++ b/man/slide.epi_archive2.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{slide.epi_archive2} +\alias{slide.epi_archive2} +\title{Slide over epi archive} +\usage{ +\method{slide}{epi_archive2}( + epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\description{ +Slides a given function over variables in an \code{epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/slide.grouped_epi_archive2.Rd b/man/slide.grouped_epi_archive2.Rd new file mode 100644 index 00000000..b5aac24c --- /dev/null +++ b/man/slide.grouped_epi_archive2.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{slide.grouped_epi_archive2} +\alias{slide.grouped_epi_archive2} +\title{Slide over grouped epi archive} +\usage{ +\method{slide}{grouped_epi_archive2}( + grouped_epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\description{ +Slides a given function over variables in a \code{grouped_epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. +} diff --git a/man/truncate_versions_after.epi_archive2.Rd b/man/truncate_versions_after.epi_archive2.Rd new file mode 100644 index 00000000..08ae40d4 --- /dev/null +++ b/man/truncate_versions_after.epi_archive2.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{truncate_versions_after.epi_archive2} +\alias{truncate_versions_after.epi_archive2} +\title{Truncate versions after} +\usage{ +\method{truncate_versions_after}{epi_archive2}(epi_archive, max_version) +} +\arguments{ +\item{epi_archive}{as in \code{\link{epix_truncate_versions_after}}} + +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions, mutating the archive by +potentially reseating but not mutating some fields. \code{DT} is likely, but not +guaranteed, to be copied. Returns the mutated archive +\link[base:invisible]{invisibly}. +} diff --git a/man/truncate_versions_after.grouped_epi_archive2.Rd b/man/truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..7c25950f --- /dev/null +++ b/man/truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{truncate_versions_after.grouped_epi_archive2} +\alias{truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\arguments{ +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} + +\item{x}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions by mutating the underlying +\code{epi_archive} using \verb{$truncate_versions_after}. Returns the mutated +\code{grouped_epi_archive} \link[base:invisible]{invisibly}. +} diff --git a/tests/testthat/test-archive_new.R b/tests/testthat/test-archive_new.R new file mode 100644 index 00000000..f2d0bde5 --- /dev/null +++ b/tests/testthat/test-archive_new.R @@ -0,0 +1,173 @@ +library(dplyr) + +test_that("first input must be a data.frame", { + expect_error(as_epi_archive2(c(1, 2, 3), compactify = FALSE), + regexp = "Must be of type 'data.frame'." + ) +}) + +dt <- archive_cases_dv_subset$DT + +test_that("data.frame must contain geo_value, time_value and version columns", { + expect_error(as_epi_archive2(select(dt, -geo_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -time_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -version), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) +}) + +test_that("other_keys can only contain names of the data.frame columns", { + expect_error(as_epi_archive2(dt, other_keys = "xyz", compactify = FALSE), + regexp = "`other_keys` must be contained in the column names of `x`." + ) + expect_error(as_epi_archive2(dt, other_keys = "percent_cli", compactify = FALSE), NA) +}) + +test_that("other_keys cannot contain names geo_value, time_value or version", { + expect_error(as_epi_archive2(dt, other_keys = "geo_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "time_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "version", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) +}) + +test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { + expect_warning(as_epi_archive2(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) + expect_warning(as_epi_archive2(dt, additional_metadata = list(time_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) +}) + +test_that("epi_archives are correctly instantiated with a variety of data types", { + # Data frame + df <- data.frame( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20 + ) + + ea1 <- as_epi_archive2(df, compactify = FALSE) + expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) + expect_equal(ea1$additional_metadata, list()) + + ea2 <- as_epi_archive2(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea2$additional_metadata, list(value = df$value)) + + # Tibble + tib <- tibble::tibble(df, code = "x") + + ea3 <- as_epi_archive2(tib, compactify = FALSE) + expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) + expect_equal(ea3$additional_metadata, list()) + + ea4 <- as_epi_archive2(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea4$additional_metadata, list(value = df$value)) + + # Keyed data.table + kdt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA", + key = "code" + ) + + ea5 <- as_epi_archive2(kdt, compactify = FALSE) + # Key from data.table isn't absorbed when as_epi_archive2 is used + expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) + expect_equal(ea5$additional_metadata, list()) + + ea6 <- as_epi_archive2(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + # Mismatched keys, but the one from as_epi_archive2 overrides + expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea6$additional_metadata, list(value = df$value)) + + # Unkeyed data.table + udt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA" + ) + + ea7 <- as_epi_archive2(udt, compactify = FALSE) + expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) + expect_equal(ea7$additional_metadata, list()) + + ea8 <- as_epi_archive2(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea8$additional_metadata, list(value = df$value)) + + # epi_df + edf1 <- jhu_csse_daily_subset %>% + select(geo_value, time_value, cases) %>% + mutate(version = max(time_value), code = "USA") + + ea9 <- as_epi_archive2(edf1, compactify = FALSE) + expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) + expect_equal(ea9$additional_metadata, list()) + + ea10 <- as_epi_archive2(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea10$additional_metadata, list(value = df$value)) + + # Keyed epi_df + edf2 <- data.frame( + geo_value = "al", + time_value = rep(as.Date("2020-01-01") + 0:9, 2), + version = c( + rep(as.Date("2020-01-25"), 10), + rep(as.Date("2020-01-26"), 10) + ), + cases = 1:20, + misc = "USA" + ) %>% + as_epi_df(additional_metadata = list(other_keys = "misc")) + + ea11 <- as_epi_archive2(edf2, compactify = FALSE) + expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) + expect_equal(ea11$additional_metadata, list()) + + ea12 <- as_epi_archive2(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(ea12$additional_metadata, list(value = df$misc)) +}) + +test_that("`epi_archive` rejects nonunique keys", { + toy_update_tbl <- + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) + expect_error( + as_epi_archive2(toy_update_tbl), + class = "epiprocess__epi_archive_requires_unique_key" + ) + expect_error( + regexp = NA, + as_epi_archive2(toy_update_tbl, other_keys = "age_group"), + ) +}) diff --git a/tests/testthat/test-compactify_new.R b/tests/testthat/test-compactify_new.R new file mode 100644 index 00000000..f2887eaf --- /dev/null +++ b/tests/testthat/test-compactify_new.R @@ -0,0 +1,110 @@ +library(epiprocess) +library(data.table) +library(dplyr) + +dt <- archive_cases_dv_subset$DT +dt <- filter(dt, geo_value == "ca") %>% + filter(version <= "2020-06-15") %>% + select(-case_rate_7d_av) + +test_that("Input for compactify must be NULL or a boolean", { + expect_error(as_epi_archive2(dt, compactify = "no")) +}) + +dt$percent_cli <- c(1:80) +dt$case_rate <- c(1:80) + +row_replace <- function(dt, row, x, y) { + # (This way of "replacing" elements appears to use copy-on-write even though + # we are working with a data.table.) + dt[row, 4] <- x + dt[row, 5] <- y + dt +} + +# Note that compactify is working on version-wise LOCF (last version of each +# observation carried forward) + +# Rows 1 should not be eliminated even if NA +dt <- row_replace(dt, 1, NA, NA) # Not LOCF + +# NOTE! We are assuming that there are no NA's in geo_value, time_value, +# and version. Even though compactify may erroneously remove the first row +# if it has all NA's, we are not testing this behaviour for now as this dataset +# has problems beyond the scope of this test + +# Rows 11 and 12 correspond to different time_values +dt <- row_replace(dt, 12, 11, 11) # Not LOCF + +# Rows 20 and 21 only differ in version +dt <- row_replace(dt, 21, 20, 20) # LOCF + +# Rows 21 and 22 only differ in version +dt <- row_replace(dt, 22, 20, 20) # LOCF + +# Row 39 comprises the first NA's +dt <- row_replace(dt, 39, NA, NA) # Not LOCF + +# Row 40 has two NA's, just like its lag, row 39 +dt <- row_replace(dt, 40, NA, NA) # LOCF + +# Row 62's values already exist in row 15, but row 15 is not a preceding row +dt <- row_replace(dt, 62, 15, 15) # Not LOCF + +# Row 73 only has one value carried over +dt <- row_replace(dt, 74, 73, 74) # Not LOCF + +dt_true <- as_tibble(as_epi_archive2(dt, compactify = TRUE)$DT) +dt_false <- as_tibble(as_epi_archive2(dt, compactify = FALSE)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive2(dt, compactify = NULL)$DT)) + +test_that("Warning for LOCF with compactify as NULL", { + expect_warning(as_epi_archive2(dt, compactify = NULL)) +}) + +test_that("No warning when there is no LOCF", { + expect_warning(as_epi_archive2(dt[1:5], compactify = NULL), NA) +}) + +test_that("LOCF values are ignored with compactify=FALSE", { + expect_identical(nrow(dt), nrow(dt_false)) +}) + +test_that("LOCF values are taken out with compactify=TRUE", { + dt_test <- as_tibble(as_epi_archive2(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) + + expect_identical(dt_true, dt_null) + expect_identical(dt_null, dt_test) +}) + +test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { + ea_true <- as_epi_archive2(dt, compactify = TRUE) + ea_false <- as_epi_archive2(dt, compactify = FALSE) + + # Row 22, an LOCF row corresponding to the latest version, is omitted in + # ea_true + latest_version <- max(ea_false$DT$version) + as_of_true <- as_of(ea_true, latest_version) + as_of_false <- as_of(ea_false, latest_version) + + expect_identical(as_of_true, as_of_false) +}) + +test_that("compactify does not alter the default clobberable and observed version bounds", { + x <- tibble::tibble( + geo_value = "geo1", + time_value = as.Date("2000-01-01"), + version = as.Date("2000-01-01") + 1:5, + value = 42L + ) + ea_true <- as_epi_archive2(x, compactify = TRUE) + ea_false <- as_epi_archive2(x, compactify = FALSE) + # We say that we base the bounds on the user's `x` arg. We might mess up or + # change our minds and base things on the `DT` field (or a temporary `DT` + # variable, post-compactify) instead. Check that this test would trigger + # in that case: + expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) + # The actual test: + expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) + expect_identical(ea_true$versions_end, ea_false$versions_end) +}) diff --git a/tests/testthat/test-epix_fill_through_version_new.R b/tests/testthat/test-epix_fill_through_version_new.R new file mode 100644 index 00000000..2b76a851 --- /dev/null +++ b/tests/testthat/test-epix_fill_through_version_new.R @@ -0,0 +1,109 @@ +test_that("epix_fill_through_version2 mirrors input when it is sufficiently up to date", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + some_earlier_observed_version <- 2L + ea_trivial_fill_na1 <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 <- epix_fill_through_version2(ea_orig, ea_orig$versions_end, "na") + ea_trivial_fill_locf <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "locf") + # Below, we want R6 objects to be compared based on contents rather than + # addresses. We appear to get this with `expect_identical` in `testthat` + # edition 3, which is based on `waldo::compare` rather than `base::identical`; + # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 + # objects by contents rather than address (in a way that is tested but maybe + # not guaranteed via user docs). Use `testthat::local_edition` to ensure we + # use testthat edition 3 here (use `testthat::` to prevent ambiguity with + # `readr`). + testthat::local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) + expect_identical(ea_orig, ea_trivial_fill_na2) + expect_identical(ea_orig, ea_trivial_fill_locf) +}) + +test_that("epix_fill_through_version2 can extend observed versions, gives expected `as_of`s", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", + time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), + version = c(1:5, 2L), + value = 1:6 + )) + first_unobserved_version <- 6L + later_unobserved_version <- 10L + ea_fill_na <- epix_fill_through_version2(ea_orig, later_unobserved_version, "na") + ea_fill_locf <- epix_fill_through_version2(ea_orig, later_unobserved_version, "locf") + + # We use testthat edition 3 features here, passing `ignore_attr` to + # `waldo::compare`. Ensure we are using edition 3: + testthat::local_edition(3) + withCallingHandlers( + { + expect_identical(ea_fill_na$versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(as_of(ea_fill_na, first_unobserved_version)), + tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + ignore_attr = TRUE + ) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) + expect_identical( + as_of(ea_fill_locf, first_unobserved_version), + as_of(ea_fill_locf, ea_orig$versions_end) %>% + { + attr(., "metadata")$as_of <- first_unobserved_version + . + } + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") + ) +}) + +test_that("epix_fill_through_version2 does not mutate x", { + for (ea_orig in list( + # vanilla case + as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )), + # data.table unique yielding original DT by reference special case (maybe + # having only 1 row is the trigger? having no revisions of initial values + # doesn't seem sufficient to trigger) + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + )) { + # We want to perform a strict comparison of the contents of `ea_orig` before + # and `ea_orig` after. `clone` + `expect_identical` based on waldo would + # sort of work, but we might want something stricter. `as.list` + + # `identical` plus a check of the DT seems to do the trick. + ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + some_unobserved_version <- 8L + # + ea_fill_na <- epix_fill_through_version2(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list <- as.list(ea_orig) + # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + # + ea_fill_locf <- epix_fill_through_version2(ea_orig, some_unobserved_version, "locf") + ea_orig_after_as_list <- as.list(ea_orig) + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + } +}) + +test_that("epix_fill_through_version return with expected visibility", { + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) +}) + +test_that("epix_fill_through_version2 returns same key & doesn't mutate old DT or its key", { + ea <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + old_DT <- ea$DT + old_DT_copy <- data.table::copy(old_DT) + old_key <- data.table::key(ea$DT) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "locf")$DT), old_key) + expect_identical(data.table::key(ea$DT), old_key) +}) diff --git a/tests/testthat/test-epix_merge_new.R b/tests/testthat/test-epix_merge_new.R new file mode 100644 index 00000000..594b7b5e --- /dev/null +++ b/tests/testthat/test-epix_merge_new.R @@ -0,0 +1,228 @@ +test_that("epix_merge requires forbids on invalid `y`", { + ea <- archive_cases_dv_subset$DT %>% + as_epi_archive2() %>% + clone() %>% + suppressWarnings() + expect_error(epix_merge2(ea, data.frame(x = 1))) +}) + +test_that("epix_merge merges and carries forward updates properly", { + x <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) + "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + ) %>% + tidyr::unchop(c(version, x_value)) %>% + dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + y <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, y_value)) %>% + dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + xy <- epix_merge2(x, y) + xy_expected <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, x_value, y_value)) %>% + dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + # We rely on testthat edition 3 expect_identical using waldo, not identical. See + # test-epix_fill_through_version.R comments for details. + testthat::local_edition(3) + expect_identical(xy, xy_expected) +}) + +test_that("epix_merge forbids and warns on metadata and naming issues", { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*geo_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*time_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) + ), + regexp = "overlapping.*names" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) + ), + regexp = "x\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ) + ), + regexp = "y\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) +}) + +# use `local` to prevent accidentally using the x, y, xy bindings here +# elsewhere, while allowing reuse across a couple tests +local({ + x <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + clobberable_versions_start = 1L, versions_end = 10L + ) + y <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + clobberable_versions_start = 3L, versions_end = 10L + ) + xy <- epix_merge2(x, y) + test_that("epix_merge considers partially-clobberable row to be clobberable", { + expect_identical(xy$clobberable_versions_start, 1L) + }) + test_that("epix_merge result uses versions_end metadata not max version val", { + expect_identical(xy$versions_end, 10L) + }) +}) + +local({ + x <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), + clobberable_versions_start = 1L, + versions_end = 3L + ) + y <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), + clobberable_versions_start = 1L + ) + test_that('epix_merge forbids on sync default or "forbid"', { + expect_error(epix_merge2(x, y), + class = "epiprocess__epix_merge_unresolved_sync" + ) + expect_error(epix_merge2(x, y, sync = "forbid"), + class = "epiprocess__epix_merge_unresolved_sync" + ) + }) + test_that('epix_merge sync="na" works', { + expect_equal( + epix_merge2(x, y, sync = "na"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + # (we should not have a y vals -> NA update here; version 5 should be + # the `versions_end` of the result) + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="locf" works', { + expect_equal( + epix_merge2(x, y, sync = "locf"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="truncate" works', { + expect_equal( + epix_merge2(x, y, sync = "truncate"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + # y's update beyond x's last update has been truncated + ), clobberable_versions_start = 1L, versions_end = 3L) + ) + }) + x_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) + y_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + xy_no_conflict_expected <- as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) + test_that('epix_merge sync="forbid" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "forbid"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="na" on no-conflict works', { + # This test is the main reason for these no-conflict tests. We want to make + # sure that we don't add an unnecessary NA-ing-out version beyond a common + # versions_end. + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "na"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="locf" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "locf"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="truncate" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "truncate"), + xy_no_conflict_expected + ) + }) +}) + + +test_that('epix_merge sync="na" balks if do not know next_after', { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), + sync = "na" + ), + regexp = "no applicable method.*next_after" + ) +}) diff --git a/tests/testthat/test-epix_slide_new.R b/tests/testthat/test-epix_slide_new.R new file mode 100644 index 00000000..f748231a --- /dev/null +++ b/tests/testthat/test-epix_slide_new.R @@ -0,0 +1,810 @@ +library(dplyr) + +test_that("epix_slide2 only works on an epi_archive", { + expect_error(epix_slide2(data.frame(x = 1))) +}) + +x <- tibble::tribble( + ~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2, 4), 2^(4:6), + 6, c(1:2, 4:5), 2^(7:10), + 7, 2:6, 2^(11:15) +) %>% + tidyr::unnest(c(time_value, binary)) + +xx <- bind_cols(geo_value = rep("x", 15), x) %>% + as_epi_archive2() + +test_that("epix_slide2 works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by( + dplyr::across(dplyr::all_of("geo_value")) + ) %>% + slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2(f = function(x, gk, rtv) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1, xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + sum_binary = sum(binary), + before = 2 + ) + + expect_identical(xx1, xx5) +}) + +test_that("epix_slide2 works as intended with `as_list_col=TRUE`", { + xx_dfrow1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + xx_dfrow2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx_dfrow1, xx_dfrow2) # * + + xx_dfrow3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_df2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1, xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_scalar2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1, xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ .x$binary, + before = 2, + as_list_col = TRUE + ) + + xx_vec2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1, xx_vec2) +}) + +test_that("epix_slide2 `before` validation works", { + expect_error( + slide(xx, f = ~ sum(.x$binary)), + "`before` is required" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = NA), + "Assertion on 'before' failed: May not be NA" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = -1), + "Assertion on 'before' failed: Element 1 is not >= 0" + ) + expect_error(slide(xx, f = ~ sum(.x$binary), before = 1.5), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # We might want to allow this at some point (issue #219): + expect_error(slide(xx, f = ~ sum(.x$binary), before = Inf), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # (wrapper shouldn't introduce a value:) + expect_error(epix_slide2(xx, f = ~ sum(.x$binary)), "`before` is required") + # These `before` values should be accepted: + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 0), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 2L), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 365000), + NA + ) +}) + +test_that("quosure passing issue in epix_slide2 is resolved + other potential issues", { + # (First part adapted from @examples) + time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-02"), + by = "1 day" + ) + # We only have one non-version, non-time key in the example archive. Add + # another so that we don't accidentally pass tests due to accidentally + # matching the default grouping. + ea <- as_epi_archive2( + archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE + ) + reference_by_modulus <- ea %>% + group_by(modulus) %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + reference_by_neither <- ea %>% + group_by() %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + # test the passing-something-that-must-be-enquosed behavior: + # + # (S3 group_by behavior for this case is the `reference_by_modulus`) + expect_identical( + ea %>% group_by(modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the .data pronoun behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(.data$modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing across-all-of-string-literal behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(across(all_of("modulus"))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing-across-all-of-string-var behavior: + my_group_by <- "modulus" + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the default behavior (default in this case should just be grouping by neither): + expect_identical( + epix_slide2( + x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) + expect_identical( + ea %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) +}) + +ea <- tibble::tribble( + ~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1) +) %>% + tidyr::unnest(c(time_value, binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive2() + +test_that("epix_slide2 with all_versions option has access to all older versions", { + library(data.table) + # Make sure we're using testthat edition 3, where `expect_identical` doesn't + # actually mean `base::identical` but something more content-based using + # `waldo` package: + testthat::local_edition(3) + + slide_fn <- function(x, gk, rtv) { + return(tibble( + n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)) + )) + } + + ea_orig_mirror <- ea %>% clone(deep = TRUE) + ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + + result1 <- ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_true(inherits(result1, "tbl_df")) + + result2 <- tibble::tribble( + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) + + expect_identical(result1, result2) # * + + result3 <- ea %>% + group_by() %>% + slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result3) # This and * Imply result2 and result3 are identical + + # formula interface + result4 <- ea %>% + group_by() %>% + epix_slide2( + f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide2( + data = slide_fn( + .x, + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result5) # This and * Imply result2 and result5 are identical + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea +}) + +test_that("as_of and epix_slide2 with long enough window are compatible", { + library(data.table) + testthat::local_edition(3) + + # For all_versions = FALSE: + + f1 <- function(x, gk, rtv) { + tibble( + diff_mean = mean(diff(x$binary)) + ) + } + ref_time_value1 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ) + + # For all_versions = TRUE: + + f2 <- function(x, gk, rtv) { + x %>% + # extract time&version-lag-1 data: + epix_slide2( + function(subx, subgk, rtv) { + tibble(data = list( + subx %>% + filter(time_value == attr(subx, "metadata")$as_of - 1) %>% + rename(real_time_value = time_value, lag1 = binary) + )) + }, + before = 1, names_sep = NULL + ) %>% + # assess as nowcast: + unnest(data) %>% + inner_join(x %>% as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + summarize(mean_abs_delta = mean(abs(binary - lag1))) + } + ref_time_value2 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), + ea %>% slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ) + + # Test the same sort of thing when grouping by geo in an archive with multiple geos. + ea_multigeo <- ea %>% clone() + ea_multigeo$DT <- rbind( + ea_multigeo$DT, + copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + ) + setkeyv(ea_multigeo$DT, key(ea$DT)) + + expect_identical( + ea_multigeo %>% + group_by(geo_value) %>% + epix_slide2(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% + filter(geo_value == "x"), + ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` + epix_as_of2(ref_time_value2, all_versions = TRUE) %>% + f2() %>% + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + group_by(geo_value) + ) +}) + +test_that("epix_slide2 `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { + slide_fn <- function(x, gk, rtv) { + expect_true(is_epi_archive2(x)) + return(NA) + } + + ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE + ) +}) + +test_that("epix_slide2 with all_versions option works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9 + 2^6, + 2^15 + 2^14 + 2^10 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical +}) + +# XXX currently, we're using a stopgap measure of having `epix_slide2` always +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide2` output more carefully. We might bring this test +# back depending on the decisions there: +# +# test_that("`epix_slide2` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide2(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) + +test_that("epix_slide2 works with 0-row computation outputs", { + epix_slide_empty <- function(ea, ...) { + ea %>% + epix_slide2(before = 5L, ..., function(x, gk, rtv) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) %>% + group_by(geo_value) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + group_by(geo_value) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) + +test_that("epix_slide alerts if the provided f doesn't take enough args", { + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + expect_warning(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + expect_warning(epix_slide2(xx, f_x_dots, before = 2L), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) +}) + +test_that("epix_slide2 computation via formula can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.ref_time_value, + before = 2 + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.z, + before = 2 + ) + + expect_identical(xx2, xx_ref) + + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~..3, + before = 2 + ) + + expect_identical(xx3, xx_ref) +}) + +test_that("epix_slide2 computation via function can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = function(x, g, t) t, + before = 2 + ) + + expect_identical(xx1, xx_ref) +}) + +test_that("epix_slide2 computation via dots can use ref_time_value and group", { + # ref_time_value + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .ref_time_value + ) + + expect_identical(xx1, xx_ref) + + # group_key + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = "x" + ) %>% + group_by(geo_value) + + # Use group_key column + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .group_key$geo_value + ) + + expect_identical(xx3, xx_ref) + + # Use entire group_key object + expect_error( + xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = nrow(.group_key) + ), + NA + ) +}) + +test_that("epix_slide2 computation via dots outputs the same result using col names and the data var", { + xx_ref <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(time_value) + ) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.x$time_value) + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.data$time_value) + ) + + expect_identical(xx2, xx_ref) +}) + +test_that("`epix_slide2` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive2() %>% + epix_slide2(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) + +test_that("`epix_slide2` can access objects inside of helper functions", { + helper <- function(archive_haystack, time_value_needle) { + archive_haystack %>% epix_slide2(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(suppressWarnings(as_epi_archive2(archive_cases_dv_subset$DT)), as.Date("2021-01-01")), + NA + ) + expect_error( + helper(xx, 3L), + NA + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive_new.R b/tests/testthat/test-grouped_epi_archive_new.R new file mode 100644 index 00000000..8f0133b9 --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive_new.R @@ -0,0 +1,104 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) %>% + as_epi_archive2(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys <- toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) + ) + grouping_cols <- c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "Must be of type 'logical', not 'character'" + ) + expect_warning(toy_archive %>% group_by(.drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning( + grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + expect_identical( + grouped_factor_then_nonfactor %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop = FALSE) + ) + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop = FALSE) %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop = FALSE) + ) +}) diff --git a/tests/testthat/test-methods-epi_archive_new.R b/tests/testthat/test-methods-epi_archive_new.R new file mode 100644 index 00000000..a267ba58 --- /dev/null +++ b/tests/testthat/test-methods-epi_archive_new.R @@ -0,0 +1,138 @@ +library(dplyr) + +ea <- archive_cases_dv_subset$DT %>% + as_epi_archive2() %>% + clone() %>% + suppressWarnings() + +ea2_data <- tibble::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, +) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + +# epix_as_of tests +test_that("epix_as_of behaves identically to as_of method", { + expect_identical( + epix_as_of2(ea, max_version = min(ea$DT$version)), + ea %>% as_of(max_version = min(ea$DT$version)) + ) +}) + +test_that("Errors are thrown due to bad as_of inputs", { + # max_version cannot be of string class rather than date class + expect_error(ea %>% as_of("2020-01-01")) + # max_version cannot be later than latest version + expect_error(ea %>% as_of(as.Date("2025-01-01"))) + # max_version cannot be a vector + expect_error(ea %>% as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) +}) + +test_that("Warning against max_version being clobberable", { + # none by default + expect_warning(regexp = NA, ea %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% as_of(max_version = min(ea$DT$version))) + # but with `clobberable_versions_start` non-`NA`, yes + ea_with_clobberable <- ea %>% clone() + ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) + expect_warning(ea_with_clobberable %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% as_of(max_version = min(ea$DT$version))) +}) + +test_that("as_of properly grabs the data and doesn't mutate key", { + d <- as.Date("2020-06-01") + + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + edf_as_of <- ea2 %>% + epix_as_of2(max_version = as.Date("2020-06-03")) + + edf_expected <- as_epi_df(tibble( + geo_value = "ca", + time_value = d + 0:2, + cases = c(2, 1, 1) + ), as_of = as.Date("2020-06-03")) + + expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { + # x must be an archive + expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) + # max_version cannot be of string class rather than date class + expect_error(epix_truncate_versions_after(ea, "2020-01-01")) + # max_version cannot be a vector + expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + # max_version cannot be missing + expect_error(epix_truncate_versions_after(ea, as.Date(NA))) + # max_version cannot be after latest version in archive + expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) +}) + +test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-02")) + + ea_expected <- ea2_data[1:3, ] %>% + as_epi_archive2() + + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) +}) + +test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_epi_archive2(ea_as_of, grouped_okay = FALSE)) + + ea2_grouped <- ea2 %>% group_by(geo_value) + + ea_as_of <- ea2_grouped %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_grouped_epi_archive2(ea_as_of)) +}) + + +test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + ea2 <- ea2 %>% group_by(geo_value) + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) +})