From 58ed6b45f77eb252ee5f721ff0ae8ef93888e4a6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 16 Nov 2023 11:00:14 -0800 Subject: [PATCH 1/3] Update R/epi_df.R Co-authored-by: brookslogan --- R/epi_df.R | 190 +++++++++++++++++++++++++++++------------------------ 1 file changed, 104 insertions(+), 86 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index 53dca62b..91e6c9d9 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -87,7 +87,7 @@ NULL #' Creates an `epi_df` object #' -#' Creates a new `epi_df` object. By default, builds an empty tibble with the +#' Creates a new `epi_df` object. By default, builds an empty tibble with the #' correct metadata for an `epi_df` object (ie. `geo_type`, `time_type`, and `as_of`). #' Refer to the below info. about the arguments for more details. #' @@ -107,18 +107,18 @@ NULL #' `epi_df` object. The metadata will have `geo_type`, `time_type`, and #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. +#' character vector in the `other_keys` component of `additional_metadata`. #' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. -#' +#' #' @export -new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list(), ...) { +new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, + additional_metadata = list(), ...) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") } - + if (!is.list(additional_metadata)) { Abort("`additional_metadata` must be a list type.") } @@ -128,52 +128,55 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # If geo type is missing, then try to guess it if (missing(geo_type)) { - geo_type = guess_geo_type(x$geo_value) + geo_type <- guess_geo_type(x$geo_value) } - + # If time type is missing, then try to guess it if (missing(time_type)) { - time_type = guess_time_type(x$time_value) + time_type <- guess_time_type(x$time_value) } - + # If as_of is missing, then try to guess it if (missing(as_of)) { # First check the metadata for an as_of field if ("metadata" %in% names(attributes(x)) && - "as_of" %in% names(attributes(x)$metadata)) { - as_of = attributes(x)$metadata$as_of + "as_of" %in% names(attributes(x)$metadata)) { + as_of <- attributes(x)$metadata$as_of } - + # Next check for as_of, issue, or version columns - else if ("as_of" %in% names(x)) as_of = max(x$as_of) - else if ("issue" %in% names(x)) as_of = max(x$issue) - else if ("version" %in% names(x)) as_of = max(x$version) - - # If we got here then we failed - else as_of = Sys.time() # Use the current day-time + else if ("as_of" %in% names(x)) { + as_of <- max(x$as_of) + } else if ("issue" %in% names(x)) { + as_of <- max(x$issue) + } else if ("version" %in% names(x)) { + as_of <- max(x$version) + } # If we got here then we failed + else { + as_of <- Sys.time() + } # Use the current day-time } - + # Define metadata fields - metadata = list() - metadata$geo_type = geo_type - metadata$time_type = time_type - metadata$as_of = as_of - metadata = c(metadata, additional_metadata) - + metadata <- list() + metadata$geo_type <- geo_type + metadata$time_type <- time_type + metadata$as_of <- as_of + metadata <- c(metadata, additional_metadata) + # Reorder columns (geo_value, time_value, ...) - if(sum(dim(x)) != 0){ + if (sum(dim(x)) != 0) { cols_to_put_first <- c("geo_value", "time_value") x <- x[, c( cols_to_put_first, # All other columns names(x)[!(names(x) %in% cols_to_put_first)] - ) - ] + )] } - + # Apply epi_df class, attach metadata, and return - class(x) = c("epi_df", class(x)) - attributes(x)$metadata = metadata + class(x) <- c("epi_df", class(x)) + attributes(x)$metadata <- metadata return(x) } @@ -205,77 +208,85 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, #' @return An `epi_df` object. #' #' @export -#' @examples +#' @examples #' # Convert a `tsibble` that has county code as an extra key #' # Notice that county code should be a character string to preserve any leading zeroes -#' +#' #' ex1_input <- tibble::tibble( #' geo_value = rep(c("ca", "fl", "pa"), each = 3), -#' county_code = c("06059","06061","06067", -#' "12111","12113","12117", -#' "42101", "42103","42105"), +#' county_code = c( +#' "06059", "06061", "06067", +#' "12111", "12113", "12117", +#' "42101", "42103", "42105" +#' ), #' time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), -#' by = "day"), length.out = length(geo_value)), +#' by = "day" +#' ), length.out = length(geo_value)), #' value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) -#' ) %>% +#' ) %>% #' tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) -#' +#' #' # The `other_keys` metadata (`"county_code"` in this case) is automatically #' # inferred from the `tsibble`'s `key`: #' ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") -#' attr(ex1,"metadata")[["other_keys"]] -#' -#' -#' +#' attr(ex1, "metadata")[["other_keys"]] +#' +#' +#' #' # Dealing with misspecified column names: #' # Geographical and temporal information must be provided in columns named #' # `geo_value` and `time_value`; if we start from a data frame with a #' # different format, it must be converted to use `geo_value` and `time_value` #' # before calling `as_epi_df`. -#' +#' #' ex2_input <- tibble::tibble( #' state = rep(c("ca", "fl", "pa"), each = 3), # misnamed #' pol = rep(c("blue", "swing", "swing"), each = 3), # extra key #' reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), -#' by = "day"), length.out = length(state)), # misnamed +#' by = "day" +#' ), length.out = length(state)), # misnamed #' value = 1:length(state) + 0.01 * rnorm(length(state)) -#' ) -#' +#' ) +#' #' print(ex2_input) -#' -#' ex2 <- ex2_input %>% dplyr::rename(geo_value = state, time_value = reported_date) %>% -#' as_epi_df(geo_type = "state", as_of = "2020-06-03", -#' additional_metadata = list(other_keys = "pol")) -#' -#' attr(ex2,"metadata") -#' -#' -#' +#' +#' ex2 <- ex2_input %>% +#' dplyr::rename(geo_value = state, time_value = reported_date) %>% +#' as_epi_df( +#' geo_type = "state", as_of = "2020-06-03", +#' additional_metadata = list(other_keys = "pol") +#' ) +#' +#' attr(ex2, "metadata") +#' +#' +#' #' # Adding additional keys to an `epi_df` object -#' +#' #' ex3_input <- jhu_csse_county_level_subset %>% #' dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% -#' dplyr::slice_tail(n = 6) -#' -#' ex3 <- ex3_input %>% +#' dplyr::slice_tail(n = 6) +#' +#' ex3 <- ex3_input %>% #' tsibble::as_tsibble() %>% # needed to add the additional metadata #' # add 2 extra keys #' dplyr::mutate( -#' state = rep("MA",6), -#' pol = rep(c("blue", "swing", "swing"), each = 2)) %>% -#' # the 2 extra keys we added have to be specified in the other_keys +#' state = rep("MA", 6), +#' pol = rep(c("blue", "swing", "swing"), each = 2) +#' ) %>% +#' # the 2 extra keys we added have to be specified in the other_keys #' # component of additional_metadata. #' as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) -#' -#' attr(ex3,"metadata") -as_epi_df = function(x, ...) { +#' +#' attr(ex3, "metadata") +as_epi_df <- function(x, ...) { UseMethod("as_epi_df") } #' @method as_epi_df epi_df #' @describeIn as_epi_df Simply returns the `epi_df` object unchanged. #' @export -as_epi_df.epi_df = function(x, ...) { +as_epi_df.epi_df <- function(x, ...) { return(x) } @@ -289,8 +300,8 @@ as_epi_df.epi_df = function(x, ...) { #' be used. #' @importFrom rlang .data #' @export -as_epi_df.tbl_df = function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { +as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { # Check that we have geo_value and time_value columns if (!("geo_value" %in% names(x))) { Abort("`x` must contain a `geo_value` column.") @@ -298,18 +309,22 @@ as_epi_df.tbl_df = function(x, geo_type, time_type, as_of, if (!("time_value" %in% names(x))) { Abort("`x` must contain a `time_value` column.") } - - new_epi_df(x, geo_type, time_type, as_of, - additional_metadata, ...) + + new_epi_df( + x, geo_type, time_type, as_of, + additional_metadata, ... + ) } #' @method as_epi_df data.frame #' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`. #' @export -as_epi_df.data.frame = function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { - as_epi_df.tbl_df(tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ...) +as_epi_df.data.frame <- function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { + as_epi_df.tbl_df( + tibble::as_tibble(x), geo_type, time_type, as_of, + additional_metadata, ... + ) } #' @method as_epi_df tbl_ts @@ -318,23 +333,26 @@ as_epi_df.data.frame = function(x, geo_type, time_type, as_of, #' "geo_value") are added to the metadata of the returned object, under the #' `other_keys` field. #' @export -as_epi_df.tbl_ts = function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { - tsibble_other_keys = setdiff(tsibble::key_vars(x), "geo_value") +as_epi_df.tbl_ts <- function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { + tsibble_other_keys <- setdiff(tsibble::key_vars(x), "geo_value") if (length(tsibble_other_keys) != 0) { - additional_metadata$other_keys = unique( - c(additional_metadata$other_keys, tsibble_other_keys)) + additional_metadata$other_keys <- unique( + c(additional_metadata$other_keys, tsibble_other_keys) + ) } - as_epi_df.tbl_df(tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ...) + as_epi_df.tbl_df( + tibble::as_tibble(x), geo_type, time_type, as_of, + additional_metadata, ... + ) } #' Test for `epi_df` format #' #' @param x An object. #' @return `TRUE` if the object inherits from `epi_df`. -#' +#' #' @export -is_epi_df = function(x) { +is_epi_df <- function(x) { inherits(x, "epi_df") } From c65876078a6f9525952b305eaea2fca003adf907 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 17 Jan 2024 13:51:27 -0800 Subject: [PATCH 2/3] style: styler --- R/archive.R | 1094 +++++++++-------- R/correlation.R | 109 +- R/data.R | 102 +- R/epiprocess.R | 2 +- R/grouped_epi_archive.R | 500 ++++---- R/growth_rate.R | 199 +-- R/methods-epi_archive.R | 396 +++--- R/methods-epi_df.R | 104 +- R/outliers.R | 263 ++-- R/slide.R | 180 +-- R/utils.R | 356 +++--- tests/testthat/test-archive-version-bounds.R | 146 ++- tests/testthat/test-archive.R | 246 ++-- tests/testthat/test-compactify.R | 66 +- tests/testthat/test-correlation.R | 55 +- tests/testthat/test-data.R | 54 +- tests/testthat/test-deprecations.R | 19 +- tests/testthat/test-epi_slide.R | 482 +++++--- .../testthat/test-epix_fill_through_version.R | 107 +- tests/testthat/test-epix_merge.R | 176 +-- tests/testthat/test-epix_slide.R | 674 +++++----- tests/testthat/test-grouped_epi_archive.R | 136 +- tests/testthat/test-methods-epi_archive.R | 81 +- tests/testthat/test-methods-epi_df.R | 77 +- tests/testthat/test-utils.R | 241 ++-- vignettes/advanced.Rmd | 205 +-- vignettes/compactify.Rmd | 25 +- vignettes/growth_rate.Rmd | 116 +- 28 files changed, 3429 insertions(+), 2782 deletions(-) diff --git a/R/archive.R b/R/archive.R index 7a7d8d82..1908b77c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -4,7 +4,7 @@ # want the special behavior via `.datatable.aware = TRUE` or by importing any # `data.table` package member. Do both to prevent surprises if we decide to use # `data.table::` everywhere and not importing things. -.datatable.aware = TRUE +.datatable.aware <- TRUE #' Validate a version bound arg #' @@ -22,16 +22,20 @@ #' @section Side effects: raises an error if version bound appears invalid #' #' @noRd -validate_version_bound = function(version_bound, x, na_ok, - version_bound_arg = rlang::caller_arg(version_bound), - x_arg = rlang::caller_arg(version_bound)) { +validate_version_bound <- function(version_bound, x, na_ok, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { # We might want some (optional?) validation here to detect internal bugs. if (length(version_bound) != 1L) { # Check for length-1-ness fairly early so we don't have to worry as much # about our `if`s receiving non-length-1 "Boolean"s. - Abort(sprintf("`version_bound` must have length 1, but instead was length %d", - length(version_bound)), - class=sprintf("epiprocess__%s_is_not_length_1", version_bound_arg)) + Abort( + sprintf( + "`version_bound` must have length 1, but instead was length %d", + length(version_bound) + ), + class = sprintf("epiprocess__%s_is_not_length_1", version_bound_arg) + ) } else if (is.na(version_bound)) { # Check for NA before class&type, as any-class&type NA should be fine for # our purposes, and some version classes&types might not have their own NA @@ -41,20 +45,20 @@ validate_version_bound = function(version_bound, x, na_ok, return(invisible(NULL)) } else { Abort(sprintf( - '`%s` must not satisfy `is.na` (NAs are not allowed for this kind of version bound)', + "`%s` must not satisfy `is.na` (NAs are not allowed for this kind of version bound)", version_bound_arg - ), class=sprintf("epiprocess__%s_is_na", version_bound_arg)) + ), class = sprintf("epiprocess__%s_is_na", version_bound_arg)) } - } else if (!identical(class(version_bound), class(x[["version"]])) || - !identical(typeof(version_bound), typeof(x[["version"]]))) { + } else if (!identical(class(version_bound), class(x[["version"]])) || + !identical(typeof(version_bound), typeof(x[["version"]]))) { Abort(sprintf( - '`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`', + "`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`", version_bound_arg, # '{x_arg}[["version"]]' except adding parentheses if needed: rlang::expr_deparse(rlang::new_call( quote(`[[`), rlang::pairlist2(rlang::parse_expr(x_arg), "version") )) - ), class=sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) + ), class = sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) } else { # Looks like a valid version bound; exit without error. return(invisible(NULL)) @@ -71,15 +75,17 @@ validate_version_bound = function(version_bound, x, na_ok, #' an `NA` version value #' #' @export -max_version_with_row_in = function(x) { +max_version_with_row_in <- function(x) { if (nrow(x) == 0L) { Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `versions_end` must be manually specified."), - class="epiprocess__max_version_cannot_be_used") + class = "epiprocess__max_version_cannot_be_used" + ) } else { - version_col = purrr::pluck(x, "version") # error not NULL if doesn't exist + version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist if (anyNA(version_col)) { Abort("version values cannot be NA", - class="epiprocess__version_values_must_not_be_na") + class = "epiprocess__version_values_must_not_be_na" + ) } else { version_bound <- max(version_col) } @@ -92,13 +98,13 @@ max_version_with_row_in = function(x) { #' @return same class, typeof, and length as `x` #' #' @export -next_after = function(x) UseMethod("next_after") +next_after <- function(x) UseMethod("next_after") #' @export -next_after.integer = function(x) x + 1L +next_after.integer <- function(x) x + 1L #' @export -next_after.Date = function(x) x + 1L +next_after.Date <- function(x) x + 1L #' @title `epi_archive` object #' @@ -110,7 +116,7 @@ next_after.Date = function(x) x + 1L #' @details An `epi_archive` is an R6 class which contains a data table `DT`, of #' class `data.table` from the `data.table` package, with (at least) the #' following columns: -#' +#' #' * `geo_value`: the geographic value associated with each row of measurements. #' * `time_value`: the time value associated with each row of measurements. #' * `version`: the time value specifying the version for each row of @@ -122,12 +128,12 @@ next_after.Date = function(x) x + 1L #' The data table `DT` has key variables `geo_value`, `time_value`, `version`, #' as well as any others (these can be specified when instantiating the #' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for +#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for #' information and examples of relevant parameter names for an `epi_archive` object. #' Note that there can only be a single row per unique combination of #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. -#' +#' #' In general, the last version of each observation is carried forward (LOCF) to #' fill in data between recorded versions, and between the last recorded #' update and the `versions_end`. One consequence is that the `DT` @@ -153,7 +159,7 @@ next_after.Date = function(x) x + 1L #' make a clone using the `$clone` method, then overwrite the clone's `DT` #' field with `data.table::copy(clone$DT)`, and finally perform the #' modifications on the clone. -#' +#' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: @@ -191,478 +197,520 @@ next_after.Date = function(x) x + 1L #' @examples #' tib <- tibble::tibble( #' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5), times = 2), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), #' value = rnorm(10, mean = 2, sd = 1) #' ) -#' -#' toy_epi_archive <- tib %>% epi_archive$new(geo_type = "state", -#' time_type = "day") -#' toy_epi_archive -epi_archive = +#' +#' toy_epi_archive <- tib %>% epi_archive$new( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +epi_archive <- R6::R6Class( - classname = "epi_archive", - ##### - public = list( - DT = NULL, - geo_type = NULL, - time_type = NULL, - additional_metadata = NULL, - clobberable_versions_start = NULL, - versions_end = NULL, -#' @description Creates a new `epi_archive` object. -#' @param x A data frame, data table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param other_keys Character vector specifying the names of variables in `x` -#' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space while maintaining the same behavior (with the help of the -#' `clobberable_versions_start` and `versions_end` fields in some edge cases). -#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will -#' remove these rows and issue a warning. Generally, this can be set to -#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` -#' such as its `DT`, or rely on redundant updates to achieve a certain -#' behavior of the `ref_time_values` default in `epix_slide`, you will have to -#' determine whether `compactify=TRUE` will produce the desired results. If -#' compactification here is removing a large proportion of the rows, this may -#' indicate a potential for space, time, or bandwidth savings upstream the -#' data pipeline, e.g., by avoiding fetching, storing, or processing these -#' rows of `x`. -#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] -#' @param versions_end Optional; as in [`as_epi_archive`] -#' @return An `epi_archive` object. -#' @importFrom data.table as.data.table key setkeyv -#' -#' @details -#' Refer to the documentation for [as_epi_archive()] for more information -#' and examples of parameter names. - initialize = function(x, geo_type, time_type, other_keys, - additional_metadata, compactify, - clobberable_versions_start, versions_end) { - # Check that we have a data frame - if (!is.data.frame(x)) { - Abort("`x` must be a data frame.") - } - - # Check that we have geo_value, time_value, version columns - if (!("geo_value" %in% names(x))) { - Abort("`x` must contain a `geo_value` column.") - } - if (!("time_value" %in% names(x))) { - Abort("`x` must contain a `time_value` column.") - } - if (!("version" %in% names(x))) { - Abort("`x` must contain a `version` column.") - } - if (anyNA(x$version)) { - Abort("`x$version` must not contain `NA`s", - class = "epiprocess__version_values_must_not_be_na") - } - - # If geo type is missing, then try to guess it - if (missing(geo_type)) { - geo_type = guess_geo_type(x$geo_value) - } + classname = "epi_archive", + ##### + public = list( + DT = NULL, + geo_type = NULL, + time_type = NULL, + additional_metadata = NULL, + clobberable_versions_start = NULL, + versions_end = NULL, + #' @description Creates a new `epi_archive` object. + #' @param x A data frame, data table, or tibble, with columns `geo_value`, + #' `time_value`, `version`, and then any additional number of columns. + #' @param geo_type Type for the geo values. If missing, then the function will + #' attempt to infer it from the geo values present; if this fails, then it + #' will be set to "custom". + #' @param time_type Type for the time values. If missing, then the function will + #' attempt to infer it from the time values present; if this fails, then it + #' will be set to "custom". + #' @param other_keys Character vector specifying the names of variables in `x` + #' that should be considered key variables (in the language of `data.table`) + #' apart from "geo_value", "time_value", and "version". + #' @param additional_metadata List of additional metadata to attach to the + #' `epi_archive` object. The metadata will have `geo_type` and `time_type` + #' fields; named entries from the passed list or will be included as well. + #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are + #' considered redundant for the purposes of `epi_archive`'s built-in methods + #' such as `as_of`? As these methods use the last version of each observation + #' carried forward (LOCF) to interpolate between the version data provided, + #' rows that don't change these LOCF results can potentially be omitted to + #' save space while maintaining the same behavior (with the help of the + #' `clobberable_versions_start` and `versions_end` fields in some edge cases). + #' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will + #' remove these rows and issue a warning. Generally, this can be set to + #' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` + #' such as its `DT`, or rely on redundant updates to achieve a certain + #' behavior of the `ref_time_values` default in `epix_slide`, you will have to + #' determine whether `compactify=TRUE` will produce the desired results. If + #' compactification here is removing a large proportion of the rows, this may + #' indicate a potential for space, time, or bandwidth savings upstream the + #' data pipeline, e.g., by avoiding fetching, storing, or processing these + #' rows of `x`. + #' @param clobberable_versions_start Optional; as in [`as_epi_archive`] + #' @param versions_end Optional; as in [`as_epi_archive`] + #' @return An `epi_archive` object. + #' @importFrom data.table as.data.table key setkeyv + #' + #' @details + #' Refer to the documentation for [as_epi_archive()] for more information + #' and examples of parameter names. + initialize = function(x, geo_type, time_type, other_keys, + additional_metadata, compactify, + clobberable_versions_start, versions_end) { + # Check that we have a data frame + if (!is.data.frame(x)) { + Abort("`x` must be a data frame.") + } - # If time type is missing, then try to guess it - if (missing(time_type)) { - time_type = guess_time_type(x$time_value) - } - - # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys = NULL - if (missing(additional_metadata)) additional_metadata = list() - if (!all(other_keys %in% names(x))) { - Abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - Abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% - c("geo_type", "time_type"))) { - Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } + # Check that we have geo_value, time_value, version columns + if (!("geo_value" %in% names(x))) { + Abort("`x` must contain a `geo_value` column.") + } + if (!("time_value" %in% names(x))) { + Abort("`x` must contain a `time_value` column.") + } + if (!("version" %in% names(x))) { + Abort("`x` must contain a `version` column.") + } + if (anyNA(x$version)) { + Abort("`x$version` must not contain `NA`s", + class = "epiprocess__version_values_must_not_be_na" + ) + } - # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify = NULL - } else if (!rlang::is_bool(compactify) && - !rlang::is_null(compactify)) { - Abort("compactify must be boolean or null.") - } + # If geo type is missing, then try to guess it + if (missing(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end)) { - versions_end <- max_version_with_row_in(x) - } - validate_version_bound(clobberable_versions_start, x, na_ok=TRUE) - validate_version_bound(versions_end, x, na_ok=FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - Abort(sprintf("`versions_end` was %s, but `x` contained + # If time type is missing, then try to guess it + if (missing(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata)) additional_metadata <- list() + if (!all(other_keys %in% names(x))) { + Abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + Abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% + c("geo_type", "time_type"))) { + Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } else if (!rlang::is_bool(compactify) && + !rlang::is_null(compactify)) { + Abort("compactify must be boolean or null.") + } + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + Abort( + sprintf( + "`versions_end` was %s, but `x` contained updates for a later version or versions, up through %s", - versions_end, max(x[["version"]])), - class="epiprocess__versions_end_earlier_than_updates") - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - Abort(sprintf("`versions_end` was %s, but a `clobberable_versions_start` + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + Abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start), - class="epiprocess__versions_end_earlier_than_clobberable_versions_start") - } + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } - # --- End of validation and replacing missing args with defaults --- + # --- End of validation and replacing missing args with defaults --- - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars = c("geo_value", "time_value", other_keys, "version") - DT = as.data.table(x, key = key_vars) - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - maybe_first_duplicate_key_row_index = anyDuplicated(DT, by=key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { - Abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same measurement.", - class = "epiprocess__epi_archive_requires_unique_key") - } - - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec))) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df,if_any(c(everything(),-version),~ !is_locf(.))) - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df,if_all(c(everything(),-version),~ is_locf(.))) - } - - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - elim = keep_locf(DT) - DT = rm_locf(DT) - } else { - # Create empty data frame for nrow(elim) to be 0 - elim = tibble::tibble() - } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- break_str(paste( - 'Found rows that appear redundant based on', - 'last (version of each) observation carried forward;', - 'these rows have been removed to "compactify" and save space:' - )) - - warning_data <- paste(collapse="\n", capture.output(print(elim, topn=3L, nrows=7L))) - - warning_outro <- break_str(paste( - "Built-in `epi_archive` functionality should be unaffected,", - "but results may change if you work directly with its fields (such as `DT`).", - "See `?as_epi_archive` for details.", - "To silence this warning but keep compactification,", - "you can pass `compactify=TRUE` when constructing the archive." - )) - - warning_message <- paste(sep="\n", warning_intro, warning_data, warning_outro) - - rlang::warn(warning_message, class="epiprocess__compactify_default_removed_rows") - } - - # Instantiate all self variables - self$DT = DT - self$geo_type = geo_type - self$time_type = time_type - self$additional_metadata = additional_metadata - self$clobberable_versions_start = clobberable_versions_start - self$versions_end = versions_end - }, - print = function(class = TRUE, methods = TRUE) { - if (class) cat("An `epi_archive` object, with metadata:\n") - cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) - cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) - if (!is.null(self$additional_metadata)) { - sapply(self$additional_metadata, function(m) { - cat(sprintf("* %-9s = %s\n", names(m), m)) - }) - } - cat("----------\n") - if (length(self$DT$time_value) == 0 || all(is.na(self$DT$time_value))) { - min_time = max_time = NA - } else { - min_time = Min(self$DT$time_value) - max_time = Max(self$DT$time_value) - } - cat(sprintf("* %-14s = %s\n", "min time value", min_time)) - cat(sprintf("* %-14s = %s\n", "max time value", max_time)) - cat(sprintf("* %-14s = %s\n", "first version with update", - min(self$DT$version))) - cat(sprintf("* %-14s = %s\n", "last version with update", - max(self$DT$version))) - if (is.na(self$clobberable_versions_start)) { - cat("* No clobberable versions\n") - } else { - cat(sprintf("* %-14s = %s\n", "clobberable versions start", - self$clobberable_versions_start)) - } - cat(sprintf("* %-14s = %s\n", "versions end", - self$versions_end)) - cat("----------\n") - cat(sprintf("Data archive (stored in DT field): %i x %i\n", - nrow(self$DT), ncol(self$DT))) - cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( - colnames(self$DT)) <= 4, paste(colnames(self$DT), collapse = ", "), - paste(paste(colnames(self$DT)[1:4], collapse = ", "), "and", - length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns"))))) - if (methods) { - cat("----------\n") - writeLines(wrap_varnames(initial = "Public R6 methods: ", - names(epi_archive$public_methods))) - } - }, - ##### -#' @description Generates a snapshot in `epi_df` format as of a given version. -#' See the documentation for the wrapper function [`epix_as_of()`] for details. -#' @importFrom data.table between key - as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { - # Self max version and other keys - other_keys = setdiff(key(self$DT), - c("geo_value", "time_value", "version")) - if (length(other_keys) == 0) other_keys = NULL - - # Check a few things on max_version - if (!identical(class(max_version), class(self$DT$version)) || - !identical(typeof(max_version), typeof(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") - } - if (length(max_version) != 1) { - Abort("`max_version` cannot be a vector.") - } - if (is.na(max_version)) { - Abort("`max_version` must not be NA.") - } - if (max_version > self$versions_end) { - Abort("`max_version` must be at most `self$versions_end`.") - } - if (!rlang::is_bool(all_versions)) { - Abort("`all_versions` must be TRUE or FALSE.") - } - if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered"). Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', - class="epiprocess__snapshot_as_of_clobberable_version") - } - - # Filter by version and return - if (all_versions) { - result = epix_truncate_versions_after(self, max_version) - # `self` has already been `clone`d in `epix_truncate_versions_after` - # so we can modify the new archive's DT directly. - result$DT = result$DT[time_value >= min_time_value, ] - return(result) - } + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + Abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } - return( - # Make sure to use data.table ways of filtering and selecting - self$DT[time_value >= min_time_value & - version <= max_version, ] %>% - unique(by = c("geo_value", "time_value", other_keys), - fromLast = TRUE) %>% - tibble::as_tibble() %>% - # (`as_tibble` should de-alias the DT and its columns in any edge - # cases where they are aliased. We don't say we guarantee this - # though.) - dplyr::select(-"version") %>% - as_epi_df(geo_type = self$geo_type, - time_type = self$time_type, - as_of = max_version, - additional_metadata = c(self$additional_metadata, - other_keys = other_keys)) - ) - }, - ##### -#' @description Fill in unobserved history using requested scheme by mutating -#' `self` and potentially reseating its fields. See -#' [`epix_fill_through_version`] for a full description of the non-R6-method -#' version, which doesn't mutate the input archive but might alias its fields. -#' -#' @param fill_versions_end as in [`epix_fill_through_version`] -#' @param how as in [`epix_fill_through_version`] -#' -#' @importFrom data.table key setkeyv := address copy -#' @importFrom rlang arg_match - fill_through_version = function(fill_versions_end, - how=c("na", "locf")) { - validate_version_bound(fill_versions_end, self$DT, na_ok=FALSE) - how <- arg_match(how) - if (self$versions_end < fill_versions_end) { - new_DT = switch( - how, - "na" = { - # old DT + a version consisting of all NA observations - # immediately after the last currently/actually-observed - # version. Note that this NA-observation version must only be - # added if `self` is outdated. - nonversion_key_cols = setdiff(key(self$DT), "version") - nonkey_cols = setdiff(names(self$DT), key(self$DT)) - next_version_tag = next_after(self$versions_end) - if (next_version_tag > fill_versions_end) { - Abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), self$versions_end, next_version_tag, fill_versions_end)) - } - nonversion_key_vals_ever_recorded = unique(self$DT, by=nonversion_key_cols) - # In edge cases, the `unique` result can alias the original - # DT; detect and copy if necessary: - if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { - nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) - } - next_version_DT = nonversion_key_vals_ever_recorded[ - , version := next_version_tag][ - # this makes the class of these columns logical (`NA` is a - # logical NA; we're relying on the rbind below to convert to - # the proper class&typeof) - , (nonkey_cols) := NA] - # full result DT: - setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] - }, - "locf" = { - # just the old DT; LOCF is built into other methods: - self$DT - } - ) - new_versions_end = fill_versions_end - # Update `self` all at once with simple, error-free operations + - # return below: - self$DT <- new_DT - self$versions_end <- new_versions_end - } else { - # Already sufficiently up to date; nothing to do. - } - return (invisible(self)) - }, - ##### -#' @description Filter to keep only older versions, mutating the archive by -#' potentially reseating but not mutating some fields. `DT` is likely, but not -#' guaranteed, to be copied. Returns the mutated archive -#' [invisibly][base::invisible]. -#' @param x as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] - truncate_versions_after = function(max_version) { - if (length(max_version) != 1) { - Abort("`max_version` cannot be a vector.") - } - if (is.na(max_version)) { - Abort("`max_version` must not be NA.") - } - if (!identical(class(max_version), class(self$DT$version)) || - !identical(typeof(max_version), typeof(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") - } - if (max_version > self$versions_end) { - Abort("`max_version` must be at most `self$versions_end`.") - } - self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with=FALSE] - # (^ this filter operation seems to always copy the DT, even if it - # keeps every entry; we don't guarantee this behavior in - # documentation, though, so we could change to alias in this case) - if (!is.na(self$clobberable_versions_start) && - self$clobberable_versions_start > max_version) { - self$clobberable_versions_start <- NA - } - self$versions_end <- max_version - return (invisible(self)) - }, - ##### -#' @description Merges another `epi_archive` with the current one, mutating the -#' current one by reseating its `DT` and several other fields, but avoiding -#' mutation of the old `DT`; returns the current archive -#' [invisibly][base::invisible]. See [`epix_merge`] for a full description -#' of the non-R6-method version, which does not mutate either archive, and -#' does not alias either archive's `DT`. -#' @param y as in [`epix_merge`] -#' @param sync as in [`epix_merge`] -#' @param compactify as in [`epix_merge`] - merge = function(y, sync = c("forbid","na","locf","truncate"), compactify = TRUE) { - result = epix_merge(self, y, - sync = sync, - compactify = compactify) - - if (length(epi_archive$private_fields) != 0L) { - Abort("expected no private fields in epi_archive", - internal=TRUE) - } + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } - # Mutate fields all at once, trying to avoid any potential errors: - for (field_name in names(epi_archive$public_fields)) { - self[[field_name]] <- result[[field_name]] - } + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } - return (invisible(self)) - }, - ##### - group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - group_by.epi_archive(self, ..., .add=.add, .drop=.drop) - }, -#' @description Slides a given function over variables in an `epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. -#' @importFrom data.table key -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # For an "ungrouped" slide, treat all rows as belonging to one big - # group (group by 0 vars), like `dplyr::summarize`, and let the - # resulting `grouped_epi_archive` handle the slide: - self$group_by()$slide( - f, ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- break_str(paste( + "Found rows that appear redundant based on", + "last (version of each) observation carried forward;", + 'these rows have been removed to "compactify" and save space:' + )) + + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + + warning_outro <- break_str(paste( + "Built-in `epi_archive` functionality should be unaffected,", + "but results may change if you work directly with its fields (such as `DT`).", + "See `?as_epi_archive` for details.", + "To silence this warning but keep compactification,", + "you can pass `compactify=TRUE` when constructing the archive." + )) + + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + # Instantiate all self variables + self$DT <- DT + self$geo_type <- geo_type + self$time_type <- time_type + self$additional_metadata <- additional_metadata + self$clobberable_versions_start <- clobberable_versions_start + self$versions_end <- versions_end + }, + print = function(class = TRUE, methods = TRUE) { + if (class) cat("An `epi_archive` object, with metadata:\n") + cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) + cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) + if (!is.null(self$additional_metadata)) { + sapply(self$additional_metadata, function(m) { + cat(sprintf("* %-9s = %s\n", names(m), m)) + }) + } + cat("----------\n") + if (length(self$DT$time_value) == 0 || all(is.na(self$DT$time_value))) { + min_time <- max_time <- NA + } else { + min_time <- Min(self$DT$time_value) + max_time <- Max(self$DT$time_value) + } + cat(sprintf("* %-14s = %s\n", "min time value", min_time)) + cat(sprintf("* %-14s = %s\n", "max time value", max_time)) + cat(sprintf( + "* %-14s = %s\n", "first version with update", + min(self$DT$version) + )) + cat(sprintf( + "* %-14s = %s\n", "last version with update", + max(self$DT$version) + )) + if (is.na(self$clobberable_versions_start)) { + cat("* No clobberable versions\n") + } else { + cat(sprintf( + "* %-14s = %s\n", "clobberable versions start", + self$clobberable_versions_start + )) + } + cat(sprintf( + "* %-14s = %s\n", "versions end", + self$versions_end + )) + cat("----------\n") + cat(sprintf( + "Data archive (stored in DT field): %i x %i\n", + nrow(self$DT), ncol(self$DT) + )) + cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( + colnames(self$DT) + ) <= 4, paste(colnames(self$DT), collapse = ", "), + paste( + paste(colnames(self$DT)[1:4], collapse = ", "), "and", + length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns" + ) + )))) + if (methods) { + cat("----------\n") + writeLines(wrap_varnames( + initial = "Public R6 methods: ", + names(epi_archive$public_methods) + )) + } + }, + ##### + #' @description Generates a snapshot in `epi_df` format as of a given version. + #' See the documentation for the wrapper function [`epix_as_of()`] for details. + #' @importFrom data.table between key + as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { + # Self max version and other keys + other_keys <- setdiff( + key(self$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!identical(class(max_version), class(self$DT$version)) || + !identical(typeof(max_version), typeof(self$DT$version))) { + Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") + } + if (length(max_version) != 1) { + Abort("`max_version` cannot be a vector.") + } + if (is.na(max_version)) { + Abort("`max_version` must not be NA.") + } + if (max_version > self$versions_end) { + Abort("`max_version` must be at most `self$versions_end`.") + } + if (!rlang::is_bool(all_versions)) { + Abort("`all_versions` must be TRUE or FALSE.") + } + if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { + Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered"). Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + result <- epix_truncate_versions_after(self, max_version) + # `self` has already been `clone`d in `epix_truncate_versions_after` + # so we can modify the new archive's DT directly. + result$DT <- result$DT[time_value >= min_time_value, ] + return(result) + } + + return( + # Make sure to use data.table ways of filtering and selecting + self$DT[time_value >= min_time_value & + version <= max_version, ] %>% + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE ) %>% - # We want a slide on ungrouped archives to output something - # ungrouped, rather than retaining the trivial (0-variable) - # grouping applied above. So we `ungroup()`. However, the current - # `dplyr` implementation automatically ignores/drops trivial - # groupings, so this is just a no-op for now. - ungroup() - } + tibble::as_tibble() %>% + # (`as_tibble` should de-alias the DT and its columns in any edge + # cases where they are aliased. We don't say we guarantee this + # though.) + dplyr::select(-"version") %>% + as_epi_df( + geo_type = self$geo_type, + time_type = self$time_type, + as_of = max_version, + additional_metadata = c(self$additional_metadata, + other_keys = other_keys + ) + ) ) - ) + }, + ##### + #' @description Fill in unobserved history using requested scheme by mutating + #' `self` and potentially reseating its fields. See + #' [`epix_fill_through_version`] for a full description of the non-R6-method + #' version, which doesn't mutate the input archive but might alias its fields. + #' + #' @param fill_versions_end as in [`epix_fill_through_version`] + #' @param how as in [`epix_fill_through_version`] + #' + #' @importFrom data.table key setkeyv := address copy + #' @importFrom rlang arg_match + fill_through_version = function(fill_versions_end, + how = c("na", "locf")) { + validate_version_bound(fill_versions_end, self$DT, na_ok = FALSE) + how <- arg_match(how) + if (self$versions_end < fill_versions_end) { + new_DT <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `self` is outdated. + nonversion_key_cols <- setdiff(key(self$DT), "version") + nonkey_cols <- setdiff(names(self$DT), key(self$DT)) + next_version_tag <- next_after(self$versions_end) + if (next_version_tag > fill_versions_end) { + Abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), self$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(self$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + self$DT + } + ) + new_versions_end <- fill_versions_end + # Update `self` all at once with simple, error-free operations + + # return below: + self$DT <- new_DT + self$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(invisible(self)) + }, + ##### + #' @description Filter to keep only older versions, mutating the archive by + #' potentially reseating but not mutating some fields. `DT` is likely, but not + #' guaranteed, to be copied. Returns the mutated archive + #' [invisibly][base::invisible]. + #' @param x as in [`epix_truncate_versions_after`] + #' @param max_version as in [`epix_truncate_versions_after`] + truncate_versions_after = function(max_version) { + if (length(max_version) != 1) { + Abort("`max_version` cannot be a vector.") + } + if (is.na(max_version)) { + Abort("`max_version` must not be NA.") + } + if (!identical(class(max_version), class(self$DT$version)) || + !identical(typeof(max_version), typeof(self$DT$version))) { + Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") + } + if (max_version > self$versions_end) { + Abort("`max_version` must be at most `self$versions_end`.") + } + self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(self$clobberable_versions_start) && + self$clobberable_versions_start > max_version) { + self$clobberable_versions_start <- NA + } + self$versions_end <- max_version + return(invisible(self)) + }, + ##### + #' @description Merges another `epi_archive` with the current one, mutating the + #' current one by reseating its `DT` and several other fields, but avoiding + #' mutation of the old `DT`; returns the current archive + #' [invisibly][base::invisible]. See [`epix_merge`] for a full description + #' of the non-R6-method version, which does not mutate either archive, and + #' does not alias either archive's `DT`. + #' @param y as in [`epix_merge`] + #' @param sync as in [`epix_merge`] + #' @param compactify as in [`epix_merge`] + merge = function(y, sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE) { + result <- epix_merge(self, y, + sync = sync, + compactify = compactify + ) + + if (length(epi_archive$private_fields) != 0L) { + Abort("expected no private fields in epi_archive", + internal = TRUE + ) + } + + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(epi_archive$public_fields)) { + self[[field_name]] <- result[[field_name]] + } + + return(invisible(self)) + }, + ##### + group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { + group_by.epi_archive(self, ..., .add = .add, .drop = .drop) + }, + #' @description Slides a given function over variables in an `epi_archive` + #' object. See the documentation for the wrapper function [`epix_slide()`] for + #' details. + #' @importFrom data.table key + #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms + slide = function(f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + self$group_by()$slide( + f, ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() + } + ) + ) #' Convert to `epi_archive` format #' @@ -739,41 +787,55 @@ epi_archive = #' # Simple ex. with necessary keys #' tib <- tibble::tibble( #' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5), times = 2), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), #' value = rnorm(10, mean = 2, sd = 1) #' ) -#' -#' toy_epi_archive <- tib %>% as_epi_archive(geo_type = "state", -#' time_type = "day") -#' toy_epi_archive -#' +#' +#' toy_epi_archive <- tib %>% as_epi_archive( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' #' # Ex. with an additional key for county -#' df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), -#' county = c(1, 3, 2, 5), -#' time_value = c("2020-06-01", -#' "2020-06-02", -#' "2020-06-01", -#' "2020-06-02"), -#' version = c("2020-06-02", -#' "2020-06-03", -#' "2020-06-02", -#' "2020-06-03"), -#' cases = c(1, 2, 3, 4), -#' cases_rate = c(0.01, 0.02, 0.01, 0.05)) -#' -#' x <- df %>% as_epi_archive(geo_type = "state", -#' time_type = "day", -#' other_keys = "county") -as_epi_archive = function(x, geo_type, time_type, other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x)) { - epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, - compactify, clobberable_versions_start, versions_end) +#' df <- data.frame( +#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c( +#' "2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02" +#' ), +#' version = c( +#' "2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03" +#' ), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05) +#' ) +#' +#' x <- df %>% as_epi_archive( +#' geo_type = "state", +#' time_type = "day", +#' other_keys = "county" +#' ) +as_epi_archive <- function(x, geo_type, time_type, other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x)) { + epi_archive$new( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) } #' Test for `epi_archive` format @@ -782,7 +844,7 @@ as_epi_archive = function(x, geo_type, time_type, other_keys, #' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also #' count? Default is `FALSE`. #' @return `TRUE` if the object inherits from `epi_archive`. -#' +#' #' @export #' @examples #' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) @@ -791,11 +853,11 @@ as_epi_archive = function(x, geo_type, time_type, other_keys, #' # By default, grouped_epi_archives don't count as epi_archives, as they may #' # support a different set of operations from regular `epi_archives`. This #' # behavior can be controlled by `grouped_okay`. -#' grouped_archive = archive_cases_dv_subset$group_by(geo_value) +#' grouped_archive <- archive_cases_dv_subset$group_by(geo_value) #' is_epi_archive(grouped_archive) # FALSE -#' is_epi_archive(grouped_archive, grouped_okay=TRUE) # TRUE +#' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE #' #' @seealso [`is_grouped_epi_archive`] -is_epi_archive = function(x, grouped_okay=FALSE) { +is_epi_archive <- function(x, grouped_okay = FALSE) { inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") } diff --git a/R/correlation.R b/R/correlation.R index 62d024bd..a4a56d1e 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -4,11 +4,11 @@ #' grouping by geo value, time value, or any other variables. See the #' [correlation #' vignette](https://cmu-delphi.github.io/epiprocess/articles/correlation.html) -#' for examples. +#' for examples. #' #' @param x The `epi_df` object under consideration. #' @param var1,var2 The variables in `x` to correlate. -#' @param dt1,dt2 Time shifts to consider for the two variables, respectively, +#' @param dt1,dt2 Time shifts to consider for the two variables, respectively, #' before computing correlations. Negative shifts translate into in a lag #' value and positive shifts into a lead value; for example, if `dt = -1`, #' then the new value on June 2 is the original value on June 1; if `dt = 1`, @@ -34,51 +34,59 @@ #' `method` (same as `cor()`). #' #' @return An tibble with the grouping columns first (`geo_value`, `time_value`, -#' or possibly others), and then a column `cor`, which gives the correlation. -#' +#' or possibly others), and then a column `cor`, which gives the correlation. +#' #' @importFrom stats cor #' @importFrom rlang .data !! !!! enquo syms #' @importFrom tidyselect eval_select #' @export #' @examples -#' +#' #' # linear association of case and death rates on any given day -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = "time_value") -#' +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = "time_value" +#' ) +#' #' # correlation of death rates and lagged case rates -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = time_value, -#' dt1 = -2) -#' -#' # correlation grouped by location -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = geo_value) -#' +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = time_value, +#' dt1 = -2 +#' ) +#' +#' # correlation grouped by location +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = geo_value +#' ) +#' #' # correlation grouped by location and incorporates lagged cases rates -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = geo_value, -#' dt1 = -2) -epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, - cor_by = geo_value, use = "na.or.complete", - method = c("pearson", "kendall", "spearman")) { +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = geo_value, +#' dt1 = -2 +#' ) +epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, + cor_by = geo_value, use = "na.or.complete", + method = c("pearson", "kendall", "spearman")) { # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") # Check that we have variables to do computations on if (missing(var1)) Abort("`var1` must be specified.") if (missing(var2)) Abort("`var2` must be specified.") - var1 = enquo(var1) - var2 = enquo(var2) - + var1 <- enquo(var1) + var2 <- enquo(var2) + # Defuse grouping variables. This looks a bit more involved since we want to # accomodate the option of specifying multiple variables for each grouping. # Hence use the power of tidyselect::eval_select(), which can accomodate any @@ -88,26 +96,33 @@ epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, # * cor_by = c(a, b) # * cor_by = c("a", "b") # and so on, and similarly for shift_by. Note: make sure to follow with !!! - cor_by = syms(names(eval_select(enquo(cor_by), x))) - shift_by = syms(names(eval_select(enquo(shift_by), x))) + cor_by <- syms(names(eval_select(enquo(cor_by), x))) + shift_by <- syms(names(eval_select(enquo(shift_by), x))) # Which method? - method = match.arg(method) + method <- match.arg(method) # Perform time shifts, then compute appropriate correlations and return return(x %>% - dplyr::group_by(!!!shift_by) %>% - dplyr::arrange(.data$time_value) %>% - dplyr::mutate(var1 = shift(!!var1, n = dt1), - var2 = shift(!!var2, n = dt2)) %>% - dplyr::ungroup() %>% - dplyr::group_by(!!!cor_by) %>% - dplyr::summarize(cor = cor(x = .data$var1, y = .data$var2, - use = use, method = method))) + dplyr::group_by(!!!shift_by) %>% + dplyr::arrange(.data$time_value) %>% + dplyr::mutate( + var1 = shift(!!var1, n = dt1), + var2 = shift(!!var2, n = dt2) + ) %>% + dplyr::ungroup() %>% + dplyr::group_by(!!!cor_by) %>% + dplyr::summarize(cor = cor( + x = .data$var1, y = .data$var2, + use = use, method = method + ))) } # Function to perform time shifts, lag or lead -shift = function(var, n) { - if (n < 0) return(dplyr::lag(var, -n)) - else return(dplyr::lead(var, n)) +shift <- function(var, n) { + if (n < 0) { + return(dplyr::lag(var, -n)) + } else { + return(dplyr::lead(var, n)) + } } diff --git a/R/data.R b/R/data.R index 248288eb..ead3dfdd 100644 --- a/R/data.R +++ b/R/data.R @@ -1,41 +1,41 @@ -#' Subset of JHU daily state cases and deaths +#' Subset of JHU daily state cases and deaths #' #' This data source of confirmed COVID-19 cases and deaths #' is based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to +#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to #' California, Florida, Texas, New York, Georgia, and Pennsylvania. #' #' @format A tibble with 4026 rows and 6 variables: #' \describe{ -#' \item{geo_value}{the geographic value associated with each row +#' \item{geo_value}{the geographic value associated with each row #' of measurements.} #' \item{time_value}{the time value associated with each row of measurements.} -#' \item{case_rate_7d_av}{7-day average signal of number of new +#' \item{case_rate_7d_av}{7-day average signal of number of new #' confirmed COVID-19 cases per 100,000 population, daily} -#' \item{death_rate_7d_av}{7-day average signal of number of new confirmed +#' \item{death_rate_7d_av}{7-day average signal of number of new confirmed #' deaths due to COVID-19 per 100,000 population, daily} #' \item{cases}{Number of new confirmed COVID-19 cases, daily} -#' \item{cases_7d_av}{7-day average signal of number of new confirmed +#' \item{cases_7d_av}{7-day average signal of number of new confirmed #' COVID-19 cases, daily} #' } -#' @source This object contains a modified part of the +#' @source This object contains a modified part of the #' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. +#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. #' This data set is licensed under the terms of the #' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} #' by the Johns Hopkins University on behalf of its Center for Systems Science #' in Engineering. Copyright Johns Hopkins University 2020. #' #' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -#' These signals are taken directly from the JHU CSSE -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -#' without changes. The 7-day average signals are computed by Delphi by -#' calculating moving averages of the preceding 7 days, so the signal for -#' June 7 is the average of the underlying data for June 1 through 7, +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: +#' These signals are taken directly from the JHU CSSE +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} +#' without changes. The 7-day average signals are computed by Delphi by +#' calculating moving averages of the preceding 7 days, so the signal for +#' June 7 is the average of the underlying data for June 1 through 7, #' inclusive. -#' * Furthermore, the data has been limited to a very small number of rows, +#' * Furthermore, the data has been limited to a very small number of rows, #' the signal names slightly altered, and formatted into a tibble. "jhu_csse_daily_subset" @@ -46,7 +46,7 @@ #' provided to us by health system partners, and also contains confirmed #' COVID-19 cases based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from June 1, 2020 to Dec 1, 2021, and +#' This example data ranges from June 1, 2020 to Dec 1, 2021, and #' is also limited to California, Florida, Texas, and New York. #' #' @format An `epi_archive` data format. The data table DT has 129,638 rows and 5 columns: @@ -88,23 +88,23 @@ #' @return Boolean #' #' @noRd -some_package_is_being_unregistered = function(parent_n = 0L) { - calls = sys.calls() +some_package_is_being_unregistered <- function(parent_n = 0L) { + calls <- sys.calls() # `calls` will include the call to this function; strip out this call plus # `parent_n` additional requested calls to make it like we're reasoning about # the desired call. This could prevent potential false positives from # triggering if, in a later version, we decide to loosen the `call_name` # checks below to something that would be `TRUE` for the name of this function # or one of the undesired call ancestors. - calls_to_inspect = utils::head(calls, n = -(parent_n + 1L)) + calls_to_inspect <- utils::head(calls, n = -(parent_n + 1L)) # Note that `utils::head(sys.calls(), n=-1L)` isn't equivalent, due to lazy # argument evaluation. Note that copy-pasting the body of this function # without this `utils::head` operation isn't always equivalent to calling it; # e.g., within the `value` argument of a package-level `delayedAssign`, # `sys.calls()` will return `NULL` is some or all cases, including when its # evaluation has been triggered via `unregister`. - simple_call_names = purrr::map_chr(calls_to_inspect, function(call) { - maybe_simple_call_name = rlang::call_name(call) + simple_call_names <- purrr::map_chr(calls_to_inspect, function(call) { + maybe_simple_call_name <- rlang::call_name(call) if (is.null(maybe_simple_call_name)) NA_character_ else maybe_simple_call_name }) # `pkgload::unregister` is an (the?) exported function that forces @@ -127,11 +127,11 @@ some_package_is_being_unregistered = function(parent_n = 0L) { #' different than when using `delayedAssign` directly. #' #' @noRd -delayed_assign_with_unregister_awareness = function(x, value, - eval.env = rlang::caller_env(), - assign.env = rlang::caller_env()) { - value_quosure = rlang::as_quosure(rlang::enexpr(value), eval.env) - this_env = environment() +delayed_assign_with_unregister_awareness <- function(x, value, + eval.env = rlang::caller_env(), + assign.env = rlang::caller_env()) { + value_quosure <- rlang::as_quosure(rlang::enexpr(value), eval.env) + this_env <- environment() delayedAssign(x, eval.env = this_env, assign.env = assign.env, value = { if (some_package_is_being_unregistered()) { withCallingHandlers( @@ -144,26 +144,30 @@ delayed_assign_with_unregister_awareness = function(x, value, # all.) rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)), error = function(err) { - Abort(paste("An error was raised while attempting to evaluate a promise", - "(prepared with `delayed_assign_with_unregister_awareness`)", - "while an `unregister` or `unregister_namespace` call", - "was being evaluated.", - "This can happen, for example, when `devtools::load_all`", - "reloads a package that contains a buggy promise,", - "because reloading can cause old package-level promises to", - "be forced via `pkgload::unregister` and", - "`pkgload:::unregister_namespace`, due to", - "https://github.com/r-lib/pkgload/pull/157.", - "If this is the current situation, you might be able to", - "be successfully reload the package again after", - "`unloadNamespace`-ing it (but this situation will", - "keep re-occurring every other `devtools::load`", - "and every `devtools:document` until the bug or situation", - "generating the promise's error has been resolved)." - ), - class = "epiprocess__promise_evaluation_error_during_unregister", - parent = err) - }) + Abort( + paste( + "An error was raised while attempting to evaluate a promise", + "(prepared with `delayed_assign_with_unregister_awareness`)", + "while an `unregister` or `unregister_namespace` call", + "was being evaluated.", + "This can happen, for example, when `devtools::load_all`", + "reloads a package that contains a buggy promise,", + "because reloading can cause old package-level promises to", + "be forced via `pkgload::unregister` and", + "`pkgload:::unregister_namespace`, due to", + "https://github.com/r-lib/pkgload/pull/157.", + "If this is the current situation, you might be able to", + "be successfully reload the package again after", + "`unloadNamespace`-ing it (but this situation will", + "keep re-occurring every other `devtools::load`", + "and every `devtools:document` until the bug or situation", + "generating the promise's error has been resolved)." + ), + class = "epiprocess__promise_evaluation_error_during_unregister", + parent = err + ) + } + ) } else { rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)) } @@ -189,14 +193,14 @@ delayed_assign_with_unregister_awareness = function(x, value, # binding may have been created with the same name as the package promise, and # this binding will stick around even when the package is reloaded, and will # need to be `rm`-d to easily access the refreshed package promise. -delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archive(archive_cases_dv_subset_dt, compactify=FALSE)) +delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archive(archive_cases_dv_subset_dt, compactify = FALSE)) #' Subset of JHU daily cases from California and Florida #' #' This data source of confirmed COVID-19 cases #' is based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data is a snapshot as of Oct 28, 2021 and captures the cases +#' This example data is a snapshot as of Oct 28, 2021 and captures the cases #' from June 1, 2020 to May 31, 2021 #' and is limited to California and Florida. #' @@ -222,7 +226,7 @@ delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archi #' This data source of confirmed COVID-19 cases and deaths #' is based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, +#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, #' and is limited to Massachusetts and Vermont. #' #' @format A tibble with 16,212 rows and 5 variables: diff --git a/R/epiprocess.R b/R/epiprocess.R index e047de8c..bbdcf4f3 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -2,7 +2,7 @@ #' #' This package introduces a common data structure for epidemiological data sets #' measured over space and time, and offers associated utilities to perform -#' basic signal processing tasks. +#' basic signal processing tasks. #' #' @docType package #' @name epiprocess diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index b11bf821..f083cf93 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -1,4 +1,3 @@ - #' Get var names from select-only `tidy_select`ing `...` in `.data` #' #' Convenience function for performing a `tidy_select` on dots according to its @@ -11,7 +10,7 @@ #' `names(.data)` denoting the selection #' #' @noRd -eval_pure_select_names_from_dots = function(..., .data) { +eval_pure_select_names_from_dots <- function(..., .data) { # `?tidyselect::eval_select` tells us to use this form when we take in dots. # It seems a bit peculiar, since the expr doesn't pack with it a way to get at # the environment for the dots, but it looks like `eval_select` will assume @@ -20,7 +19,7 @@ eval_pure_select_names_from_dots = function(..., .data) { # # If we were allowing renaming, we'd need to be careful about which names (new # vs. old vs. both) to return here. - names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename=FALSE)) + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) } #' Get names of dots without forcing the dots @@ -31,16 +30,16 @@ eval_pure_select_names_from_dots = function(..., .data) { #' dots if we're using NSE. #' #' @noRd -nse_dots_names = function(...) { +nse_dots_names <- function(...) { names(rlang::call_match()) } -nse_dots_names2 = function(...) { +nse_dots_names2 <- function(...) { rlang::names2(rlang::call_match()) } #' @importFrom dplyr group_by_drop_default #' @noRd -grouped_epi_archive = +grouped_epi_archive <- R6::R6Class( classname = "grouped_epi_archive", # (We don't R6-inherit `epi_archive` or S3-multiclass with "epi_archive"; @@ -55,34 +54,39 @@ grouped_epi_archive = initialize = function(ungrouped, vars, drop) { if (inherits(ungrouped, "grouped_epi_archive")) { Abort("`ungrouped` must not already be grouped (neither automatic regrouping nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, or `ungroup` first.", - class="epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", - epiprocess__ungrouped_class = class(ungrouped), - epiprocess__ungrouped_groups = groups(ungrouped)) + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped) + ) } if (!inherits(ungrouped, "epi_archive")) { Abort("`ungrouped` must be an epi_archive", - class="epiprocess__grouped_epi_archive__ungrouped_arg_is_not_epi_archive", - epiprocess__ungrouped_class = class(ungrouped)) + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_not_epi_archive", + epiprocess__ungrouped_class = class(ungrouped) + ) } if (!is.character(vars)) { Abort("`vars` must be a character vector (any tidyselection should have already occurred in a helper method).", - class="epiprocess__grouped_epi_archive__vars_is_not_chr", - epiprocess__vars_class = class(vars), - epiprocess__vars_type = typeof(vars)) + class = "epiprocess__grouped_epi_archive__vars_is_not_chr", + epiprocess__vars_class = class(vars), + epiprocess__vars_type = typeof(vars) + ) } if (!all(vars %in% names(ungrouped$DT))) { Abort("`vars` must be selected from the names of columns of `ungrouped$DT`", - class="epiprocess__grouped_epi_archive__vars_contains_invalid_entries", - epiprocess__vars = vars, - epiprocess__DT_names = names(ungrouped$DT)) + class = "epiprocess__grouped_epi_archive__vars_contains_invalid_entries", + epiprocess__vars = vars, + epiprocess__DT_names = names(ungrouped$DT) + ) } if ("version" %in% vars) { Abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") } if (!rlang::is_bool(drop)) { Abort("`drop` must be a Boolean", - class="epiprocess__grouped_epi_archive__drop_is_not_bool", - epiprocess__drop = drop) + class = "epiprocess__grouped_epi_archive__drop_is_not_bool", + epiprocess__drop = drop + ) } # ----- private$ungrouped <- ungrouped @@ -91,7 +95,7 @@ grouped_epi_archive = }, print = function(class = TRUE, methods = TRUE) { if (class) cat("A `grouped_epi_archive` object:\n") - writeLines(wrap_varnames(private$vars, initial="* Groups: ")) + writeLines(wrap_varnames(private$vars, initial = "* Groups: ")) # If none of the grouping vars is a factor, then $drop doesn't seem # relevant, so try to be less verbose and don't message about it. # @@ -102,7 +106,7 @@ grouped_epi_archive = # something to rely too much on), while map functions currently appear # to avoid column copies. if (any(purrr::map_lgl(private$ungrouped$DT, is.factor)[private$vars])) { - cat(strwrap(init="* ", prefix=" ", sprintf( + cat(strwrap(init = "* ", prefix = " ", sprintf( "%s groups formed by factor levels that don't appear in the data", if (private$drop) "Drops" else "Does not drop" ))) @@ -113,14 +117,20 @@ grouped_epi_archive = if (methods) { cat("----------\n") cat("Public `grouped_epi_archive` R6 methods:\n") - grouped_method_names = names(grouped_epi_archive$public_methods) - ungrouped_method_names = names(epi_archive$public_methods) - writeLines(wrap_varnames(initial = "\u2022 Specialized `epi_archive` methods: ", - intersect(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "\u2022 Exclusive to `grouped_epi_archive`: ", - setdiff(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "\u2022 `ungroup` to use: ", - setdiff(ungrouped_method_names, grouped_method_names))) + grouped_method_names <- names(grouped_epi_archive$public_methods) + ungrouped_method_names <- names(epi_archive$public_methods) + writeLines(wrap_varnames( + initial = "\u2022 Specialized `epi_archive` methods: ", + intersect(grouped_method_names, ungrouped_method_names) + )) + writeLines(wrap_varnames( + initial = "\u2022 Exclusive to `grouped_epi_archive`: ", + setdiff(grouped_method_names, ungrouped_method_names) + )) + writeLines(wrap_varnames( + initial = "\u2022 `ungroup` to use: ", + setdiff(ungrouped_method_names, grouped_method_names) + )) } # Return self invisibly for convenience in `$`-"pipe": invisible(self) @@ -135,14 +145,15 @@ grouped_epi_archive = If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. ', - class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE") + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) } else { # `group_by` `...` computations are performed on ungrouped data (see # `?dplyr::group_by`) - detailed_mutate = epix_detailed_restricted_mutate(private$ungrouped, ...) - out_ungrouped = detailed_mutate[["archive"]] - vars_from_dots = detailed_mutate[["request_names"]] - vars = union(private$vars, vars_from_dots) + detailed_mutate <- epix_detailed_restricted_mutate(private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(private$vars, vars_from_dots) grouped_epi_archive$new(private$ungrouped, vars, .drop) } }, @@ -158,11 +169,11 @@ grouped_epi_archive = # an ungrouped class, as with `grouped_df`s. private$ungrouped } else { - exclude_vars = eval_pure_select_names_from_dots(..., .data=private$ungrouped$DT) + exclude_vars <- eval_pure_select_names_from_dots(..., .data = private$ungrouped$DT) # (requiring a pure selection here is a little stricter than dplyr # implementations, but passing a renaming selection into `ungroup` # seems pretty weird.) - result_vars = private$vars[! private$vars %in% exclude_vars] + result_vars <- private$vars[!private$vars %in% exclude_vars] # `vars` might be length 0 if the user's tidyselection removed all # grouping vars. Unlike with tibble, opt here to keep the result as a # grouped_epi_archive, for output class consistency when `...` is @@ -170,36 +181,36 @@ grouped_epi_archive = grouped_epi_archive$new(private$ungrouped, result_vars, private$drop) } }, -#' @description Filter to keep only older versions by mutating the underlying -#' `epi_archive` using `$truncate_versions_after`. Returns the mutated -#' `grouped_epi_archive` [invisibly][base::invisible]. -#' @param x as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] + #' @description Filter to keep only older versions by mutating the underlying + #' `epi_archive` using `$truncate_versions_after`. Returns the mutated + #' `grouped_epi_archive` [invisibly][base::invisible]. + #' @param x as in [`epix_truncate_versions_after`] + #' @param max_version as in [`epix_truncate_versions_after`] truncate_versions_after = function(max_version) { # The grouping is irrelevant for this method; if we were to split into # groups and recombine appropriately, we should get the same result as # just leveraging the ungrouped method, so just do the latter: private$ungrouped$truncate_versions_after(max_version) - return (invisible(self)) + return(invisible(self)) }, -#' @description Slides a given function over variables in a `grouped_epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. -#' @importFrom data.table key address rbindlist setDF -#' @importFrom tibble as_tibble new_tibble validate_tibble -#' @importFrom dplyr group_by groups -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms -#' env missing_arg - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = - # deprecated()` in the function signature, because they are from - # early development versions and much more likely to be clutter than - # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { - Abort(" + #' @description Slides a given function over variables in a `grouped_epi_archive` + #' object. See the documentation for the wrapper function [`epix_slide()`] for + #' details. + #' @importFrom data.table key address rbindlist setDF + #' @importFrom tibble as_tibble new_tibble validate_tibble + #' @importFrom dplyr group_by groups + #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms + #' env missing_arg + slide = function(f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + Abort(" The `group_by` argument to `slide` has been removed; please use the `group_by` S3 generic function or `$group_by` R6 method before the slide instead. (If you were instead trying to pass a @@ -208,208 +219,209 @@ grouped_epi_archive = different column name here and rename the resulting column after the slide.) ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") - } - if ("all_rows" %in% nse_dots_names(...)) { - Abort(" + } + if ("all_rows" %in% nse_dots_names(...)) { + Abort(" The `all_rows` argument has been removed from `epix_slide` (but is still supported in `epi_slide`). Add rows for excluded results with a manual join instead. ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") - } - - if (missing(ref_time_values)) { - ref_time_values = epix_slide_ref_time_values_default(private$ungrouped) - } else if (length(ref_time_values) == 0L) { - Abort("`ref_time_values` must have at least one element.") - } else if (any(is.na(ref_time_values))) { - Abort("`ref_time_values` must not include `NA`.") - } else if (anyDuplicated(ref_time_values) != 0L) { - Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") - } else if (any(ref_time_values > private$ungrouped$versions_end)) { - Abort("All `ref_time_values` must be `<=` the `versions_end`.") - } else { - # Sort, for consistency with `epi_slide`, although the current - # implementation doesn't take advantage of it. - ref_time_values = sort(ref_time_values) - } + } - # Validate and pre-process `before`: - if (missing(before)) { - Abort("`before` is required (and must be passed by name); + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(private$ungrouped) + } else if (length(ref_time_values) == 0L) { + Abort("`ref_time_values` must have at least one element.") + } else if (any(is.na(ref_time_values))) { + Abort("`ref_time_values` must not include `NA`.") + } else if (anyDuplicated(ref_time_values) != 0L) { + Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } else if (any(ref_time_values > private$ungrouped$versions_end)) { + Abort("All `ref_time_values` must be `<=` the `versions_end`.") + } else { + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + Abort("`before` is required (and must be passed by name); if you did not want to apply a sliding window but rather to map `as_of` and `f` across various `ref_time_values`, pass a large `before` value (e.g., if time steps are days, `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - if (length(before) != 1L || is.na(before) || before < 0L) { - Abort("`before` must be length-1, non-NA, non-negative.") - } + } + before <- vctrs::vec_cast(before, integer()) + if (length(before) != 1L || is.na(before) || before < 0L) { + Abort("`before` must be length-1, non-NA, non-negative.") + } - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) - - # Symbolize column name - new_col = sym(new_col_name) + # If a custom time step is specified, then redefine units - # Validate rest of parameters: - if (!rlang::is_bool(as_list_col)) { - Abort("`as_list_col` must be TRUE or FALSE.") - } - if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) { - Abort("`names_sep` must be a (single) string or NULL.") - } - if (!rlang::is_bool(all_versions)) { - Abort("`all_versions` must be TRUE or FALSE.") - } + if (!missing(time_step)) before <- time_step(before) - # Computation for one group, one time value - comp_one_grp = function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # Carry out the specified computation - comp_value = f(.data_group, .group_key, ref_time_value, ...) + # Symbolize column name + new_col <- sym(new_col_name) - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `as_of` step. - .data_group = .data_group$DT - } + # Validate rest of parameters: + if (!rlang::is_bool(as_list_col)) { + Abort("`as_list_col` must be TRUE or FALSE.") + } + if (!(rlang::is_string(names_sep) || is.null(names_sep))) { + Abort("`names_sep` must be a (single) string or NULL.") + } + if (!rlang::is_bool(all_versions)) { + Abort("`all_versions` must be TRUE or FALSE.") + } - if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { - Abort("The slide computation must return an atomic vector or a data frame.") - } + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group <- .data_group$DT + } - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) + if (!(is.atomic(comp_value) || is.data.frame(comp_value))) { + Abort("The slide computation must return an atomic vector or a data frame.") + } - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos = enquos(...) - if (length(quos) == 0) { - Abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - Abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - f = quos[[1]] - new_col = sym(names(rlang::quos_auto_name(quos))) - ... = missing_arg() # magic value that passes zero args as dots in calls below - } + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) - f = as_slide_computation(f, ...) - x = lapply(ref_time_values, function(ref_time_value) { - # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, - # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) - # Set: - # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not - # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { - as_of_df = as_of_raw - group_modify_fn = comp_one_grp - } else { - as_of_archive = as_of_raw - # We essentially want to `group_modify` the archive, but - # haven't implemented this method yet. Next best would be - # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. - # Instead, go through an ordinary data frame, trying to avoid - # copies. - if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { - # `as_of` aliased its the full `$DT`; copy before mutating: - # - # Note: this step is probably unneeded; we're fine with - # aliasing of the DT or its columns: vanilla operations aren't - # going to mutate them in-place if they are aliases, and we're - # not performing mutation (unlike the situation with - # `fill_through_version` where we do mutate a `DT` and don't - # want aliasing). - as_of_archive$DT <- copy(as_of_archive$DT) - } - dt_key = data.table::key(as_of_archive$DT) - as_of_df = as_of_archive$DT - data.table::setDF(as_of_df) + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } - # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn = function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # .data_group is coming from as_of_df as a tibble, but we - # want to feed `comp_one_grp` an `epi_archive` backed by a - # DT; convert and wrap: - data.table::setattr(.data_group, "sorted", dt_key) - data.table::setDT(.data_group, key=dt_key) - .data_group_archive = as_of_archive$clone() - .data_group_archive$DT = .data_group - comp_one_grp(.data_group_archive, .group_key, f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col - ) - } - } + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + Abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + Abort("If `f` is missing then only a single computation can be specified via `...`.") + } - return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE - ) - ) - }) - # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) - # Reconstruct groups - x <- group_by(x, !!!syms(private$vars), .drop=private$drop) + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below + } - # Unchop/unnest if we need to - if (!as_list_col) { - x = tidyr::unnest(x, !!new_col, names_sep = names_sep) + f <- as_slide_computation(f, ...) + x <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). + as_of_archive$DT <- copy(as_of_archive$DT) } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) - # if (is_epi_df(x)) { - # # The analogue of `epi_df`'s `as_of` metadata for an archive is - # # `$versions_end`, at least in the current absence of - # # separate fields/columns denoting the "archive version" with a - # # different resolution, or from the perspective of a different - # # stage of a data pipeline. The `as_of` that is automatically - # # derived won't always match; override: - # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - # } + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- as_of_archive$clone() + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } - # XXX We need to work out when we want to return an `epi_df` and how - # to get appropriate keys (see #290, #223, #163). We'll probably - # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have some more consistency across - # different ways of calling `epix_slide`, and to prevent `epi_df` - # output with invalid metadata, always output a (grouped or - # ungrouped) tibble. - x <- decay_epi_df(x) + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(private$vars), .drop = private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(private$vars), .drop = private$drop) - return(x) - } + # Unchop/unnest if we need to + if (!as_list_col) { + x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) + } ) ) @@ -424,8 +436,8 @@ grouped_epi_archive = #' #' @importFrom dplyr group_by #' @export -group_by.grouped_epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { - .data$group_by(..., .add=.add, .drop=.drop) +group_by.grouped_epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { + .data$group_by(..., .add = .add, .drop = .drop) } #' @include methods-epi_archive.R @@ -433,7 +445,7 @@ group_by.grouped_epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::gro #' #' @importFrom dplyr groups #' @export -groups.grouped_epi_archive = function(x) { +groups.grouped_epi_archive <- function(x) { x$groups() } @@ -442,7 +454,7 @@ groups.grouped_epi_archive = function(x) { #' #' @importFrom dplyr ungroup #' @export -ungroup.grouped_epi_archive = function(x, ...) { +ungroup.grouped_epi_archive <- function(x, ...) { x$ungroup(...) } @@ -450,7 +462,7 @@ ungroup.grouped_epi_archive = function(x, ...) { #' @rdname group_by.epi_archive #' #' @export -is_grouped_epi_archive = function(x) { +is_grouped_epi_archive <- function(x) { inherits(x, "grouped_epi_archive") } @@ -458,12 +470,12 @@ is_grouped_epi_archive = function(x) { #' @rdname group_by.epi_archive #' #' @export -group_by_drop_default.grouped_epi_archive = function(.tbl) { +group_by_drop_default.grouped_epi_archive <- function(.tbl) { .tbl$group_by_drop_default() } #' @export -epix_truncate_versions_after.grouped_epi_archive = function(x, max_version) { - return ((x$clone()$truncate_versions_after(max_version))) +epix_truncate_versions_after.grouped_epi_archive <- function(x, max_version) { + return((x$clone()$truncate_versions_after(max_version))) # ^ second set of parens drops invisibility } diff --git a/R/growth_rate.R b/R/growth_rate.R index 17c4ec74..f54d1277 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -1,5 +1,5 @@ #' Estimate growth rate -#' +#' #' Estimates the growth rate of a signal at given points along the underlying #' sequence. Several methodologies are available; see the [growth rate #' vignette](https://cmu-delphi.github.io/epiprocess/articles/growth_rate.html) @@ -10,7 +10,7 @@ #' `y`). #' @param y Signal values. #' @param x0 Points at which we should estimate the growth rate. Must be a -#' subset of `x` (no extrapolation allowed). Default is `x`. +#' subset of `x` (no extrapolation allowed). Default is `x`. #' @param method Either "rel_change", "linear_reg", "smooth_spline", or #' "trend_filter", indicating the method to use for the growth rate #' calculation. The first two are local methods: they are run in a sliding @@ -28,7 +28,7 @@ #' @param na_rm Should missing values be removed before the computation? Default #' is `FALSE`. #' @param ... Additional arguments to pass to the method used to estimate the -#' derivative. +#' derivative. #' @return Vector of growth rate estimates at the specified points `x0`. #' #' @details The growth rate of a function f defined over a continuously-valued @@ -39,8 +39,8 @@ #' the signal value itself (or possibly a smoothed version of the signal #' value). #' -#' The following methods are available for estimating the growth rate: -#' +#' The following methods are available for estimating the growth rate: +#' #' * "rel_change": uses (B/A - 1) / h, where B is the average of `y` over the #' second half of a sliding window of bandwidth h centered at the reference #' point `x0`, and A the average over the first half. This can be seen as @@ -51,11 +51,11 @@ #' * "smooth_spline": uses the estimated derivative at `x0` from a smoothing #' spline fit to `x` and `y`, via `stats::smooth.spline()`, divided by the #' fitted value of the spline at `x0`. -#' * "trend_filter": uses the estimated derivative at `x0` from polynomial trend +#' * "trend_filter": uses the estimated derivative at `x0` from polynomial trend #' filtering (a discrete spline) fit to `x` and `y`, via #' `genlasso::trendfilter()`, divided by the fitted value of the discrete #' spline at `x0`. -#' +#' #' @section Log Scale: #' An alternative view for the growth rate of a function f in general is given #' by defining g(t) = log(f(t)), and then observing that g'(t) = f'(t) / @@ -74,7 +74,7 @@ #' `Date` objects, `h = 7`, and the reference point is January 7, then the #' sliding window contains all data in between January 1 and 14 (matching the #' behavior of `epi_slide()` with `before = h - 1` and `after = h`). -#' +#' #' @section Additional Arguments: #' For the global methods, "smooth_spline" and "trend_filter", additional #' arguments can be specified via `...` for the underlying estimation @@ -89,7 +89,7 @@ #' * `maxsteps`: maximum number of steps to take in the solution path before #' terminating. Default is 1000. #' * `cv`: should cross-validation be used to choose an effective degrees of -#' freedom for the fit? Default is `TRUE`. +#' freedom for the fit? Default is `TRUE`. #' * `k`: number of folds if cross-validation is to be used. Default is 3. #' * `df`: desired effective degrees of freedom for the trend filtering fit. If #' `cv = FALSE`, then `df` must be a positive integer; if `cv = TRUE`, then @@ -98,153 +98,166 @@ #' rule, respectively. Default is "min" (going along with the default `cv = #' TRUE`). Note that if `cv = FALSE`, then we require `df` to be set by the #' user. -#' +#' #' @export #' @examples #' # COVID cases growth rate by state using default method relative change -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' mutate(cases_gr = growth_rate(x = time_value, y = cases)) -#' +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' mutate(cases_gr = growth_rate(x = time_value, y = cases)) +#' #' # Log scale, degree 4 polynomial and 6-fold cross validation -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) - -growth_rate = function(x = seq_along(y), y, x0 = x, - method = c("rel_change", "linear_reg", - "smooth_spline", "trend_filter"), - h = 7, log_scale = FALSE, - dup_rm = FALSE, na_rm = FALSE, ...) { +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) +growth_rate <- function(x = seq_along(y), y, x0 = x, + method = c( + "rel_change", "linear_reg", + "smooth_spline", "trend_filter" + ), + h = 7, log_scale = FALSE, + dup_rm = FALSE, na_rm = FALSE, ...) { # Check x, y, x0 if (length(x) != length(y)) Abort("`x` and `y` must have the same length.") if (!all(x0 %in% x)) Abort("`x0` must be a subset of `x`.") - + # Check the method - method = match.arg(method) - + method <- match.arg(method) + # Arrange in increasing order of x - o = order(x) - x = x[o] - y = y[o] - + o <- order(x) + x <- x[o] + y <- y[o] + # Convert to log(y) if we need to - y = as.numeric(y) - if (log_scale) y = log(y) + y <- as.numeric(y) + if (log_scale) y <- log(y) # Remove duplicates if we need to if (dup_rm) { - o = !duplicated(x) + o <- !duplicated(x) if (any(!o)) { Warn("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") } - x = x[o] - y = y[o] + x <- x[o] + y <- y[o] } - - + + # Remove NAs if we need to if (na_rm) { - o = !(is.na(x) & is.na(y)) - x = x[o] - y = y[o] + o <- !(is.na(x) & is.na(y)) + x <- x[o] + y <- y[o] } # Useful indices for later - i0 = x %in% x0 + i0 <- x %in% x0 # Local methods - if (method == "rel_change" || method == "linear_reg") { - g = purrr::map_dbl(x, function(x_ref) { + if (method == "rel_change" || method == "linear_reg") { + g <- purrr::map_dbl(x, function(x_ref) { # Form the local window - ii = (x > x_ref - h) & (x <= x_ref + h) - xx = x[ii] - yy = y[ii] + ii <- (x > x_ref - h) & (x <= x_ref + h) + xx <- x[ii] + yy <- y[ii] # Convert to numerics - x_ref = as.numeric(x_ref) - xx = as.numeric(xx) - + x_ref <- as.numeric(x_ref) + xx <- as.numeric(xx) + # Relative change if (method == "rel_change") { - right = xx > x_ref - left = xx <= x_ref - b = mean(yy[right]) - a = mean(yy[left]) - hh = mean(xx[right]) - mean(xx[left]) - if (log_scale) return((b-a) / hh) - else return((b/a - 1) / hh) + right <- xx > x_ref + left <- xx <= x_ref + b <- mean(yy[right]) + a <- mean(yy[left]) + hh <- mean(xx[right]) - mean(xx[left]) + if (log_scale) { + return((b - a) / hh) + } else { + return((b / a - 1) / hh) + } } # Linear regression else { - xm = xx - mean(xx) - ym = yy - mean(yy) - b = sum(xm * ym) / sum(xm^2) - a = mean(yy - b * xx) - if (log_scale) return(b) - else return(b / (a + b * x_ref)) + xm <- xx - mean(xx) + ym <- yy - mean(yy) + b <- sum(xm * ym) / sum(xm^2) + a <- mean(yy - b * xx) + if (log_scale) { + return(b) + } else { + return(b / (a + b * x_ref)) + } } }) - + return(g[i0]) } - + # Global methods if (method == "smooth_spline" || method == "trend_filter") { # Convert to numerics - x = as.numeric(x) - x0 = as.numeric(x0) - + x <- as.numeric(x) + x0 <- as.numeric(x0) + # Collect parameters - params = list(...) + params <- list(...) # Smoothing spline if (method == "smooth_spline") { - params$x = x - params$y = y - obj = do.call(stats::smooth.spline, params) - f0 = stats::predict(obj, x = x0)$y - d0 = stats::predict(obj, x = x0, deriv = 1)$y - if (log_scale) return(d0) - else return(d0 / f0) + params$x <- x + params$y <- y + obj <- do.call(stats::smooth.spline, params) + f0 <- stats::predict(obj, x = x0)$y + d0 <- stats::predict(obj, x = x0, deriv = 1)$y + if (log_scale) { + return(d0) + } else { + return(d0 / f0) + } } # Trend filtering else { - ord = params$ord - maxsteps = params$maxsteps - cv = params$cv - df = params$df - k = params$k + ord <- params$ord + maxsteps <- params$maxsteps + cv <- params$cv + df <- params$df + k <- params$k # Default parameters - if (is.null(ord)) ord = 3 - if (is.null(maxsteps)) maxsteps = 1000 - if (is.null(cv)) cv = TRUE - if (is.null(df)) df = "min" - if (is.null(k)) k = 3 + if (is.null(ord)) ord <- 3 + if (is.null(maxsteps)) maxsteps <- 1000 + if (is.null(cv)) cv <- TRUE + if (is.null(df)) df <- "min" + if (is.null(k)) k <- 3 # Check cv and df combo - if (is.numeric(df)) cv = FALSE + if (is.numeric(df)) cv <- FALSE if (!cv && !(is.numeric(df) && df == round(df))) { Abort("If `cv = FALSE`, then `df` must be an integer.") } # Compute trend filtering path - obj = genlasso::trendfilter(y = y, pos = x, ord = ord, max = maxsteps) + obj <- genlasso::trendfilter(y = y, pos = x, ord = ord, max = maxsteps) # Use CV to find df, if we need to if (cv) { - cv_obj = quiet(genlasso::cv.trendfilter(obj, k = k, mode = "df")) - df = ifelse(df == "min", cv_obj$df.min, cv_obj$df.1se) + cv_obj <- quiet(genlasso::cv.trendfilter(obj, k = k, mode = "df")) + df <- ifelse(df == "min", cv_obj$df.min, cv_obj$df.1se) } # Estimate growth rate and return - f = genlasso::coef.genlasso(obj, df = df)$beta - d = ExtendR(diff(f) / diff(x)) - if (log_scale) return(d[i0]) - else return((d / f)[i0]) + f <- genlasso::coef.genlasso(obj, df = df)$beta + d <- ExtendR(diff(f) / diff(x)) + if (log_scale) { + return(d[i0]) + } else { + return((d / f)[i0]) + } } } } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 367fe759..45db2855 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -41,15 +41,19 @@ #' #' @examples #' # warning message of data latency shown -#' epix_as_of(x = archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version)) -#' +#' epix_as_of( +#' x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version) +#' ) +#' #' @examples #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' -#' epix_as_of(x = archive_cases_dv_subset, -#' max_version = as.Date("2020-06-12")) +#' epix_as_of( +#' x = archive_cases_dv_subset, +#' max_version = as.Date("2020-06-12") +#' ) #' #' # When fetching a snapshot as of the latest version with update data in the #' # archive, a warning is issued by default, as this update data might not yet @@ -59,15 +63,20 @@ #' # based on database queries, the latest available update might still be #' # subject to change, but previous versions should be finalized). We can #' # muffle such warnings with the following pattern: -#' withCallingHandlers({ -#' epix_as_of(x = archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version)) -#' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +#' withCallingHandlers( +#' { +#' epix_as_of( +#' x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version) +#' ) +#' }, +#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' ) #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. #' #' @export -epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { +epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value, all_versions = all_versions)) } @@ -102,12 +111,12 @@ epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALS #' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are #' based on LOCF). Default is `"na"`. #' @return An `epi_archive` -epix_fill_through_version = function(x, fill_versions_end, - how=c("na", "locf")) { +epix_fill_through_version <- function(x, fill_versions_end, + how = c("na", "locf")) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") # Enclosing parentheses drop the invisibility flag. See description above of # potential mutation and aliasing behavior. - ( x$clone()$fill_through_version(fill_versions_end, how=how) ) + (x$clone()$fill_through_version(fill_versions_end, how = how)) } #' Merge two `epi_archive` objects @@ -155,21 +164,21 @@ epix_fill_through_version = function(x, fill_versions_end, #' @examples #' # create two example epi_archive datasets #' x <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value,time_value,version,case_rate_7d_av) %>% -#' as_epi_archive(compactify=TRUE) +#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% +#' as_epi_archive(compactify = TRUE) #' y <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value,time_value,version,percent_cli) %>% -#' as_epi_archive(compactify=TRUE) +#' dplyr::select(geo_value, time_value, version, percent_cli) %>% +#' as_epi_archive(compactify = TRUE) #' # merge results stored in a third object: -#' xy = epix_merge(x, y) +#' xy <- epix_merge(x, y) #' # vs. mutating x to hold the merge result: #' x$merge(y) #' #' @importFrom data.table key set setkeyv #' @export -epix_merge = function(x, y, - sync = c("forbid","na","locf","truncate"), - compactify = TRUE) { +epix_merge <- function(x, y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { if (!inherits(x, "epi_archive")) { Abort("`x` must be of class `epi_archive`.") } @@ -190,15 +199,17 @@ epix_merge = function(x, y, if (length(x$additional_metadata) != 0L) { Warn("x$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata") + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) } if (length(y$additional_metadata) != 0L) { Warn("y$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata") + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) } - result_additional_metadata = list() + result_additional_metadata <- list() - result_clobberable_versions_start = + result_clobberable_versions_start <- if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { NA # (any type of NA is fine here) } else { @@ -216,21 +227,23 @@ epix_merge = function(x, y, "`x$versions_end` was not identical to `y$versions_end`;", "either ensure that `x` and `y` are equally up to date before merging,", "or specify how to deal with this using `sync`" - ), class="epiprocess__epix_merge_unresolved_sync") + ), class = "epiprocess__epix_merge_unresolved_sync") } else { - new_versions_end = x$versions_end - x_DT = x$DT - y_DT = y$DT + new_versions_end <- x$versions_end + x_DT <- x$DT + y_DT <- y$DT } } else if (sync %in% c("na", "locf")) { - new_versions_end = max(x$versions_end, y$versions_end) - x_DT = epix_fill_through_version(x, new_versions_end, sync)$DT - y_DT = epix_fill_through_version(y, new_versions_end, sync)$DT + new_versions_end <- max(x$versions_end, y$versions_end) + x_DT <- epix_fill_through_version(x, new_versions_end, sync)$DT + y_DT <- epix_fill_through_version(y, new_versions_end, sync)$DT } else if (sync == "truncate") { - new_versions_end = min(x$versions_end, y$versions_end) - x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with=FALSE] - y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with=FALSE] - } else Abort("unimplemented") + new_versions_end <- min(x$versions_end, y$versions_end) + x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + } else { + Abort("unimplemented") + } # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to @@ -241,15 +254,15 @@ epix_merge = function(x, y, # have a bug in the preprocessing, a weird/invalid archive as input, and/or a # data.table version with different semantics (which may break other parts of # our code). - x_DT_key_as_expected = identical(key(x$DT), key(x_DT)) - y_DT_key_as_expected = identical(key(y$DT), key(y_DT)) + x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) + y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) if (!x_DT_key_as_expected || !y_DT_key_as_expected) { Warn(" `epiprocess` internal warning (please report): pre-processing for epix_merge unexpectedly resulted in an intermediate data table (or tables) with a different key than the corresponding input archive. Manually setting intermediate data table keys to the expected values. - ", internal=TRUE) + ", internal = TRUE) setkeyv(x_DT, key(x$DT)) setkeyv(y_DT, key(y$DT)) } @@ -267,94 +280,106 @@ epix_merge = function(x, y, but y does not), please retry after processing them to share the same key (e.g., by summarizing x to remove the age breakdown, or by applying a static age breakdown to y). - ", class="epiprocess__epix_merge_x_y_must_have_same_key_set") + ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") } # `by` cols = result (and each input's) `key` cols, and determine # the row set, determined using a full join via `merge` # # non-`by` cols = "value"-ish cols, and are looked up with last # version carried forward via rolling joins - by = key(x_DT) # = some perm of key(y_DT) - if (!all(c("geo_value","time_value","version") %in% key(x_DT))) { + by <- key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { Abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to contain "geo_value", "time_value", and "version".', - class="epiprocess__epi_archive_must_have_required_key_cols") + class = "epiprocess__epi_archive_must_have_required_key_cols" + ) } if (length(by) < 1L || utils::tail(by, 1L) != "version") { Abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to have a "version" as the last key col.', - class="epiprocess__epi_archive_must_have_version_at_end_of_key") + class = "epiprocess__epi_archive_must_have_version_at_end_of_key" + ) } - x_nonby_colnames = setdiff(names(x_DT), by) - y_nonby_colnames = setdiff(names(y_DT), by) + x_nonby_colnames <- setdiff(names(x_DT), by) + y_nonby_colnames <- setdiff(names(y_DT), by) if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { Abort(" `x` and `y` DTs have overlapping non-by column names; this is currently not supported; please manually fix up first: any overlapping columns that can are key-like should be incorporated into the key, and other columns should be renamed. - ", class="epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") } - x_by_vals = x_DT[, by, with=FALSE] + x_by_vals <- x_DT[, by, with = FALSE] if (anyDuplicated(x_by_vals) != 0L) { Abort(" The `by` columns must uniquely determine rows of `x$DT`; the `by` is currently set to the common `key` of the two archives, so this can be resolved by adding key-like columns to `x`'s key (to get a unique key). - ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - y_by_vals = y_DT[, by, with=FALSE] + y_by_vals <- y_DT[, by, with = FALSE] if (anyDuplicated(y_by_vals) != 0L) { Abort(" The `by` columns must uniquely determine rows of `y$DT`; the `by` is currently set to the common `key` of the two archives, so this can be resolved by adding key-like columns to `y`'s key (to get a unique key). - ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - result_DT = merge(x_by_vals, y_by_vals, by=by, - # We must have `all=TRUE` or we may skip updates - # from x and/or y and corrupt the history - all=TRUE, - # We don't want Cartesian products, but the - # by-is-unique-key check above already ensures - # this. (Note that `allow.cartesian=FALSE` doesn't - # actually catch all Cartesian products anyway.) - # Disable superfluous check: - allow.cartesian=TRUE) - set(result_DT,, x_nonby_colnames, - x_DT[result_DT[, by, with=FALSE], x_nonby_colnames, with=FALSE, - # It's good practice to specify `on`, and we must - # explicitly specify `on` if there's a potential key vs. - # by order mismatch (not possible currently for x - # with by = key(x$DT), but possible for y): - on = by, - # last version carried forward: - roll=TRUE, - # requesting non-version key that doesn't exist in the other archive, - # or before its first version, should result in NA - nomatch=NA, - # see note on `allow.cartesian` above; currently have a - # similar story here. - allow.cartesian=TRUE]) - set(result_DT,, y_nonby_colnames, - y_DT[result_DT[, by, with=FALSE], y_nonby_colnames, with=FALSE, - on = by, - roll=TRUE, - nomatch=NA, - allow.cartesian=TRUE]) + result_DT <- merge(x_by_vals, y_by_vals, + by = by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all = TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian = TRUE + ) + set( + result_DT, , x_nonby_colnames, + x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + with = FALSE, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll = TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch = NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian = TRUE + ] + ) + set( + result_DT, , y_nonby_colnames, + y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + with = FALSE, + on = by, + roll = TRUE, + nomatch = NA, + allow.cartesian = TRUE + ] + ) # The key could be unset in case of a key vs. by order mismatch as # noted above. Ensure that we keep it: setkeyv(result_DT, by) - return (as_epi_archive( + return(as_epi_archive( result_DT[], # clear data.table internal invisibility flag if set geo_type = x$geo_type, time_type = x$time_type, - other_keys = setdiff(key(result_DT), c("geo_value","time_value","version")), + other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), additional_metadata = result_additional_metadata, # It'd probably be better to pre-compactify before the merge, and might be # guaranteed not to be necessary to compactify the merge result if the @@ -383,10 +408,11 @@ epix_merge = function(x, y, #' @return a `col_modify_recorder_df` #' #' @noRd -new_col_modify_recorder_df = function(parent_df) { +new_col_modify_recorder_df <- function(parent_df) { if (!inherits(parent_df, "data.frame")) { Abort('`parent_df` must inherit class `"data.frame"`', - internal=TRUE) + internal = TRUE + ) } `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) } @@ -398,17 +424,20 @@ new_col_modify_recorder_df = function(parent_df) { #' input to [`dplyr::dplyr_col_modify`] that this class was designed to record #' #' @noRd -destructure_col_modify_recorder_df = function(col_modify_recorder_df) { +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { if (!inherits(col_modify_recorder_df, "col_modify_recorder_df")) { Abort('`col_modify_recorder_df` must inherit class `"col_modify_recorder_df"`', - internal=TRUE) + internal = TRUE + ) } list( unchanged_parent_df = col_modify_recorder_df %>% `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% `class<-`(setdiff(class(.), "col_modify_recorder_df")), cols = attr(col_modify_recorder_df, - "epiprocess::col_modify_recorder_df::cols", exact=TRUE) + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) ) } @@ -420,10 +449,11 @@ destructure_col_modify_recorder_df = function(col_modify_recorder_df) { #' @importFrom dplyr dplyr_col_modify #' @export #' @noRd -dplyr_col_modify.col_modify_recorder_df = function(data, cols) { - if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact=TRUE))) { +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { Abort("`col_modify_recorder_df` can only record `cols` once", - internal=TRUE) + internal = TRUE + ) } attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols data @@ -452,7 +482,7 @@ dplyr_col_modify.col_modify_recorder_df = function(data, cols) { #' Don't export this without cleaning up language of "mutate" as in side effects #' vs. "mutate" as in `dplyr::mutate`. #' @noRd -epix_detailed_restricted_mutate = function(.data, ...) { +epix_detailed_restricted_mutate <- function(.data, ...) { # We don't want to directly use `dplyr::mutate` on the `$DT`, as: # - this likely copies the entire table # - `mutate` behavior, including the output class, changes depending on @@ -466,12 +496,12 @@ epix_detailed_restricted_mutate = function(.data, ...) { # back to something that will use `dplyr`'s included `mutate` method(s), # then convert this using shallow operations into a `data.table`. # - Use `col_modify_recorder_df` to get the desired details. - in_tbl = tibble::as_tibble(as.list(.data$DT), .name_repair="minimal") - col_modify_cols = + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + col_modify_cols <- destructure_col_modify_recorder_df( mutate(new_col_modify_recorder_df(in_tbl), ...) )[["cols"]] - invalidated_key_col_is = + invalidated_key_col_is <- which(purrr::map_lgl(key(.data$DT), function(key_colname) { key_colname %in% names(col_modify_cols) && !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) @@ -480,7 +510,8 @@ epix_detailed_restricted_mutate = function(.data, ...) { rlang::abort(paste_lines(c( "Key columns must not be replaced or removed.", wrap_varnames(key(.data$DT)[invalidated_key_col_is], - initial="Flagged key cols: ") + initial = "Flagged key cols: " + ) ))) } else { # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing @@ -493,12 +524,12 @@ epix_detailed_restricted_mutate = function(.data, ...) { # sorting (including potential extra copies) or sortedness checking, then # `setDT` (rather than `as.data.table`, in order to prevent column copying # to establish ownership according to `data.table`'s memory model). - out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% - data.table::setDT(key=key(.data$DT)) - out_archive = .data$clone() + data.table::setDT(key = key(.data$DT)) + out_archive <- .data$clone() out_archive$DT <- out_DT - request_names = names(col_modify_cols) + request_names <- names(col_modify_cols) return(list( archive = out_archive, request_names = request_names @@ -577,7 +608,7 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' @examples #' -#' grouped_archive = archive_cases_dv_subset %>% group_by(geo_value) +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) #' #' # `print` for metadata and method listing: #' grouped_archive %>% print() @@ -586,10 +617,12 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% -#' epix_slide(f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = 'case_rate_3d_av') %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% #' ungroup() #' #' # ----------------------------------------------------------------- @@ -597,34 +630,42 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' # Advanced: some other features of dplyr grouping are implemented: #' #' library(dplyr) -#' toy_archive = +#' toy_archive <- #' tribble( -#' ~geo_value, ~age_group, ~time_value, ~version, ~value, -#' "us", "adult", "2000-01-01", "2000-01-02", 121, -#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) -#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) -#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) #' ) %>% -#' mutate(age_group = ordered(age_group, c("pediatric", "adult")), -#' time_value = as.Date(time_value), -#' version = as.Date(version)) %>% #' as_epi_archive(other_keys = "age_group") #' #' # The following are equivalent: #' toy_archive %>% group_by(geo_value, age_group) -#' toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) -#' grouping_cols = c("geo_value", "age_group") +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") #' toy_archive %>% group_by(across(all_of(grouping_cols))) #' #' # And these are equivalent: #' toy_archive %>% group_by(geo_value) -#' toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) #' #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -#' toy_archive %>% group_by(geo_value) %>% groups() +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() #' #' toy_archive %>% -#' group_by(geo_value, age_group, .drop=FALSE) %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% #' epix_slide(f = ~ sum(.x$value), before = 20) %>% #' ungroup() #' @@ -632,27 +673,30 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' @export #' #' @aliases grouped_epi_archive -group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { +group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. - detailed_mutate = epix_detailed_restricted_mutate(.data, ...) + detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) if (!rlang::is_bool(.drop)) { Abort("`.drop` must be TRUE or FALSE") } if (!.drop) { - grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] - grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. if (!any(grouping_col_is_factor)) { Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) } else if (any(diff(grouping_col_is_factor) == -1L)) { Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) } } grouped_epi_archive$new(detailed_mutate[["archive"]], - detailed_mutate[["request_names"]], - drop = .drop) + detailed_mutate[["request_names"]], + drop = .drop + ) } #' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` @@ -818,17 +862,20 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' #' # Reference time points for which we want to compute slide values: #' ref_time_values <- seq(as.Date("2020-06-01"), -#' as.Date("2020-06-15"), -#' by = "1 day") +#' as.Date("2020-06-15"), +#' by = "1 day" +#' ) #' #' # A simple (but not very useful) example (see the archive vignette for a more #' # realistic one): #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% -#' epix_slide(f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = ref_time_values, -#' new_col_name = 'case_rate_7d_av_recent_av') %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = "case_rate_7d_av_recent_av" +#' ) %>% #' ungroup() #' # We requested time windows that started 2 days before the corresponding time #' # values. The actual number of `time_value`s in each computation depends on @@ -846,23 +893,24 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' # Examining characteristics of the data passed to each computation with #' # `all_versions=FALSE`. #' archive_cases_dv_subset %>% -#' group_by(geo_value) %>% -#' epix_slide( -#' function(x, gk, rtv) { -#' tibble( -#' time_range = if(nrow(x) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) -#' }, -#' n = nrow(x), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' before = 5, all_versions = FALSE, -#' ref_time_values = ref_time_values, names_sep=NULL) %>% -#' ungroup() %>% -#' arrange(geo_value, time_value) +#' group_by(geo_value) %>% +#' epix_slide( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if (nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) #' #' # --- Advanced: --- #' @@ -884,7 +932,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' toString(min(x$DT$version)) #' }, #' versions_end = x$versions_end, -#' time_range = if(nrow(x$DT) == 0L) { +#' time_range = if (nrow(x$DT) == 0L) { #' "0 `time_value`s" #' } else { #' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) @@ -894,7 +942,8 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' ) #' }, #' before = 5, all_versions = TRUE, -#' ref_time_values = ref_time_values, names_sep=NULL) %>% +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% #' ungroup() %>% #' # Focus on one geo_value so we can better see the columns above: #' filter(geo_value == "ca") %>% @@ -902,30 +951,31 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' #' @importFrom rlang enquo !!! #' @export -epix_slide = function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - if (!is_epi_archive(x, grouped_okay=TRUE)) { +epix_slide <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive(x, grouped_okay = TRUE)) { Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } - return(x$slide(f, ..., before = before, - ref_time_values = ref_time_values, - time_step = time_step, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_versions = all_versions - )) + return(x$slide(f, ..., + before = before, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_name = new_col_name, + as_list_col = as_list_col, + names_sep = names_sep, + all_versions = all_versions + )) } #' Default value for `ref_time_values` in an `epix_slide` #' #' @noRd -epix_slide_ref_time_values_default = function(ea) { - versions_with_updates = c(ea$DT$version, ea$versions_end) - ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) - return (ref_time_values) +epix_slide_ref_time_values_default <- function(ea) { + versions_with_updates <- c(ea$DT$version, ea$versions_end) + ref_time_values <- tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + return(ref_time_values) } #' Filter an `epi_archive` object to keep only older versions @@ -941,12 +991,12 @@ epix_slide_ref_time_values_default = function(ea) { #' @return An `epi_archive` object #' #' @export -epix_truncate_versions_after = function(x, max_version) { +epix_truncate_versions_after <- function(x, max_version) { UseMethod("epix_truncate_versions_after") } #' @export -epix_truncate_versions_after.epi_archive = function(x, max_version) { - return ((x$clone()$truncate_versions_after(max_version))) +epix_truncate_versions_after.epi_archive <- function(x, max_version) { + return((x$clone()$truncate_versions_after(max_version))) # ^ second set of parens drops invisibility } diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 6e4666e7..7e002320 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -8,7 +8,7 @@ #' #' @importFrom tibble as_tibble #' @export -as_tibble.epi_df = function(x, ...) { +as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. @@ -16,22 +16,23 @@ as_tibble.epi_df = function(x, ...) { } #' Convert to tsibble format -#' -#' Converts an `epi_df` object into a tsibble, where the index is taken to be +#' +#' Converts an `epi_df` object into a tsibble, where the index is taken to be #' `time_value`, and the key variables taken to be `geo_value` along with any -#' others in the `other_keys` field of the metadata, or else explicitly set. +#' others in the `other_keys` field of the metadata, or else explicitly set. #' #' @method as_tsibble epi_df #' @param x The `epi_df` object. -#' @param key Optional. Any additional keys (other than `geo_value`) to add to +#' @param key Optional. Any additional keys (other than `geo_value`) to add to #' the `tsibble`. #' @param ... additional arguments passed on to `tsibble::as_tsibble()` #' @export -as_tsibble.epi_df = function(x, key, ...) { - if (missing(key)) key = c("geo_value", attributes(x)$metadata$other_keys) +as_tsibble.epi_df <- function(x, key, ...) { + if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys) return(as_tsibble(tibble::as_tibble(x), - key = tidyselect::all_of(key), index = "time_value", - ...)) + key = tidyselect::all_of(key), index = "time_value", + ... + )) } #' Base S3 methods for an `epi_df` object @@ -43,9 +44,11 @@ as_tsibble.epi_df = function(x, key, ...) { #' #' @method print epi_df #' @export -print.epi_df = function(x, ...) { - cat("An `epi_df` object,", prettyNum(nrow(x),","), "x", - prettyNum(ncol(x),","), "with metadata:\n") +print.epi_df <- function(x, ...) { + cat( + "An `epi_df` object,", prettyNum(nrow(x), ","), "x", + prettyNum(ncol(x), ","), "with metadata:\n" + ) cat(sprintf("* %-9s = %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", attributes(x)$metadata$time_type)) cat(sprintf("* %-9s = %s\n", "as_of", attributes(x)$metadata$as_of)) @@ -67,7 +70,7 @@ print.epi_df = function(x, ...) { #' @importFrom rlang .data #' @importFrom stats median #' @export -summary.epi_df = function(object, ...) { +summary.epi_df <- function(object, ...) { cat("An `epi_df` x, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", attributes(object)$metadata$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", attributes(object)$metadata$time_type)) @@ -75,10 +78,12 @@ summary.epi_df = function(object, ...) { cat("----------\n") cat(sprintf("* %-27s = %s\n", "min time value", min(object$time_value))) cat(sprintf("* %-27s = %s\n", "max time value", max(object$time_value))) - cat(sprintf("* %-27s = %i\n", "average rows per time value", - as.integer(object %>% dplyr::group_by(.data$time_value) %>% - dplyr::summarize(num = dplyr::n()) %>% - dplyr::summarize(mean(.data$num))))) + cat(sprintf( + "* %-27s = %i\n", "average rows per time value", + as.integer(object %>% dplyr::group_by(.data$time_value) %>% + dplyr::summarize(num = dplyr::n()) %>% + dplyr::summarize(mean(.data$num))) + )) } #' Drop any `epi_df` metadata and class on a data frame @@ -93,7 +98,7 @@ summary.epi_df = function(object, ...) { #' present, dropped #' #' @noRd -decay_epi_df = function(x) { +decay_epi_df <- function(x) { attributes(x)$metadata <- NULL class(x) <- class(x)[class(x) != "epi_df"] x @@ -117,23 +122,26 @@ decay_epi_df = function(x) { #' @importFrom dplyr dplyr_reconstruct #' @export #' @noRd -dplyr_reconstruct.epi_df = function(data, template) { +dplyr_reconstruct.epi_df <- function(data, template) { # Start from a reconstruction for the backing S3 classes; this ensures that we # keep any grouping that has been applied: res <- NextMethod() - + cn <- names(res) # Duplicate columns, Abort - dup_col_names = cn[duplicated(cn)] + dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { - Abort(paste0("Column name(s) ", - paste(unique(dup_col_names), - collapse = ", "), " must not be duplicated.")) + Abort(paste0( + "Column name(s) ", + paste(unique(dup_col_names), + collapse = ", " + ), " must not be duplicated." + )) } - + not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn) - + if (not_epi_df) { # If we're calling on an `epi_df` from one of our own functions, we need to # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble, @@ -142,43 +150,45 @@ dplyr_reconstruct.epi_df = function(data, template) { # should work in both cases. return(decay_epi_df(res)) } - + res <- reclass(res, attr(template, "metadata")) # XXX we may want verify the `geo_type` and `time_type` here. If it's # significant overhead, we may also want to keep this less strict version # around and implement some extra S3 methods that use it, when appropriate. - + # Amend additional metadata if some other_keys cols are dropped in the subset - old_other_keys = attr(template, "metadata")$other_keys + old_other_keys <- attr(template, "metadata")$other_keys attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn] - + res } #' @export `[.epi_df` <- function(x, i, j, drop = FALSE) { res <- NextMethod() - - if (!is.data.frame(res)) return(res) - + + if (!is.data.frame(res)) { + return(res) + } + dplyr::dplyr_reconstruct(res, x) } #' @importFrom dplyr dplyr_col_modify #' @export -dplyr_col_modify.epi_df = function(data, cols) { +dplyr_col_modify.epi_df <- function(data, cols) { dplyr::dplyr_reconstruct(NextMethod(), data) } #' @importFrom dplyr dplyr_row_slice #' @export -dplyr_row_slice.epi_df = function(data, i, ...) { +dplyr_row_slice.epi_df <- function(data, i, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) } #' @export -`names<-.epi_df` = function(x, value) { +`names<-.epi_df` <- function(x, value) { old_names <- names(x) old_metadata <- attr(x, "metadata") old_other_keys <- old_metadata[["other_keys"]] @@ -193,18 +203,18 @@ dplyr_row_slice.epi_df = function(data, i, ...) { #' @method group_by epi_df #' @rdname print.epi_df #' @export -group_by.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() +group_by.epi_df <- function(.data, ...) { + metadata <- attributes(.data)$metadata + .data <- NextMethod() reclass(.data, metadata) } #' @method ungroup epi_df #' @rdname print.epi_df #' @export -ungroup.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() +ungroup.epi_df <- function(x, ...) { + metadata <- attributes(x)$metadata + x <- NextMethod() reclass(x, metadata) } @@ -214,7 +224,7 @@ ungroup.epi_df = function(x, ...) { #' @param .f function or formula; see [`dplyr::group_modify`] #' @param .keep Boolean; see [`dplyr::group_modify`] #' @export -group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { +group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) } @@ -222,13 +232,13 @@ group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { #' @rdname print.epi_df #' @param data The `epi_df` object. #' @export -unnest.epi_df = function(data, ...) { +unnest.epi_df <- function(data, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) } # Simple reclass function -reclass = function(x, metadata) { - class(x) = unique(c("epi_df", class(x))) - attributes(x)$metadata = metadata +reclass <- function(x, metadata) { + class(x) <- unique(c("epi_df", class(x))) + attributes(x)$metadata <- metadata return(x) } diff --git a/R/outliers.R b/R/outliers.R index e5fd8765..1eb3ea01 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -46,38 +46,54 @@ #' @export #' @importFrom dplyr select #' @examples -#' detection_methods = dplyr::bind_rows( -#' dplyr::tibble(method = "rm", -#' args = list(list(detect_negatives = TRUE, -#' detection_multiplier = 2.5)), -#' abbr = "rm"), -#' dplyr::tibble(method = "stl", -#' args = list(list(detect_negatives = TRUE, -#' detection_multiplier = 2.5, -#' seasonal_period = 7)), -#' abbr = "stl_seasonal"), -#' dplyr::tibble(method = "stl", -#' args = list(list(detect_negatives = TRUE, -#' detection_multiplier = 2.5, -#' seasonal_period = NULL)), -#' abbr = "stl_nonseasonal")) +#' detection_methods <- dplyr::bind_rows( +#' dplyr::tibble( +#' method = "rm", +#' args = list(list( +#' detect_negatives = TRUE, +#' detection_multiplier = 2.5 +#' )), +#' abbr = "rm" +#' ), +#' dplyr::tibble( +#' method = "stl", +#' args = list(list( +#' detect_negatives = TRUE, +#' detection_multiplier = 2.5, +#' seasonal_period = 7 +#' )), +#' abbr = "stl_seasonal" +#' ), +#' dplyr::tibble( +#' method = "stl", +#' args = list(list( +#' detect_negatives = TRUE, +#' detection_multiplier = 2.5, +#' seasonal_period = NULL +#' )), +#' abbr = "stl_nonseasonal" +#' ) +#' ) #' -#' x <- incidence_num_outlier_example %>% -#' dplyr::select(geo_value,time_value,cases) %>% -#' as_epi_df() %>% -#' group_by(geo_value) %>% -#' mutate(outlier_info = detect_outlr( -#' x = time_value, y = cases, -#' methods = detection_methods, -#' combiner = "median")) %>% -#' unnest(outlier_info) -detect_outlr = function(x = seq_along(y), y, - methods = tibble::tibble(method = "rm", - args = list(list()), - abbr = "rm"), - combiner = c("median", "mean", "none")) { +#' x <- incidence_num_outlier_example %>% +#' dplyr::select(geo_value, time_value, cases) %>% +#' as_epi_df() %>% +#' group_by(geo_value) %>% +#' mutate(outlier_info = detect_outlr( +#' x = time_value, y = cases, +#' methods = detection_methods, +#' combiner = "median" +#' )) %>% +#' unnest(outlier_info) +detect_outlr <- function(x = seq_along(y), y, + methods = tibble::tibble( + method = "rm", + args = list(list()), + abbr = "rm" + ), + combiner = c("median", "mean", "none")) { # Validate combiner - combiner = match.arg(combiner) + combiner <- match.arg(combiner) # Validate that x contains all distinct values if (any(duplicated(x))) { @@ -85,32 +101,33 @@ detect_outlr = function(x = seq_along(y), y, } # Run all outlier detection methods - results = purrr::pmap_dfc(methods, function(method, args, abbr) { - if (is.character(method)) method = paste0("detect_outlr_", method) + results <- purrr::pmap_dfc(methods, function(method, args, abbr) { + if (is.character(method)) method <- paste0("detect_outlr_", method) # Call the method - results = do.call(method, args = c(list("x" = x, "y" = y), args)) + results <- do.call(method, args = c(list("x" = x, "y" = y), args)) - # Validate the output + # Validate the output if (!is.data.frame(results) || - !all(c("lower", "upper", "replacement") %in% colnames(results))) { + !all(c("lower", "upper", "replacement") %in% colnames(results))) { Abort("Outlier detection method must return a data frame with columns `lower`, `upper`, and `replacement`.") } # Update column names with model abbreviation - colnames(results) = paste(abbr, colnames(results), sep = "_") + colnames(results) <- paste(abbr, colnames(results), sep = "_") return(results) }) # Combine information about detected outliers if (combiner != "none") { - if (combiner == "mean") combine_fun = mean - else if (combiner == "median") combine_fun = median + if (combiner == "mean") { + combine_fun <- mean + } else if (combiner == "median") combine_fun <- median for (target in c("lower", "upper", "replacement")) { - results[[paste0("combined_", target)]] = apply( + results[[paste0("combined_", target)]] <- apply( results %>% - dplyr::select(dplyr::ends_with(target)), 1, combine_fun + dplyr::select(dplyr::ends_with(target)), 1, combine_fun ) } } @@ -154,48 +171,54 @@ detect_outlr = function(x = seq_along(y), y, #' @examples #' # Detect outliers based on a rolling median #' incidence_num_outlier_example %>% -#' dplyr::select(geo_value,time_value,cases) %>% +#' dplyr::select(geo_value, time_value, cases) %>% #' as_epi_df() %>% #' group_by(geo_value) %>% -#' mutate(outlier_info = detect_outlr_rm( -#' x = time_value, y = cases)) %>% +#' mutate(outlier_info = detect_outlr_rm( +#' x = time_value, y = cases +#' )) %>% #' unnest(outlier_info) -detect_outlr_rm = function(x = seq_along(y), y, n = 21, - log_transform = FALSE, - detect_negatives = FALSE, - detection_multiplier = 2, - min_radius = 0, - replacement_multiplier = 0) { +detect_outlr_rm <- function(x = seq_along(y), y, n = 21, + log_transform = FALSE, + detect_negatives = FALSE, + detection_multiplier = 2, + min_radius = 0, + replacement_multiplier = 0) { # Transform if requested if (log_transform) { # Replace all negative values with 0 - y = pmax(0, y) - offset = as.integer(any(y == 0)) - y = log(y + offset) + y <- pmax(0, y) + offset <- as.integer(any(y == 0)) + y <- log(y + offset) } # Detect negatives if requested - if (detect_negatives && !log_transform) min_lower = 0 - else min_lower = -Inf + if (detect_negatives && !log_transform) { + min_lower <- 0 + } else { + min_lower <- -Inf + } # Make an epi_df for easy sliding - z = as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) + z <- as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) # Calculate lower and upper thresholds and replacement value - z = z %>% - epi_slide(fitted = median(y), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>% + z <- z %>% + epi_slide(fitted = median(y), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% dplyr::mutate(resid = y - fitted) %>% - roll_iqr(n = n, - detection_multiplier = detection_multiplier, - min_radius = min_radius, - replacement_multiplier = replacement_multiplier, - min_lower = min_lower) + roll_iqr( + n = n, + detection_multiplier = detection_multiplier, + min_radius = min_radius, + replacement_multiplier = replacement_multiplier, + min_lower = min_lower + ) # Undo log transformation if necessary if (log_transform) { - z$lower = exp(z$lower) - offset - z$upper = exp(z$upper) - offset - z$replacement = exp(z$replacement) - offset + z$lower <- exp(z$lower) - offset + z$upper <- exp(z$upper) - offset + z$replacement <- exp(z$replacement) - offset } return(z) @@ -251,100 +274,116 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, #' @examples #' # Detects outliers based on a seasonal-trend decomposition using LOESS #' incidence_num_outlier_example %>% -#' dplyr::select(geo_value,time_value,cases) %>% +#' dplyr::select(geo_value, time_value, cases) %>% #' as_epi_df() %>% #' group_by(geo_value) %>% -#' mutate(outlier_info = detect_outlr_stl( +#' mutate(outlier_info = detect_outlr_stl( #' x = time_value, y = cases, -#' seasonal_period = 7 )) %>% # weekly seasonality for daily data +#' seasonal_period = 7 +#' )) %>% # weekly seasonality for daily data #' unnest(outlier_info) -detect_outlr_stl = function(x = seq_along(y), y, - n_trend = 21, - n_seasonal = 21, - n_threshold = 21, - seasonal_period = NULL, - log_transform = FALSE, - detect_negatives = FALSE, - detection_multiplier = 2, - min_radius = 0, - replacement_multiplier = 0) { +detect_outlr_stl <- function(x = seq_along(y), y, + n_trend = 21, + n_seasonal = 21, + n_threshold = 21, + seasonal_period = NULL, + log_transform = FALSE, + detect_negatives = FALSE, + detection_multiplier = 2, + min_radius = 0, + replacement_multiplier = 0) { # Transform if requested if (log_transform) { # Replace all negative values with 0 - y = pmax(0, y) - offset = as.integer(any(y == 0)) - y = log(y + offset) + y <- pmax(0, y) + offset <- as.integer(any(y == 0)) + y <- log(y + offset) } # Make a tsibble for fabletools, setup and run STL - z_tsibble = tsibble::tsibble(x = x, y = y, index = x) + z_tsibble <- tsibble::tsibble(x = x, y = y, index = x) - stl_formula = y ~ trend(window = n_trend) + + stl_formula <- y ~ trend(window = n_trend) + season(period = seasonal_period, window = n_seasonal) - stl_components = z_tsibble %>% + stl_components <- z_tsibble %>% fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>% generics::components() %>% tibble::as_tibble() %>% dplyr::select(trend:remainder) %>% - dplyr::rename_with(~ "seasonal", tidyselect::starts_with("season")) %>% + dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>% dplyr::rename(resid = remainder) # Allocate the seasonal term from STL to either fitted or resid if (!is.null(seasonal_period)) { - stl_components = stl_components %>% + stl_components <- stl_components %>% dplyr::mutate( - fitted = trend + seasonal) + fitted = trend + seasonal + ) } else { - stl_components = stl_components %>% + stl_components <- stl_components %>% dplyr::mutate( fitted = trend, - resid = seasonal + resid) + resid = seasonal + resid + ) } # Detect negatives if requested - if (detect_negatives && !log_transform) min_lower = 0 - else min_lower = -Inf + if (detect_negatives && !log_transform) { + min_lower <- 0 + } else { + min_lower <- -Inf + } # Make an epi_df for easy sliding - z = as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) + z <- as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) # Calculate lower and upper thresholds and replacement value - z = z %>% + z <- z %>% dplyr::mutate( fitted = stl_components$fitted, - resid = stl_components$resid) %>% - roll_iqr(n = n_threshold, - detection_multiplier = detection_multiplier, - min_radius = min_radius, - replacement_multiplier = replacement_multiplier, - min_lower = min_lower) + resid = stl_components$resid + ) %>% + roll_iqr( + n = n_threshold, + detection_multiplier = detection_multiplier, + min_radius = min_radius, + replacement_multiplier = replacement_multiplier, + min_lower = min_lower + ) # Undo log transformation if necessary if (log_transform) { - z$lower = exp(z$lower) - offset - z$upper = exp(z$upper) - offset - z$replacement = exp(z$replacement) - offset + z$lower <- exp(z$lower) - offset + z$upper <- exp(z$upper) - offset + z$replacement <- exp(z$replacement) - offset } return(z) } # Common function for rolling IQR, using fitted and resid variables -roll_iqr = function(z, n, detection_multiplier, min_radius, - replacement_multiplier, min_lower) { - if (typeof(z$y) == "integer") as_type = as.integer - else as_type = as.numeric +roll_iqr <- function(z, n, detection_multiplier, min_radius, + replacement_multiplier, min_lower) { + if (typeof(z$y) == "integer") { + as_type <- as.integer + } else { + as_type <- as.numeric + } - epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>% + epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% dplyr::mutate( - lower = pmax(min_lower, - fitted - pmax(min_radius, detection_multiplier * roll_iqr)), + lower = pmax( + min_lower, + fitted - pmax(min_radius, detection_multiplier * roll_iqr) + ), upper = fitted + pmax(min_radius, detection_multiplier * roll_iqr), replacement = dplyr::case_when( - (y < lower) ~ as_type(fitted - replacement_multiplier * roll_iqr), - (y > upper) ~ as_type(fitted + replacement_multiplier * roll_iqr), - TRUE ~ y)) %>% + (y < lower) ~ as_type(fitted - replacement_multiplier * roll_iqr), + (y > upper) ~ as_type(fitted + replacement_multiplier * roll_iqr), + TRUE ~ y + ) + ) %>% dplyr::select(lower, upper, replacement) %>% tibble::as_tibble() } diff --git a/R/slide.R b/R/slide.R index 0feb689a..2a10efce 100644 --- a/R/slide.R +++ b/R/slide.R @@ -76,7 +76,7 @@ #' `NA` marker. #' @return An `epi_df` object given by appending a new column to `x`, named #' according to the `new_col_name` argument. -#' +#' #' @details To "slide" means to apply a function or formula over a rolling #' window of time steps for each data group, where the window is entered at a #' reference time and left and right endpoints are given by the `before` and @@ -117,21 +117,21 @@ #' new_col_name = "cases_7dav") #' ``` #' Thus, to be clear, when the computation is specified via an expression for -#' tidy evaluation (first example, above), then the name for the new column is -#' inferred from the given expression and overrides any name passed explicitly +#' tidy evaluation (first example, above), then the name for the new column is +#' inferred from the given expression and overrides any name passed explicitly #' through the `new_col_name` argument. -#' +#' #' @importFrom lubridate days weeks #' @importFrom dplyr bind_rows group_vars filter select #' @importFrom rlang .data .env !! enquo enquos sym env missing_arg #' @export -#' @examples +#' @examples #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6) %>% +#' epi_slide(cases_7dav = mean(cases), before = 6) %>% #' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' dplyr::select(-death_rate_7d_av) #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% @@ -143,35 +143,38 @@ #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% +#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% #' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' dplyr::select(-death_rate_7d_av) #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% +#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% #' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' dplyr::select(-death_rate_7d_av) #' #' # nested new columns #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(a = data.frame(cases_2dav = mean(cases), -#' cases_2dma = mad(cases)), -#' before = 1, as_list_col = TRUE) -epi_slide = function(x, f, ..., before, after, ref_time_values, - time_step, - new_col_name = "slide_value", as_list_col = FALSE, - names_sep = "_", all_rows = FALSE) { - +#' epi_slide( +#' a = data.frame( +#' cases_2dav = mean(cases), +#' cases_2dma = mad(cases) +#' ), +#' before = 1, as_list_col = TRUE +#' ) +epi_slide <- function(x, f, ..., before, after, ref_time_values, + time_step, + new_col_name = "slide_value", as_list_col = FALSE, + names_sep = "_", all_rows = FALSE) { # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") - + if (missing(ref_time_values)) { - ref_time_values = unique(x$time_value) + ref_time_values <- unique(x$time_value) } - + # Some of these `ref_time_values` checks and processing steps also apply to # the `ref_time_values` default; for simplicity, just apply all the steps # regardless of whether we are working with a default or user-provided @@ -185,9 +188,9 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } else if (!all(ref_time_values %in% unique(x$time_value))) { Abort("All `ref_time_values` must appear in `x$time_value`.") } else { - ref_time_values = sort(ref_time_values) + ref_time_values <- sort(ref_time_values) } - + # Validate and pre-process `before`, `after`: if (!missing(before)) { before <- vctrs::vec_cast(before, integer()) @@ -227,20 +230,20 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, after <- time_step(after) } - min_ref_time_values = ref_time_values - before + min_ref_time_values <- ref_time_values - before min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))] # Do set up to let us recover `ref_time_value`s later. # A helper column marking real observations. - x$.real = TRUE + x$.real <- TRUE # Create df containing phony data. Df has the same columns and attributes as # `x`, but filled with `NA`s aside from grouping columns. Number of rows is # equal to the number of `min_ref_time_values_not_in_x` we have * the # number of unique levels seen in the grouping columns. - before_time_values_df = data.frame(time_value=min_ref_time_values_not_in_x) + before_time_values_df <- data.frame(time_value = min_ref_time_values_not_in_x) if (length(group_vars(x)) != 0) { - before_time_values_df = dplyr::cross_join( + before_time_values_df <- dplyr::cross_join( # Get unique combinations of grouping columns seen in real data. unique(x[, group_vars(x)]), before_time_values_df @@ -248,69 +251,71 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } # Automatically fill in all other columns from `x` with `NA`s, and carry # attributes over to new df. - before_time_values_df <- bind_rows(x[0,], before_time_values_df) + before_time_values_df <- bind_rows(x[0, ], before_time_values_df) before_time_values_df$.real <- FALSE x <- bind_rows(before_time_values_df, x) # Arrange by increasing time_value - x = arrange(x, time_value) + x <- arrange(x, time_value) # Now set up starts and stops for sliding/hopping - time_range = range(unique(x$time_value)) - starts = in_range(ref_time_values - before, time_range) - stops = in_range(ref_time_values + after, time_range) - - if( length(starts) == 0 || length(stops) == 0 ) { + time_range <- range(unique(x$time_value)) + starts <- in_range(ref_time_values - before, time_range) + stops <- in_range(ref_time_values + after, time_range) + + if (length(starts) == 0 || length(stops) == 0) { Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).") } # Symbolize new column name - new_col = sym(new_col_name) + new_col <- sym(new_col_name) # Computation for one group, all time values - slide_one_grp = function(.data_group, - f, ..., - starts, - stops, - time_values, - all_rows, - new_col) { + slide_one_grp <- function(.data_group, + f, ..., + starts, + stops, + time_values, + all_rows, + new_col) { # Figure out which reference time values appear in the data group in the # first place (we need to do this because it could differ based on the # group, hence the setup/checks for the reference time values based on all # the data could still be off) - o = time_values %in% .data_group$time_value - starts = starts[o] - stops = stops[o] - time_values = time_values[o] - - # Compute the slide values - slide_values_list = slider::hop_index(.x = .data_group, - .i = .data_group$time_value, - .f = f, ..., - .starts = starts, - .stops = stops) + o <- time_values %in% .data_group$time_value + starts <- starts[o] + stops <- stops[o] + time_values <- time_values[o] + + # Compute the slide values + slide_values_list <- slider::hop_index( + .x = .data_group, + .i = .data_group$time_value, + .f = f, ..., + .starts = starts, + .stops = stops + ) # Now figure out which rows in the data group are in the reference time # values; this will be useful for all sorts of checks that follow - o = .data_group$time_value %in% time_values - num_ref_rows = sum(o) - + o <- .data_group$time_value %in% time_values + num_ref_rows <- sum(o) + # Count the number of appearances of each reference time value (these # appearances should all be real for now, but if we allow ref time values # outside of .data_group's time values): - counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>% + counts <- dplyr::filter(.data_group, .data$time_value %in% time_values) %>% dplyr::count(.data$time_value) %>% dplyr::pull(n) if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && - !all(purrr::map_lgl(slide_values_list, is.data.frame))) { + !all(purrr::map_lgl(slide_values_list, is.data.frame))) { Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } # Unlist if appropriate: - slide_values = + slide_values <- if (as_list_col) { slide_values_list } else { @@ -318,16 +323,16 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && - length(slide_values_list) != 0L) { + length(slide_values_list) != 0L) { # Recycle to make size stable (one slide value per ref time value). # (Length-0 case also could be handled here, but causes difficulties; # leave it to the next branch, where it also belongs.) - slide_values = vctrs::vec_rep_each(slide_values, times = counts) + slide_values <- vctrs::vec_rep_each(slide_values, times = counts) } else { # Split and flatten if appropriate, perform a (loose) check on number of # rows. if (as_list_col) { - slide_values = purrr::list_flatten(purrr::map( + slide_values <- purrr::list_flatten(purrr::map( slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]] )) } @@ -338,60 +343,61 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # If all rows, then pad slide values with NAs, else filter down data group if (all_rows) { - orig_values = slide_values - slide_values = vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) + orig_values <- slide_values + slide_values <- vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) # ^ using vctrs::vec_init would be shorter but docs don't guarantee it # fills with NA equivalent. - vctrs::vec_slice(slide_values, o) = orig_values + vctrs::vec_slice(slide_values, o) <- orig_values } else { # This implicitly removes phony (`.real` == FALSE) observations. - .data_group = filter(.data_group, o) + .data_group <- filter(.data_group, o) } return(mutate(.data_group, !!new_col := slide_values)) } # If `f` is missing, interpret ... as an expression for tidy evaluation if (missing(f)) { - quos = enquos(...) + quos <- enquos(...) if (length(quos) == 0) { Abort("If `f` is missing then a computation must be specified via `...`.") } if (length(quos) > 1) { Abort("If `f` is missing then only a single computation can be specified via `...`.") } - - f = quos[[1]] - new_col = sym(names(rlang::quos_auto_name(quos))) - ... = missing_arg() # magic value that passes zero args as dots in calls below + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below } - f = as_slide_computation(f, ...) + f <- as_slide_computation(f, ...) # Create a wrapper that calculates and passes `.ref_time_value` to the # computation. - f_wrapper = function(.x, .group_key, ...) { - .ref_time_value = min(.x$time_value) + before - .x <- .x[.x$.real,] + f_wrapper <- function(.x, .group_key, ...) { + .ref_time_value <- min(.x$time_value) + before + .x <- .x[.x$.real, ] .x$.real <- NULL f(.x, .group_key, .ref_time_value, ...) } - x = group_modify(x, slide_one_grp, - f = f_wrapper, ..., - starts = starts, - stops = stops, - time_values = ref_time_values, - all_rows = all_rows, - new_col = new_col, - .keep = FALSE) - + x <- group_modify(x, slide_one_grp, + f = f_wrapper, ..., + starts = starts, + stops = stops, + time_values = ref_time_values, + all_rows = all_rows, + new_col = new_col, + .keep = FALSE + ) + # Unnest if we need to, and return if (!as_list_col) { - x = unnest(x, !!new_col, names_sep = names_sep) + x <- unnest(x, !!new_col, names_sep = names_sep) } # Remove any remaining phony observations. When `all_rows` is TRUE, phony # observations aren't necessarily removed in `slide_one_grp`. if (all_rows) { - x <- x[x$.real,] + x <- x[x$.real, ] } # Drop helper column `.real`. diff --git a/R/utils.R b/R/utils.R index 471fb053..9cc707a6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,6 +1,6 @@ -break_str = function(str, nchar = 79, init = "") { - str = paste(strwrap(str, nchar, initial = init), collapse = "\n") - str[1] = substring(str, nchar(init)+1) +break_str <- function(str, nchar = 79, init = "") { + str <- paste(strwrap(str, nchar, initial = init), collapse = "\n") + str[1] <- substring(str, nchar(init) + 1) return(str) } @@ -29,9 +29,9 @@ break_str = function(str, nchar = 79, init = "") { #' @return `chr`; to print, use [`base::writeLines`]. #' #' @noRd -wrap_symbolics = function(symbolics, - initial = "", common_prefix = "", none_str = "", - width = getOption("width", 80L)) { +wrap_symbolics <- function(symbolics, + initial = "", common_prefix = "", none_str = "", + width = getOption("width", 80L)) { if (!all(purrr::map_lgl(symbolics, rlang::is_symbolic))) { Abort("`symbolics` must be a list of symbolic objects") } @@ -44,14 +44,16 @@ wrap_symbolics = function(symbolics, if (!rlang::is_string(none_str)) { Abort("`none_str` must be a string") } - prefix = strrep(" ", nchar(initial, type="width")) - full_initial = paste0(common_prefix, initial) - full_prefix = paste0(common_prefix, prefix) - full_initial_width = nchar(full_initial, type="width") - minimum_reasonable_line_width_for_syms = 20L - line_width_for_syms = max(width - full_initial_width, - minimum_reasonable_line_width_for_syms) - unprefixed_lines = + prefix <- strrep(" ", nchar(initial, type = "width")) + full_initial <- paste0(common_prefix, initial) + full_prefix <- paste0(common_prefix, prefix) + full_initial_width <- nchar(full_initial, type = "width") + minimum_reasonable_line_width_for_syms <- 20L + line_width_for_syms <- max( + width - full_initial_width, + minimum_reasonable_line_width_for_syms + ) + unprefixed_lines <- if (length(symbolics) == 0L) { none_str } else { @@ -60,12 +62,14 @@ wrap_symbolics = function(symbolics, # `paste0` already takes care of necessary backquotes. `cat` with # `fill=TRUE` takes care of spacing + line wrapping exclusively # between elements. We need to add commas appropriately. - cat(paste0(symbolics, c(rep(",", times=length(symbolics)-1L), "")), fill=TRUE) + cat(paste0(symbolics, c(rep(",", times = length(symbolics) - 1L), "")), fill = TRUE) }) ) } - lines = paste0(c(full_initial, rep(full_prefix, times=length(unprefixed_lines)-1L)), - unprefixed_lines) + lines <- paste0( + c(full_initial, rep(full_prefix, times = length(unprefixed_lines) - 1L)), + unprefixed_lines + ) lines } @@ -76,15 +80,15 @@ wrap_symbolics = function(symbolics, #' @return `chr`; to print, use [`base::writeLines`]. #' #' @noRd -wrap_varnames = function(nms, - initial = "", common_prefix = "", none_str = "", - width = getOption("width", 80L)) { +wrap_varnames <- function(nms, + initial = "", common_prefix = "", none_str = "", + width = getOption("width", 80L)) { # (Repeating parameter names and default args here for better autocomplete. # Using `...` instead would require less upkeep, but have worse autocomplete.) if (!rlang::is_character(nms)) { Abort("`nms` must be a character vector") } - wrap_symbolics(rlang::syms(nms), initial=initial, common_prefix=common_prefix, none_str=none_str, width=width) + wrap_symbolics(rlang::syms(nms), initial = initial, common_prefix = common_prefix, none_str = none_str, width = width) } #' Paste `chr` entries (lines) together with `"\n"` separators, trailing `"\n"` @@ -93,12 +97,12 @@ wrap_varnames = function(nms, #' @return string #' #' @noRd -paste_lines = function(lines) { - paste(paste0(lines,"\n"), collapse="") +paste_lines <- function(lines) { + paste(paste0(lines, "\n"), collapse = "") } -Abort = function(msg, ...) rlang::abort(break_str(msg, init = "Error: "), ...) -Warn = function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) +Abort <- function(msg, ...) rlang::abort(break_str(msg, init = "Error: "), ...) +Warn <- function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) #' Assert that a sliding computation function takes enough args #' @@ -115,17 +119,17 @@ Warn = function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) assert_sufficient_f_args <- function(f, ...) { mandatory_f_args_labels <- c("window data", "group key", "reference time value") n_mandatory_f_args <- length(mandatory_f_args_labels) - args = formals(args(f)) - args_names = names(args) + args <- formals(args(f)) + args_names <- names(args) # Remove named arguments forwarded from `epi[x]_slide`'s `...`: - forwarded_dots_names = names(rlang::call_match(dots_expand = FALSE)[["..."]]) - args_matched_in_dots = + forwarded_dots_names <- names(rlang::call_match(dots_expand = FALSE)[["..."]]) + args_matched_in_dots <- # positional calling args will skip over args matched by named calling args args_names %in% forwarded_dots_names & - # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` - args_names != "..." - remaining_args = args[!args_matched_in_dots] - remaining_args_names = names(remaining_args) + # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` + args_names != "..." + remaining_args <- args[!args_matched_in_dots] + remaining_args_names <- names(remaining_args) # note that this doesn't include unnamed args forwarded through `...`. dots_i <- which(remaining_args_names == "...") # integer(0) if no match n_f_args_before_dots <- dots_i - 1L @@ -134,7 +138,7 @@ assert_sufficient_f_args <- function(f, ...) { mandatory_args_mapped_names <- remaining_args_names[seq_len(n_f_args_before_dots)] if (n_f_args_before_dots < n_mandatory_f_args) { - mandatory_f_args_in_f_dots = + mandatory_f_args_in_f_dots <- tail(mandatory_f_args_labels, n_mandatory_f_args - n_f_args_before_dots) cli::cli_warn( "`f` might not have enough positional arguments before its `...`; in the current `epi[x]_slide` call, the {mandatory_f_args_in_f_dots} will be included in `f`'s `...`; if `f` doesn't expect those arguments, it may produce confusing error messages", @@ -149,13 +153,15 @@ assert_sufficient_f_args <- function(f, ...) { if (rlang::dots_n(...) == 0L) { # common case; try for friendlier error message Abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", - epiprocess__f = f) + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", + epiprocess__f = f + ) } else { # less common; highlight that they are (accidentally?) using dots forwarding Abort(sprintf("`f` must take at least %s arguments plus the %s arguments forwarded through `epi[x]_slide`'s `...`, or a named argument to `epi[x]_slide` was misspelled", n_mandatory_f_args, rlang::dots_n(...)), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", - epiprocess__f = f) + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", + epiprocess__f = f + ) } } } @@ -163,21 +169,22 @@ assert_sufficient_f_args <- function(f, ...) { # calling args. If `f` has fewer than n_mandatory_f_args before `...`, then we # only need to check those args for defaults. Note that `n_f_args_before_dots` is # length 0 if `f` doesn't accept `...`. - n_remaining_args_for_default_check = min(c(n_f_args_before_dots, n_mandatory_f_args)) - default_check_args = remaining_args[seq_len(n_remaining_args_for_default_check)] - default_check_args_names = names(default_check_args) - has_default_replaced_by_mandatory = map_lgl(default_check_args, ~!is_missing(.x)) + n_remaining_args_for_default_check <- min(c(n_f_args_before_dots, n_mandatory_f_args)) + default_check_args <- remaining_args[seq_len(n_remaining_args_for_default_check)] + default_check_args_names <- names(default_check_args) + has_default_replaced_by_mandatory <- map_lgl(default_check_args, ~ !is_missing(.x)) if (any(has_default_replaced_by_mandatory)) { - default_check_mandatory_args_labels = + default_check_mandatory_args_labels <- mandatory_f_args_labels[seq_len(n_remaining_args_for_default_check)] # ^ excludes any mandatory args absorbed by f's `...`'s: - mandatory_args_replacing_defaults = + mandatory_args_replacing_defaults <- default_check_mandatory_args_labels[has_default_replaced_by_mandatory] - args_with_default_replaced_by_mandatory = + args_with_default_replaced_by_mandatory <- rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) cli::cli_abort("`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which {?has a/have} default value{?s}; we suspect that `f` doesn't expect {?this arg/these args} at all and may produce confusing error messages. Please add additional arguments to `f` or remove defaults as appropriate.", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", - epiprocess__f = f) + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", + epiprocess__f = f + ) } } @@ -276,24 +283,24 @@ assert_sufficient_f_args <- function(f, ...) { #' #' @noRd as_slide_computation <- function(f, ...) { - arg = caller_arg(f) - call = caller_env() + arg <- caller_arg(f) + call <- caller_env() # A quosure is a type of formula, so be careful with the order and contents # of the conditional logic here. if (is_quosure(f)) { - fn = function(.x, .group_key, .ref_time_value) { + fn <- function(.x, .group_key, .ref_time_value) { # Convert to environment to standardize between tibble and R6 # based inputs. In both cases, we should get a simple # environment with the empty environment as its parent. - data_env = rlang::as_environment(.x) - data_mask = rlang::new_data_mask(bottom = data_env, top = data_env) + data_env <- rlang::as_environment(.x) + data_mask <- rlang::new_data_mask(bottom = data_env, top = data_env) data_mask$.data <- rlang::as_data_pronoun(data_mask) # We'll also install `.x` directly, not as an `rlang_data_pronoun`, so # that we can, e.g., use more dplyr and epiprocess operations. - data_mask$.x = .x - data_mask$.group_key = .group_key - data_mask$.ref_time_value = .ref_time_value + data_mask$.x <- .x + data_mask$.group_key <- .group_key + data_mask$.ref_time_value <- .ref_time_value rlang::eval_tidy(f, data_mask) } @@ -309,24 +316,27 @@ as_slide_computation <- function(f, ...) { if (is_formula(f)) { if (length(f) > 2) { Abort(sprintf("%s must be a one-sided formula", arg), - class = "epiprocess__as_slide_computation__formula_is_twosided", - epiprocess__f = f, - call = call) + class = "epiprocess__as_slide_computation__formula_is_twosided", + epiprocess__f = f, + call = call + ) } if (rlang::dots_n(...) > 0L) { Abort("No arguments can be passed via `...` when `f` is a formula, or there are unrecognized/misspelled parameter names.", - class = "epiprocess__as_slide_computation__formula_with_dots", - epiprocess__f = f, - epiprocess__enquos_dots = enquos(...)) + class = "epiprocess__as_slide_computation__formula_with_dots", + epiprocess__f = f, + epiprocess__enquos_dots = enquos(...) + ) } env <- f_env(f) if (!is_environment(env)) { Abort("Formula must carry an environment.", - class = "epiprocess__as_slide_computation__formula_has_no_env", - epiprocess__f = f, - epiprocess__f_env = env, - arg = arg, call = call) + class = "epiprocess__as_slide_computation__formula_has_no_env", + epiprocess__f = f, + epiprocess__f_env = env, + arg = arg, call = call + ) } args <- list( @@ -340,103 +350,121 @@ as_slide_computation <- function(f, ...) { return(fn) } - Abort(sprintf("Can't convert an object of class %s to a slide computation", paste(collapse=" ", deparse(class(f)))), - class = "epiprocess__as_slide_computation__cant_convert_catchall", - epiprocess__f = f, - epiprocess__f_class = class(f), - arg = arg, - call = call) + Abort(sprintf("Can't convert an object of class %s to a slide computation", paste(collapse = " ", deparse(class(f)))), + class = "epiprocess__as_slide_computation__cant_convert_catchall", + epiprocess__f = f, + epiprocess__f_class = class(f), + arg = arg, + call = call + ) } ########## -in_range = function(x, rng) pmin(pmax(x, rng[1]), rng[2]) +in_range <- function(x, rng) pmin(pmax(x, rng[1]), rng[2]) ########## -Min = function(x) min(x, na.rm = TRUE) -Max = function(x) max(x, na.rm = TRUE) -Sum = function(x) sum(x, na.rm = TRUE) -Mean = function(x) mean(x, na.rm = TRUE) -Median = function(x) median(x, na.rm = TRUE) +Min <- function(x) min(x, na.rm = TRUE) +Max <- function(x) max(x, na.rm = TRUE) +Sum <- function(x) sum(x, na.rm = TRUE) +Mean <- function(x) mean(x, na.rm = TRUE) +Median <- function(x) median(x, na.rm = TRUE) ########## -Start = function(x) x[1] -End = function(x) x[length(x)] -MiddleL = function(x) x[floor((length(x)+1)/2)] -MiddleR = function(x) x[ceiling((length(x)+1)/2)] -ExtendL = function(x) c(Start(x), x) -ExtendR = function(x) c(x, End(x)) +Start <- function(x) x[1] +End <- function(x) x[length(x)] +MiddleL <- function(x) x[floor((length(x) + 1) / 2)] +MiddleR <- function(x) x[ceiling((length(x) + 1) / 2)] +ExtendL <- function(x) c(Start(x), x) +ExtendR <- function(x) c(x, End(x)) -guess_geo_type = function(geo_value) { +guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { # Convert geo values to lowercase - geo_value = tolower(geo_value) - - # If all geo values are state abbreviations, then use "state" - state_values = c(tolower(datasets::state.abb), - "as", "dc", "gu", "mp", "pr", "vi") - if (all(geo_value %in% state_values)) return("state") - - # Else if all geo values are 2 letters, then use "nation" - else if (all(grepl("[a-z]{2}", geo_value)) - & !any(grepl("[a-z]{3}", geo_value))) return("nation") - - # Else if all geo values are 5 numbers, then use "county" - else if (all(grepl("[0-9]{5}", geo_value)) & - !any(grepl("[0-9]{6}", geo_value))) return("county") - } + geo_value <- tolower(geo_value) - else if (is.numeric(geo_value)) { + # If all geo values are state abbreviations, then use "state" + state_values <- c( + tolower(datasets::state.abb), + "as", "dc", "gu", "mp", "pr", "vi" + ) + if (all(geo_value %in% state_values)) { + return("state") + } # Else if all geo values are 2 letters, then use "nation" + else if (all(grepl("[a-z]{2}", geo_value)) & + !any(grepl("[a-z]{3}", geo_value))) { + return("nation") + } # Else if all geo values are 5 numbers, then use "county" + else if (all(grepl("[0-9]{5}", geo_value)) & + !any(grepl("[0-9]{6}", geo_value))) { + return("county") + } + } else if (is.numeric(geo_value)) { # Convert geo values to integers - geo_value = as.integer(geo_value) + geo_value <- as.integer(geo_value) # If the max geo value is at most 10, then use "hhs" - if (max(geo_value) <= 10) return("hhs") - + if (max(geo_value) <= 10) { + return("hhs") + } + # Else if the max geo value is at most 457, then use "hrr" - if (max(geo_value) <= 457) return("hrr") + if (max(geo_value) <= 457) { + return("hrr") + } } # If we got here then we failed return("custom") } -guess_time_type = function(time_value) { +guess_time_type <- function(time_value) { # Convert character time values to Date or POSIXct if (is.character(time_value)) { if (nchar(time_value[1]) <= "10") { - new_time_value = tryCatch({ as.Date(time_value) }, - error = function(e) NULL) - } - else { - new_time_value = tryCatch({ as.POSIXct(time_value) }, - error = function(e) NULL) + new_time_value <- tryCatch( + { + as.Date(time_value) + }, + error = function(e) NULL + ) + } else { + new_time_value <- tryCatch( + { + as.POSIXct(time_value) + }, + error = function(e) NULL + ) } - if (!is.null(new_time_value)) time_value = new_time_value + if (!is.null(new_time_value)) time_value <- new_time_value } - - # Now, if a POSIXct class, then use "day-time" - if (inherits(time_value, "POSIXct")) return("day-time") - # Else, if a Date class, then use "week" or "day" depending on gaps + # Now, if a POSIXct class, then use "day-time" + if (inherits(time_value, "POSIXct")) { + return("day-time") + } # Else, if a Date class, then use "week" or "day" depending on gaps else if (inherits(time_value, "Date")) { return(ifelse(all(diff(sort(time_value)) == 7), "week", "day")) } # Else, check whether it's one of the tsibble classes - else if (inherits(time_value, "yearweek")) return("yearweek") - else if (inherits(time_value, "yearmonth")) return("yearmonth") - else if (inherits(time_value, "yearquarter")) return("yearquarter") + else if (inherits(time_value, "yearweek")) { + return("yearweek") + } else if (inherits(time_value, "yearmonth")) { + return("yearmonth") + } else if (inherits(time_value, "yearquarter")) { + return("yearquarter") + } # Else, if it's an integer that's at least 1582, then use "year" if (is.numeric(time_value) && - all(time_value == as.integer(time_value)) && - all(time_value >= 1582)) { + all(time_value == as.integer(time_value)) && + all(time_value >= 1582)) { return("year") } - + # If we got here then we failed return("custom") } @@ -444,29 +472,29 @@ guess_time_type = function(time_value) { ########## -quiet = function(x) { - sink(tempfile()) - on.exit(sink()) - invisible(force(x)) +quiet <- function(x) { + sink(tempfile()) + on.exit(sink()) + invisible(force(x)) } ########## # Create an auto-named list -enlist = function(...) { - x = list(...) - n = as.character(sys.call())[-1] +enlist <- function(...) { + x <- list(...) + n <- as.character(sys.call())[-1] if (!is.null(n0 <- names(x))) { - n[n0 != ""] = n0[n0 != ""] + n[n0 != ""] <- n0[n0 != ""] } - names(x) = n - return(x) + names(x) <- n + return(x) } -# Variable assignment from a list. NOT USED. Something is broken, this doesn't +# Variable assignment from a list. NOT USED. Something is broken, this doesn't # seem to work completely as expected: the variables it define don't propogate -# down to child environments -list2var = function(x) { +# down to child environments +list2var <- function(x) { list2env(x, envir = parent.frame()) } @@ -485,7 +513,7 @@ list2var = function(x) { #' #' @examples #' -#' fn = function(x = deprecated()) { +#' fn <- function(x = deprecated()) { #' deprecated_quo_is_present(rlang::enquo(x)) #' } #' @@ -497,10 +525,10 @@ list2var = function(x) { #' # argument that has already been defused into a quosure, `!!quo`). (This is #' # already how NSE arguments that will be enquosed should be forwarded.) #' -#' wrapper1 = function(x=deprecated()) fn({{x}}) -#' wrapper2 = function(x=lifecycle::deprecated()) fn({{x}}) -#' wrapper3 = function(x) fn({{x}}) -#' wrapper4 = function(x) fn(!!rlang::enquo(x)) +#' wrapper1 <- function(x = deprecated()) fn({{ x }}) +#' wrapper2 <- function(x = lifecycle::deprecated()) fn({{ x }}) +#' wrapper3 <- function(x) fn({{ x }}) +#' wrapper4 <- function(x) fn(!!rlang::enquo(x)) #' #' wrapper1() # FALSE #' wrapper2() # FALSE @@ -509,27 +537,28 @@ list2var = function(x) { #' #' # More advanced: wrapper that receives an already-enquosed arg: #' -#' inner_wrapper = function(quo) fn(!!quo) -#' outer_wrapper1 = function(x=deprecated()) inner_wrapper(rlang::enquo(x)) +#' inner_wrapper <- function(quo) fn(!!quo) +#' outer_wrapper1 <- function(x = deprecated()) inner_wrapper(rlang::enquo(x)) #' #' outer_wrapper1() # FALSE #' #' # Improper argument forwarding from a wrapper function will cause this #' # function to produce incorrect results. -#' bad_wrapper1 = function(x) fn(x) +#' bad_wrapper1 <- function(x) fn(x) #' bad_wrapper1() # TRUE, bad #' #' @noRd -deprecated_quo_is_present = function(quo) { +deprecated_quo_is_present <- function(quo) { if (!rlang::is_quosure(quo)) { Abort("`quo` must be a quosure; `enquo` the arg first", - internal=TRUE) + internal = TRUE + ) } else if (rlang::quo_is_missing(quo)) { FALSE } else { - quo_expr = rlang::get_expr(quo) + quo_expr <- rlang::get_expr(quo) if (identical(quo_expr, rlang::expr(deprecated())) || - identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { + identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { FALSE } else { TRUE @@ -577,7 +606,7 @@ deprecated_quo_is_present = function(quo) { #' be an integer. #' #' @noRd -gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { +gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { if (!is.numeric(a) || length(a) != 1L) { Abort("`a` must satisfy `is.numeric`, have `length` 1.") } @@ -593,21 +622,21 @@ gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { if (!is.numeric(irtol) || length(irtol) != 1L || irtol < 0) { Abort("`irtol` must satisfy `is.numeric`, have `length` 1, and be non-negative.") } - if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a/b) >= pqlim || abs(b/a) >= pqlim) { + if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a / b) >= pqlim || abs(b / a) >= pqlim) { Abort("`a` and/or `b` is either `NA` or exactly zero, or one is so much smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting.") } - iatol = irtol * max(a,b) - a_curr = a - b_curr = b + iatol <- irtol * max(a, b) + a_curr <- a + b_curr <- b while (TRUE) { # `b_curr` is the candidate GCD / iterand; check first if it seems too small: if (abs(b_curr) <= iatol) { - Abort('No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.') + Abort("No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.") } - remainder = a_curr - round(a_curr / b_curr) * b_curr + remainder <- a_curr - round(a_curr / b_curr) * b_curr if (abs(remainder / b_curr) <= rrtol) { # We consider `a_curr` divisible by `b_curr`; `b_curr` is the GCD or its negation - return (abs(b_curr)) + return(abs(b_curr)) } a_curr <- b_curr b_curr <- remainder @@ -625,7 +654,7 @@ gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { #' error.) #' #' @noRd -gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { +gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { if (!is.numeric(dividends) || length(dividends) == 0L) { Abort("`dividends` must satisfy `is.numeric`, and have `length` > 0") } @@ -637,7 +666,7 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { # workload. Also take `abs` early on as another form of deduplication and to # make the sort simpler. Use `na.last=FALSE` in the sort to preserve presence # of `NA`s in order to get a better error message in this case. - optimized_dividends = sort(unique(abs(dividends)), na.last=FALSE) + optimized_dividends <- sort(unique(abs(dividends)), na.last = FALSE) # Note that taking the prime factorizations of a set of integers, and # calculating the minimum power for each prime across all these # factorizations, yields the prime factorization of the GCD of the set of @@ -656,8 +685,9 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { # gcd2real(gcd_int(X/gcd_real(XUY))*gcd_real(XUY), # gcd_int(Y/gcd_real(XUY))*gcd_real(XUY)) = gcd2real(gcd_real(X), # gcd_real(Y)). So "gcd_real" should also be `reduce`-compatible. - numeric_gcd = purrr::reduce(optimized_dividends, gcd2num, - rrtol=rrtol, pqlim=pqlim, irtol=irtol) + numeric_gcd <- purrr::reduce(optimized_dividends, gcd2num, + rrtol = rrtol, pqlim = pqlim, irtol = irtol + ) vctrs::vec_cast(numeric_gcd, dividends) } @@ -672,13 +702,13 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { #' in error messages. Defaults to quoting the expression the caller fed into #' the `ref_time_values` argument. #' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)` -guess_period = function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { - sorted_distinct_ref_time_values = sort(unique(ref_time_values)) +guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { + sorted_distinct_ref_time_values <- sort(unique(ref_time_values)) if (length(sorted_distinct_ref_time_values) < 2L) { Abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) } - skips = diff(sorted_distinct_ref_time_values) - decayed_skips = + skips <- diff(sorted_distinct_ref_time_values) + decayed_skips <- if (typeof(skips) == "integer") { as.integer(skips) } else { diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 598825f6..4000727a 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -1,8 +1,9 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { - my_version_bound = NA - validate_version_bound(my_version_bound, na_ok=TRUE) - expect_error(validate_version_bound(my_version_bound, na_ok=FALSE), - class="epiprocess__my_version_bound_is_na") + my_version_bound <- NA + validate_version_bound(my_version_bound, na_ok = TRUE) + expect_error(validate_version_bound(my_version_bound, na_ok = FALSE), + class = "epiprocess__my_version_bound_is_na" + ) # Note that if the error class name changes, this test may produce some # confusing output along the following lines: # @@ -11,23 +12,26 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { }) test_that("`validate_version_bound` catches bounds that are the wrong length", { - my_version_bound1a = NULL - expect_error(validate_version_bound(my_version_bound1a, na_ok=TRUE), - class="epiprocess__my_version_bound1a_is_not_length_1") - my_version_bound1b = integer(0L) - expect_error(validate_version_bound(my_version_bound1b, na_ok=TRUE), - class="epiprocess__my_version_bound1b_is_not_length_1") - my_version_bound2 = c(2, 10) - expect_error(validate_version_bound(my_version_bound2, na_ok=TRUE), - class="epiprocess__my_version_bound2_is_not_length_1") + my_version_bound1a <- NULL + expect_error(validate_version_bound(my_version_bound1a, na_ok = TRUE), + class = "epiprocess__my_version_bound1a_is_not_length_1" + ) + my_version_bound1b <- integer(0L) + expect_error(validate_version_bound(my_version_bound1b, na_ok = TRUE), + class = "epiprocess__my_version_bound1b_is_not_length_1" + ) + my_version_bound2 <- c(2, 10) + expect_error(validate_version_bound(my_version_bound2, na_ok = TRUE), + class = "epiprocess__my_version_bound2_is_not_length_1" + ) }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { - my_int = 5L - my_dbl = 5 - my_list = list(5L) - my_date = as.Date("2000-01-01") - my_datetime = vctrs::vec_cast(my_date, as.POSIXct(as.Date("1900-01-01"))) + my_int <- 5L + my_dbl <- 5 + my_list <- list(5L) + my_date <- as.Date("2000-01-01") + my_datetime <- vctrs::vec_cast(my_date, as.POSIXct(as.Date("1900-01-01"))) # When first drafted, this validate function was a (validate+)cast function, # which used vctrs::vec_cast inside. However, the initial implementation # didn't actually allow casting to occur, and it was easier to change to the @@ -41,30 +45,36 @@ test_that("`validate_version_bound` validate and class checks together allow and expect_identical(vctrs::vec_cast(my_datetime, my_date), my_date) expect_identical(vctrs::vec_cast(my_date, my_datetime), my_datetime) # - x_int = tibble::tibble(version = my_int) - x_dbl = tibble::tibble(version = my_dbl) - x_list = tibble::tibble(version = my_list) - x_date = tibble::tibble(version = my_date) - x_datetime = tibble::tibble(version = my_datetime) + x_int <- tibble::tibble(version = my_int) + x_dbl <- tibble::tibble(version = my_dbl) + x_list <- tibble::tibble(version = my_list) + x_date <- tibble::tibble(version = my_date) + x_datetime <- tibble::tibble(version = my_datetime) # Custom classes matter (test vectors and non-vctrs-specialized lists separately): - my_version_bound1 = `class<-`(24, "c1") - expect_error(validate_version_bound(my_version_bound1, x_int, na_ok=FALSE), - class="epiprocess__my_version_bound1_has_invalid_class_or_typeof") - my_version_bound2 = `class<-`(list(12), c("c2a","c2b","c2c")) - expect_error(validate_version_bound(my_version_bound2, x_list, na_ok=FALSE), - class="epiprocess__my_version_bound2_has_invalid_class_or_typeof") + my_version_bound1 <- `class<-`(24, "c1") + expect_error(validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), + class = "epiprocess__my_version_bound1_has_invalid_class_or_typeof" + ) + my_version_bound2 <- `class<-`(list(12), c("c2a", "c2b", "c2c")) + expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), + class = "epiprocess__my_version_bound2_has_invalid_class_or_typeof" + ) # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: - validate_version_bound(my_date, x_date, version_bound_arg="vb") - validate_version_bound(my_datetime, x_datetime, version_bound_arg="vb") - expect_error(validate_version_bound(my_datetime, x_date, na_ok=TRUE, version_bound_arg="vb"), - class="epiprocess__vb_has_invalid_class_or_typeof") - expect_error(validate_version_bound(my_date, x_datetime, na_ok=TRUE, version_bound_arg="vb"), - class="epiprocess__vb_has_invalid_class_or_typeof") + validate_version_bound(my_date, x_date, version_bound_arg = "vb") + validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") + expect_error(validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), + class = "epiprocess__vb_has_invalid_class_or_typeof" + ) + expect_error(validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), + class = "epiprocess__vb_has_invalid_class_or_typeof" + ) # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb")) expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb")) - expect_error(validate_version_bound(`class<-`(list(2), "clazz"), - tibble::tibble(version=`class<-`(5L, "clazz")), TRUE, "vb")) + expect_error(validate_version_bound( + `class<-`(list(2), "clazz"), + tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" + )) # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -77,34 +87,48 @@ test_that("`validate_version_bound` validate and class checks together allow and }) test_that("archive version bounds args work as intended", { - measurement_date = as.Date("2000-01-01") - update_tbl = tibble::tibble( + measurement_date <- as.Date("2000-01-01") + update_tbl <- tibble::tibble( geo_value = "g1", time_value = measurement_date, version = measurement_date + 1:5, value = 1:5 ) - expect_error(as_epi_archive(update_tbl, - clobberable_versions_start = 1241, - versions_end = measurement_date), - class="epiprocess__clobberable_versions_start_has_invalid_class_or_typeof") - expect_error(as_epi_archive(update_tbl[integer(0L),]), - class="epiprocess__max_version_cannot_be_used") - expect_error(as_epi_archive(update_tbl, - clobberable_versions_start = NA, - versions_end = measurement_date), - class="epiprocess__versions_end_earlier_than_updates") - expect_error(as_epi_archive(update_tbl, - clobberable_versions_start=measurement_date+6L, - versions_end = measurement_date+5L), - class="epiprocess__versions_end_earlier_than_clobberable_versions_start") + expect_error( + as_epi_archive(update_tbl, + clobberable_versions_start = 1241, + versions_end = measurement_date + ), + class = "epiprocess__clobberable_versions_start_has_invalid_class_or_typeof" + ) + expect_error(as_epi_archive(update_tbl[integer(0L), ]), + class = "epiprocess__max_version_cannot_be_used" + ) + expect_error( + as_epi_archive(update_tbl, + clobberable_versions_start = NA, + versions_end = measurement_date + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + expect_error( + as_epi_archive(update_tbl, + clobberable_versions_start = measurement_date + 6L, + versions_end = measurement_date + 5L + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) expect_error(as_epi_archive(update_tbl, versions_end = NA), - regexp="versions_end.*must not satisfy.*is.na") - ea_default = as_epi_archive(update_tbl) - ea_default$as_of(measurement_date+4L) - expect_warning(regexp=NA, - ea_default$as_of(measurement_date+5L), - class = "epiprocess__snapshot_as_of_clobberable_version") - expect_error(ea_default$as_of(measurement_date+6L), - regexp = "max_version.*at most.*versions_end") + regexp = "versions_end.*must not satisfy.*is.na" + ) + ea_default <- as_epi_archive(update_tbl) + ea_default$as_of(measurement_date + 4L) + expect_warning( + regexp = NA, + ea_default$as_of(measurement_date + 5L), + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + expect_error(ea_default$as_of(measurement_date + 6L), + regexp = "max_version.*at most.*versions_end" + ) }) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 790ac65f..73f0e166 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -1,145 +1,167 @@ library(dplyr) -test_that("first input must be a data.frame",{ - expect_error(as_epi_archive(c(1,2,3),compactify=FALSE), - regexp="`x` must be a data frame.") +test_that("first input must be a data.frame", { + expect_error(as_epi_archive(c(1, 2, 3), compactify = FALSE), + regexp = "`x` must be a data frame." + ) }) dt <- archive_cases_dv_subset$DT -test_that("data.frame must contain geo_value, time_value and version columns",{ - expect_error(as_epi_archive(select(dt,-geo_value), compactify=FALSE), - regexp="`x` must contain a `geo_value` column.") - expect_error(as_epi_archive(select(dt,-time_value), compactify=FALSE), - regexp="`x` must contain a `time_value` column.") - expect_error(as_epi_archive(select(dt,-version), compactify=FALSE), - regexp="`x` must contain a `version` column.") +test_that("data.frame must contain geo_value, time_value and version columns", { + expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), + regexp = "`x` must contain a `geo_value` column." + ) + expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), + regexp = "`x` must contain a `time_value` column." + ) + expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), + regexp = "`x` must contain a `version` column." + ) }) -test_that("other_keys can only contain names of the data.frame columns",{ - expect_error(as_epi_archive(dt,other_keys = "xyz", compactify=FALSE), - regexp="`other_keys` must be contained in the column names of `x`.") - expect_error(as_epi_archive(dt,other_keys = "percent_cli", compactify=FALSE),NA) +test_that("other_keys can only contain names of the data.frame columns", { + expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE), + regexp = "`other_keys` must be contained in the column names of `x`." + ) + expect_error(as_epi_archive(dt, other_keys = "percent_cli", compactify = FALSE), NA) }) -test_that("other_keys cannot contain names geo_value, time_value or version",{ - expect_error(as_epi_archive(dt,other_keys = "geo_value", compactify=FALSE), - regexp="`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - expect_error(as_epi_archive(dt,other_keys = "time_value", compactify=FALSE), - regexp="`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - expect_error(as_epi_archive(dt,other_keys = "version", compactify=FALSE), - regexp="`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") +test_that("other_keys cannot contain names geo_value, time_value or version", { + expect_error(as_epi_archive(dt, other_keys = "geo_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive(dt, other_keys = "time_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive(dt, other_keys = "version", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) }) -test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields",{ - expect_warning(as_epi_archive(dt,additional_metadata = list(geo_type = 1), compactify=FALSE), - regexp="`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\".") - expect_warning(as_epi_archive(dt,additional_metadata = list(time_type = 1), compactify=FALSE), - regexp="`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\".") +test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { + expect_warning(as_epi_archive(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\"." + ) + expect_warning(as_epi_archive(dt, additional_metadata = list(time_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\"." + ) }) -test_that("epi_archives are correctly instantiated with a variety of data types",{ +test_that("epi_archives are correctly instantiated with a variety of data types", { # Data frame - df <- data.frame(geo_value="ca", - time_value=as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value=1:20) - - ea1 <- as_epi_archive(df, compactify=FALSE) - expect_equal(key(ea1$DT),c("geo_value","time_value","version")) - expect_equal(ea1$additional_metadata,list()) - - ea2 <- as_epi_archive(df, other_keys="value", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea2$DT),c("geo_value","time_value","value","version")) - expect_equal(ea2$additional_metadata,list(value=df$value)) - + df <- data.frame( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20 + ) + + ea1 <- as_epi_archive(df, compactify = FALSE) + expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) + expect_equal(ea1$additional_metadata, list()) + + ea2 <- as_epi_archive(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea2$additional_metadata, list(value = df$value)) + # Tibble - tib <- tibble::tibble(df, code="x") - - ea3 <- as_epi_archive(tib, compactify=FALSE) - expect_equal(key(ea3$DT),c("geo_value","time_value","version")) - expect_equal(ea3$additional_metadata,list()) - - ea4 <- as_epi_archive(tib, other_keys="code", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea4$DT),c("geo_value","time_value","code","version")) - expect_equal(ea4$additional_metadata,list(value=df$value)) - + tib <- tibble::tibble(df, code = "x") + + ea3 <- as_epi_archive(tib, compactify = FALSE) + expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) + expect_equal(ea3$additional_metadata, list()) + + ea4 <- as_epi_archive(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea4$additional_metadata, list(value = df$value)) + # Keyed data.table - kdt <- data.table::data.table(geo_value="ca", - time_value=as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20, - code = "CA", - key = "code") - - ea5 <- as_epi_archive(kdt, compactify=FALSE) + kdt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA", + key = "code" + ) + + ea5 <- as_epi_archive(kdt, compactify = FALSE) # Key from data.table isn't absorbed when as_epi_archive is used - expect_equal(key(ea5$DT),c("geo_value","time_value","version")) - expect_equal(ea5$additional_metadata,list()) - - ea6 <- as_epi_archive(kdt,other_keys="value", additional_metadata=list(value=df$value), compactify=FALSE) + expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) + expect_equal(ea5$additional_metadata, list()) + + ea6 <- as_epi_archive(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) # Mismatched keys, but the one from as_epi_archive overrides - expect_equal(key(ea6$DT),c("geo_value","time_value","value","version")) - expect_equal(ea6$additional_metadata,list(value=df$value)) - + expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea6$additional_metadata, list(value = df$value)) + # Unkeyed data.table - udt <- data.table::data.table(geo_value="ca", - time_value=as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value=1:20, - code = "CA") - - ea7 <- as_epi_archive(udt, compactify=FALSE) - expect_equal(key(ea7$DT),c("geo_value","time_value","version")) - expect_equal(ea7$additional_metadata,list()) - - ea8 <- as_epi_archive(udt,other_keys="code", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea8$DT),c("geo_value","time_value","code","version")) - expect_equal(ea8$additional_metadata,list(value=df$value)) - + udt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA" + ) + + ea7 <- as_epi_archive(udt, compactify = FALSE) + expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) + expect_equal(ea7$additional_metadata, list()) + + ea8 <- as_epi_archive(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea8$additional_metadata, list(value = df$value)) + # epi_df edf1 <- jhu_csse_daily_subset %>% - select(geo_value,time_value,cases) %>% + select(geo_value, time_value, cases) %>% mutate(version = max(time_value), code = "USA") - - ea9 <- as_epi_archive(edf1, compactify=FALSE) - expect_equal(key(ea9$DT),c("geo_value","time_value","version")) - expect_equal(ea9$additional_metadata,list()) - - ea10 <- as_epi_archive(edf1,other_keys="code", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea10$DT),c("geo_value","time_value","code","version")) - expect_equal(ea10$additional_metadata,list(value=df$value)) - + + ea9 <- as_epi_archive(edf1, compactify = FALSE) + expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) + expect_equal(ea9$additional_metadata, list()) + + ea10 <- as_epi_archive(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea10$additional_metadata, list(value = df$value)) + # Keyed epi_df - edf2 <- data.frame(geo_value = "al", - time_value = rep(as.Date("2020-01-01") + 0:9,2), - version = c(rep(as.Date("2020-01-25"),10), - rep(as.Date("2020-01-26"),10)), - cases = 1:20, - misc = "USA") %>% + edf2 <- data.frame( + geo_value = "al", + time_value = rep(as.Date("2020-01-01") + 0:9, 2), + version = c( + rep(as.Date("2020-01-25"), 10), + rep(as.Date("2020-01-26"), 10) + ), + cases = 1:20, + misc = "USA" + ) %>% as_epi_df(additional_metadata = list(other_keys = "misc")) - - ea11 <- as_epi_archive(edf2, compactify=FALSE) - expect_equal(key(ea11$DT),c("geo_value","time_value","version")) - expect_equal(ea11$additional_metadata,list()) - - ea12 <- as_epi_archive(edf2,other_keys="misc", additional_metadata=list(value=df$misc), compactify=FALSE) - expect_equal(key(ea12$DT),c("geo_value","time_value","misc","version")) - expect_equal(ea12$additional_metadata,list(value=df$misc)) + + ea11 <- as_epi_archive(edf2, compactify = FALSE) + expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) + expect_equal(ea11$additional_metadata, list()) + + ea12 <- as_epi_archive(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(ea12$additional_metadata, list(value = df$misc)) }) test_that("`epi_archive` rejects nonunique keys", { - toy_update_tbl = + toy_update_tbl <- tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130, - "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 ) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version)) + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) expect_error( as_epi_archive(toy_update_tbl), class = "epiprocess__epi_archive_requires_unique_key" diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index f8d956c0..4400c94a 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -3,22 +3,22 @@ library(data.table) library(dplyr) dt <- archive_cases_dv_subset$DT -dt <- filter(dt,geo_value == "ca") %>% +dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) test_that("Input for compactify must be NULL or a boolean", { - expect_error(as_epi_archive(dt,compactify="no")) + expect_error(as_epi_archive(dt, compactify = "no")) }) dt$percent_cli <- c(1:80) dt$case_rate <- c(1:80) -row_replace <- function(dt,row,x,y) { +row_replace <- function(dt, row, x, y) { # (This way of "replacing" elements appears to use copy-on-write even though # we are working with a data.table.) - dt[row,4] <- x - dt[row,5] <- y + dt[row, 4] <- x + dt[row, 5] <- y dt } @@ -26,7 +26,7 @@ row_replace <- function(dt,row,x,y) { # observation carried forward) # Rows 1 should not be eliminated even if NA -dt <- row_replace(dt,1,NA,NA) # Not LOCF +dt <- row_replace(dt, 1, NA, NA) # Not LOCF # NOTE! We are assuming that there are no NA's in geo_value, time_value, # and version. Even though compactify may erroneously remove the first row @@ -34,71 +34,71 @@ dt <- row_replace(dt,1,NA,NA) # Not LOCF # has problems beyond the scope of this test # Rows 11 and 12 correspond to different time_values -dt <- row_replace(dt,12,11,11) # Not LOCF +dt <- row_replace(dt, 12, 11, 11) # Not LOCF # Rows 20 and 21 only differ in version -dt <- row_replace(dt,21,20,20) # LOCF +dt <- row_replace(dt, 21, 20, 20) # LOCF # Rows 21 and 22 only differ in version -dt <- row_replace(dt,22,20,20) # LOCF +dt <- row_replace(dt, 22, 20, 20) # LOCF # Row 39 comprises the first NA's -dt <- row_replace(dt,39,NA,NA) # Not LOCF +dt <- row_replace(dt, 39, NA, NA) # Not LOCF # Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(dt,40,NA,NA) # LOCF +dt <- row_replace(dt, 40, NA, NA) # LOCF # Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(dt,62,15,15) # Not LOCF +dt <- row_replace(dt, 62, 15, 15) # Not LOCF # Row 73 only has one value carried over -dt <- row_replace(dt,74,73,74) # Not LOCF +dt <- row_replace(dt, 74, 73, 74) # Not LOCF -dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) -dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) -dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt,compactify=NULL)$DT)) +dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT) +dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt, compactify = NULL)$DT)) test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive(dt,compactify=NULL)) + expect_warning(as_epi_archive(dt, compactify = NULL)) }) test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive(dt[1:5],compactify=NULL),NA) + expect_warning(as_epi_archive(dt[1:5], compactify = NULL), NA) }) test_that("LOCF values are ignored with compactify=FALSE", { - expect_identical(nrow(dt),nrow(dt_false)) + expect_identical(nrow(dt), nrow(dt_false)) }) test_that("LOCF values are taken out with compactify=TRUE", { - dt_test <- as_tibble(as_epi_archive(dt[-c(21,22,40),],compactify=FALSE)$DT) - - expect_identical(dt_true,dt_null) - expect_identical(dt_null,dt_test) + dt_test <- as_tibble(as_epi_archive(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) + + expect_identical(dt_true, dt_null) + expect_identical(dt_null, dt_test) }) test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { - ea_true <- as_epi_archive(dt,compactify=TRUE) - ea_false <- as_epi_archive(dt,compactify=FALSE) - + ea_true <- as_epi_archive(dt, compactify = TRUE) + ea_false <- as_epi_archive(dt, compactify = FALSE) + # Row 22, an LOCF row corresponding to the latest version, is omitted in # ea_true - latest_version = max(ea_false$DT$version) - as_of_true <- ea_true$as_of(latest_version) + latest_version <- max(ea_false$DT$version) + as_of_true <- ea_true$as_of(latest_version) as_of_false <- ea_false$as_of(latest_version) - - expect_identical(as_of_true,as_of_false) + + expect_identical(as_of_true, as_of_false) }) test_that("compactify does not alter the default clobberable and observed version bounds", { - x = tibble::tibble( + x <- tibble::tibble( geo_value = "geo1", time_value = as.Date("2000-01-01"), version = as.Date("2000-01-01") + 1:5, value = 42L ) - ea_true <- as_epi_archive(x, compactify=TRUE) - ea_false <- as_epi_archive(x, compactify=FALSE) + ea_true <- as_epi_archive(x, compactify = TRUE) + ea_false <- as_epi_archive(x, compactify = FALSE) # We say that we base the bounds on the user's `x` arg. We might mess up or # change our minds and base things on the `DT` field (or a temporary `DT` # variable, post-compactify) instead. Check that this test would trigger diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index b7b22dc6..fe129616 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -1,34 +1,39 @@ library(tibble) -test_that("epi_cor throws an error for a non-epi_df for its first argument",{ - expect_error(epi_cor(1:10,1,1)) - expect_error(epi_cor(data.frame(x=1:10),1,1)) +test_that("epi_cor throws an error for a non-epi_df for its first argument", { + expect_error(epi_cor(1:10, 1, 1)) + expect_error(epi_cor(data.frame(x = 1:10), 1, 1)) }) -test_that("epi_cor requires two var arguments, var1 and var2",{ - expect_error(epi_cor(archive_cases_dv_subset$DT,var2=1)) - expect_error(epi_cor(archive_cases_dv_subset$DT,var1=1)) +test_that("epi_cor requires two var arguments, var1 and var2", { + expect_error(epi_cor(archive_cases_dv_subset$DT, var2 = 1)) + expect_error(epi_cor(archive_cases_dv_subset$DT, var1 = 1)) }) -test_that("epi_cor functions as intended",{ - expect_equal(epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = geo_value, - dt1 = -2)[1], - tibble(geo_value = unique(jhu_csse_daily_subset$geo_value)) +test_that("epi_cor functions as intended", { + expect_equal( + epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value, + dt1 = -2 + )[1], + tibble(geo_value = unique(jhu_csse_daily_subset$geo_value)) ) - - edf <- as_epi_df(data.frame(geo_value=rep("asdf",20), - time_value=as.Date("2020-01-01") + 1:20, - pos=1:20, - neg=-(1:20))) - expect_equal(epi_cor(edf, pos, pos)[[2]],1) - expect_equal(epi_cor(edf, pos, neg)[[2]],-1) + + edf <- as_epi_df(data.frame( + geo_value = rep("asdf", 20), + time_value = as.Date("2020-01-01") + 1:20, + pos = 1:20, + neg = -(1:20) + )) + expect_equal(epi_cor(edf, pos, pos)[[2]], 1) + expect_equal(epi_cor(edf, pos, neg)[[2]], -1) }) -test_that("shift works as intended",{ - expect_identical(epiprocess:::shift(1:100,1),dplyr::lead(1:100)) - expect_identical(epiprocess:::shift(1:100,0),1:100) - expect_identical(epiprocess:::shift(1:100,-1),dplyr::lag(1:100)) -}) \ No newline at end of file +test_that("shift works as intended", { + expect_identical(epiprocess:::shift(1:100, 1), dplyr::lead(1:100)) + expect_identical(epiprocess:::shift(1:100, 0), 1:100) + expect_identical(epiprocess:::shift(1:100, -1), dplyr::lag(1:100)) +}) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index f3d4c9d7..511cc8d7 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -5,12 +5,17 @@ test_that("`archive_cases_dv_subset` is formed successfully", { test_that("`delayed_assign_with_unregister_awareness` works as expected on good promises", { # Since we're testing environment stuff, use some "my_" prefixes to try to # prevent naming coincidences from changing behavior. - my_eval_env = rlang::new_environment(list(x=40L, n_evals=0L), parent=rlang::base_env()) - my_assign_env = rlang::new_environment() - delayed_assign_with_unregister_awareness("good1", { - n_evals <- n_evals + 1L - x + 2L - }, my_eval_env, my_assign_env) + my_eval_env <- rlang::new_environment(list(x = 40L, n_evals = 0L), parent = rlang::base_env()) + my_assign_env <- rlang::new_environment() + delayed_assign_with_unregister_awareness( + "good1", + { + n_evals <- n_evals + 1L + x + 2L + }, + my_eval_env, + my_assign_env + ) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) @@ -19,12 +24,17 @@ test_that("`delayed_assign_with_unregister_awareness` works as expected on good }) test_that("original `delayedAssign` works as expected on good promises", { - my_eval_env = rlang::new_environment(list(x=40L, n_evals=0L), parent=rlang::base_env()) - my_assign_env = rlang::new_environment() - delayedAssign("good1", { - n_evals <- n_evals + 1L - x + 2L - }, my_eval_env, my_assign_env) + my_eval_env <- rlang::new_environment(list(x = 40L, n_evals = 0L), parent = rlang::base_env()) + my_assign_env <- rlang::new_environment() + delayedAssign( + "good1", + { + n_evals <- n_evals + 1L + x + 2L + }, + my_eval_env, + my_assign_env + ) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) @@ -33,34 +43,34 @@ test_that("original `delayedAssign` works as expected on good promises", { }) test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { - delayed_assign_with_unregister_awareness("x", Abort("msg", class="original_error_class")) - expect_error(force(x), class="original_error_class") + delayed_assign_with_unregister_awareness("x", Abort("msg", class = "original_error_class")) + expect_error(force(x), class = "original_error_class") }) test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { - delayed_assign_with_unregister_awareness("x", Abort("msg", class="original_error_class")) + delayed_assign_with_unregister_awareness("x", Abort("msg", class = "original_error_class")) # Take advantage of a false positive / hedge against package renaming: make # our own `unregister` function to trigger the special error message. - unregister = function(y) y - expect_error(unregister(force(x)), class="epiprocess__promise_evaluation_error_during_unregister") + unregister <- function(y) y + expect_error(unregister(force(x)), class = "epiprocess__promise_evaluation_error_during_unregister") }) test_that("`delayed_assign_with_unregister_awareness` injection support works", { - my_exprs = rlang::exprs(a = b + c, d = e) + my_exprs <- rlang::exprs(a = b + c, d = e) delayed_assign_with_unregister_awareness( "good2", list(!!!my_exprs), - eval.env=rlang::new_environment(list(b=2L, c=3L, e=4L), rlang::base_env()) + eval.env = rlang::new_environment(list(b = 2L, c = 3L, e = 4L), rlang::base_env()) ) force(good2) - expect_identical(good2, list(a=5L, d=4L)) + expect_identical(good2, list(a = 5L, d = 4L)) }) test_that("`some_package_is_being_unregistered` doesn't fail in response to non-simple calls", { # Prerequisite for current implementation to work (testing here to help debug # in case some R version doesn't obey): expect_false(NA_character_ %in% letters) - f = function() function() some_package_is_being_unregistered() - my_expr = rlang::expr(f()()) + f <- function() function() some_package_is_being_unregistered() + my_expr <- rlang::expr(f()()) # Prerequisite for this to test to actually be testing on non-simple calls: expect_false(rlang::is_call_simple(my_expr)) # Actual test (`FALSE` is correct; `NA` or error is not): diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R index 334b4488..5be3824e 100644 --- a/tests/testthat/test-deprecations.R +++ b/tests/testthat/test-deprecations.R @@ -1,48 +1,47 @@ - -test_that("epix_slide group_by= deprecation works",{ +test_that("epix_slide group_by= deprecation works", { expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before=2L, group_by=c()), + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ - slide(function(...) {}, before=2L, group_by=c()), + slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before=2L, group_by=c()), + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ group_by(geo_value)$ - slide(function(...) {}, before=2L, group_by=c()), + slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) # expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before=2L, all_rows=TRUE), + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ - slide(function(...) {}, before=2L, all_rows=TRUE), + slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before=2L, all_rows=TRUE), + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ group_by(geo_value)$ - slide(function(...) {}, before=2L, all_rows=TRUE), + slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) }) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 2e61e088..e2bbc040 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -2,116 +2,154 @@ d <- as.Date("2020-01-01") -ungrouped = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value=1:200), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) +ungrouped <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value = 1:200), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) ) %>% as_epi_df() -grouped = ungrouped %>% +grouped <- ungrouped %>% group_by(geo_value) -small_x = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value=11:15), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) +small_x <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) ) %>% as_epi_df(as_of = d + 6) %>% group_by(geo_value) -f = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) +f <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) -toy_edf = tibble::tribble( - ~geo_value, ~time_value, ~value , - "a" , 1:10 , 2L^( 1:10), - "b" , 1:10 , 2L^(11:20), - ) %>% +toy_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + "a", 1:10, 2L^(1:10), + "b", 1:10, 2L^(11:20), +) %>% tidyr::unchop(c(time_value, value)) %>% as_epi_df(as_of = 100) ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { - expect_error(epi_slide(grouped, f, before = c(0,1), after = 0, ref_time_values = d+3), - "`before`.*length-1") - expect_error(epi_slide(grouped, f, before = 1, after = c(0,1), ref_time_values = d+3), - "`after`.*length-1") + expect_error( + epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = d + 3), + "`before`.*length-1" + ) + expect_error( + epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), + "`after`.*length-1" + ) }) test_that("Test errors/warnings for discouraged features", { - expect_error(epi_slide(grouped, f, ref_time_values = d+1), - "Either or both of `before`, `after` must be provided.") - expect_warning(epi_slide(grouped, f, before = 0L, ref_time_values = d+1), - "`before==0`, `after` missing") - expect_warning(epi_slide(grouped, f, after = 0L, ref_time_values = d+1), - "`before` missing, `after==0`") + expect_error( + epi_slide(grouped, f, ref_time_values = d + 1), + "Either or both of `before`, `after` must be provided." + ) + expect_warning( + epi_slide(grouped, f, before = 0L, ref_time_values = d + 1), + "`before==0`, `after` missing" + ) + expect_warning( + epi_slide(grouped, f, after = 0L, ref_time_values = d + 1), + "`before` missing, `after==0`" + ) # Below cases should raise no errors/warnings: - expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d+2),NA) - expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d+2),NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d+2),NA) + expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d + 2), NA) + expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d + 2), NA) + expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2), NA) }) -test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible",{ - expect_error(epi_slide(grouped, f, before = -1L, ref_time_values = d+2L), - "`before`.*non-negative") - expect_error(epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d+2L), - "`after`.*non-negative") - expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d+2L), - regexp="before", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d+2L), - regexp="after", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d+2L), - regexp="before", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d+2L), - regexp="after", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d+2L), - "`before`.*non-NA") - expect_error(epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d+2L), - "`after`.*non-NA") +test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { + expect_error( + epi_slide(grouped, f, before = -1L, ref_time_values = d + 2L), + "`before`.*non-negative" + ) + expect_error( + epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d + 2L), + "`after`.*non-negative" + ) + expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error( + epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d + 2L), + "`before`.*non-NA" + ) + expect_error( + epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), + "`after`.*non-NA" + ) # Non-integer-class but integer-compatible values are allowed: - expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d+2L),NA) + expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L), NA) }) test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { - expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`.") # before the first, no data in the slide windows - expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d+207L), - "All `ref_time_values` must appear in `x\\$time_value`.") # beyond the last, no data in window + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # before the first, no data in the slide windows + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # beyond the last, no data in window }) test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { - expect_error(epi_slide(grouped, f, before=0L, after=2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`.") # before the first, but we'd expect there to be data in the window - expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d+201L), - "All `ref_time_values` must appear in `x\\$time_value`.") # beyond the last, but still with data in window + expect_error( + epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # beyond the last, but still with data in window }) ## --- These cases generate warnings (or not): --- -test_that("Warn user against having a blank `before`",{ - expect_warning(epi_slide(grouped, f, after = 1L, - ref_time_values = d+1L), NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 1L, - ref_time_values = d+1L), NA) +test_that("Warn user against having a blank `before`", { + expect_warning(epi_slide(grouped, f, + after = 1L, + ref_time_values = d + 1L + ), NA) + expect_warning(epi_slide(grouped, f, + before = 0L, after = 1L, + ref_time_values = d + 1L + ), NA) }) ## --- These cases doesn't generate the error: --- test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { - expect_identical(epi_slide(grouped, f, before=2L, ref_time_values = d+200L) %>% - ungroup() %>% - dplyr::select("geo_value","slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199)) # out of range for one group - expect_identical(epi_slide(grouped, f, before=2L, ref_time_values=d+3) %>% - ungroup() %>% - dplyr::select("geo_value","slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group }) test_that("computation output formats x as_list_col", { # See `toy_edf` definition at top of file. # We'll try 7d sum with a few formats. - basic_result_from_size1 = tibble::tribble( - ~geo_value, ~time_value, ~value , ~slide_value , - "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - ) %>% + basic_result_from_size1 <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) @@ -134,21 +172,25 @@ test_that("computation output formats x as_list_col", { ) # output naming functionality: expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - new_col_name = "result"), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result" + ), basic_result_from_size1 %>% rename(result_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)), - names_sep = NULL), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL + ), basic_result_from_size1 %>% rename(value_sum = slide_value) ) # trying with non-size-1 computation outputs: - basic_result_from_size2 = tibble::tribble( - ~geo_value, ~time_value, ~value , ~slide_value , - "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L, - ) %>% + basic_result_from_size2 <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, + ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) @@ -172,24 +214,25 @@ test_that("computation output formats x as_list_col", { }) test_that("epi_slide alerts if the provided f doesn't take enough args", { - f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) - expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) + expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) + expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) - f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value)) - expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d+1), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) + expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d + 1), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) }) test_that("`ref_time_values` + `all_rows = TRUE` works", { # See `toy_edf` definition at top of file. We'll do variants of a slide # returning the following: - basic_full_result = tibble::tribble( - ~geo_value, ~time_value, ~value , ~slide_value , - "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - ) %>% + basic_full_result <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) @@ -199,16 +242,21 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { basic_full_result ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L)), + toy_edf %>% epi_slide( + before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L) + ), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L), all_rows = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L), all_rows = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, NA_integer_)) + slide_value, NA_integer_ + )) ) # slide computations returning data frames: expect_identical( @@ -216,64 +264,82 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L)), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L) + ), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, NA_integer_)) %>% + slide_value, NA_integer_ + )) %>% dplyr::rename(slide_value_value = slide_value) ) # slide computations returning data frames with `as_list_col=TRUE`: expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), - as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% dplyr::filter(time_value %in% c(2L, 8L)) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, list(NULL))) + slide_value, list(NULL) + )) ) # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE + ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE + ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE + ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% # XXX unclear exactly what we want in this case. Current approach is @@ -282,7 +348,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) - rework_nulls = function(slide_values_list) { + rework_nulls <- function(slide_values_list) { vctrs::vec_assign( slide_values_list, vctrs::vec_detect_missing(slide_values_list), @@ -290,14 +356,17 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) } expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE + ) %>% mutate(slide_value = rework_nulls(slide_value)) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, NA_integer_)) %>% + slide_value, NA_integer_ + )) %>% dplyr::rename(slide_value_value = slide_value) ) }) @@ -314,56 +383,60 @@ test_that("`epi_slide` doesn't decay date output", { test_that("basic grouped epi_slide computation produces expected output", { # Also checks that we correctly remove extra rows and columns (`.real`) used # to recover `ref_time_value`s. - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=cumsum(-(1:5))) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) # formula - result1 <- epi_slide(small_x, f = ~sum(.x$value), before=50) + result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50) expect_identical(result1, expected_output) # function - result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before=50) + result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50) expect_identical(result2, expected_output) # dots - result3 <- epi_slide(small_x, slide_value = sum(value), before=50) + result3 <- epi_slide(small_x, slide_value = sum(value), before = 50) expect_identical(result3, expected_output) }) test_that("ungrouped epi_slide computation completes successfully", { expect_error( small_x %>% - ungroup() %>% - epi_slide(before = 2, - slide_value = sum(.x$value)), - regexp=NA + ungroup() %>% + epi_slide( + before = 2, + slide_value = sum(.x$value) + ), + regexp = NA ) }) test_that("basic ungrouped epi_slide computation produces expected output", { - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)) ) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% - epi_slide(before = 50, - slide_value = sum(.x$value)) + epi_slide( + before = 50, + slide_value = sum(.x$value) + ) expect_identical(result1, expected_output) # Ungrouped with multiple geos - expected_output = dplyr::bind_rows( + expected_output <- dplyr::bind_rows( dplyr::tibble( - geo_value = "ak", time_value = d + 1:5, value=11:15, slide_value=cumsum(11:15) + cumsum(-(1:5) - )), + geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) + ), dplyr::tibble( - geo_value = "al", time_value = d + 1:5, value=-(1:5), slide_value=cumsum(11:15) + cumsum(-(1:5)) + geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) ) ) %>% as_epi_df(as_of = d + 6) %>% @@ -371,154 +444,181 @@ test_that("basic ungrouped epi_slide computation produces expected output", { result2 <- small_x %>% ungroup() %>% - epi_slide(before = 50, - slide_value = sum(.x$value)) + epi_slide( + before = 50, + slide_value = sum(.x$value) + ) expect_identical(result2, expected_output) }) test_that("epi_slide computation via formula can use ref_time_value", { - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(f = ~ .ref_time_value, - before = 50) + epi_slide( + f = ~.ref_time_value, + before = 50 + ) expect_identical(result1, expected_output) result2 <- small_x %>% - epi_slide(f = ~ .z, - before = 50) + epi_slide( + f = ~.z, + before = 50 + ) expect_identical(result2, expected_output) result3 <- small_x %>% - epi_slide(f = ~ ..3, - before = 50) + epi_slide( + f = ~..3, + before = 50 + ) expect_identical(result3, expected_output) # Ungrouped with multiple geos - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% as_epi_df(as_of = d + 6) %>% arrange(time_value) result4 <- small_x %>% ungroup() %>% - epi_slide(f = ~ .ref_time_value, - before = 50) + epi_slide( + f = ~.ref_time_value, + before = 50 + ) expect_identical(result4, expected_output) }) test_that("epi_slide computation via function can use ref_time_value", { - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(f = function(x, g, t) t, - before = 2) + epi_slide( + f = function(x, g, t) t, + before = 2 + ) expect_identical(result1, expected_output) }) test_that("epi_slide computation via dots can use ref_time_value and group", { # ref_time_value - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(before = 50, - slide_value = .ref_time_value) + epi_slide( + before = 50, + slide_value = .ref_time_value + ) expect_identical(result1, expected_output) # `.{x,group_key,ref_time_value}` should be inaccessible from `.data` and # `.env`. expect_error(small_x %>% - epi_slide(before = 50, - slide_value = .env$.ref_time_value) - ) + epi_slide( + before = 50, + slide_value = .env$.ref_time_value + )) # group_key # Use group_key column - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value="ak"), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value="al") + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = "ak"), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = "al") ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result3 <- small_x %>% - epi_slide(before = 2, - slide_value = .group_key$geo_value) + epi_slide( + before = 2, + slide_value = .group_key$geo_value + ) expect_identical(result3, expected_output) # Use entire group_key object - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=1L), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=1L) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = 1L), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = 1L) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result4 <- small_x %>% - epi_slide(before = 2, - slide_value = nrow(.group_key)) + epi_slide( + before = 2, + slide_value = nrow(.group_key) + ) expect_identical(result4, expected_output) # Ungrouped with multiple geos - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% as_epi_df(as_of = d + 6) %>% arrange(time_value) result5 <- small_x %>% ungroup() %>% - epi_slide(before = 50, - slide_value = .ref_time_value) + epi_slide( + before = 50, + slide_value = .ref_time_value + ) expect_identical(result5, expected_output) }) test_that("epi_slide computation via dots outputs the same result using col names and the data var", { expected_output <- small_x %>% - epi_slide(before = 2, - slide_value = max(time_value)) %>% + epi_slide( + before = 2, + slide_value = max(time_value) + ) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(before = 2, - slide_value = max(.x$time_value)) + epi_slide( + before = 2, + slide_value = max(.x$time_value) + ) expect_identical(result1, expected_output) result2 <- small_x %>% - epi_slide(before = 2, - slide_value = max(.data$time_value)) + epi_slide( + before = 2, + slide_value = max(.data$time_value) + ) expect_identical(result2, expected_output) }) test_that("`epi_slide` can access objects inside of helper functions", { - helper = function(archive_haystack, time_value_needle) { + helper <- function(archive_haystack, time_value_needle) { archive_haystack %>% epi_slide(has_needle = time_value_needle %in% time_value, before = 365000L) } expect_error( diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 1d78bf49..6b113545 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -1,11 +1,12 @@ - test_that("epix_fill_through_version mirrors input when it is sufficiently up to date", { - ea_orig = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) - some_earlier_observed_version = 2L - ea_trivial_fill_na1 = epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") - ea_trivial_fill_na2 = epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") - ea_trivial_fill_locf = epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") + ea_orig <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + some_earlier_observed_version <- 2L + ea_trivial_fill_na1 <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 <- epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") + ea_trivial_fill_locf <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") # Below, we want R6 objects to be compared based on contents rather than # addresses. We appear to get this with `expect_identical` in `testthat` # edition 3, which is based on `waldo::compare` rather than `base::identical`; @@ -21,86 +22,102 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to }) test_that("epix_fill_through_version can extend observed versions, gives expected `as_of`s", { - ea_orig = as_epi_archive(data.table::data.table( + ea_orig <- as_epi_archive(data.table::data.table( geo_value = "g1", - time_value = as.Date("2020-01-01") + c(rep(0L,5L), 1L), + time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), version = c(1:5, 2L), - value = 1:6)) - first_unobserved_version = 6L - later_unobserved_version = 10L - ea_fill_na = epix_fill_through_version(ea_orig, later_unobserved_version, "na") - ea_fill_locf = epix_fill_through_version(ea_orig, later_unobserved_version, "locf") + value = 1:6 + )) + first_unobserved_version <- 6L + later_unobserved_version <- 10L + ea_fill_na <- epix_fill_through_version(ea_orig, later_unobserved_version, "na") + ea_fill_locf <- epix_fill_through_version(ea_orig, later_unobserved_version, "locf") # We use testthat edition 3 features here, passing `ignore_attr` to # `waldo::compare`. Ensure we are using edition 3: testthat::local_edition(3) - withCallingHandlers({ - expect_identical(ea_fill_na$versions_end, later_unobserved_version) - expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), - tibble::tibble(geo_value="g1", time_value=as.Date("2020-01-01")+0:1, value=rep(NA_integer_, 2L)), - ignore_attr = TRUE) - expect_identical(ea_fill_locf$versions_end, later_unobserved_version) - expect_identical(ea_fill_locf$as_of(first_unobserved_version), - ea_fill_locf$as_of(ea_orig$versions_end) %>% - {attr(., "metadata")$as_of <- first_unobserved_version; .}) - }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) + withCallingHandlers( + { + expect_identical(ea_fill_na$versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), + tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + ignore_attr = TRUE + ) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) + expect_identical( + ea_fill_locf$as_of(first_unobserved_version), + ea_fill_locf$as_of(ea_orig$versions_end) %>% + { + attr(., "metadata")$as_of <- first_unobserved_version + . + } + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") + ) }) test_that("epix_fill_through_version does not mutate x", { for (ea_orig in list( # vanilla case - as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)), + as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )), # data.table unique yielding original DT by reference special case (maybe # having only 1 row is the trigger? having no revisions of initial values # doesn't seem sufficient to trigger) - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=10L)) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) )) { # We want to perform a strict comparison of the contents of `ea_orig` before # and `ea_orig` after. `clone` + `expect_identical` based on waldo would # sort of work, but we might want something stricter. `as.list` + # `identical` plus a check of the DT seems to do the trick. - ea_orig_before_as_list = as.list(ea_orig) - ea_orig_DT_before_copy = data.table::copy(ea_orig$DT) - some_unobserved_version = 8L + ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + some_unobserved_version <- 8L # - ea_fill_na = epix_fill_through_version(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list = as.list(ea_orig) + ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list <- as.list(ea_orig) # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) expect_identical(ea_orig_DT_before_copy, ea_orig$DT) # - ea_fill_locf = epix_fill_through_version(ea_orig, some_unobserved_version, "locf") - ea_orig_after_as_list = as.list(ea_orig) + ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") + ea_orig_after_as_list <- as.list(ea_orig) expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) expect_identical(ea_orig_DT_before_copy, ea_orig$DT) } }) test_that("x$fill_through_version mutates x (if needed)", { - ea = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) # We want the contents to change in a substantial way that makes waldo compare # different (if the contents need to change). - ea_before_copies_as_list = lapply(ea, data.table::copy) - some_unobserved_version = 8L + ea_before_copies_as_list <- lapply(ea, data.table::copy) + some_unobserved_version <- 8L ea$fill_through_version(some_unobserved_version, "na") - ea_after_copies_as_list = lapply(ea, data.table::copy) + ea_after_copies_as_list <- lapply(ea, data.table::copy) expect_failure(expect_identical(ea_before_copies_as_list, ea_after_copies_as_list)) }) test_that("{epix_,$}fill_through_version return with expected visibility", { - ea = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) expect_false(withVisible(ea$fill_through_version(15L, "na"))[["visible"]]) }) test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { - ea = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=10L)) - old_DT = ea$DT - old_DT_copy = data.table::copy(old_DT) - old_key = data.table::key(ea$DT) + ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + old_DT <- ea$DT + old_DT_copy <- data.table::copy(old_DT) + old_key <- data.table::key(ea$DT) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) expect_identical(data.table::key(ea$DT), old_key) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 265263f0..0ae428e4 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,57 +1,59 @@ - -test_that("epix_merge requires forbids on invalid `y`",{ - ea = archive_cases_dv_subset$clone() - expect_error(epix_merge(ea, data.frame(x=1))) +test_that("epix_merge requires forbids on invalid `y`", { + ea <- archive_cases_dv_subset$clone() + expect_error(epix_merge(ea, data.frame(x = 1))) }) test_that("epix_merge merges and carries forward updates properly", { - x = as_epi_archive( + x <- as_epi_archive( data.table::as.data.table( - tibble::tribble(~geo_value, ~time_value, ~version, ~x_value, - # same version set for x and y - "g1", 1L, 1:3, paste0("XA", 1:3), - # versions of x surround those of y + this measurement has - # max update version beyond some others - "g1", 2L, 1:5, paste0("XB", 1:5), - # mirror case - "g1", 3L, 2L, paste0("XC", 2L), - # x has 1 version, y has 0 - "g1", 4L, 1L, paste0("XD", 1L), - # non-NA values that should be carried forward - # (version-wise LOCF) in other versions, plus NAs that - # should (similarly) be carried forward as NA (latter - # wouldn't work with an ordinary merge + post-processing - # with `data.table::nafill`) - "g1", 6L, c(1L,3L,5L), paste0("XE", c(1L, NA, 5L)) - ) %>% + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) + "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + ) %>% tidyr::unchop(c(version, x_value)) %>% dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) ) - y = as_epi_archive( + y <- as_epi_archive( data.table::as.data.table( - tibble::tribble(~geo_value, ~time_value, ~version, ~y_value, - "g1", 1L, 1:3, paste0("YA", 1:3), - "g1", 2L, 2L, paste0("YB", 2L), - "g1", 3L, 1:5, paste0("YC", 1:5), - "g1", 5L, 1L, paste0("YD", 1L), - "g1", 6L, 1:5, paste0("YE", 1:5), - ) %>% + tibble::tribble( + ~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% tidyr::unchop(c(version, y_value)) %>% dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) ) - xy = epix_merge(x, y) - xy_expected = as_epi_archive( + xy <- epix_merge(x, y) + xy_expected <- as_epi_archive( data.table::as.data.table( - tibble::tribble(~geo_value, ~time_value, ~version, ~x_value, ~y_value, - "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), - "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA,2L,2L,2L,2L)), - "g1", 3L, 1:5, paste0("XC", c(NA,2L,2L,2L,2L)), paste0("YC", 1:5), - "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), - "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), - "g1", 6L, 1:5, paste0("XE", c(1L,1L,NA,NA,5L)), paste0("YE", 1:5), - ) %>% + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + ) %>% tidyr::unchop(c(version, x_value, y_value)) %>% dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) @@ -62,42 +64,44 @@ test_that("epix_merge merges and carries forward updates properly", { expect_identical(xy, xy_expected) }) -test_that('epix_merge forbids and warns on metadata and naming issues', { +test_that("epix_merge forbids and warns on metadata and naming issues", { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value="tx", time_value=1L, version=1L, x_value=1L)), - as_epi_archive(tibble::tibble(geo_value="us", time_value=1L, version=5L, y_value=2L)) + as_epi_archive(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) ), regexp = "must have the same.*geo_type" ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value="pa", time_value=1L, version=1L, x_value=1L)), - as_epi_archive(tibble::tibble(geo_value="pa", time_value=as.Date("2020-01-01"), version=5L, y_value=2L)) + as_epi_archive(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) ), regexp = "must have the same.*time_type" ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=1L)), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=2L)) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) ), regexp = "overlapping.*names" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), - additional_metadata=list("updates_fetched"=lubridate::ymd_hms("2022-05-01 16:00:00", tz="UTC"))), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L)) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) ), regexp = "x\\$additional_metadata", class = "epiprocess__epix_merge_ignores_additional_metadata" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L)), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), - additional_metadata=list("updates_fetched"=lubridate::ymd_hms("2022-05-01 16:00:00", tz="UTC"))) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ) ), regexp = "y\\$additional_metadata", class = "epiprocess__epix_merge_ignores_additional_metadata" @@ -107,74 +111,78 @@ test_that('epix_merge forbids and warns on metadata and naming issues', { # use `local` to prevent accidentally using the x, y, xy bindings here # elsewhere, while allowing reuse across a couple tests local({ - x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), - clobberable_versions_start=1L, versions_end = 10L) - y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), - clobberable_versions_start=3L, versions_end = 10L) - xy = epix_merge(x,y) - test_that('epix_merge considers partially-clobberable row to be clobberable', { + x <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + clobberable_versions_start = 1L, versions_end = 10L + ) + y <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + clobberable_versions_start = 3L, versions_end = 10L + ) + xy <- epix_merge(x, y) + test_that("epix_merge considers partially-clobberable row to be clobberable", { expect_identical(xy$clobberable_versions_start, 1L) }) - test_that('epix_merge result uses versions_end metadata not max version val', { + test_that("epix_merge result uses versions_end metadata not max version val", { expect_identical(xy$versions_end, 10L) }) }) local({ - x = as_epi_archive( - tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L), + x <- as_epi_archive( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), clobberable_versions_start = 1L, versions_end = 3L ) - y = as_epi_archive( - tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L), + y <- as_epi_archive( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), clobberable_versions_start = 1L ) test_that('epix_merge forbids on sync default or "forbid"', { - expect_error(epix_merge(x,y), - class="epiprocess__epix_merge_unresolved_sync") - expect_error(epix_merge(x,y, sync = "forbid"), - class="epiprocess__epix_merge_unresolved_sync") + expect_error(epix_merge(x, y), + class = "epiprocess__epix_merge_unresolved_sync" + ) + expect_error(epix_merge(x, y, sync = "forbid"), + class = "epiprocess__epix_merge_unresolved_sync" + ) }) test_that('epix_merge sync="na" works', { expect_equal( - epix_merge(x,y, sync = "na"), + epix_merge(x, y, sync = "na"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet - 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated # (we should not have a y vals -> NA update here; version 5 should be # the `versions_end` of the result) - ), clobberable_versions_start=1L) + ), clobberable_versions_start = 1L) ) }) test_that('epix_merge sync="locf" works', { expect_equal( - epix_merge(x,y, sync = "locf"), + epix_merge(x, y, sync = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated - ), clobberable_versions_start=1L) + ), clobberable_versions_start = 1L) ) }) test_that('epix_merge sync="truncate" works', { expect_equal( - epix_merge(x,y, sync = "truncate"), + epix_merge(x, y, sync = "truncate"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet # y's update beyond x's last update has been truncated - ), clobberable_versions_start=1L, versions_end=3L) + ), clobberable_versions_start = 1L, versions_end = 3L) ) }) - x_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L)) - y_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=20L)) - xy_no_conflict_expected = as_epi_archive(tibble::tribble( + x_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) + y_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + xy_no_conflict_expected <- as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet - )) + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) test_that('epix_merge sync="forbid" on no-conflict works', { expect_equal( epix_merge(x_no_conflict, y_no_conflict, sync = "forbid"), @@ -208,8 +216,8 @@ local({ test_that('epix_merge sync="na" balks if do not know next_after', { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-01")), x_value=10L)), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-02")), y_value=20L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), sync = "na" ), regexp = "no applicable method.*next_after" diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 9e091642..b3fff13d 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -1,45 +1,55 @@ library(dplyr) test_that("epix_slide only works on an epi_archive", { - expect_error(epix_slide(data.frame(x=1))) + expect_error(epix_slide(data.frame(x = 1))) }) -x <- tibble::tribble(~version, ~time_value, ~binary, - 4, c(1:3), 2^(1:3), - 5, c(1:2,4), 2^(4:6), - 6, c(1:2,4:5), 2^(7:10), - 7, 2:6, 2^(11:15)) %>% - tidyr::unnest(c(time_value,binary)) +x <- tibble::tribble( + ~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2, 4), 2^(4:6), + 6, c(1:2, 4:5), 2^(7:10), + 7, 2:6, 2^(11:15) +) %>% + tidyr::unnest(c(time_value, binary)) -xx <- bind_cols(geo_value = rep("x",15), x) %>% +xx <- bind_cols(geo_value = rep("x", 15), x) %>% as_epi_archive() test_that("epix_slide works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary") - - xx2 <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - sum_binary = c(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14)) %>% + epix_slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% group_by(geo_value) - - expect_identical(xx1,xx2) # * - + + expect_identical(xx1, xx2) # * + xx3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide(f = ~ sum(.x$binary), - before = 2, - new_col_name = 'sum_binary') + $slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) ) - - expect_identical(xx1,xx3) # This and * imply xx2 and xx3 are identical + + expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical # function interface xx4 <- xx %>% @@ -47,158 +57,199 @@ test_that("epix_slide works as intended", { epix_slide(f = function(x, gk, rtv) { tibble::tibble(sum_binary = sum(x$binary)) }, before = 2, names_sep = NULL) - - expect_identical(xx1,xx4) + + expect_identical(xx1, xx4) # tidyeval interface xx5 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(sum_binary = sum(binary), - before = 2) - - expect_identical(xx1,xx5) + epix_slide( + sum_binary = sum(binary), + before = 2 + ) + + expect_identical(xx1, xx5) }) -test_that("epix_slide works as intended with `as_list_col=TRUE`",{ +test_that("epix_slide works as intended with `as_list_col=TRUE`", { xx_dfrow1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + xx_dfrow2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), slide_value = - c(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14) %>% - purrr::map(~ data.frame(bin_sum = .x)) + c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) %>% + purrr::map(~ data.frame(bin_sum = .x)) ) %>% group_by(geo_value) - - expect_identical(xx_dfrow1,xx_dfrow2) # * - + + expect_identical(xx_dfrow1, xx_dfrow2) # * + xx_dfrow3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide(f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE) + $slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) ) - - expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical - + + expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + xx_df1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ data.frame(bin = .x$binary), - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE + ) + xx_df2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), slide_value = - list(c(2^3,2^2), - c(2^6,2^3), - c(2^10,2^9), - c(2^15,2^14)) %>% - purrr::map(~ data.frame(bin = rev(.x))) + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(~ data.frame(bin = rev(.x))) ) %>% group_by(geo_value) - - expect_identical(xx_df1,xx_df2) + + expect_identical(xx_df1, xx_df2) xx_scalar1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ sum(.x$binary), - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE + ) + xx_scalar2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), slide_value = - list(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14) + list( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) ) %>% group_by(geo_value) - - expect_identical(xx_scalar1,xx_scalar2) - + + expect_identical(xx_scalar1, xx_scalar2) + xx_vec1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ .x$binary, - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ .x$binary, + before = 2, + as_list_col = TRUE + ) + xx_vec2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = - list(c(2^3,2^2), - c(2^6,2^3), - c(2^10,2^9), - c(2^15,2^14)) %>% - purrr::map(rev) + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(rev) ) %>% group_by(geo_value) - - expect_identical(xx_vec1,xx_vec2) + + expect_identical(xx_vec1, xx_vec2) }) test_that("epix_slide `before` validation works", { - expect_error(xx$slide(f = ~ sum(.x$binary)), - "`before` is required") - expect_error(xx$slide(f = ~ sum(.x$binary), before=NA), - "`before`.*NA") - expect_error(xx$slide(f = ~ sum(.x$binary), before=-1), - "`before`.*negative") - expect_error(xx$slide(f = ~ sum(.x$binary), before=1.5), - regexp="before", - class="vctrs_error_incompatible_type") + expect_error( + xx$slide(f = ~ sum(.x$binary)), + "`before` is required" + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = NA), + "`before`.*NA" + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = -1), + "`before`.*negative" + ) + expect_error(xx$slide(f = ~ sum(.x$binary), before = 1.5), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) # We might want to allow this at some point (issue #219): - expect_error(xx$slide(f = ~ sum(.x$binary), before=Inf), - regexp="before", - class="vctrs_error_incompatible_type") + expect_error(xx$slide(f = ~ sum(.x$binary), before = Inf), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) # (wrapper shouldn't introduce a value:) expect_error(epix_slide(xx, f = ~ sum(.x$binary)), "`before` is required") # These `before` values should be accepted: - expect_error(xx$slide(f = ~ sum(.x$binary), before=0), - NA) - expect_error(xx$slide(f = ~ sum(.x$binary), before=2L), - NA) - expect_error(xx$slide(f = ~ sum(.x$binary), before=365000), - NA) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = 0), + NA + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = 2L), + NA + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = 365000), + NA + ) }) test_that("quosure passing issue in epix_slide is resolved + other potential issues", { # (First part adapted from @examples) time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-02"), - by = "1 day") + as.Date("2020-06-02"), + by = "1 day" + ) # We only have one non-version, non-time key in the example archive. Add # another so that we don't accidentally pass tests due to accidentally # matching the default grouping. - ea = as_epi_archive(archive_cases_dv_subset$DT %>% - dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), - other_keys = "modulus", - compactify = TRUE) - reference_by_modulus = ea %>% + ea <- as_epi_archive( + archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE + ) + reference_by_modulus <- ea %>% group_by(modulus) %>% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') - reference_by_neither = ea %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + reference_by_neither <- ea %>% group_by() %>% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) # test the passing-something-that-must-be-enquosed behavior: # # (S3 group_by behavior for this case is the `reference_by_modulus`) @@ -207,17 +258,19 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the .data pronoun behavior: expect_identical( - epix_slide(x = ea %>% group_by(.data$modulus), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) expect_identical( @@ -225,17 +278,19 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the passing across-all-of-string-literal behavior: expect_identical( - epix_slide(x = ea %>% group_by(dplyr::across(all_of("modulus"))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) expect_identical( @@ -243,18 +298,20 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the passing-across-all-of-string-var behavior: - my_group_by = "modulus" + my_group_by <- "modulus" expect_identical( - epix_slide(x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) expect_identical( @@ -262,17 +319,19 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the default behavior (default in this case should just be grouping by neither): expect_identical( - epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_neither ) expect_identical( @@ -280,20 +339,22 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_neither ) }) -ea <- tibble::tribble(~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1)) %>% - tidyr::unnest(c(time_value,binary)) %>% +ea <- tibble::tribble( + ~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1) +) %>% + tidyr::unnest(c(time_value, binary)) %>% mutate(geo_value = "x") %>% as_epi_archive() @@ -305,67 +366,79 @@ test_that("epix_slide with all_versions option has access to all older versions" testthat::local_edition(3) slide_fn <- function(x, gk, rtv) { - return(tibble(n_versions = length(unique(x$DT$version)), - n_row = nrow(x$DT), - dt_class1 = class(x$DT)[[1L]], - dt_key = list(key(x$DT)))) + return(tibble( + n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)) + )) } - ea_orig_mirror = ea$clone(deep=TRUE) + ea_orig_mirror <- ea$clone(deep = TRUE) ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) - result1 <- ea %>% group_by() %>% - epix_slide(f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + result1 <- ea %>% + group_by() %>% + epix_slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) expect_true(inherits(result1, "tbl_df")) result2 <- tibble::tribble( - ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, - 2, 1L, sum(1:1), "data.table", key(ea$DT), - 3, 2L, sum(1:2), "data.table", key(ea$DT), - 4, 3L, sum(1:3), "data.table", key(ea$DT), - 5, 4L, sum(1:4), "data.table", key(ea$DT), - 6, 5L, sum(1:5), "data.table", key(ea$DT), - 7, 6L, sum(1:6), "data.table", key(ea$DT), - ) + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) - expect_identical(result1,result2) # * + expect_identical(result1, result2) # * result3 <- ( ea $group_by() - $slide(f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + $slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) ) - expect_identical(result1,result3) # This and * Imply result2 and result3 are identical + expect_identical(result1, result3) # This and * Imply result2 and result3 are identical # formula interface - result4 <- ea %>% group_by() %>% - epix_slide(f = ~ slide_fn(.x, .y), - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + result4 <- ea %>% + group_by() %>% + epix_slide( + f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) - expect_identical(result1,result4) # This and * Imply result2 and result4 are identical + expect_identical(result1, result4) # This and * Imply result2 and result4 are identical # tidyeval interface result5 <- ea %>% group_by() %>% - epix_slide(data = slide_fn( - .data$clone(), # hack to convert from pronoun back to archive - stop("slide_fn doesn't use group key, no need to prepare it") - ), - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + epix_slide( + data = slide_fn( + .data$clone(), # hack to convert from pronoun back to archive + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) - expect_identical(result1,result5) # This and * Imply result2 and result5 are identical + expect_identical(result1, result5) # This and * Imply result2 and result5 are identical expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea }) @@ -376,21 +449,21 @@ test_that("as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: - f1 = function(x, gk, rtv) { + f1 <- function(x, gk, rtv) { tibble( diff_mean = mean(diff(x$binary)) ) } - ref_time_value1 = 5 + ref_time_value1 <- 5 expect_identical( - ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before=1L), - ea$slide(f1, before=1000L, ref_time_values=ref_time_value1, names_sep=NULL) + ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea$slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) ) # For all_versions = TRUE: - f2 = function(x, gk, rtv) { + f2 <- function(x, gk, rtv) { x %>% # extract time&version-lag-1 data: epix_slide( @@ -400,81 +473,95 @@ test_that("as_of and epix_slide with long enough window are compatible", { filter(time_value == attr(subx, "metadata")$as_of - 1) %>% rename(real_time_value = time_value, lag1 = binary) )) - }, before = 1, names_sep = NULL + }, + before = 1, names_sep = NULL ) %>% # assess as nowcast: unnest(data) %>% inner_join(x$as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } - ref_time_value2 = 5 + ref_time_value2 <- 5 expect_identical( - ea$as_of(ref_time_value2, all_versions=TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before=1L), - ea$slide(f2, before=1000L, ref_time_values=ref_time_value2, all_versions=TRUE, names_sep=NULL) + ea$as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), + ea$slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo = ea$clone() - ea_multigeo$DT <- rbind(ea_multigeo$DT, - copy(ea_multigeo$DT)[,geo_value:="y"][,binary:=-binary][]) + ea_multigeo <- ea$clone() + ea_multigeo$DT <- rbind( + ea_multigeo$DT, + copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + ) setkeyv(ea_multigeo$DT, key(ea$DT)) expect_identical( ea_multigeo %>% group_by(geo_value) %>% - epix_slide(f2, before=1000L, ref_time_values=ref_time_value2, all_versions=TRUE, names_sep=NULL) %>% + epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% filter(geo_value == "x"), ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` - epix_as_of(ref_time_value2, all_versions=TRUE) %>% + epix_as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% group_by(geo_value) ) }) -test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`",{ +test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { slide_fn <- function(x, gk, rtv) { expect_true(is_epi_archive(x)) return(NA) } - ea %>% group_by() %>% - epix_slide(f = slide_fn, - before = 1, - ref_time_values = 5, - new_col_name = "out", - all_versions = TRUE) + ea %>% + group_by() %>% + epix_slide( + f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE + ) }) test_that("epix_slide with all_versions option works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE) - - xx2 <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - sum_binary = c(2^3+2^2, - 2^6+2^3, - 2^10+2^9+2^6, - 2^15+2^14+2^10)) %>% + epix_slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9 + 2^6, + 2^15 + 2^14 + 2^10 + ) + ) %>% group_by(geo_value) - expect_identical(xx1,xx2) # * + expect_identical(xx1, xx2) # * xx3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide(f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = 'sum_binary', - all_versions = TRUE) + $slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) ) - expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical + expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical }) # XXX currently, we're using a stopgap measure of having `epix_slide` always @@ -498,7 +585,7 @@ test_that("epix_slide with all_versions option works as intended", { # }) test_that("epix_slide works with 0-row computation outputs", { - epix_slide_empty = function(ea, ...) { + epix_slide_empty <- function(ea, ...) { ea %>% epix_slide(before = 5L, ..., function(x, gk, rtv) { tibble::tibble() @@ -521,13 +608,13 @@ test_that("epix_slide works with 0-row computation outputs", { ) %>% # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, # as_of = ea$versions_end) %>% - group_by(geo_value) + group_by(geo_value) ) # with `all_versions=TRUE`, we have something similar but never get an # `epi_df`: expect_identical( ea %>% - epix_slide_empty(all_versions=TRUE), + epix_slide_empty(all_versions = TRUE), tibble::tibble( time_value = ea$DT$version[integer(0)] ) @@ -535,7 +622,7 @@ test_that("epix_slide works with 0-row computation outputs", { expect_identical( ea %>% group_by(geo_value) %>% - epix_slide_empty(all_versions=TRUE), + epix_slide_empty(all_versions = TRUE), tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], time_value = ea$DT$version[integer(0)] @@ -563,87 +650,104 @@ test_that("epix_slide works with 0-row computation outputs", { # }) test_that("epix_slide alerts if the provided f doesn't take enough args", { - f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. expect_error(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) expect_warning(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) - f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) expect_warning(epix_slide(xx, f_x_dots, before = 2L), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) }) test_that("epix_slide computation via formula can use ref_time_value", { - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = c(4,5,6,7) - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% group_by(geo_value) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ .ref_time_value, - before = 2) + epix_slide( + f = ~.ref_time_value, + before = 2 + ) expect_identical(xx1, xx_ref) xx2 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ .z, - before = 2) + epix_slide( + f = ~.z, + before = 2 + ) expect_identical(xx2, xx_ref) xx3 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ ..3, - before = 2) + epix_slide( + f = ~..3, + before = 2 + ) expect_identical(xx3, xx_ref) }) test_that("epix_slide computation via function can use ref_time_value", { - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = c(4,5,6,7) - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% group_by(geo_value) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = function(x, g, t) t, - before = 2) + epix_slide( + f = function(x, g, t) t, + before = 2 + ) expect_identical(xx1, xx_ref) }) test_that("epix_slide computation via dots can use ref_time_value and group", { # ref_time_value - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = c(4,5,6,7) - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% group_by(geo_value) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - slide_value = .ref_time_value) + epix_slide( + before = 2, + slide_value = .ref_time_value + ) expect_identical(xx1, xx_ref) # group_key - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = "x" - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = "x" + ) %>% group_by(geo_value) # Use group_key column xx3 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - slide_value = .group_key$geo_value) + epix_slide( + before = 2, + slide_value = .group_key$geo_value + ) expect_identical(xx3, xx_ref) @@ -651,8 +755,10 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { expect_error( xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - slide_value = nrow(.group_key)), + epix_slide( + before = 2, + slide_value = nrow(.group_key) + ), NA ) }) @@ -660,20 +766,26 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { test_that("epix_slide computation via dots outputs the same result using col names and the data var", { xx_ref <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - sum_binary = sum(time_value)) + epix_slide( + before = 2, + sum_binary = sum(time_value) + ) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - sum_binary = sum(.x$time_value)) + epix_slide( + before = 2, + sum_binary = sum(.x$time_value) + ) expect_identical(xx1, xx_ref) xx2 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - sum_binary = sum(.data$time_value)) + epix_slide( + before = 2, + sum_binary = sum(.data$time_value) + ) expect_identical(xx2, xx_ref) }) @@ -691,7 +803,7 @@ test_that("`epix_slide` doesn't decay date output", { }) test_that("`epix_slide` can access objects inside of helper functions", { - helper = function(archive_haystack, time_value_needle) { + helper <- function(archive_haystack, time_value_needle) { archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = 365000L) } expect_error( diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 0423352e..68e7c76d 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -1,17 +1,19 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # From an example: library(dplyr) - toy_archive = + toy_archive <- tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) ) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version)) %>% as_epi_archive(other_keys = "age_group") # Ensure that we're using testthat edition 3's idea of "identical", which is @@ -19,12 +21,12 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { testthat::local_edition(3) # Test equivalency claims in example: - by_both_keys = toy_archive %>% group_by(geo_value, age_group) + by_both_keys <- toy_archive %>% group_by(geo_value, age_group) expect_identical( by_both_keys, - toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) ) - grouping_cols = c("geo_value", "age_group") + grouping_cols <- c("geo_value", "age_group") expect_identical( by_both_keys, toy_archive %>% group_by(across(all_of(grouping_cols))) @@ -37,52 +39,66 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # Test `.drop` behavior: expect_error(toy_archive %>% group_by(.drop = "bogus"), - regexp = "\\.drop.*TRUE or FALSE") - expect_warning(toy_archive %>% group_by(.drop=FALSE), - class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") - expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), - class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") - expect_warning(grouped_factor_then_nonfactor <- - toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), - class="epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") - expect_identical(grouped_factor_then_nonfactor %>% - epix_slide(before = 10, s = sum(value)), - tibble::tribble( - ~age_group, ~geo_value, ~time_value, ~s, - "pediatric", NA_character_, "2000-01-02", 0, - "adult", "us", "2000-01-02", 121, - "pediatric", "us", "2000-01-03", 5, - "adult", "us", "2000-01-03", 255) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value)) %>% - # # See - # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 - # # and - # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 - # # for why this is commented out, pending some design - # # decisions. - # # - # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - # as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(age_group, geo_value, time_value, s) %>% - group_by(age_group, geo_value, .drop=FALSE)) - expect_identical(toy_archive %>% - group_by(geo_value, age_group, .drop=FALSE) %>% - epix_slide(before = 10, s = sum(value)), - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~s, - "us", "pediatric", "2000-01-02", 0, - "us", "adult", "2000-01-02", 121, - "us", "pediatric", "2000-01-03", 5, - "us", "adult", "2000-01-03", 255) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value)) %>% - # as_epi_df(as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(geo_value, age_group, time_value, s) %>% - group_by(geo_value, age_group, .drop=FALSE) - ) + regexp = "\\.drop.*TRUE or FALSE" + ) + expect_warning(toy_archive %>% group_by(.drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning( + grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + expect_identical( + grouped_factor_then_nonfactor %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop = FALSE) + ) + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop = FALSE) %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop = FALSE) + ) }) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 3b692475..7ab63f19 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -3,27 +3,29 @@ library(dplyr) ea <- archive_cases_dv_subset$clone() ea2_data <- tibble::tribble( - ~geo_value, ~time_value, ~version, ~cases, - "ca", "2020-06-01", "2020-06-01", 1, - "ca", "2020-06-01", "2020-06-02", 2, - # - "ca", "2020-06-02", "2020-06-02", 0, - "ca", "2020-06-02", "2020-06-03", 1, - "ca", "2020-06-02", "2020-06-04", 2, - # - "ca", "2020-06-03", "2020-06-03", 1, - # - "ca", "2020-06-04", "2020-06-04", 4, - ) %>% - dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, +) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) # epix_as_of tests -test_that("epix_as_of behaves identically to as_of method",{ - expect_identical(epix_as_of(ea,max_version = min(ea$DT$version)), - ea$as_of(max_version = min(ea$DT$version))) +test_that("epix_as_of behaves identically to as_of method", { + expect_identical( + epix_as_of(ea, max_version = min(ea$DT$version)), + ea$as_of(max_version = min(ea$DT$version)) + ) }) -test_that("Errors are thrown due to bad as_of inputs",{ +test_that("Errors are thrown due to bad as_of inputs", { # max_version cannot be of string class rather than date class expect_error(ea$as_of("2020-01-01")) # max_version cannot be later than latest version @@ -32,25 +34,24 @@ test_that("Errors are thrown due to bad as_of inputs",{ expect_error(ea$as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) }) -test_that("Warning against max_version being clobberable",{ +test_that("Warning against max_version being clobberable", { # none by default expect_warning(regexp = NA, ea$as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea$as_of(max_version = min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable = ea$clone() - ea_with_clobberable$clobberable_versions_start = max(ea_with_clobberable$DT$version) + ea_with_clobberable <- ea$clone() + ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) expect_warning(ea_with_clobberable$as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea_with_clobberable$as_of(max_version = min(ea$DT$version))) }) -test_that("as_of properly grabs the data and doesn't mutate key",{ - +test_that("as_of properly grabs the data and doesn't mutate key", { d <- as.Date("2020-06-01") - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() - old_key = data.table::key(ea2$DT) + old_key <- data.table::key(ea2$DT) edf_as_of <- ea2 %>% epix_as_of(max_version = as.Date("2020-06-03")) @@ -58,14 +59,14 @@ test_that("as_of properly grabs the data and doesn't mutate key",{ edf_expected <- as_epi_df(tibble( geo_value = "ca", time_value = d + 0:2, - cases = c(2,1,1) + cases = c(2, 1, 1) ), as_of = as.Date("2020-06-03")) - expect_equal(edf_as_of, edf_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) expect_equal(data.table::key(ea2$DT), old_key) }) -test_that("Errors are thrown due to bad epix_truncate_versions_after inputs",{ +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { # x must be an archive expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) # max_version cannot be of string class rather than date class @@ -79,44 +80,41 @@ test_that("Errors are thrown due to bad epix_truncate_versions_after inputs",{ }) test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() - old_key = data.table::key(ea2$DT) + old_key <- data.table::key(ea2$DT) ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-02")) - ea_expected <- ea2_data[1:3,] %>% + ea_expected <- ea2_data[1:3, ] %>% as_epi_archive() - expect_equal(ea_as_of, ea_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) expect_equal(data.table::key(ea2$DT), old_key) }) test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() ea_expected <- ea2$clone() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of, ea_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) }) test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_epi_archive(ea_as_of, grouped_okay=FALSE)) + expect_true(is_epi_archive(ea_as_of, grouped_okay = FALSE)) - ea2_grouped = ea2$group_by(geo_value) + ea2_grouped <- ea2$group_by(geo_value) ea_as_of <- ea2_grouped %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -125,10 +123,9 @@ test_that("epix_truncate_version_after returns the same grouping type as input e test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() - ea2 = ea2$group_by(geo_value) + ea2 <- ea2$group_by(geo_value) ea_expected <- ea2$clone() diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index aeb08ced..c2a6d956 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -7,17 +7,19 @@ toy_epi_df <- tibble::tibble( length.out = 5 ), times = 2), geo_value = rep(c("ca", "hi"), each = 5), - indic_var1 = as.factor(rep(1:2, times = 5)), + indic_var1 = as.factor(rep(1:2, times = 5)), indic_var2 = as.factor(rep(letters[1:5], times = 2)) -) %>% as_epi_df(additional_metadata = - list(other_keys = c("indic_var1", "indic_var2"))) +) %>% as_epi_df( + additional_metadata = + list(other_keys = c("indic_var1", "indic_var2")) +) -att_toy = attr(toy_epi_df, "metadata") +att_toy <- attr(toy_epi_df, "metadata") test_that("Head and tail do not drop the epi_df class", { - att_head = attr(head(toy_epi_df), "metadata") - att_tail = attr(tail(toy_epi_df), "metadata") - + att_head <- attr(head(toy_epi_df), "metadata") + att_tail <- attr(tail(toy_epi_df), "metadata") + expect_true(is_epi_df(head(toy_epi_df))) expect_true(is_epi_df(tail(toy_epi_df))) expect_identical(att_head$geo_type, att_toy$geo_type) @@ -32,11 +34,10 @@ test_that("Head and tail do not drop the epi_df class", { test_that("Subsetting drops & does not drop the epi_df class appropriately", { - # Row subset - should be epi_df - row_subset = toy_epi_df[1:2, ] - att_row_subset = attr(row_subset, "metadata") - + row_subset <- toy_epi_df[1:2, ] + att_row_subset <- attr(row_subset, "metadata") + expect_true(is_epi_df(row_subset)) expect_equal(nrow(row_subset), 2L) expect_equal(ncol(row_subset), 6L) @@ -44,34 +45,34 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { expect_identical(att_row_subset$time_type, att_toy$time_type) expect_identical(att_row_subset$as_of, att_toy$as_of) expect_identical(att_row_subset$other_keys, att_toy$other_keys) - + # Row and col single value - shouldn't be an epi_df - row_col_subset1 = toy_epi_df[1,2] + row_col_subset1 <- toy_epi_df[1, 2] expect_false(is_epi_df(row_col_subset1)) expect_true(tibble::is_tibble(row_col_subset1)) expect_equal(nrow(row_col_subset1), 1L) expect_equal(ncol(row_col_subset1), 1L) - + # Col subset with no time_value - shouldn't be an epi_df - col_subset1 = toy_epi_df[, c(1,3)] - + col_subset1 <- toy_epi_df[, c(1, 3)] + expect_false(is_epi_df(col_subset1)) expect_true(tibble::is_tibble(col_subset1)) expect_equal(nrow(col_subset1), 10L) expect_equal(ncol(col_subset1), 2L) - + # Col subset with no geo_value - shouldn't be an epi_df - col_subset2 = toy_epi_df[, 2:3] - + col_subset2 <- toy_epi_df[, 2:3] + expect_false(is_epi_df(col_subset2)) expect_true(tibble::is_tibble(col_subset2)) expect_equal(nrow(col_subset2), 10L) expect_equal(ncol(col_subset2), 2L) - + # Row and col subset that contains geo_value and time_value - should be epi_df - row_col_subset2 = toy_epi_df[2:3,1:3] - att_row_col_subset2 = attr(row_col_subset2, "metadata") - + row_col_subset2 <- toy_epi_df[2:3, 1:3] + att_row_col_subset2 <- attr(row_col_subset2, "metadata") + expect_true(is_epi_df(row_col_subset2)) expect_equal(nrow(row_col_subset2), 2L) expect_equal(ncol(row_col_subset2), 3L) @@ -82,17 +83,21 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { }) test_that("When duplicate cols in subset should abort", { - expect_error(toy_epi_df[, c(2,2:3,4,4,4)], - "Column name(s) time_value, y must not be duplicated.", fixed = T) - expect_error(toy_epi_df[1:4, c(1,2:4,1)], - "Column name(s) geo_value must not be duplicated.", fixed = T) + expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], + "Column name(s) time_value, y must not be duplicated.", + fixed = T + ) + expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], + "Column name(s) geo_value must not be duplicated.", + fixed = T + ) }) test_that("Correct metadata when subset includes some of other_keys", { # Only include other_var of indic_var1 - only_indic_var1 = toy_epi_df[, 1:5] - att_only_indic_var1 = attr(only_indic_var1, "metadata") - + only_indic_var1 <- toy_epi_df[, 1:5] + att_only_indic_var1 <- attr(only_indic_var1, "metadata") + expect_true(is_epi_df(only_indic_var1)) expect_equal(nrow(only_indic_var1), 10L) expect_equal(ncol(only_indic_var1), 5L) @@ -100,11 +105,11 @@ test_that("Correct metadata when subset includes some of other_keys", { expect_identical(att_only_indic_var1$time_type, att_toy$time_type) expect_identical(att_only_indic_var1$as_of, att_toy$as_of) expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2]) - + # Only include other_var of indic_var2 - only_indic_var2 = toy_epi_df[, c(1:4,6)] - att_only_indic_var2 = attr(only_indic_var2, "metadata") - + only_indic_var2 <- toy_epi_df[, c(1:4, 6)] + att_only_indic_var2 <- attr(only_indic_var2, "metadata") + expect_true(is_epi_df(only_indic_var2)) expect_equal(nrow(only_indic_var2), 10L) expect_equal(ncol(only_indic_var2), 5L) @@ -112,12 +117,12 @@ test_that("Correct metadata when subset includes some of other_keys", { expect_identical(att_only_indic_var2$time_type, att_toy$time_type) expect_identical(att_only_indic_var2$as_of, att_toy$as_of) expect_identical(att_only_indic_var2$other_keys, att_toy$other_keys[-1]) - + # Including both original other_keys was already tested above }) test_that("Metadata and grouping are dropped by `as_tibble`", { - grouped_converted = toy_epi_df %>% + grouped_converted <- toy_epi_df %>% group_by(geo_value) %>% as_tibble() expect_true( diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 8460a5e8..2319d045 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,117 +1,119 @@ -test_that("break_string works properly",{ - expect_equal(break_str("A dog is here", 6),"A dog\nis\nhere") +test_that("break_string works properly", { + expect_equal(break_str("A dog is here", 6), "A dog\nis\nhere") }) -test_that("Abort and Warn work",{ +test_that("Abort and Warn work", { expect_error(Abort("abort")) expect_warning(Warn("warn")) }) -test_that("in_range works",{ - expect_equal(in_range(1,c(2,4)),2) - expect_equal(in_range(3,c(2,4)),3) - expect_equal(in_range(5,c(2,4)),4) +test_that("in_range works", { + expect_equal(in_range(1, c(2, 4)), 2) + expect_equal(in_range(3, c(2, 4)), 3) + expect_equal(in_range(5, c(2, 4)), 4) }) -test_that("new summarizing functions work",{ - x <- c(3,4,5,9,NA) - expect_equal(Min(x),3) - expect_equal(Max(x),9) - expect_equal(Sum(x),21) - expect_equal(Mean(x),5.25) - expect_equal(Median(x),4.5) +test_that("new summarizing functions work", { + x <- c(3, 4, 5, 9, NA) + expect_equal(Min(x), 3) + expect_equal(Max(x), 9) + expect_equal(Sum(x), 21) + expect_equal(Mean(x), 5.25) + expect_equal(Median(x), 4.5) }) -test_that("Other capital letter functions work",{ - x <- c(1,2,3,4,5) - expect_equal(Start(x),1) - expect_equal(End(x),5) - expect_equal(MiddleL(x),3) - expect_equal(MiddleR(x),3) - expect_equal(MiddleL(x[-5]),2) - expect_equal(MiddleR(x[-5]),3) - expect_equal(ExtendL(x),c(1,1,2,3,4,5)) - expect_equal(ExtendR(x),c(1,2,3,4,5,5)) +test_that("Other capital letter functions work", { + x <- c(1, 2, 3, 4, 5) + expect_equal(Start(x), 1) + expect_equal(End(x), 5) + expect_equal(MiddleL(x), 3) + expect_equal(MiddleR(x), 3) + expect_equal(MiddleL(x[-5]), 2) + expect_equal(MiddleR(x[-5]), 3) + expect_equal(ExtendL(x), c(1, 1, 2, 3, 4, 5)) + expect_equal(ExtendR(x), c(1, 2, 3, 4, 5, 5)) }) -test_that("guess_geo_type tests for different types of geo_value's",{ +test_that("guess_geo_type tests for different types of geo_value's", { # California, New York - states <- c("ca","ny") - + states <- c("ca", "ny") + # Canada, USA, United Kingdom - nations <- c("ca","us","uk") - + nations <- c("ca", "us", "uk") + # Note: These are just five-number names that may not necessarily be existent # counties - counties <- c("12345","67890") - + counties <- c("12345", "67890") + # HHS regions hhs <- c(1:3) - + # HRR regions - hrr <- c(100,200) - + hrr <- c(100, 200) + # Long numbers should be custom - long_nums <- c(123456789,111222333) - + long_nums <- c(123456789, 111222333) + # Health regions in British Columbia - bc <- c("Vancouver Coastal","Interior","Fraser", - "Northern","Vancouver Island") - + bc <- c( + "Vancouver Coastal", "Interior", "Fraser", + "Northern", "Vancouver Island" + ) + # Long numbers as strings should also be custom - long_num_strings <- c("123456789","111222333") - - expect_equal(guess_geo_type(states),"state") - expect_equal(guess_geo_type(nations),"nation") - expect_equal(guess_geo_type(counties),"county") - expect_equal(guess_geo_type(hhs),"hhs") - expect_equal(guess_geo_type(hrr),"hrr") - expect_equal(guess_geo_type(long_num_strings),"custom") - expect_equal(guess_geo_type(bc),"custom") - expect_equal(guess_geo_type(long_nums),"custom") + long_num_strings <- c("123456789", "111222333") + + expect_equal(guess_geo_type(states), "state") + expect_equal(guess_geo_type(nations), "nation") + expect_equal(guess_geo_type(counties), "county") + expect_equal(guess_geo_type(hhs), "hhs") + expect_equal(guess_geo_type(hrr), "hrr") + expect_equal(guess_geo_type(long_num_strings), "custom") + expect_equal(guess_geo_type(bc), "custom") + expect_equal(guess_geo_type(long_nums), "custom") }) -test_that("guess_time_type works for different types",{ +test_that("guess_time_type works for different types", { days <- as.Date("2022-01-01") + 0:6 weeks <- as.Date("2022-01-01") + 7 * 0:6 yearweeks <- tsibble::yearweek(10) yearmonths <- tsibble::yearmonth(10) yearquarters <- tsibble::yearquarter(10) - - years <- c(1999,2000) - + + years <- c(1999, 2000) + # YYYY-MM-DD is the accepted format not_ymd1 <- "January 1, 2022" not_ymd2 <- "1 January 2022" not_ymd3 <- "1 Jan 2022" - + not_a_date <- "asdf" - - expect_equal(guess_time_type(days),"day") - expect_equal(guess_time_type(weeks),"week") - - expect_equal(guess_time_type(yearweeks),"yearweek") - expect_equal(guess_time_type(yearmonths),"yearmonth") - expect_equal(guess_time_type(yearquarters),"yearquarter") - - expect_equal(guess_time_type(years),"year") - - expect_equal(guess_time_type(not_ymd1),"custom") - expect_equal(guess_time_type(not_ymd2),"custom") - expect_equal(guess_time_type(not_ymd3),"custom") - expect_equal(guess_time_type(not_a_date),"custom") + + expect_equal(guess_time_type(days), "day") + expect_equal(guess_time_type(weeks), "week") + + expect_equal(guess_time_type(yearweeks), "yearweek") + expect_equal(guess_time_type(yearmonths), "yearmonth") + expect_equal(guess_time_type(yearquarters), "yearquarter") + + expect_equal(guess_time_type(years), "year") + + expect_equal(guess_time_type(not_ymd1), "custom") + expect_equal(guess_time_type(not_ymd2), "custom") + expect_equal(guess_time_type(not_ymd3), "custom") + expect_equal(guess_time_type(not_a_date), "custom") }) -test_that("enlist works",{ - my_list <- enlist(x=1,y=2,z=3) - expect_equal(my_list$x,1) - expect_true(inherits(my_list,"list")) +test_that("enlist works", { + my_list <- enlist(x = 1, y = 2, z = 3) + expect_equal(my_list$x, 1) + expect_true(inherits(my_list, "list")) }) test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough args", { - f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xgt_dots = function(x, g, t, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xgt_dots <- function(x, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. expect_error(assert_sufficient_f_args(f_xgt), regexp = NA) @@ -119,63 +121,76 @@ test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough expect_error(assert_sufficient_f_args(f_xgt_dots), regexp = NA) expect_warning(assert_sufficient_f_args(f_xgt_dots), regexp = NA) - f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_dots = function(...) dplyr::tibble(value=c(5), count=c(2)) - f_x = function(x) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f = function() dplyr::tibble(value=c(5), count=c(2)) + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_dots <- function(...) dplyr::tibble(value = c(5), count = c(2)) + f_x <- function(x) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f <- function() dplyr::tibble(value = c(5), count = c(2)) expect_warning(assert_sufficient_f_args(f_x_dots), regexp = ", the group key and reference time value will be included", - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) expect_warning(assert_sufficient_f_args(f_dots), regexp = ", the window data, group key, and reference time value will be included", - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) expect_error(assert_sufficient_f_args(f_x), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args") + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" + ) expect_error(assert_sufficient_f_args(f), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args") + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" + ) - f_xs_dots = function(x, setting="a", ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xs = function(x, setting="a") dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - expect_warning(assert_sufficient_f_args(f_xs_dots, setting="b"), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") - expect_error(assert_sufficient_f_args(f_xs, setting="b"), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded") + f_xs_dots <- function(x, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xs <- function(x, setting = "a") dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b"), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + expect_error(assert_sufficient_f_args(f_xs, setting = "b"), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" + ) expect_error(assert_sufficient_f_args(f_xgt, "b"), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded") + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" + ) }) test_that("assert_sufficient_f_args alerts if the provided f has defaults for the required args", { - f_xgt = function(x, g=1, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xgt_dots = function(x=1, g, t, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_x_dots = function(x=1, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt <- function(x, g = 1, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xgt_dots <- function(x = 1, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_x_dots <- function(x = 1, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) expect_error(assert_sufficient_f_args(f_xgt), regexp = "pass the group key to `f`'s g argument,", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error(assert_sufficient_f_args(f_xgt_dots), regexp = "pass the window data to `f`'s x argument,", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots)), - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) - f_xsgt = function(x, setting="a", g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xsgt_dots = function(x, setting="a", g, t, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xs_dots = function(x=1, setting="a", ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xsgt <- function(x, setting = "a", g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xsgt_dots <- function(x, setting = "a", g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xs_dots <- function(x = 1, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # forwarding named dots should prevent some complaints: expect_no_error(assert_sufficient_f_args(f_xsgt, setting = "b")) expect_no_error(assert_sufficient_f_args(f_xsgt_dots, setting = "b")) expect_error(suppressWarnings(assert_sufficient_f_args(f_xs_dots, setting = "b")), regexp = "pass the window data to `f`'s x argument", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) # forwarding unnamed dots should not: expect_error(assert_sufficient_f_args(f_xsgt, "b"), - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error(assert_sufficient_f_args(f_xsgt_dots, "b"), - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error( expect_warning( assert_sufficient_f_args(f_xs_dots, "b"), @@ -198,36 +213,42 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th test_that("computation formula-derived functions take all argument types", { # positional expect_identical(as_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) - expect_identical(as_slide_computation(~ ..1)(1, 2, 3), 1) + expect_identical(as_slide_computation(~..1)(1, 2, 3), 1) # Matching rlang, purr, dplyr usage expect_identical(as_slide_computation(~ .x + .z)(1, 2, 3), 4) expect_identical(as_slide_computation(~ .x + .y)(1, 2, 3), 3) # named expect_identical(as_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) - expect_identical(as_slide_computation(~ .group_key)(1, 2, 3), 2) + expect_identical(as_slide_computation(~.group_key)(1, 2, 3), 2) }) test_that("as_slide_computation passes functions unaltered", { - f <- function(a, b, c) {a * b * c + 5} + f <- function(a, b, c) { + a * b * c + 5 + } expect_identical(as_slide_computation(f), f) }) test_that("as_slide_computation raises errors as expected", { # Formulas must be one-sided expect_error(as_slide_computation(y ~ ..1), - class="epiprocess__as_slide_computation__formula_is_twosided") + class = "epiprocess__as_slide_computation__formula_is_twosided" + ) # Formulas can't be paired with ... - expect_error(as_slide_computation(~ ..1, method = "fn"), - class="epiprocess__as_slide_computation__formula_with_dots") + expect_error(as_slide_computation(~..1, method = "fn"), + class = "epiprocess__as_slide_computation__formula_with_dots" + ) # `f_env` must be an environment - formula_without_env <- stats::as.formula(~ ..1) + formula_without_env <- stats::as.formula(~..1) rlang::f_env(formula_without_env) <- 5 expect_error(as_slide_computation(formula_without_env), - class="epiprocess__as_slide_computation__formula_has_no_env") + class = "epiprocess__as_slide_computation__formula_has_no_env" + ) # `f` must be a function, formula, or string expect_error(as_slide_computation(5), - class="epiprocess__as_slide_computation__cant_convert_catchall") + class = "epiprocess__as_slide_computation__cant_convert_catchall" + ) }) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 812cb711..567975a5 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -88,7 +88,8 @@ library(dplyr) edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), + by = "day" + ), length.out = length(geo_value)), x = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)), ) %>% as_epi_df() @@ -157,8 +158,10 @@ object returned by `epi_slide()` has a list column containing the slide values. ```{r} edf2 <- edf %>% group_by(geo_value) %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = TRUE) %>% + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = TRUE + ) %>% ungroup() class(edf2$a) @@ -176,8 +179,10 @@ slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} edf %>% group_by(geo_value) %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE) %>% + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = FALSE + ) %>% ungroup() ``` @@ -187,8 +192,10 @@ the prefix associated with list column name, in naming the unnested columns. ```{r} edf %>% group_by(geo_value) %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL) %>% + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = FALSE, names_sep = NULL + ) %>% ungroup() ``` @@ -197,24 +204,30 @@ order to make the result size stable, just like the case for atomic values. ```{r} edf %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL) + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = FALSE, names_sep = NULL + ) ``` ```{r, include = FALSE} # More checks (not included) edf %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL) + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + ref_time_values = as.Date("2020-06-02"), + before = 1, as_list_col = FALSE, names_sep = NULL + ) edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% - epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL) %>% + epix_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + ref_time_values = as.Date("2020-06-02"), + before = 1, as_list_col = FALSE, names_sep = NULL + ) %>% ungroup() ``` @@ -241,11 +254,14 @@ edf %>% obj <- lm(y ~ x, data = d) return( as.data.frame( - predict(obj, newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), - interval = "prediction", level = 0.9) - )) + predict(obj, + newdata = d %>% + group_by(geo_value) %>% + filter(time_value == max(time_value)), + interval = "prediction", level = 0.9 + ) + ) + ) }, before = 1, new_col_name = "fc", names_sep = NULL) ``` @@ -303,17 +319,18 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive(compactify=FALSE) + as_epi_archive(compactify = FALSE) # mutating merge operation: -x$merge(y2 %>% - select(geo_value, time_value, - version = issue, - case_rate_7d_av = value - ) %>% - as_epi_archive(compactify=FALSE), +x$merge( + y2 %>% + select(geo_value, time_value, + version = issue, + case_rate_7d_av = value + ) %>% + as_epi_archive(compactify = FALSE), sync = "locf", - compactify=FALSE + compactify = FALSE ) ``` @@ -323,9 +340,8 @@ library(ggplot2) theme_set(theme_bw()) x <- archive_cases_dv_subset$DT %>% - filter(geo_value %in% c("ca","fl")) %>% + filter(geo_value %in% c("ca", "fl")) %>% as_epi_archive(compactify = FALSE) - ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -347,28 +363,35 @@ prob_arx_args <- function(lags = c(0, 7, 14), symmetrize = TRUE, intercept = FALSE, nonneg = TRUE) { - return(list(lags = lags, - ahead = ahead, - min_train_window = min_train_window, - lower_level = lower_level, - upper_level = upper_level, - symmetrize = symmetrize, - intercept = intercept, - nonneg = nonneg)) + return(list( + lags = lags, + ahead = ahead, + min_train_window = min_train_window, + lower_level = lower_level, + upper_level = upper_level, + symmetrize = symmetrize, + intercept = intercept, + nonneg = nonneg + )) } prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Return NA if insufficient training data if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { - return(data.frame(geo_value = unique(geo_value), # Return geo value! - point = NA, lower = NA, upper = NA)) + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = NA, lower = NA, upper = NA + )) } # Set up x, y, lags list - if (!missing(x)) x <- data.frame(x, y) - else x <- data.frame(y) + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } if (!is.list(args$lags)) args$lags <- list(args$lags) - args$lags = rep(args$lags, length.out = ncol(x)) + args$lags <- rep(args$lags, length.out = ncol(x)) # Build features and response for the AR model, and then fit it dat <- @@ -377,19 +400,24 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { mutate(name = paste0("x", 1:nrow(.))) %>% # One list element for each lagged feature pmap(function(i, lag, name) { - tibble(geo_value = geo_value, - time_value = time_value + lag, # Shift back - !!name := x[,i]) + tibble( + geo_value = geo_value, + time_value = time_value + lag, # Shift back + !!name := x[, i] + ) }) %>% # One list element for the response vector c(list( - tibble(geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward - y = y))) %>% + tibble( + geo_value = geo_value, + time_value = time_value - args$ahead, # Shift forward + y = y + ) + )) %>% # Combine them together into one data frame reduce(full_join, by = c("geo_value", "time_value")) %>% arrange(time_value) - if (args$intercept) dat$x0 = rep(1, nrow(dat)) + if (args$intercept) dat$x0 <- rep(1, nrow(dat)) obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) # Use LOCF to fill NAs in the latest feature values (do this by geo value) @@ -398,10 +426,10 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] # Make predictions - test_time_value = max(time_value) + test_time_value <- max(time_value) point <- predict(obj, newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value)) + dplyr::group_by(geo_value) %>% + dplyr::filter(time_value == test_time_value)) # Compute bands r <- residuals(obj) @@ -412,12 +440,14 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Clip at zero if we need to, then return if (args$nonneg) { - point = pmax(point, 0) - lower = pmax(lower, 0) - upper = pmax(upper, 0) + point <- pmax(point, 0) + lower <- pmax(lower, 0) + upper <- pmax(upper, 0) } - return(data.frame(geo_value = unique(geo_value), # Return geo value! - point = point, lower = lower, upper = upper)) + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = point, lower = lower, upper = upper + )) } ``` @@ -428,44 +458,57 @@ data. # Latest snapshot of data, and forecast dates x_latest <- epix_as_of(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month") + as.Date("2021-11-30"), + by = "1 month" +) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), - before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = TRUE, - geo_value = fc_geo_value) - } - else { + epix_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead) + ), + before = 119, ref_time_values = fc_time_values + ) %>% + mutate( + target_date = time_value + ahead, as_of = TRUE, + geo_value = fc_geo_value + ) + } else { x_latest %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), - before = 119, ref_time_values = fc_time_values) %>% + epi_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead) + ), + before = 119, ref_time_values = fc_time_values + ) %>% mutate(target_date = time_value + ahead, as_of = FALSE) } } # Generate the forecasts, and bind them together -fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE)) +fc <- bind_rows( + k_week_ahead(x, ahead = 7, as_of = TRUE), + k_week_ahead(x, ahead = 14, as_of = TRUE), + k_week_ahead(x, ahead = 21, as_of = TRUE), + k_week_ahead(x, ahead = 28, as_of = TRUE), + k_week_ahead(x, ahead = 7, as_of = FALSE), + k_week_ahead(x, ahead = 14, as_of = FALSE), + k_week_ahead(x, ahead = 21, as_of = FALSE), + k_week_ahead(x, ahead = 28, as_of = FALSE) +) # Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50") + - geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc_point)) + + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 8ad3d1cd..3e97b6b9 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -35,7 +35,7 @@ library(dplyr) dt <- archive_cases_dv_subset$DT locf_omitted <- as_epi_archive(dt) -locf_included <- as_epi_archive(dt,compactify = FALSE) +locf_included <- as_epi_archive(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -46,10 +46,10 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu `percent_cli` column for comparing performance. ```{r} -dt2 <- select(dt,-percent_cli) +dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) -locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) +locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were @@ -70,13 +70,13 @@ the LOCF values are omitted. # Performance of filtering iterate_filter <- function(my_ea) { for (i in 1:1000) { - filter(my_ea$DT,version >= as.Date("2020-01-01") + i) + filter(my_ea$DT, version >= as.Date("2020-01-01") + i) } } elapsed_time <- function(fx) c(system.time(fx))[[3]] -speed_test <- function(f,name) { +speed_test <- function(f, name) { data.frame( operation = name, locf = elapsed_time(f(locf_included_2)), @@ -84,8 +84,7 @@ speed_test <- function(f,name) { ) } -speeds <- speed_test(iterate_filter,"filter_1000x") - +speeds <- speed_test(iterate_filter, "filter_1000x") ``` We would also like to measure the speed of `epi_archive` methods. @@ -98,22 +97,22 @@ iterate_as_of <- function(my_ea) { } } -speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) +speeds <- rbind(speeds, speed_test(iterate_as_of, "as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(case_rate_7d_av), before = 7) + my_ea$slide(median = median(case_rate_7d_av), before = 7) } -speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) +speeds <- rbind(speeds, speed_test(slide_median, "slide_median")) ``` Here is a detailed performance comparison: ```{r} -speeds_tidy <- tidyr::gather(speeds,key="is_locf",value="time_in_s",locf,no_locf) +speeds_tidy <- tidyr::gather(speeds, key = "is_locf", value = "time_in_s", locf, no_locf) library(ggplot2) ggplot(speeds_tidy) + - geom_bar(aes(x=is_locf,y=time_in_s,fill=operation),stat = "identity") + geom_bar(aes(x = is_locf, y = time_in_s, fill = operation), stat = "identity") ``` diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index 4fb4eda5..abef646f 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -43,7 +43,7 @@ The data has 1,158 rows and 3 columns. data(jhu_csse_daily_subset) x <- jhu_csse_daily_subset %>% select(geo_value, time_value, cases = cases_7d_av) %>% - filter(geo_value %in% c("pa","ga") & time_value >= "2020-06-01") %>% + filter(geo_value %in% c("pa", "ga") & time_value >= "2020-06-01") %>% arrange(geo_value, time_value) %>% as_epi_df() ``` @@ -104,16 +104,20 @@ red) and below -1% (in blue), faceting by geo value. library(ggplot2) theme_set(theme_bw()) -upper = 0.01 -lower = -0.01 +upper <- 0.01 +lower <- -0.01 ggplot(x, aes(x = time_value, y = cases)) + - geom_tile(data = x %>% filter(cases_gr1 >= upper), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 2, alpha = 0.08) + - geom_tile(data = x %>% filter(cases_gr1 <= lower), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 4, alpha = 0.08) + + geom_tile( + data = x %>% filter(cases_gr1 >= upper), + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 2, alpha = 0.08 + ) + + geom_tile( + data = x %>% filter(cases_gr1 <= lower), + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 4, alpha = 0.08 + ) + geom_line() + facet_wrap(vars(geo_value), scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + @@ -128,7 +132,7 @@ ggplot(x, aes(x = time_value, y = cases_gr1)) + geom_line(aes(col = geo_value)) + geom_hline(yintercept = upper, linetype = 2, col = 2) + geom_hline(yintercept = lower, linetype = 2, col = 4) + - scale_color_manual(values = c(3,6)) + + scale_color_manual(values = c(3, 6)) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "State") ``` @@ -154,15 +158,18 @@ x <- x %>% mutate(cases_gr2 = growth_rate(time_value, cases, method = "linear_reg")) x %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr1 = "rel_change", - cases_gr2 = "linear_reg")) %>% + cases_gr1 = "rel_change", + cases_gr2 = "linear_reg" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(2,4)) + + scale_color_manual(values = c(2, 4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") @@ -183,20 +190,25 @@ details.) ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), - cases_gr4 = growth_rate(time_value, cases, method = "trend_filter")) + mutate( + cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), + cases_gr4 = growth_rate(time_value, cases, method = "trend_filter") + ) x %>% select(geo_value, time_value, cases_gr3, cases_gr4) %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr3 = "smooth_spline", - cases_gr4 = "trend_filter")) %>% + cases_gr3 = "smooth_spline", + cases_gr4 = "trend_filter" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(3,6)) + + scale_color_manual(values = c(3, 6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") @@ -227,41 +239,57 @@ the call to `growth_rate()`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr5 = growth_rate(time_value, cases, method = "rel_change", - log_scale = TRUE), - cases_gr6 = growth_rate(time_value, cases, method = "linear_reg", - log_scale = TRUE), - cases_gr7 = growth_rate(time_value, cases, method = "smooth_spline", - log_scale = TRUE), - cases_gr8 = growth_rate(time_value, cases, method = "trend_filter", - log_scale = TRUE)) + mutate( + cases_gr5 = growth_rate(time_value, cases, + method = "rel_change", + log_scale = TRUE + ), + cases_gr6 = growth_rate(time_value, cases, + method = "linear_reg", + log_scale = TRUE + ), + cases_gr7 = growth_rate(time_value, cases, + method = "smooth_spline", + log_scale = TRUE + ), + cases_gr8 = growth_rate(time_value, cases, + method = "trend_filter", + log_scale = TRUE + ) + ) x %>% select(geo_value, time_value, cases_gr5, cases_gr6) %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr5 = "rel_change_log", - cases_gr6 = "linear_reg_log")) %>% + cases_gr5 = "rel_change_log", + cases_gr6 = "linear_reg_log" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(2,4)) + + scale_color_manual(values = c(2, 4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") x %>% select(geo_value, time_value, cases_gr7, cases_gr8) %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr7 = "smooth_spline_log", - cases_gr8 = "trend_filter_log")) %>% + cases_gr7 = "smooth_spline_log", + cases_gr8 = "trend_filter_log" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(3,6)) + + scale_color_manual(values = c(3, 6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") From c02d2013e71fe78659bea0d29683c68f8ff55ca5 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 17 Jan 2024 13:52:17 -0800 Subject: [PATCH 3/3] repo: ignore styling commit --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..a3d36061 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +c65876078a6f9525952b305eaea2fca003adf907 \ No newline at end of file