diff --git a/.Rbuildignore b/.Rbuildignore index 0582014a..cb0b7ed2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,3 +16,5 @@ ^.lintr$ ^DEVELOPMENT.md$ man-roxygen +^.venv$ +^sandbox.R$ \ No newline at end of file diff --git a/.gitignore b/.gitignore index de393a31..8dc001be 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ docs renv/ renv.lock .Rprofile +sandbox.R \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index f03a92ee..790b36a5 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,10 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.8.0 +Version: 0.9.0 Authors@R: c( person("Jacob", "Bien", role = "ctb"), - person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), + person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), person("Rafael", "Catoia", role = "ctb"), person("Nat", "DeFries", role = "ctb"), person("Daniel", "McDonald", role = "aut"), @@ -15,23 +15,30 @@ Authors@R: c( person("Evan", "Ray", role = "aut"), person("Dmitry", "Shemetov", role = "ctb"), person("Ryan", "Tibshirani", role = "aut"), - person("Lionel", "Henry", role = "ctb", comment = "Author of included rlang fragments"), - person("Hadley", "Wickham", role = "ctb", comment = "Author of included rlang fragments"), - person("Posit", role = "cph", comment = "Copyright holder of included rlang fragments") + person("Lionel", "Henry", role = "ctb", + comment = "Author of included rlang fragments"), + person("Hadley", "Wickham", role = "ctb", + comment = "Author of included rlang fragments"), + person("Posit", role = "cph", + comment = "Copyright holder of included rlang fragments") ) -Description: This package introduces a common data structure for epidemiological - data reported by location and time, provides another data structure to - work with revisions to these data sets over time, and offers associated - utilities to perform basic signal processing tasks. +Description: This package introduces a common data structure for + epidemiological data reported by location and time, provides another + data structure to work with revisions to these data sets over time, + and offers associated utilities to perform basic signal processing + tasks. License: MIT + file LICENSE -Copyright: file inst/COPYRIGHTS +URL: https://cmu-delphi.github.io/epiprocess/ +Depends: + R (>= 3.6) Imports: checkmate, cli, data.table, - dplyr (>= 1.0.0), + dplyr (>= 1.0.8), genlasso, ggplot2, + glue, lifecycle (>= 1.0.1), lubridate, magrittr, @@ -43,7 +50,8 @@ Imports: tidyselect (>= 1.2.0), tsibble, utils, - vctrs + vctrs, + waldo Suggests: covidcast, devtools, @@ -57,22 +65,22 @@ VignetteBuilder: knitr Remotes: cmu-delphi/epidatr, - reconverse/outbreaks, - glmgen/genlasso + glmgen/genlasso, + reconverse/outbreaks Config/testthat/edition: 3 +Config/testthat/parallel: true +Copyright: file inst/COPYRIGHTS Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 -Depends: - R (>= 2.10) -URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' 'autoplot.R' 'correlation.R' 'data.R' 'epi_df.R' + 'epi_df_forbidden_methods.R' 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' @@ -82,6 +90,7 @@ Collate: 'methods-epi_df.R' 'outliers.R' 'reexports.R' + 'revision_analysis.R' 'slide.R' 'utils.R' 'utils_pipe.R' diff --git a/NAMESPACE b/NAMESPACE index f8610226..904b2d24 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,13 @@ S3method("[",epi_df) S3method("names<-",epi_df) +S3method(Summary,epi_df) +S3method(arrange_canonical,default) +S3method(arrange_canonical,epi_df) +S3method(arrange_col_canonical,default) +S3method(arrange_col_canonical,epi_df) +S3method(arrange_row_canonical,default) +S3method(arrange_row_canonical,epi_df) S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) @@ -11,6 +18,7 @@ S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) S3method(clone,epi_archive) S3method(clone,grouped_epi_archive) +S3method(complete,epi_df) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) @@ -24,6 +32,7 @@ S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) S3method(group_modify,epi_df) +S3method(group_vars,grouped_epi_archive) S3method(groups,grouped_epi_archive) S3method(guess_period,Date) S3method(guess_period,POSIXt) @@ -32,12 +41,12 @@ S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) +S3method(mean,epi_df) S3method(next_after,Date) S3method(next_after,integer) S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) -S3method(select,epi_df) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) @@ -45,11 +54,13 @@ S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) export(arrange) +export(arrange_canonical) export(as_epi_archive) export(as_epi_df) export(as_tsibble) export(autoplot) export(clone) +export(complete) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -64,8 +75,10 @@ export(epix_merge) export(epix_slide) export(epix_truncate_versions_after) export(filter) +export(full_seq) export(geo_column_names) export(group_by) +export(group_epi_df) export(group_modify) export(growth_rate) export(guess_period) @@ -79,7 +92,9 @@ export(new_epi_df) export(next_after) export(relocate) export(rename) +export(revision_summary) export(slice) +export(sum_groups_epi_df) export(time_column_names) export(ungroup) export(unnest) @@ -109,6 +124,7 @@ importFrom(checkmate,vname) importFrom(cli,cat_line) importFrom(cli,cli_abort) importFrom(cli,cli_inform) +importFrom(cli,cli_li) importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(cli,format_message) @@ -126,8 +142,11 @@ importFrom(data.table,set) importFrom(data.table,setDF) importFrom(data.table,setkeyv) importFrom(dplyr,"%>%") +importFrom(dplyr,across) +importFrom(dplyr,all_of) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) +importFrom(dplyr,c_across) importFrom(dplyr,dplyr_col_modify) importFrom(dplyr,dplyr_reconstruct) importFrom(dplyr,dplyr_row_slice) @@ -135,16 +154,23 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) +importFrom(dplyr,group_map) importFrom(dplyr,group_modify) importFrom(dplyr,group_vars) importFrom(dplyr,groups) importFrom(dplyr,if_all) importFrom(dplyr,if_any) +importFrom(dplyr,if_else) +importFrom(dplyr,lag) importFrom(dplyr,mutate) +importFrom(dplyr,near) +importFrom(dplyr,pick) +importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,slice) +importFrom(dplyr,summarize) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(ggplot2,autoplot) @@ -176,6 +202,7 @@ importFrom(rlang,is_formula) importFrom(rlang,is_function) importFrom(rlang,is_missing) importFrom(rlang,is_quosure) +importFrom(rlang,list2) importFrom(rlang,missing_arg) importFrom(rlang,new_function) importFrom(rlang,quo_get_expr) @@ -194,6 +221,8 @@ importFrom(stats,median) importFrom(tibble,as_tibble) importFrom(tibble,new_tibble) importFrom(tibble,validate_tibble) +importFrom(tidyr,complete) +importFrom(tidyr,full_seq) importFrom(tidyr,unnest) importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) diff --git a/NEWS.md b/NEWS.md index e1c6a3ee..ee04b7f3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,10 +4,74 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat # epiprocess 0.9 +## Breaking changes + +- `epi_slide` interface has major breaking changes. + - All variables are now dot-prefixed to be more consistent with tidyverse + style for functions that allow tidyeval. + - The `before/after` arguments have been replaced with the `.window_size` and + `.align` arguments. + - `names_sep` has been removed. If you return data frames from your + computations: + - without a name, they will be unpacked into separate columns without name + prefixes + - with a name, it will become a packed data.frame-class column (see + `tidyr::pack`). + - `as_list_col` has been removed. You can now directly return a list from your + slide computations instead. If you were using `as_list_col=TRUE`, you will + need to wrap your output in a list. + - Ungrouped slides are no longer allowed in `epi_slide`. If you used this for + geographic aggregation up to national, consider using `sum_groups_epi_df`. + - Added `sum_groups_epi_df` to allow aggregation across key columns prior to + sliding. +- `epix_slide` interface has major changes. + - All variables are now dot-prefixed to be more consistent with tidyverse + style for functions that allow tidyeval. + - `names_sep` has been removed. If you return data frames from your + computations: + - without a name, they will be unpacked into separate columns without name + prefixes + - with a name, it will become a packed data.frame-class column (see + `tidyr::pack`). + - `as_list_col` has been removed. You can now directly return a list from your + slide computations instead. If you were using `as_list_col=TRUE`, you will + need to wrap your output in a list. +- `as_epi_df()` now checks that every group has unique time values and errors if + this is not the case. The same check is performed at the beginning of + `epi_slide()`. This check is currently not enforced in dplyr operations (like + for joins, mutates, or select), but we plan to add it in the future. +- `as_epi_df()` or `as_epi_archive()` no longer accept `additional_metadata`. + Use the new `other_keys` arg to specify additional key columns, such as age + group columns or other demographic breakdowns. Miscellaneous metadata are no + longer handled by `epiprocess`, but you can use R's built-in `attr<-` instead + for a similar feature. + +## Improvements + +- Added `complete.epi_df`, which fills in missing values in an `epi_df` with + `NA`s. Uses `tidyr::complete` underneath and preserves `epi_df` metadata. +- Inclusion of the function `revision_summary` to provide basic revision + information for `epi_archive`s out of the box. (#492) + +## Bug fixes + +- Fix `epi_slide_opt` (and related functions) to correctly handle `before=Inf`. + Also allow multiple columns specified as a list of strings. +- Disallow `after=Inf` in slide functions, since it doesn't seem like a likely + use case and complicates code. + # epiprocess 0.8 ## Breaking changes +- `epi_df`'s are now more strict about what types they allow in the time column. + Namely, we are explicit about only supporting `Date` at the daily and weekly + cadence and generic integer types (for yearly cadence). +- `epi_slide` `before` and `after` arguments are now require the user to + specific time units in certain cases. The `time_step` argument has been + removed. +- `epix_slide` `before` argument now defaults to `Inf`, and requires the user to + specify units in some cases. The `time_step` argument has been removed. - `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use `detect_outlr_stl(seasonal_period = , seasonal_as_residual = TRUE)` instead. See `?detect_outlr_stl` for more details. @@ -50,6 +114,12 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat are similar functions for `geo` and `version`). - Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would output a huge number of `ref_time_values` spaced apart by mere seconds. +- In `epi_slide()` and `epix_slide()`: + - Multiple "data-masking" tidy evaluation expressions can be passed in via + `...`, rather than just one. + - Additional tidy evaluation features from `dplyr::mutate` are supported: `!! +name_var := value`, unnamed expressions evaluating to data frames, and `= +NULL`; see `?epi_slide` for more details. ## Cleanup @@ -58,17 +128,6 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471). - Removed some external package dependencies. -## Breaking Changes - -- `epi_df`'s are now more strict about what types they allow in the time column. - Namely, we are explicit about only supporting `Date` at the daily and weekly - cadence and generic integer types (for yearly cadence). -- `epi_slide` `before` and `after` arguments are now require the user to - specific time units in certain cases. The `time_step` argument has been - removed. -- `epix_slide` `before` argument now defaults to `Inf`, and requires the user to - specify units in some cases. The `time_step` argument has been removed. - # epiprocess 0.7.0 ## Breaking changes: diff --git a/R/archive.R b/R/archive.R index 052f2776..e877d397 100644 --- a/R/archive.R +++ b/R/archive.R @@ -49,7 +49,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, if (!identical(class(version_bound), class(x[["version"]]))) { cli_abort( "{version_bound_arg} must have the same `class` vector as x$version, - which has a `class` of {paste(collapse = ' ', deparse(class(x$version)))}", + which has a `class` of {format_class_vec(class(x$version))}", class = "epiprocess__version_bound_mismatched_class" ) } @@ -179,7 +179,8 @@ NULL #' #' * `geo_type`: the type for the geo values. #' * `time_type`: the type for the time values. -#' * `additional_metadata`: list of additional metadata for the data archive. +#' * `other_keys`: any additional keys as a character vector. +#' Typical examples are "age" or sub-geographies. #' #' While this metadata is not protected, it is generally recommended to treat it #' as read-only, and to use the `epi_archive` methods to interact with the data @@ -209,10 +210,8 @@ NULL #' if the time type is not recognized. #' @param other_keys Character vector specifying the names of variables in `x` #' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have the `geo_type` field; named -#' entries from the passed list or will be included as well. +#' apart from "geo_value", "time_value", and "version". Typical examples +#' are "age" or more granular geographies. #' @param compactify Optional; Boolean. `TRUE` will remove some #' redundant rows, `FALSE` will not, and missing or `NULL` will remove #' redundant rows, but issue a warning. See more information at `compactify`. @@ -240,6 +239,7 @@ NULL #' value of `clobberable_versions_start` does not fully trust these empty #' updates, and assumes that any version `>= max(x$version)` could be #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification #' @return An `epi_archive` object. #' #' @importFrom data.table as.data.table key setkeyv @@ -292,18 +292,18 @@ new_epi_archive <- function( geo_type, time_type, other_keys, - additional_metadata, compactify, clobberable_versions_start, - versions_end) { + versions_end, + compactify_tol = .Machine$double.eps^0.5) { # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed key_vars <- c("geo_value", "time_value", other_keys, "version") - DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + data_table <- as.data.table(x, key = key_vars) # nolint: object_name_linter + if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars) - if (anyDuplicated(DT, by = key(DT)) != 0L) { + if (anyDuplicated(data_table, by = key(data_table)) != 0L) { cli_abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. @@ -313,38 +313,17 @@ new_epi_archive <- function( ) } - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { # nolint: object_usage_linter - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec)) - ) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter - } - + nrow_before_compactify <- nrow(data_table) # Runs compactify on data frame if (is.null(compactify) || compactify == TRUE) { - elim <- keep_locf(DT) - DT <- rm_locf(DT) # nolint: object_name_linter + compactified <- apply_compactify(data_table, key_vars, compactify_tol) } else { - # Create empty data frame for nrow(elim) to be 0 - elim <- tibble::tibble() + compactified <- data_table } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { + # Warns about redundant rows if the number of rows decreased, and we didn't + # explicitly say to compactify + if (is.null(compactify) && nrow(compactified) < nrow_before_compactify) { + elim <- removed_by_compactify(data_table, key_vars, compactify_tol) warning_intro <- cli::format_inline( "Found rows that appear redundant based on last (version of each) observation carried forward; @@ -366,10 +345,10 @@ new_epi_archive <- function( structure( list( - DT = DT, + DT = compactified, geo_type = geo_type, time_type = time_type, - additional_metadata = additional_metadata, + other_keys = other_keys, clobberable_versions_start = clobberable_versions_start, versions_end = versions_end ), @@ -377,6 +356,63 @@ new_epi_archive <- function( ) } +#' given a tibble as would be found in an epi_archive, remove duplicate entries. +#' @description +#' works by shifting all rows except the version, then comparing values to see +#' if they've changed. We need to arrange in descending order, but note that +#' we don't need to group, since at least one column other than version has +#' changed, and so is kept. +#' @keywords internal +#' @importFrom dplyr filter +apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) { + df %>% + arrange(!!!keys) %>% + filter(if_any( + c(everything(), -version), # all non-version columns + ~ !is_locf(., tolerance) + )) +} + +#' get the entries that `compactify` would remove +#' @keywords internal +#' @importFrom dplyr filter if_all everything +removed_by_compactify <- function(df, keys, tolerance) { + df %>% + arrange(!!!keys) %>% + filter(if_all( + c(everything(), -version), + ~ is_locf(., tolerance) + )) # nolint: object_usage_linter +} + +#' Checks to see if a value in a vector is LOCF +#' @description +#' LOCF meaning last observation carried forward. lags the vector by 1, then +#' compares with itself. For doubles it uses float comparison via +#' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are +#' considered equal to themselves and each other. +#' @importFrom dplyr lag if_else near +#' @keywords internal +is_locf <- function(vec, tolerance) { # nolint: object_usage_linter + lag_vec <- dplyr::lag(vec) + if (typeof(vec) == "double") { + res <- if_else( + !is.na(vec) & !is.na(lag_vec), + near(vec, lag_vec, tol = tolerance), + is.na(vec) & is.na(lag_vec) + ) + return(res) + } else { + res <- if_else( + !is.na(vec) & !is.na(lag_vec), + vec == lag_vec, + is.na(vec) & is.na(lag_vec) + ) + return(res) + } +} + + #' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`. #' #' @rdname epi_archive @@ -385,7 +421,6 @@ new_epi_archive <- function( validate_epi_archive <- function( x, other_keys, - additional_metadata, compactify, clobberable_versions_start, versions_end) { @@ -396,9 +431,6 @@ validate_epi_archive <- function( if (any(c("geo_value", "time_value", "version") %in% other_keys)) { cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\" or \"time_type\".") - } # Conduct checks and apply defaults for `compactify` assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) @@ -447,8 +479,7 @@ as_epi_archive <- function( x, geo_type = deprecated(), time_type = deprecated(), - other_keys = character(0L), - additional_metadata = list(), + other_keys = character(), compactify = NULL, clobberable_versions_start = NA, .versions_end = max_version_with_row_in(x), ..., @@ -480,11 +511,10 @@ as_epi_archive <- function( time_type <- guess_time_type(x$time_value) validate_epi_archive( - x, other_keys, additional_metadata, - compactify, clobberable_versions_start, versions_end + x, other_keys, compactify, clobberable_versions_start, versions_end ) new_epi_archive( - x, geo_type, time_type, other_keys, additional_metadata, + x, geo_type, time_type, other_keys, compactify, clobberable_versions_start, versions_end ) } @@ -513,7 +543,7 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { c( ">" = if (class) "An `epi_archive` object, with metadata:", "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { - "Non-standard DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}" + "Other DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}" }, "i" = if (nrow(x$DT) != 0L) { "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}" @@ -555,8 +585,8 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { #' `...`. #' @param .drop As described in [`dplyr::group_by`]; determines treatment of #' factor columns. -#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for -#' `is_grouped_epi_archive`: any object +#' @param x For `groups`, `group_vars`, or `ungroup`: a `grouped_epi_archive`; +#' for `is_grouped_epi_archive`: any object #' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or #' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; #' `grouped_epi_archive` dispatches its own S3 method) @@ -594,10 +624,10 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% #' epix_slide( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = "case_rate_3d_av" +#' .f = ~ mean(.x$case_rate_7d_av), +#' .before = 2, +#' .versions = as.Date("2020-06-11") + 0:2, +#' .new_col_name = "case_rate_3d_av" #' ) %>% #' ungroup() #' @@ -635,6 +665,11 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { #' group_by(geo_value, age_group) %>% #' ungroup(age_group) #' +#' # To get the grouping variable names as a character vector: +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_vars() +#' #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): #' toy_archive %>% #' group_by(geo_value) %>% @@ -642,14 +677,15 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { #' #' toy_archive %>% #' group_by(geo_value, age_group, .drop = FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' epix_slide(.f = ~ sum(.x$value), .before = 20) %>% #' ungroup() #' #' @importFrom dplyr group_by #' @export #' #' @aliases grouped_epi_archive -group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { +group_by.epi_archive <- function(.data, ..., .add = FALSE, + .drop = dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) assert_logical(.drop) @@ -657,9 +693,9 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. - if (!any(grouping_col_is_factor)) { + if (length(grouping_cols) != 0L && !any(grouping_col_is_factor)) { cli_warn( - "`.drop=FALSE` but there are no factor grouping columns; + "`.drop=FALSE` but none of the grouping columns are factors; did you mean to convert one of the columns to a factor beforehand?", class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" ) @@ -667,10 +703,10 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ cli_warn( "`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these - columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + non-factor columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, - or use `.drop=TRUE` and add a call to `tidyr::complete`.", + or use `.drop=TRUE` and add a call to `tidyr::complete()`.", class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" ) } diff --git a/R/autoplot.R b/R/autoplot.R index 7443628b..eef5aa12 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -47,15 +47,15 @@ autoplot.epi_df <- function( .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "#3A448F", .max_facets = Inf) { - .color_by <- match.arg(.color_by) - .facet_by <- match.arg(.facet_by) + .color_by <- rlang::arg_match(.color_by) + .facet_by <- rlang::arg_match(.facet_by) assert(anyInfinite(.max_facets), checkInt(.max_facets), combine = "or") assert_character(.base_color, len = 1) key_cols <- key_colnames(object) non_key_cols <- setdiff(names(object), key_cols) - geo_and_other_keys <- kill_time_value(key_cols) + geo_and_other_keys <- key_colnames(object, exclude = "time_value") # --- check for numeric variables allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) diff --git a/R/correlation.R b/R/correlation.R index 5e9694c4..e86ad373 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -99,7 +99,7 @@ epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, # nol shift_by <- syms(names(eval_select(enquo(shift_by), x))) # Which method? - method <- match.arg(method) + method <- rlang::arg_match(method) # Perform time shifts, then compute appropriate correlations and return return(x %>% diff --git a/R/epi_df.R b/R/epi_df.R index 37f26b87..c8d052d9 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -81,10 +81,7 @@ #' #' An unrecognizable time type is labeled "custom". #' -#' @template epi_df-params -#' @rdname epi_df -#' -#' @export +#' @name epi_df #' @examples #' # Convert a `tsibble` that has county code as an extra key #' # Notice that county code should be a character string to preserve any leading zeroes @@ -130,7 +127,7 @@ #' dplyr::rename(geo_value = state, time_value = reported_date) %>% #' as_epi_df( #' as_of = "2020-06-03", -#' additional_metadata = list(other_keys = "pol") +#' other_keys = "pol" #' ) #' #' attr(ex2, "metadata") @@ -149,61 +146,87 @@ #' state = rep("MA", 6), #' pol = rep(c("blue", "swing", "swing"), each = 2) #' ) %>% -#' # the 2 extra keys we added have to be specified in the other_keys -#' # component of additional_metadata. -#' as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) +#' as_epi_df(other_keys = c("state", "pol")) #' #' attr(ex3, "metadata") -new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list()) { +NULL + +#' @describeIn epi_df Lower-level constructor for `epi_df` object +#' @order 2 +#' @param geo_type `r lifecycle::badge("deprecated")` in `as_epi_df()`, has no +#' effect; the geo value type is inferred from the location column and set to +#' "custom" if not recognized. In `new_epi_df()`, should be set to the same +#' value that would be inferred. +#' @param time_type `r lifecycle::badge("deprecated")` in `as_epi_df()`, has no +#' effect: the time value type inferred from the time column and set to +#' "custom" if not recognized. Unpredictable behavior may result if the time +#' type is not recognized. In `new_epi_df()`, should be set to the same value +#' that would be inferred. +#' @param as_of Time value representing the time at which the given data were +#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` +#' object that is created would represent the most up-to-date version of the +#' data available as of January 31, 2022. If the `as_of` argument is missing, +#' then the current day-time will be used. +#' @param other_keys If your tibble has additional keys, be sure to specify them +#' as a character vector here (typical examples are "age" or sub-geographies). +#' @param ... Additional arguments passed to methods. +#' @return An `epi_df` object. +#' +#' @export +new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())), + geo_type, time_type, as_of, + other_keys = character(), ...) { # Define metadata fields metadata <- list() metadata$geo_type <- geo_type metadata$time_type <- time_type metadata$as_of <- as_of - metadata <- c(metadata, additional_metadata) + metadata$other_keys <- other_keys # Reorder columns (geo_value, time_value, ...) - if (sum(dim(x)) != 0) { - cols_to_put_first <- c("geo_value", "time_value") - x <- x[, c( - cols_to_put_first, - # All other columns - names(x)[!(names(x) %in% cols_to_put_first)] - )] + if (nrow(x) > 0) { + x <- x %>% relocate(all_of(c("geo_value", other_keys, "time_value")), .before = 1) } # Apply epi_df class, attach metadata, and return class(x) <- c("epi_df", class(x)) attributes(x)$metadata <- metadata + return(x) } -#' @rdname epi_df +#' @describeIn epi_df The preferred way of constructing `epi_df`s +#' @order 1 +#' @param x An `epi_df`, `data.frame`, [tibble::tibble], or [tsibble::tsibble] +#' to be converted +#' @param ... used for specifying column names, as in [`dplyr::rename`]. For +#' example, `geo_value = STATEFP, time_value = end_date`. #' @export as_epi_df <- function(x, ...) { UseMethod("as_epi_df") } -#' @method as_epi_df epi_df #' @rdname epi_df +#' @order 1 +#' @method as_epi_df epi_df #' @export as_epi_df.epi_df <- function(x, ...) { return(x) } -#' @method as_epi_df tbl_df #' @rdname epi_df +#' @order 1 #' @importFrom rlang .data #' @importFrom tidyselect any_of #' @importFrom cli cli_inform +#' @method as_epi_df tbl_df #' @export as_epi_df.tbl_df <- function( x, geo_type = deprecated(), time_type = deprecated(), as_of, - additional_metadata = list(), + other_keys = character(), ...) { # possible standard substitutions for time_value x <- rename(x, ...) @@ -217,12 +240,11 @@ as_epi_df.tbl_df <- function( must be present in `x`." ) } - if (lifecycle::is_present(geo_type)) { - cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") + cli_warn("epi_df constructor argument `geo_type` is now ignored. Consider removing.") } if (lifecycle::is_present(time_type)) { - cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.") + cli_warn("epi_df constructor argument `time_type` is now ignored. Consider removing.") } # If geo type is missing, then try to guess it @@ -250,29 +272,45 @@ as_epi_df.tbl_df <- function( } # Use the current day-time } - assert_list(additional_metadata) - additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) - new_epi_df(x, geo_type, time_type, as_of, additional_metadata) + assert_character(other_keys) + + if (".time_value_counts" %in% other_keys) { + cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") + } + + duplicated_time_values <- x %>% + group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% + filter(dplyr::n() > 1) %>% + ungroup() + if (nrow(duplicated_time_values) > 0) { + bad_data <- capture.output(duplicated_time_values) + cli_abort( + "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.", + body = c("Sample groups:", bad_data) + ) + } + + new_epi_df(x, geo_type, time_type, as_of, other_keys) } -#' @method as_epi_df data.frame #' @rdname epi_df +#' @order 1 +#' @method as_epi_df data.frame #' @export -as_epi_df.data.frame <- function(x, as_of, additional_metadata = list(), ...) { - as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...) +as_epi_df.data.frame <- function(x, as_of, other_keys = character(), ...) { + as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...) } -#' @method as_epi_df tbl_ts #' @rdname epi_df +#' @order 1 +#' @method as_epi_df tbl_ts #' @export -as_epi_df.tbl_ts <- function(x, as_of, additional_metadata = list(), ...) { +as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) { tsibble_other_keys <- setdiff(tsibble::key_vars(x), "geo_value") - if (length(tsibble_other_keys) != 0) { - additional_metadata$other_keys <- unique( - c(additional_metadata$other_keys, tsibble_other_keys) - ) + if (length(tsibble_other_keys) > 0) { + other_keys <- unique(c(other_keys, tsibble_other_keys)) } - as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...) + as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...) } #' Test for `epi_df` format diff --git a/R/epi_df_forbidden_methods.R b/R/epi_df_forbidden_methods.R new file mode 100644 index 00000000..86997daa --- /dev/null +++ b/R/epi_df_forbidden_methods.R @@ -0,0 +1,48 @@ +# Methods in this file are used to +# * Disable problematic inherited behavior (e.g., mean on epi_dfs) +# * Provide better error messaging if possible for things that already abort +# when they should (e.g., sum on epi_dfs) + + +# Disable mean on epi_dfs, to prevent `epi_slide(~ mean(.x), ....)` bad output: + +#' @export +mean.epi_df <- function(x, ...) { + cli_abort(c( + "`mean` shouldn't be used on entire `epi_df`s", + "x" = "{rlang::caller_arg(x)} was an `epi_df`", + "i" = "If you encountered this while trying to take a rolling mean + of a column using `epi_slide`, you probably forgot to + specify the column name (e.g., ~ mean(.x$colname)). You may + also prefer to use the specialized `epi_slide_mean` method." + ), class = "epiprocess__summarizer_on_entire_epi_df") +} + +# Similarly, provide better error messages for some other potentially-common +# slide operations (sum, prod, min, max, all, any, range): + +#' @export +Summary.epi_df <- function(..., na.rm = FALSE) { # nolint: object_name_linter + # cli uses dot prefixes for special purpose; use alias to avoid confusion during interpolation + generic <- .Generic # nolint: object_usage_linter + opt_pointer <- switch(generic, # nolint: object_usage_linter + sum = "You may also prefer to use the specialized `epi_slide_sum` method.", + prod = , + min = , + max = , + all = , + any = "You may also prefer to use the specialized `epi_slide_opt` method.", + range = "", + cli_abort("Unrecognized .Generic: {generic}") + ) + cli_abort(c( + "`{generic}` shouldn't be used on entire `epi_df`s", + # We'd like to quote user input in the error message, but `caller_arg(..1)` is + # just "..1" and (eagerness/S4/unnamedness?) issues thwart some alternatives; just + # use something generic: + "x" = "`{generic}`'s first argument was an `epi_df`", + "i" = "If you encountered this while trying to take a rolling {generic} + of a column using `epi_slide`, you probably forgot to + specify the column name (e.g., ~ {generic}(.x$colname)). {opt_pointer}" + ), class = "epiprocess__summarizer_on_entire_epi_df") +} diff --git a/R/epiprocess.R b/R/epiprocess.R index ba072a2d..5c76f882 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -11,9 +11,11 @@ #' test_int #' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% +#' @importFrom lifecycle deprecated #' @name epiprocess "_PACKAGE" utils::globalVariables(c( ".x", ".group_key", ".ref_time_value", "resid", - "fitted", ".response", "geo_value", "time_value" + "fitted", ".response", "geo_value", "time_value", + "value", ".real" )) diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R index 949cc914..b3b92208 100644 --- a/R/group_by_epi_df_methods.R +++ b/R/group_by_epi_df_methods.R @@ -3,11 +3,3 @@ # `epi_df`s. It would be nice if there were a way to replace these with a # generic core that automatically fixed all misbehaving methods; see # brainstorming within Issue #223. - -#' @importFrom dplyr select -#' @export -select.epi_df <- function(.data, ...) { - selected <- NextMethod(.data) - might_decay <- reclass(selected, attr(selected, "metadata")) - return(dplyr_reconstruct(might_decay, might_decay)) -} diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index d0418eea..b592cd91 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -55,7 +55,7 @@ new_grouped_epi_archive <- function(x, vars, drop) { or `ungroup` first.", class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", epiprocess__ungrouped_class = class(x), - epiprocess__ungrouped_groups = groups(x) + epiprocess__ungrouped_group_vars = group_vars(x) ) } assert_class(x, "epi_archive") @@ -160,6 +160,14 @@ group_by_drop_default.grouped_epi_archive <- function(.tbl) { .tbl$private$drop } +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_vars +#' @export +group_vars.grouped_epi_archive <- function(x) { + x$private$vars +} #' @include methods-epi_archive.R #' @rdname group_by.epi_archive @@ -170,7 +178,6 @@ groups.grouped_epi_archive <- function(x) { rlang::syms(x$private$vars) } - #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' @@ -205,20 +212,31 @@ ungroup.grouped_epi_archive <- function(x, ...) { #' env missing_arg #' @export epix_slide.grouped_epi_archive <- function( - x, - f, + .x, + .f, ..., - before = Inf, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE) { + .before = Inf, + .versions = NULL, + .new_col_name = NULL, + .all_versions = FALSE) { # Perform some deprecated argument checks without using ` = # deprecated()` in the function signature, because they are from # early development versions and much more likely to be clutter than # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(provided_args %in% c("x", "f", "before", "new_col_name", "all_versions"))) { + cli::cli_abort( + "epix_slide: you are using one of the following old argument names: `x`, `f`, `before`, + `new_col_name`, `all_versions`. Please use the new names: `.x`, `.f`, `.before`, + `.new_col_name`, `.all_versions`." + ) + } + if (any(provided_args %in% c("ref_time_values", ".ref_time_values"))) { + cli::cli_abort( + "epix_slide: the argument `ref_time_values` is deprecated. Please use `.versions` instead." + ) + } + if ("group_by" %in% provided_args) { cli_abort(" The `group_by` argument to `slide` has been removed; please use the `group_by()` S3 generic function @@ -229,120 +247,219 @@ epix_slide.grouped_epi_archive <- function( the slide.) ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") } - if ("all_rows" %in% nse_dots_names(...)) { + if ("all_rows" %in% provided_args) { cli_abort(" The `all_rows` argument has been removed from `epix_slide` (but is still supported in `epi_slide`). Add rows for excluded results with a manual join instead. ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epix_slide: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("names_sep" %in% provided_args) { + cli::cli_abort( + "epix_slide: the argument `names_sep` is deprecated. If NULL, you can remove it, it is now default. + If a string, please manually prefix your column names instead." + ) + } - if (is.null(ref_time_values)) { - ref_time_values <- epix_slide_ref_time_values_default(x$private$ungrouped) + # Argument validation + if (is.null(.versions)) { + .versions <- epix_slide_versions_default(.x$private$ungrouped) } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (any(ref_time_values > x$private$ungrouped$versions_end)) { - cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + assert_numeric(.versions, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(.versions > .x$private$ungrouped$versions_end)) { + cli_abort("All `.versions` must be less than or equal to the latest version in the archive.") } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("Some `ref_time_values` are duplicated.") + if (anyDuplicated(.versions) != 0L) { + cli_abort("All `.versions` must be unique.") } # Sort, for consistency with `epi_slide`, although the current # implementation doesn't take advantage of it. - ref_time_values <- sort(ref_time_values) + .versions <- sort(.versions) } - validate_slide_window_arg(before, x$private$ungrouped$time_type) + validate_slide_window_arg(.before, .x$private$ungrouped$time_type, lower = 0) # nolint: object_usage_linter + + checkmate::assert_string(.new_col_name, null.ok = TRUE) + if (!is.null(.new_col_name)) { + if (.new_col_name %in% .x$private$vars) { + cli_abort(c("`.new_col_name` must not be one of the grouping column name(s); + `epix_slide()` uses these column name(s) to label what group + each slide computation came from.", + "i" = "{cli::qty(length(.x$private$vars))} grouping column name{?s} + {?was/were} {format_chr_with_quotes(.x$private$vars)}", + "x" = "`.new_col_name` was {format_chr_with_quotes(.new_col_name)}" + )) + } + if (identical(.new_col_name, "version")) { + cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element + of `.versions` associated with each slide computation') + } + } - # Symbolize column name - new_col <- sym(new_col_name) + assert_logical(.all_versions, len = 1L) - # Validate rest of parameters: - assert_logical(as_list_col, len = 1L) - assert_logical(all_versions, len = 1L) - assert_character(names_sep, len = 1L, null.ok = TRUE) + # If `.f` is missing, interpret ... as an expression for tidy evaluation + if (missing(.f)) { + used_data_masking <- TRUE + quosures <- enquos(...) + if (length(quosures) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + + .slide_comp <- as_diagonal_slide_computation(quosures) + # Magic value that passes zero args as dots in calls below. Equivalent to + # `... <- missing_arg()`, but use `assign` to avoid warning about + # improper use of dots. + assign("...", missing_arg()) + } else { + used_data_masking <- FALSE + .slide_comp <- as_diagonal_slide_computation(.f, ...) + } # Computation for one group, one time value comp_one_grp <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { + .slide_comp, ..., + .version, + .new_col_name) { # Carry out the specified computation - comp_value <- f(.data_group, .group_key, ref_time_value, ...) + comp_value <- .slide_comp(.data_group, .group_key, .version, ...) + + # If this wasn't a tidyeval computation, we still need to check the output + # types. We'll let `group_modify` and `vec_rbind` deal with checking for + # type compatibility between the outputs. + if (!used_data_masking && !( + # vctrs considers data.frames to be vectors, but we still check + # separately for them because certain base operations output data frames + # with rownames, which we will allow (but might drop) + is.data.frame(comp_value) || + vctrs::obj_is_vector(comp_value) && is.null(vctrs::vec_names(comp_value)) + )) { + cli_abort(" + the slide computations must always return data frames or unnamed vectors + (as determined by the vctrs package) (and not a mix of these two + structures). + ", class = "epiprocess__invalid_slide_comp_value") + } - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `epix_as_of` step. - .data_group <- .data_group$DT + .group_key_label <- if (nrow(.group_key) == 0L) { + # Edge case: we'll get here if a requested `.version` had 0 rows and we + # grouped by a nonzero number of columns using the default `.drop = TRUE` + # (or on all non-factor columns with `.drop = FALSE` for some reason, + # probably a user bug). Mimicking `dplyr`, we'll let `.group_key` provided + # to the computation be 0 rows, but then label it using NAs. (In the + # bizarre situation of grouping by a mix of factor and non-factor with + # `.drop = FALSE`, `.group_key` will already have 1 row. For ungrouped + # epix_slides and 0-variable-grouped epix_slides with either `.drop` + # setting, we will have a 1x0 .group_key, although perhaps for the latter + # this should be 0x0.) + vctrs::vec_cast(NA, .group_key) + } else { + .group_key } - assert( - check_atomic(comp_value, any.missing = TRUE), - check_data_frame(comp_value), - combine = "or", - .var.name = vname(comp_value) + # Construct result first as list, then tibble-ify, to try to avoid some + # redundant work. However, we will sacrifice some performance here doing + # checks here in the inner loop, in order to provide immediate feedback on + # some formatting errors. + res <- c( + list(), # get list output; a bit faster than `as.list()`-ing `.group_key_label` + .group_key_label, + list(version = .version) ) - - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) - - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) - - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos <- enquos(...) - if (length(quos) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + res <- vctrs::vec_recycle_common(!!!res, .size = vctrs::vec_size(comp_value)) + + if (is.null(.new_col_name)) { + if (inherits(comp_value, "data.frame")) { + # Sometimes comp_value can parrot back columns already in `res`; allow + # this, but balk if a column has the same name as one in `res` but a + # different value: + comp_nms <- names(comp_value) + overlaps_label_names <- comp_nms %in% names(res) + for (comp_i in which(overlaps_label_names)) { + if (!identical(comp_value[[comp_i]], res[[comp_nms[[comp_i]]]])) { + lines <- c( + cli::format_error(c( + "New column and labeling column clash", + "i" = "`epix_slide` is attaching labeling columns + {format_varnames(names(res))}", + "x" = "slide computation output included a + {format_varname(comp_nms[[comp_i]])} column, but it + didn't match the labeling column", + "i" = "Here are examples of differing values, for a computation + where the labels were: + {format_tibble_row(as_tibble(res)[1L,])}:" + )), + capture.output(print(waldo::compare( + res[[comp_nms[[comp_i]]]], comp_value[[comp_i]], + x_arg = rlang::expr_deparse(dplyr::expr(`$`(label, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter + y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) + ))), + cli::format_message(c( + "You likely want to rename or remove this column in your output, or debug why it has a different value." + )) + ) + rlang::abort(paste(collapse = "\n", lines), + class = "epiprocess__epix_slide_output_vs_label_column_conflict" + ) + } + } + # Unpack into separate columns (without name prefix). If there are + # columns duplicating label columns, de-dupe and order them as if they + # didn't exist in comp_value. + res <- c(res, comp_value[!overlaps_label_names]) + } else { + # Apply default name (to vector or packed data.frame-type column): + if ("slide_value" %in% names(res)) { + cli_abort(c("Cannot guess a good column name for your output", + "x" = "`slide_value` already exists in `.x`", + ">" = "Please provide a `.new_col_name`." + )) + } + res[["slide_value"]] <- comp_value + } + } else { + # Vector or packed data.frame-type column (note: overlaps with label + # column names should already be forbidden by earlier validation): + res[[.new_col_name]] <- comp_value } - f <- quos[[1]] - new_col <- sym(names(rlang::quos_auto_name(quos))) - # Magic value that passes zero args as dots in calls below. Equivalent to - # `... <- missing_arg()`, but use `assign` to avoid warning about - # improper use of dots. - assign("...", missing_arg()) + # Fast conversion: + return(validate_tibble(new_tibble(res))) } - f <- as_slide_computation(f, ...) - out <- lapply(ref_time_values, function(ref_time_value) { + out <- lapply(.versions, function(.version) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw <- x$private$ungrouped %>% epix_as_of( - ref_time_value, - min_time_value = ref_time_value - before, - all_versions = all_versions + as_of_raw <- .x$private$ungrouped %>% epix_as_of( + .version, + min_time_value = .version - .before, + all_versions = .all_versions ) # Set: # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not + # `group_map` as the `.data` argument. Might or might not # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { + # * `group_map_fn`, the corresponding `.f` argument for `group_map` + # (not our `.f`) + if (!.all_versions) { as_of_df <- as_of_raw - group_modify_fn <- comp_one_grp + group_map_fn <- comp_one_grp } else { as_of_archive <- as_of_raw # We essentially want to `group_modify` the archive, but # haven't implemented this method yet. Next best would be # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. + # behavior based on whether or not `dtplyr` < 1.3.0 is loaded. # Instead, go through an ordinary data frame, trying to avoid # copies. - if (address(as_of_archive$DT) == address(x$private$ungrouped$DT)) { + if (address(as_of_archive$DT) == address(.x$private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: # # Note: this step is probably unneeded; we're fine with @@ -356,10 +473,10 @@ epix_slide.grouped_epi_archive <- function( data.table::setDF(as_of_df) # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { + group_map_fn <- function(.data_group, .group_key, + .slide_comp, ..., + .version, + .new_col_name) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a # DT; convert and wrap: @@ -368,33 +485,28 @@ epix_slide.grouped_epi_archive <- function( .data_group_archive <- as_of_archive .data_group_archive$DT <- .data_group comp_one_grp(.data_group_archive, .group_key, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col + .slide_comp = .slide_comp, ..., + .version = .version, + .new_col_name = .new_col_name ) } } return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(x$private$vars), .drop = x$private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, + dplyr::bind_rows(dplyr::group_map( # note: output will be ungrouped + dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop), + group_map_fn, + .slide_comp = .slide_comp, ..., + .version = .version, + .new_col_name = .new_col_name, .keep = TRUE - ) + )) ) }) - # Combine output into a single tibble - out <- as_tibble(setDF(rbindlist(out))) + # Combine output into a single tibble (allowing for packed columns) + out <- vctrs::vec_rbind(!!!out) # Reconstruct groups - out <- group_by(out, !!!syms(x$private$vars), .drop = x$private$drop) - - # Unchop/unnest if we need to - if (!as_list_col) { - out <- tidyr::unnest(out, !!new_col, names_sep = names_sep) - } + out <- group_by(out, !!!syms(.x$private$vars), .drop = .x$private$drop) # nolint start: commented_code_linter. # if (is_epi_df(x)) { diff --git a/R/growth_rate.R b/R/growth_rate.R index 4537375d..d8264fd2 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -120,7 +120,7 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, # Check x, y, x0 if (length(x) != length(y)) cli_abort("`x` and `y` must have the same length.") if (!all(x0 %in% x)) cli_abort("`x0` must be a subset of `x`.") - method <- match.arg(method) + method <- rlang::arg_match(method) # Arrange in increasing order of x o <- order(x) diff --git a/R/key_colnames.R b/R/key_colnames.R index 99d8a9ed..49c32674 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -2,39 +2,46 @@ #' #' @param x a data.frame, tibble, or epi_df #' @param ... additional arguments passed on to methods -#' -#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL` +#' @param other_keys an optional character vector of other keys to include +#' @param exclude an optional character vector of keys to exclude +#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`. #' @keywords internal #' @export key_colnames <- function(x, ...) { UseMethod("key_colnames") } +#' @rdname key_colnames +#' @method key_colnames default #' @export key_colnames.default <- function(x, ...) { character(0L) } +#' @rdname key_colnames +#' @method key_colnames data.frame #' @export -key_colnames.data.frame <- function(x, other_keys = character(0L), ...) { +key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) { assert_character(other_keys) - nm <- c("time_value", "geo_value", other_keys) + assert_character(exclude) + nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude) intersect(nm, colnames(x)) } +#' @rdname key_colnames +#' @method key_colnames epi_df #' @export -key_colnames.epi_df <- function(x, ...) { +key_colnames.epi_df <- function(x, exclude = character(0L), ...) { + assert_character(exclude) other_keys <- attr(x, "metadata")$other_keys - c("time_value", "geo_value", other_keys) + setdiff(c("geo_value", other_keys, "time_value"), exclude) } +#' @rdname key_colnames +#' @method key_colnames epi_archive #' @export -key_colnames.epi_archive <- function(x, ...) { +key_colnames.epi_archive <- function(x, exclude = character(0L), ...) { + assert_character(exclude) other_keys <- attr(x, "metadata")$other_keys - c("time_value", "geo_value", other_keys) -} - -kill_time_value <- function(v) { - assert_character(v) - v[v != "time_value"] + setdiff(c("geo_value", other_keys, "time_value"), exclude) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 8363fa2e..0304d9a6 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -6,26 +6,28 @@ #' examples. #' #' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the +#' @param version Time value specifying the max version to permit in the #' snapshot. That is, the snapshot will comprise the unique rows of the #' current archive data that represent the most up-to-date signal values, as -#' of the specified `max_version` (and whose time values are at least +#' of the specified `version` (and whose time values are at least #' `min_time_value`.) #' @param min_time_value Time value specifying the min time value to permit in #' the snapshot. Default is `-Inf`, which effectively means that there is no #' minimum considered. #' @param all_versions If `all_versions = TRUE`, then the output will be in #' `epi_archive` format, and contain rows in the specified `time_value` range -#' having `version <= max_version`. The resulting object will cover a +#' having `version <= version`. The resulting object will cover a #' potentially narrower `version` and `time_value` range than `x`, depending #' on user-provided arguments. Otherwise, there will be one row in the output -#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' for the `version` of each `time_value`. Default is `FALSE`. +#' @param max_version `r lifecycle::badge("deprecated")` please use `version` +#' argument instead. #' @return An `epi_df` object. #' #' @examples #' epix_as_of( #' archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version) +#' version = max(archive_cases_dv_subset$DT$version) #' ) #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 @@ -58,32 +60,37 @@ #' #' @importFrom data.table between key #' @export -epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { +epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, + max_version = deprecated()) { assert_class(x, "epi_archive") + if (lifecycle::is_present(max_version)) { + lifecycle::deprecate_warn("0.8.1", "epix_as_of(max_version =)", "epix_as_of(version =)") + version <- max_version + } + other_keys <- setdiff( key(x$DT), c("geo_value", "time_value", "version") ) - if (length(other_keys) == 0) other_keys <- NULL - # Check a few things on max_version - if (!identical(class(max_version), class(x$DT$version))) { + # Check a few things on version + if (!identical(class(version), class(x$DT$version))) { cli_abort( - "`max_version` must have the same `class` vector as `epi_archive$DT$version`." + "`version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!identical(typeof(max_version), typeof(x$DT$version))) { + if (!identical(typeof(version), typeof(x$DT$version))) { cli_abort( - "`max_version` must have the same `typeof` as `epi_archive$DT$version`." + "`version` must have the same `typeof` as `epi_archive$DT$version`." ) } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > x$versions_end) { - cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + assert_scalar(version, na.ok = FALSE) + if (version > x$versions_end) { + cli_abort("`version` must be at most `epi_archive$versions_end`.") } assert_logical(all_versions, len = 1) - if (!is.na(x$clobberable_versions_start) && max_version >= x$clobberable_versions_start) { + if (!is.na(x$clobberable_versions_start) && version >= x$clobberable_versions_start) { cli_warn( 'Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new @@ -94,16 +101,25 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL ) } + # We can't disable nonstandard evaluation nor use the `..` feature in the `i` + # argument of `[.data.table` below; try to avoid problematic names and abort + # if we fail to do so: + .min_time_value <- min_time_value + .version <- version + if (any(c(".min_time_value", ".version") %in% names(x$DT))) { + cli_abort("epi_archives can't contain a `.min_time_value` or `.version` column") + } + # Filter by version and return if (all_versions) { # epi_archive is copied into result, so we can modify result directly - result <- epix_truncate_versions_after(x, max_version) - result$DT <- result$DT[time_value >= min_time_value, ] # nolint: object_usage_linter + result <- epix_truncate_versions_after(x, version) + result$DT <- result$DT[time_value >= .min_time_value, ] # nolint: object_usage_linter return(result) } # Make sure to use data.table ways of filtering and selecting - as_of_epi_df <- x$DT[time_value >= min_time_value & version <= max_version, ] %>% # nolint: object_usage_linter + as_of_epi_df <- x$DT[time_value >= .min_time_value & version <= .version, ] %>% # nolint: object_usage_linter unique( by = c("geo_value", "time_value", other_keys), fromLast = TRUE @@ -111,11 +127,8 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL tibble::as_tibble() %>% dplyr::select(-"version") %>% as_epi_df( - as_of = max_version, - additional_metadata = c( - x$additional_metadata, - list(other_keys = other_keys) - ) + as_of = version, + other_keys = other_keys ) return(as_of_epi_df) @@ -240,21 +253,79 @@ epix_fill_through_version <- function(x, fill_versions_end, #' Default here is `TRUE`. #' @return the resulting `epi_archive` #' -#' @details In all cases, `additional_metadata` will be an empty list, and -#' `clobberable_versions_start` will be set to the earliest version that could -#' be clobbered in either input archive. +#' @details In all cases, `clobberable_versions_start` will be set to the +#' earliest version that could be clobbered in either input archive. #' #' @examples -#' # create two example epi_archive datasets -#' x <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% -#' as_epi_archive(compactify = TRUE) -#' y <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value, time_value, version, percent_cli) %>% -#' as_epi_archive(compactify = TRUE) -#' # merge results stored in a third object: -#' xy <- epix_merge(x, y) +#' # Example 1 +#' # The s1 signal at August 1st gets revised from 10 to 11 on August 2nd +#' s1 <- tibble::tibble( +#' geo_value = c("ca", "ca", "ca"), +#' time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02")), +#' version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-02")), +#' signal1 = c(10, 11, 7) +#' ) +#' +#' s2 <- tibble::tibble( +#' geo_value = c("ca", "ca"), +#' time_value = as.Date(c("2024-08-01", "2024-08-02")), +#' version = as.Date(c("2024-08-03", "2024-08-03")), +#' signal2 = c(2, 3) +#' ) +#' +#' +#' s1 <- s1 %>% as_epi_archive() +#' s2 <- s2 %>% as_epi_archive() +#' +#' merged <- epix_merge(s1, s2, sync = "locf") +#' merged[["DT"]] +#' +#' # Example 2 +#' # The s1 signal at August 1st gets revised from 12 to 13 on August 3rd +#' s1 <- tibble::tibble( +#' geo_value = c("ca", "ca", "ca", "ca"), +#' time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02", "2024-08-03")), +#' version = as.Date(c("2024-08-01", "2024-08-03", "2024-08-03", "2024-08-03")), +#' signal1 = c(12, 13, 22, 19) +#' ) +#' +#' s2 <- tibble::tibble( +#' geo_value = c("ca", "ca"), +#' time_value = as.Date(c("2024-08-01", "2024-08-02")), +#' version = as.Date(c("2024-08-02", "2024-08-02")), +#' signal2 = c(4, 5), +#' ) +#' +#' +#' s1 <- s1 %>% as_epi_archive() +#' s2 <- s2 %>% as_epi_archive() +#' +#' merged <- epix_merge(s1, s2, sync = "locf") +#' merged[["DT"]] +#' +#' +#' # Example 3: +#' s1 <- tibble::tibble( +#' geo_value = c("ca", "ca", "ca"), +#' time_value = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), +#' version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), +#' signal1 = c(14, 11, 9) +#' ) #' +#' # The s2 signal at August 1st gets revised from 3 to 5 on August 3rd +#' s2 <- tibble::tibble( +#' geo_value = c("ca", "ca", "ca"), +#' time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02")), +#' version = as.Date(c("2024-08-02", "2024-08-03", "2024-08-03")), +#' signal2 = c(3, 5, 2), +#' ) +#' +#' s1 <- s1 %>% as_epi_archive() +#' s2 <- s2 %>% as_epi_archive() +#' +#' # Some LOCF for signal 1 as signal 2 gets updated +#' merged <- epix_merge(s1, s2, sync = "locf") +#' merged[["DT"]] #' @importFrom data.table key set setkeyv #' @export epix_merge <- function(x, y, @@ -272,18 +343,6 @@ epix_merge <- function(x, y, cli_abort("`x` and `y` must share data type on their `time_value` column.") } - if (length(x$additional_metadata) != 0L) { - cli_warn("x$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - } - if (length(y$additional_metadata) != 0L) { - cli_warn("y$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - } - result_additional_metadata <- list() - result_clobberable_versions_start <- if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { NA # (any type of NA is fine here) @@ -449,7 +508,6 @@ epix_merge <- function(x, y, return(as_epi_archive( result_dt[], # clear data.table internal invisibility flag if set other_keys = setdiff(key(result_dt), c("geo_value", "time_value", "version")), - additional_metadata = result_additional_metadata, # It'd probably be better to pre-compactify before the merge, and might be # guaranteed not to be necessary to compactify the merge result if the # inputs are already compactified, but at time of writing we don't have @@ -560,98 +618,84 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for #' examples. #' -#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, +#' @param .x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, #' all data in `x` will be treated as part of a single data group. -#' @param f Function, formula, or missing; together with `...` specifies the +#' @param .f Function, formula, or missing; together with `...` specifies the #' computation to slide. To "slide" means to apply a computation over a #' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `before` parameter described below. One time step is -#' typically one day or one week; see [`epi_slide`] details for more -#' explanation. If a function, `f` must take an `epi_df` with the same -#' column names as the archive's `DT`, minus the `version` column; followed -#' by a one-row tibble containing the values of the grouping variables for -#' the associated group; followed by a reference time value, usually as a -#' `Date` object; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~ mean (.x$var)` to compute a mean of a column `var` for each -#' group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via -#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' determined by the `.before` parameter (see details for more). If a +#' function, `.f` must have the form `function(x, g, t, ...)`, where +#' +#' - "x" is an epi_df with the same column names as the archive's `DT`, minus +#' the `version` column +#' - "g" is a one-row tibble containing the values of the grouping variables +#' for the associated group +#' - "t" is the ref_time_value for the current window +#' - "..." are additional arguments +#' +#' If a formula, `.f` can operate directly on columns accessed via `.x$var` or +#' `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for +#' each group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via `.z` +#' or `.ref_time_value`. If `.f` is missing, then `...` will specify the #' computation. #' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an -#' expression for tidy evaluation; in addition to referring to columns -#' directly by name, the expression has access to `.data` and `.env` pronouns -#' as in `dplyr` verbs, and can also refer to the `.group_key` and -#' `.ref_time_value`. See details of [`epi_slide`]. -#' @param before How far `before` each `ref_time_value` should the sliding -#' window extend? If provided, should be a single, non-NA, -#' [integer-compatible][vctrs::vec_cast] number of time steps. This window -#' endpoint is inclusive. For example, if `before = 7`, and one time step is -#' one day, then to produce a value for a `ref_time_value` of January 8, we -#' apply the given function or formula to data (for each group present) with -#' `time_value`s from January 1 onward, as they were reported on January 8. -#' For typical disease surveillance sources, this will not include any data -#' with a `time_value` of January 8, and, depending on the amount of reporting -#' latency, may not include January 7 or even earlier `time_value`s. (If -#' instead the archive were to hold nowcasts instead of regular surveillance -#' data, then we would indeed expect data for `time_value` January 8. If it -#' were to hold forecasts, then we would expect data for `time_value`s after -#' January 8, and the sliding window would extend as far after each -#' `ref_time_value` as needed to include all such `time_value`s.) -#' @param ref_time_values Reference time values / versions for sliding +#' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted +#' as a ["data-masking"][rlang::args_data_masking] expression or expressions +#' for tidy evaluation; in addition to referring columns directly by name, the +#' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs, +#' and can also refer to `.x` (not the same as the input epi_archive), +#' `.group_key`, and `.ref_time_value`. See details for more. +#' @param .before How many time values before the `.ref_time_value` +#' should each snapshot handed to the function `.f` contain? If provided, it +#' should be a single value that is compatible with the time_type of the +#' time_value column (more below), but most commonly an integer. This window +#' endpoint is inclusive. For example, if `.before = 7`, `time_type` +#' in the archive is "day", and the `.ref_time_value` is January 8, then the +#' smallest time_value in the snapshot will be January 1. If missing, then the +#' default is no limit on the time values, so the full snapshot is given. +#' @param .versions Reference time values / versions for sliding #' computations; each element of this vector serves both as the anchor point #' for the `time_value` window for the computation and the `max_version` #' `epix_as_of` which we fetch data in this window. If missing, then this will #' set to a regularly-spaced sequence of values set to cover the range of #' `version`s in the `DT` plus the `versions_end`; the spacing of values will #' be guessed (using the GCD of the skips between values). -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` -#' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If -#' `all_versions = TRUE`, then `f` will be passed the version history (all -#' `version <= ref_time_value`) for rows having `time_value` between -#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be -#' passed only the most recent `version` for every unique `time_value`. -#' Default is `FALSE`. +#' @param .new_col_name Either `NULL` or a string indicating the name of the new +#' column that will contain the derived values. The default, `NULL`, will use +#' the name "slide_value" unless your slide computations output data frames, +#' in which case they will be unpacked into the constituent columns and those +#' names used. If the resulting column name(s) overlap with the column names +#' used for labeling the computations, which are `group_vars(x)` and +#' `"version"`, then the values for these columns must be identical to the +#' labels we assign. +#' @param .all_versions (Not the same as `.all_rows` parameter of `epi_slide`.) +#' If `.all_versions = TRUE`, then the slide computation will be passed the +#' version history (all `version <= .version` where `.version` is one of the +#' requested `.versions`) for rows having a `time_value` of at least `.version +#' - before`. Otherwise, the slide computation will be passed only the most +#' recent `version` for every unique `time_value`. Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables, `time_value`, #' containing the reference time values for the slide computation, and a -#' column named according to the `new_col_name` argument, containing the slide +#' column named according to the `.new_col_name` argument, containing the slide #' values. #' #' @details A few key distinctions between the current function and `epi_slide()`: -#' 1. In `f` functions for `epix_slide`, one should not assume that the input +#' 1. In `.f` functions for `epix_slide`, one should not assume that the input #' data to contain any rows with `time_value` matching the computation's -#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for +#' `.ref_time_value` (accessible via `attributes()$metadata$as_of`); for #' typical epidemiological surveillance data, observations pertaining to a #' particular time period (`time_value`) are first reported `as_of` some #' instant after that time period has ended. -#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend -#' from `before` time steps before a given `ref_time_value` through the last -#' `time_value` available as of version `ref_time_value` (typically, this -#' won't include `ref_time_value` itself, as observations about a particular -#' time interval (e.g., day) are only published after that time interval -#' ends); `epi_slide` windows extend from `before` time steps before a -#' `ref_time_value` through `after` time steps after `ref_time_value`. -#' 3. The input class and columns are similar but different: `epix_slide` -#' (with the default `all_versions=FALSE`) keeps all columns and the +#' 2. The input class and columns are similar but different: `epix_slide` +#' (with the default `.all_versions=FALSE`) keeps all columns and the #' `epi_df`-ness of the first argument to each computation; `epi_slide` only #' provides the grouping variables in the second input, and will convert the #' first input into a regular tibble if the grouping variables include the -#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' essential `geo_value` column. (With .all_versions=TRUE`, `epix_slide` will #' will provide an `epi_archive` rather than an `epi-df` to each #' computation.) -#' 4. The output class and columns are similar but different: `epix_slide()` +#' 3. The output class and columns are similar but different: `epix_slide()` #' returns a tibble containing only the grouping variables, `time_value`, and #' the new column(s) from the slide computations, whereas `epi_slide()` #' returns an `epi_df` with all original variables plus the new columns from @@ -659,16 +703,16 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' their input, with one exception: `epi_archive`s can have trivial #' (zero-variable) groupings, but these will be dropped in `epix_slide` #' results as they are not supported by tibbles.) -#' 5. There are no size stability checks or element/row recycling to maintain +#' 4. There are no size stability checks or element/row recycling to maintain #' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is #' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly #' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed #' in the "advanced" vignette. -#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' 5. `.all_rows` is not supported in `epix_slide`; since the slide #' computations are allowed more flexibility in their outputs than in #' `epi_slide`, we can't guess a good representation for missing computations -#' for excluded group-`ref_time_value` pairs. -#' 7. The `ref_time_values` default for `epix_slide` is based on making an +#' for excluded group-`.ref_time_value` pairs. +#' 76. The `.versions` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the #' `versions_end`, rather than the `time_value`s. #' @@ -687,7 +731,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' library(dplyr) #' #' # Reference time points for which we want to compute slide values: -#' ref_time_values <- seq(as.Date("2020-06-01"), +#' versions <- seq(as.Date("2020-06-02"), #' as.Date("2020-06-15"), #' by = "1 day" #' ) @@ -697,10 +741,10 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% #' epix_slide( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = ref_time_values, -#' new_col_name = "case_rate_7d_av_recent_av" +#' .f = ~ mean(.x$case_rate_7d_av), +#' .before = 2, +#' .versions = versions, +#' .new_col_name = "case_rate_7d_av_recent_av" #' ) %>% #' ungroup() #' # We requested time windows that started 2 days before the corresponding time @@ -713,7 +757,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' # * 2 `time_value`s, for the rest of the results #' # * never the 3 `time_value`s we would get from `epi_slide`, since, because #' # of data latency, we'll never have an observation -#' # `time_value == ref_time_value` as of `ref_time_value`. +#' # `time_value == .ref_time_value` as of `.ref_time_value`. #' # The example below shows this type of behavior in more detail. #' #' # Examining characteristics of the data passed to each computation with @@ -732,17 +776,17 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' class1 = class(x)[[1L]] #' ) #' }, -#' before = 5, all_versions = FALSE, -#' ref_time_values = ref_time_values, names_sep = NULL +#' .before = 5, .all_versions = FALSE, +#' .versions = versions #' ) %>% #' ungroup() %>% -#' arrange(geo_value, time_value) +#' arrange(geo_value, version) #' #' # --- Advanced: --- #' #' # `epix_slide` with `all_versions=FALSE` (the default) applies a #' # version-unaware computation to several versions of the data. We can also -#' # use `all_versions=TRUE` to apply a version-*aware* computation to several +#' # use `.all_versions=TRUE` to apply a version-*aware* computation to several #' # versions of the data, again looking at characteristics of the data passed #' # to each computation. In this case, each computation should expect an #' # `epi_archive` containing the relevant version data: @@ -767,8 +811,8 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' class1 = class(x)[[1L]] #' ) #' }, -#' before = 5, all_versions = TRUE, -#' ref_time_values = ref_time_values, names_sep = NULL +#' .before = 5, .all_versions = TRUE, +#' .versions = versions #' ) %>% #' ungroup() %>% #' # Focus on one geo_value so we can better see the columns above: @@ -777,15 +821,13 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' #' @export epix_slide <- function( - x, - f, + .x, + .f, ..., - before = Inf, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE) { + .before = Inf, + .versions = NULL, + .new_col_name = NULL, + .all_versions = FALSE) { UseMethod("epix_slide") } @@ -793,25 +835,22 @@ epix_slide <- function( #' @rdname epix_slide #' @export epix_slide.epi_archive <- function( - x, - f, + .x, + .f, ..., - before = Inf, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE) { + .before = Inf, + .versions = NULL, + .new_col_name = NULL, + .all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big # group (group by 0 vars), like `dplyr::summarize`, and let the # resulting `grouped_epi_archive` handle the slide: epix_slide( - group_by(x), - f, + group_by(.x), + .f, ..., - before = before, ref_time_values = ref_time_values, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions + .before = .before, .versions = .versions, + .new_col_name = .new_col_name, .all_versions = .all_versions ) %>% # We want a slide on ungrouped archives to output something # ungrouped, rather than retaining the trivial (0-variable) @@ -825,7 +864,7 @@ epix_slide.epi_archive <- function( #' Default value for `ref_time_values` in an `epix_slide` #' #' @noRd -epix_slide_ref_time_values_default <- function(ea) { +epix_slide_versions_default <- function(ea) { versions_with_updates <- c(ea$DT$version, ea$versions_end) ref_time_values <- tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) return(ref_time_values) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index daccabd8..901b9b32 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -8,19 +8,23 @@ #' use `attr(your_epi_df, "decay_to_tibble") <- FALSE` beforehand. #' #' @template x -#' @param ... additional arguments to forward to `NextMethod()` +#' +#' @inheritParams tibble::as_tibble #' #' @importFrom tibble as_tibble #' @export as_tibble.epi_df <- function(x, ...) { - # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the - # grouping and should be called by `NextMethod()` in the current design. - # See #223 for discussion of alternatives. + # Note that some versions of `tsibble` overwrite `as_tibble.grouped_df`, which + # also impacts grouped `epi_df`s don't rely on `NextMethod()`. Destructure + # first instead. + destructured <- tibble::as_tibble(vctrs::vec_data(x), ...) if (attr(x, "decay_to_tibble") %||% TRUE) { - return(decay_epi_df(NextMethod())) + return(destructured) + } else { + # We specially requested via attr not to decay epi_df-ness but to drop any + # grouping. + reclass(destructured, attr(x, "metadata")) } - metadata <- attr(x, "metadata") - reclass(NextMethod(), metadata) } #' Convert to tsibble format @@ -37,10 +41,13 @@ as_tibble.epi_df <- function(x, ...) { #' @export as_tsibble.epi_df <- function(x, key, ...) { if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys) - return(as_tsibble(tibble::as_tibble(x), - key = tidyselect::all_of(key), index = "time_value", - ... - )) + return( + as_tsibble( + tibble::as_tibble(x), + key = tidyselect::all_of(key), index = "time_value", + ... + ) + ) } #' Base S3 methods for an `epi_df` object @@ -48,9 +55,9 @@ as_tsibble.epi_df <- function(x, key, ...) { #' Print and summary functions for an `epi_df` object. #' #' @template x -#' @param ... additional arguments to forward to `NextMethod()` #' #' @method print epi_df +#' @param ... additional arguments to forward to `NextMethod()`, or unused #' @export print.epi_df <- function(x, ...) { cat( @@ -59,6 +66,10 @@ print.epi_df <- function(x, ...) { ) cat(sprintf("* %-9s = %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", attributes(x)$metadata$time_type)) + ok <- attributes(x)$metadata$other_keys + if (length(ok) > 0) { + cat(sprintf("* %-9s = %s\n", "other_keys", paste(ok, collapse = ", "))) + } cat(sprintf("* %-9s = %s\n", "as_of", attributes(x)$metadata$as_of)) # Conditional output (silent if attribute is NULL): cat(sprintf("* %-9s = %s\n", "decay_to_tibble", attr(x, "decay_to_tibble"))) @@ -76,13 +87,16 @@ print.epi_df <- function(x, ...) { #' Currently unused. #' #' @method summary epi_df -#' @rdname print.epi_df #' @importFrom rlang .data #' @importFrom stats median #' @export summary.epi_df <- function(object, ...) { cat("An `epi_df` x, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", attributes(object)$metadata$geo_type)) + ok <- attributes(object)$metadata$other_keys + if (length(ok) > 0) { + cat(sprintf("* %-9s = %s\n", "other_keys", paste(ok, collapse = ", "))) + } cat(sprintf("* %-9s = %s\n", "as_of", attributes(object)$metadata$as_of)) cat("----------\n") cat(sprintf("* %-27s = %s\n", "min time value", min(object$time_value))) @@ -139,10 +153,10 @@ dplyr_reconstruct.epi_df <- function(data, template) { # keep any grouping that has been applied: res <- NextMethod() - cn <- names(res) + col_names <- names(res) # Duplicate columns, cli_abort - dup_col_names <- cn[duplicated(cn)] + dup_col_names <- col_names[duplicated(col_names)] if (length(dup_col_names) != 0) { cli_abort(c( "Duplicate column names are not allowed", @@ -152,7 +166,7 @@ dplyr_reconstruct.epi_df <- function(data, template) { )) } - not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn) + not_epi_df <- !("time_value" %in% col_names) || !("geo_value" %in% col_names) if (not_epi_df) { # If we're calling on an `epi_df` from one of our own functions, we need to @@ -171,7 +185,7 @@ dplyr_reconstruct.epi_df <- function(data, template) { # Amend additional metadata if some other_keys cols are dropped in the subset old_other_keys <- attr(template, "metadata")$other_keys - attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn] + attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names] res } @@ -203,12 +217,13 @@ dplyr_row_slice.epi_df <- function(data, i, ...) { `names<-.epi_df` <- function(x, value) { old_names <- names(x) old_metadata <- attr(x, "metadata") - old_other_keys <- old_metadata[["other_keys"]] - new_other_keys <- value[match(old_other_keys, old_names)] new_metadata <- old_metadata - new_metadata[["other_keys"]] <- new_other_keys + old_other_keys <- old_metadata[["other_keys"]] + if (!is.null(old_other_keys)) { + new_other_keys <- value[match(old_other_keys, old_names)] + new_metadata[["other_keys"]] <- new_other_keys + } result <- reclass(NextMethod(), new_metadata) - # decay to non-`epi_df` if needed: dplyr::dplyr_reconstruct(result, result) } @@ -241,6 +256,90 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) } +#' Complete epi_df +#' +#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function +#' can be used, for example, to add rows for missing combinations +#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s. +#' See the examples for usage details. +#' +#' @param data an `epi_df` +#' @param ... see [`tidyr::complete`] +#' @param fill see [`tidyr::complete`] +#' @param explicit see [`tidyr::complete`] +#' +#' @method complete epi_df +#' @importFrom tidyr complete +#' +#' @examples +#' start_date <- as.Date("2020-01-01") +#' daily_edf <- tibble::tribble( +#' ~geo_value, ~time_value, ~value, +#' 1, start_date + 1, 1, +#' 1, start_date + 3, 3, +#' 2, start_date + 2, 2, +#' 2, start_date + 3, 3, +#' ) %>% +#' as_epi_df(as_of = start_date + 3) +#' # Complete without grouping puts all the geo_values on the same min and max +#' # time_value index +#' daily_edf %>% +#' complete(geo_value, time_value = full_seq(time_value, period = 1)) +#' # Complete with grouping puts all the geo_values on individual min and max +#' # time_value indices +#' daily_edf %>% +#' group_by(geo_value) %>% +#' complete(time_value = full_seq(time_value, period = 1)) +#' # Complete has explicit=TRUE by default, but if it's FALSE, then complete +#' # only fills the implicit gaps, not those that are explicitly NA +#' daily_edf <- tibble::tribble( +#' ~geo_value, ~time_value, ~value, +#' 1, start_date + 1, 1, +#' 1, start_date + 2, NA, +#' 1, start_date + 3, 3, +#' 2, start_date + 2, 2, +#' 2, start_date + 3, 3, +#' ) %>% +#' as_epi_df(as_of = start_date + 3) +#' daily_edf %>% +#' complete( +#' geo_value, +#' time_value = full_seq(time_value, period = 1), +#' fill = list(value = 0), +#' explicit = FALSE +#' ) +#' # Complete works for weekly data and can take a fill value +#' # No grouping +#' weekly_edf <- tibble::tribble( +#' ~geo_value, ~time_value, ~value, +#' 1, start_date + 1, 1, +#' 1, start_date + 15, 3, +#' 2, start_date + 8, 2, +#' 2, start_date + 15, 3, +#' ) %>% +#' as_epi_df(as_of = start_date + 3) +#' weekly_edf %>% +#' complete( +#' geo_value, +#' time_value = full_seq(time_value, period = 7), +#' fill = list(value = 0) +#' ) +#' # With grouping +#' weekly_edf %>% +#' group_by(geo_value) %>% +#' complete( +#' time_value = full_seq(time_value, period = 7), +#' fill = list(value = 0) +#' ) +#' @export +complete.epi_df <- function(data, ..., fill = list(), explicit = TRUE) { + result <- dplyr::dplyr_reconstruct(NextMethod(), data) + if ("time_value" %in% names(rlang::call_match(dots_expand = FALSE)[["..."]])) { + attr(result, "metadata")$time_type <- guess_time_type(result$time_value) + } + result +} + #' @method unnest epi_df #' @rdname print.epi_df #' @param data an `epi_df` @@ -255,3 +354,136 @@ reclass <- function(x, metadata) { attributes(x)$metadata <- metadata return(x) } + +#' Arrange an epi_df into a standard order +#' +#' Moves [key_colnames()] to the left, then arranges rows based on that +#' ordering. This function is mainly for use in tests and so that +#' other function output will be in predictable order, where necessary. +#' +#' @param x an `epi_df`. Other objects will produce a warning and return as is. +#' @param ... not used +#' +#' @keywords internal +#' @export +arrange_canonical <- function(x, ...) { + UseMethod("arrange_canonical") +} + +#' @export +arrange_canonical.default <- function(x, ...) { + rlang::check_dots_empty() + cli::cli_abort(c( + "`arrange_canonical()` is only meaningful for an {.cls epi_df}." + )) + return(x) +} + +#' @export +arrange_canonical.epi_df <- function(x, ...) { + rlang::check_dots_empty() + x %>% + arrange_row_canonical() %>% + arrange_col_canonical() +} + +arrange_row_canonical <- function(x, ...) { + UseMethod("arrange_row_canonical") +} + +#' @export +arrange_row_canonical.default <- function(x, ...) { + rlang::check_dots_empty() + cli::cli_abort(c( + "`arrange_row_canonical()` is only meaningful for an {.cls epi_df}." + )) + return(x) +} + +#' @export +arrange_row_canonical.epi_df <- function(x, ...) { + rlang::check_dots_empty() + cols <- key_colnames(x) + x %>% dplyr::arrange(dplyr::across(dplyr::all_of(cols))) +} + +arrange_col_canonical <- function(x, ...) { + UseMethod("arrange_col_canonical") +} + +#' @export +arrange_col_canonical.default <- function(x, ...) { + rlang::check_dots_empty() + cli::cli_abort(c( + "`arrange_col_canonical()` is only meaningful for an {.cls epi_df}." + )) + return(x) +} + +#' @export +arrange_col_canonical.epi_df <- function(x, ...) { + rlang::check_dots_empty() + cols <- key_colnames(x) + x %>% dplyr::relocate(dplyr::all_of(cols), .before = 1) +} + +#' Group an `epi_df` object by default keys +#' @param x an `epi_df` +#' @param exclude character vector of column names to exclude from grouping +#' @return a grouped `epi_df` +#' @export +group_epi_df <- function(x, exclude = character()) { + cols <- key_colnames(x, exclude = exclude) + x %>% group_by(across(all_of(cols))) +} + +#' Aggregate an `epi_df` object +#' +#' Aggregates an `epi_df` object by the specified group columns, summing the +#' `value` column, and returning an `epi_df`. If aggregating over `geo_value`, +#' the resulting `epi_df` will have `geo_value` set to `"total"`. +#' +#' @param .x an `epi_df` +#' @param sum_cols character vector of the columns to aggregate +#' @param group_cols character vector of column names to group by. "time_value" is +#' included by default. +#' @return an `epi_df` object +#' +#' @export +sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) { + assert_class(.x, "epi_df") + assert_character(sum_cols) + assert_character(group_cols) + checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x))) + checkmate::assert_subset(group_cols, key_colnames(.x)) + if (!"time_value" %in% group_cols) { + group_cols <- c("time_value", group_cols) + } + + out <- .x %>% + group_by(across(all_of(group_cols))) %>% + dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop") + + # To preserve epi_df-ness, we need to ensure that the `geo_value` column is + # present. + out <- if (!"geo_value" %in% group_cols) { + out %>% + mutate(geo_value = "total") %>% + relocate(geo_value, .before = 1) + } else { + out + } + + # The `geo_type` will be correctly inherited here by the following logic: + # - if `geo_value` is in `group_cols`, then the constructor will see the + # geo_value here and will correctly read the existing values + # - if `geo_value` is not in `group_cols`, then the constructor will see + # the unrecognizeable "total" value and will correctly infer the "custom" + # geo_type. + out %>% + as_epi_df( + as_of = attr(.x, "metadata")$as_of, + other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols) + ) %>% + arrange_canonical() +} diff --git a/R/outliers.R b/R/outliers.R index 3d0ff5e5..c2187de0 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -89,7 +89,7 @@ detect_outlr <- function(x = seq_along(y), y, ), combiner = c("median", "mean", "none")) { # Validate combiner - combiner <- match.arg(combiner) + combiner <- rlang::arg_match(combiner) # Validate that x contains all distinct values if (any(duplicated(x))) { @@ -161,8 +161,7 @@ detect_outlr <- function(x = seq_along(y), y, #' group_by(geo_value) %>% #' mutate(outlier_info = detect_outlr_rm( #' x = time_value, y = cases -#' )) %>% -#' unnest(outlier_info) +#' )) detect_outlr_rm <- function(x = seq_along(y), y, n = 21, log_transform = FALSE, detect_negatives = FALSE, @@ -189,7 +188,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, # Calculate lower and upper thresholds and replacement value z <- z %>% - epi_slide(fitted = median(y), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% + epi_slide(fitted = median(y, na.rm = TRUE), .window_size = n, .align = "center") %>% dplyr::mutate(resid = y - fitted) %>% roll_iqr( n = n, @@ -256,9 +255,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' group_by(geo_value) %>% #' mutate(outlier_info = detect_outlr_stl( #' x = time_value, y = cases, -#' seasonal_period = 7 -#' )) %>% # weekly seasonality for daily data -#' unnest(outlier_info) +#' seasonal_period = 7 # weekly seasonality for daily data +#' )) detect_outlr_stl <- function(x = seq_along(y), y, n_trend = 21, n_seasonal = 21, @@ -359,9 +357,8 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius, z %>% epi_slide( - roll_iqr = stats::IQR(resid), - before = floor((n - 1) / 2), - after = ceiling((n - 1) / 2) + roll_iqr = stats::IQR(resid, na.rm = TRUE), + .window_size = n, .align = "center" ) %>% dplyr::mutate( lower = pmax( diff --git a/R/reexports.R b/R/reexports.R index 02f5af53..00ac83c2 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -57,6 +57,19 @@ dplyr::slice tidyr::unnest +#' @importFrom tidyr complete +#' @export +tidyr::complete + +# We don't provide a method for full_seq, but complete-ing using +# full_seq(time_value) is still needed to make some downstream things behave +# nicely. So make that more ergonomic/discoverable with a re-export: + +#' @importFrom tidyr full_seq +#' @export +tidyr::full_seq + + # ggplot2 ----------------------------------------------------------------- #' @importFrom ggplot2 autoplot diff --git a/R/revision_analysis.R b/R/revision_analysis.R new file mode 100644 index 00000000..7be0cd24 --- /dev/null +++ b/R/revision_analysis.R @@ -0,0 +1,264 @@ +#' A function to describe revision behavior for an archive +#' @description +#' `revision_summary` removes all missing values (if requested), and then +#' computes some basic statistics about the revision behavior of an archive, +#' returning a tibble summarizing the revisions per time_value+epi_key features. If `print_inform` is true, it +#' prints a concise summary. The columns returned are: +#' 1. `n_revisions`: the total number of revisions for that entry +#' 2. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this +#' includes `NA`'s) +#' 3. `max_lag`: the amount of time until the final (new) version (same caveat +#' for `drop_nas=FALSE`, though it is far less likely to matter) +#' 4. `min_value`: the minimum value across revisions +#' 5. `max_value`: the maximum value across revisions +#' 6. `median_value`: the median value across revisions +#' 7. `spread`: the difference between the smallest and largest values (this +#' always excludes `NA` values) +#' 8. `rel_spread`: `spread` divided by the largest value (so it will +#' always be less than 1). Note that this need not be the final value. It will +#' be `NA` whenever `spread` is 0. +#' 9. `time_near_latest`: This gives the lag when the value is within +#' `within_latest` (default 20%) of the value at the latest time. For example, +#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is +#' the 5th index, since even though 99 is within 20%, it is outside the window +#' afterwards at 150. +#' @param epi_arch an epi_archive to be analyzed +#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to +#' summarize. If empty, it chooses the first. Currently only implemented for +#' one column at a time. +#' @param drop_nas bool, drop any `NA` values from the archive? After dropping +#' `NA`'s compactify is run again to make sure there are no duplicate values +#' from occasions when the signal is revised to `NA`, and then back to its +#' immediately-preceding value. +#' @param print_inform bool, determines whether to print summary information, or +#' only return the full summary tibble +#' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any +#' time_values not earlier than `min_waiting_period` before `versions_end` are +#' removed. `min_waiting_period` should characterize the typical time during +#' which revisions occur. The default of 60 days corresponds to a typical +#' final value for case counts as reported in the context of insurance. To +#' avoid this filtering, either set to `NULL` or 0. +#' @param within_latest double between 0 and 1. Determines the threshold +#' used for the `time_to` +#' @param quick_revision difftime or integer (integer is treated as days), for +#' the printed summary, the amount of time between the final revision and the +#' actual time_value to consider the revision quickly resolved. Default of 3 +#' days +#' @param few_revisions integer, for the printed summary, the upper bound on the +#' number of revisions to consider "few". Default is 3. +#' @param abs_spread_threshold numeric, for the printed summary, the maximum +#' spread used to characterize revisions which don't actually change very +#' much. Default is 5% of the maximum value in the dataset, but this is the +#' most unit dependent of values, and likely needs to be chosen appropriate +#' for the scale of the dataset. +#' @param rel_spread_threshold float between 0 and 1, for the printed summary, +#' the relative spread fraction used to characterize revisions which don't +#' actually change very much. Default is .1, or 10% of the final value +#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the +#' threshold for when two floats are considered identical. +#' @param should_compactify bool. Compactify if `TRUE`. +#' @examples +#' +#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) +#' +#' revision_example %>% arrange(desc(spread)) +#' @export +#' @importFrom cli cli_inform cli_abort cli_li +#' @importFrom rlang list2 syms +#' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across +#' everything ungroup summarize if_else %>% +revision_summary <- function(epi_arch, + ..., + drop_nas = TRUE, + print_inform = TRUE, + min_waiting_period = as.difftime(60, units = "days"), + within_latest = 0.2, + quick_revision = as.difftime(3, units = "days"), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + compactify_tol = .Machine$double.eps^0.5, + should_compactify = TRUE) { + arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) + if (length(arg) == 0) { + # Choose the first column that's not a key or version + arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]] + } else if (length(arg) > 1) { + cli_abort("Not currently implementing more than one column at a time. Run each separately") + } + if (is.null(abs_spread_threshold)) { + abs_spread_threshold <- .05 * epi_arch$DT %>% + pull(...) %>% + max(na.rm = TRUE) + } + # for each time_value, get + # the number of revisions + # the maximum spread in value (both absolute and relative) + # the min lag + # the max lag + # + # revision_tibble + keys <- key_colnames(epi_arch) + + revision_behavior <- epi_arch$DT %>% + select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) + if (!is.null(min_waiting_period)) { + revision_behavior <- revision_behavior %>% + filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) + } + + if (drop_nas) { + # if we're dropping NA's, we should recompactify + revision_behavior <- + revision_behavior %>% + filter(!is.na(c_across(!!arg))) + } else { + revision_behavior <- epi_arch$DT + } + if (should_compactify) { + revision_behavior <- revision_behavior %>% + arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying + apply_compactify(c(keys, version), compactify_tol) + } + revision_behavior <- + revision_behavior %>% + mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter + group_by(across(all_of(keys))) %>% # group by all the keys + summarize( + n_revisions = dplyr::n() - 1, + min_lag = min(lag), # nolint: object_usage_linter + max_lag = max(lag), # nolint: object_usage_linter + min_value = f_no_na(min, pick(!!arg)), + max_value = f_no_na(max, pick(!!arg)), + median_value = f_no_na(median, pick(!!arg)), + time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter + .groups = "drop" + ) %>% + mutate( + spread = max_value - min_value, # nolint: object_usage_linter + rel_spread = spread / max_value, # nolint: object_usage_linter + # TODO the units here may be a problem + min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter + max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter + time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter + ) %>% + select(-time_to) %>% + relocate( + time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter + time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter + ) + if (print_inform) { + cli_inform("Min lag (time to first version):") + difftime_summary(revision_behavior$min_lag) %>% print() + if (!drop_nas) { + total_na <- epi_arch$DT %>% + filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter + nrow() + cli_inform("Fraction of all versions that are `NA`:") + cli_li(num_percent(total_na, nrow(epi_arch$DT), "")) + cli_inform("") + } + cli_inform("Fraction of epi_key+time_values with") + total_num <- nrow(revision_behavior) # nolint: object_usage_linter + total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter + cli_inform("No revisions:") + cli_li(num_percent(total_num_unrevised, total_num, "")) + total_quickly_revised <- sum( # nolint: object_usage_linter + revision_behavior$max_lag <= + as.difftime(quick_revision, units = "days") + ) + cli_inform("Quick revisions (last revision within {quick_revision} +{units(quick_revision)} of the `time_value`):") + cli_li(num_percent(total_quickly_revised, total_num, "")) + total_barely_revised <- sum( # nolint: object_usage_linter + revision_behavior$n_revisions <= + few_revisions + ) + cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):") + cli_li(num_percent(total_barely_revised, total_num, "")) + cli_inform("") + cli_inform("Fraction of revised epi_key+time_values which have:") + + real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter + n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter + rel_spread <- sum( # nolint: object_usage_linter + real_revisions$rel_spread < + rel_spread_threshold, + na.rm = TRUE + ) + sum(is.na(real_revisions$rel_spread)) + cli_inform("Less than {rel_spread_threshold} spread in relative value:") + cli_li(num_percent(rel_spread, n_real_revised, "")) + abs_spread <- sum( # nolint: object_usage_linter + real_revisions$spread > + abs_spread_threshold + ) # nolint: object_usage_linter + cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") + cli_li(num_percent(abs_spread, n_real_revised, "")) + + cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:") + difftime_summary(revision_behavior[["time_near_latest"]]) %>% print() + } + return(revision_behavior) +} + +#' pull the value from lags when values starts indefinitely being within prop of it's last value. +#' @param values this should be a 1 column tibble. errors may occur otherwise +#' @keywords internal +time_within_x_latest <- function(lags, values, prop = .2) { + values <- values[[1]] + latest_value <- values[[length(values)]] + close_enough <- abs(values - latest_value) < prop * latest_value + # we want to ignore any stretches where it's close, but goes farther away later + return(get_last_run(close_enough, lags)) +} + +#' return the first value in values_from from the last string of trues in bool_vec +#' @description +#' the point of this operation is to get the value in values_from which occurs +#' at the same index as the start of the last run of true values in bool_vec. +#' for example, in c(1,1,0,1,1), we want the 4th entry, since there's a 0 +#' breaking the run +#' @keywords internal +get_last_run <- function(bool_vec, values_from) { + runs <- rle(bool_vec) + length(bool_vec) - tail(runs$lengths, n = 1) + values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]] +} + +#' use when the default behavior returns a warning on empty lists, which we do +#' not want, and there is no super clean way of preventing this +#' @keywords internal +f_no_na <- function(f, x) { + x <- x[!is.na(x)] + if (length(x) == 0) { + return(Inf) + } else { + return(f(x)) + } +} + + +#' simple util for printing a fraction and it's percent +#' @keywords internal +num_percent <- function(a, b, b_description) { + glue::glue("{prettyNum(a, big.mark=',')} out of {prettyNum(b, big.mark=',')} {b_description} +({round(a/b*100,digits=2)}%)") +} + +#' summary doesn't work on difftimes +#' @keywords internal +difftime_summary <- function(diff_time_val) { + if (length(diff_time_val) > 0) { + res <- data.frame( + min = min(diff_time_val), + median = median(diff_time_val), + mean = round(mean(diff_time_val), 1), + max = max(diff_time_val), + row.names = " ", + check.names = FALSE + ) + return(res) + } else { + return(data.frame()) + } +} diff --git a/R/slide.R b/R/slide.R index be8d895b..5a7fbd6a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -5,41 +5,43 @@ #' for examples. #' #' @template basic-slide-params -#' @param f Function, formula, or missing; together with `...` specifies the +#' @param .f Function, formula, or missing; together with `...` specifies the #' computation to slide. To "slide" means to apply a computation within a #' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `before` and `after` parameters described below. One time -#' step is typically one day or one week; see details for more explanation. If -#' a function, `f` must take a data frame with the same column names as -#' the original object, minus any grouping variables, containing the time -#' window data for one group-`ref_time_value` combination; followed by a -#' one-row tibble containing the values of the grouping variables for the -#' associated group; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~mean(.x$var)` to compute a mean of a column `var` for each +#' determined by the `.window_size` and `.align` parameters, see the details +#' section for more. If a function, `.f` must have the form `function(x, g, t, +#' ...)`, where +#' +#' - `x` is a data frame with the same column names as the original object, +#' minus any grouping variables, with only the windowed data for one +#' group-`.ref_time_value` combination +#' - `g` is a one-row tibble containing the values of the grouping variables +#' for the associated group +#' - `t` is the `.ref_time_value` for the current window +#' - `...` are additional arguments +#' +#' If a formula, `.f` can operate directly on columns accessed via `.x$var` or +#' `.$var`, as in `~mean(.x$var)` to compute a mean of a column `var` for each #' `ref_time_value`-group combination. The group key can be accessed via `.y`. -#' If `f` is missing, then `...` will specify the computation. +#' If `.f` is missing, then `...` will specify the computation. #' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then the `...` is interpreted as -#' an expression for tidy evaluation; in addition to referring to columns -#' directly by name, the expression has access to `.data` and `.env` pronouns -#' as in `dplyr` verbs, and can also refer to `.x`, `.group_key`, and -#' `.ref_time_value`. See details. -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` -#' to the names of the list elements. +#' via `.f`. Alternatively, if `.f` is missing, then the `...` is interpreted +#' as a ["data-masking"][rlang::args_data_masking] expression or expressions +#' for tidy evaluation; in addition to referring columns directly by name, the +#' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs, +#' and can also refer to `.x` (not the same as the input epi_df), +#' `.group_key`, and `.ref_time_value`. See details. +#' @param .new_col_name String indicating the name of the new column that will +#' contain the derivative values. The default is "slide_value" unless your +#' slide computations output data frames, in which case they will be unpacked +#' into the constituent columns and those names used. New columns should not +#' be given names that clash with the existing columns of `.x`; see details. #' #' @template basic-slide-details #' #' @importFrom lubridate days weeks -#' @importFrom dplyr bind_rows group_vars filter select -#' @importFrom rlang .data .env !! enquo enquos sym env missing_arg +#' @importFrom dplyr bind_rows group_map group_vars filter select +#' @importFrom rlang .data .env !! enquos sym env missing_arg #' @export #' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`] #' @examples @@ -48,32 +50,28 @@ #' # the `epi_slide_mean` and `epi_slide_sum` functions instead. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), after = 6) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "left") %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' -#' # slide a 7-day centre-aligned average +#' # slide a 7-day center-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "center") %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' -#' # slide a 14-day centre-aligned average +#' # slide a 14-day center-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_14dav = mean(cases), before = 6, after = 7) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_14dav = mean(cases), .window_size = 14, .align = "center") %>% #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() #' @@ -81,212 +79,419 @@ #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide( -#' a = data.frame( +#' cases_2d = list(data.frame( #' cases_2dav = mean(cases), #' cases_2dma = mad(cases) -#' ), -#' before = 1, as_list_col = TRUE +#' )), +#' .window_size = 2 #' ) %>% #' ungroup() -epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = "slide_value", as_list_col = FALSE, - names_sep = "_", all_rows = FALSE) { - assert_class(x, "epi_df") - - if (nrow(x) == 0L) { - return(x) +epi_slide <- function( + .x, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .ref_time_values = NULL, .new_col_name = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(provided_args %in% c("x", "f", "ref_time_values", "new_col_name", "all_rows"))) { + cli::cli_abort( + "epi_slide: you are using one of the following old argument names: `x`, `f`, `ref_time_values`, + `new_col_name`, or `all_rows`. Please use the new dot-prefixed names: `.x`, `.f`, `.ref_time_values`, + `.new_col_name`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide: the argument `names_sep` is deprecated. If NULL, you can remove it, it is now default. + If a string, please manually prefix your column names instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) } - if (is.null(ref_time_values)) { - ref_time_values <- unique(x$time_value) - } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(ref_time_values, unique(x$time_value))) { + # Validate arguments + assert_class(.x, "epi_df") + if (checkmate::test_class(.x, "grouped_df")) { + expected_group_keys <- .x %>% + key_colnames(exclude = "time_value") %>% + sort() + if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { cli_abort( - "`ref_time_values` must be a unique subset of the time values in `x`." + "epi_slide: `.x` must be either grouped by {expected_group_keys}. (Or you can just ungroup + `.x` and we'll do this grouping automatically.) You may need to aggregate your data first, + see aggregate_epi_df().", + class = "epiprocess__epi_slide__invalid_grouping" ) } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") - } + } else { + .x <- group_epi_df(.x, exclude = "time_value") } - ref_time_values <- sort(ref_time_values) - - if (is.null(before) && !is.null(after)) { - if (inherits(after, "difftime")) { - before <- as.difftime(0, units = units(after)) - } else { - before <- 0 - } + if (nrow(.x) == 0L) { + return(.x) } - if (is.null(after) && !is.null(before)) { - if (inherits(before, "difftime")) { - after <- as.difftime(0, units = units(before)) - } else { - after <- 0 + # If `.f` is missing, interpret ... as an expression for tidy evaluation + if (missing(.f)) { + used_data_masking <- TRUE + quosures <- enquos(...) + if (length(quosures) == 0) { + cli_abort("If `.f` is missing then a computation must be specified via `...`.") } - } - validate_slide_window_arg(before, attr(x, "metadata")$time_type) - validate_slide_window_arg(after, attr(x, "metadata")$time_type) - - # Arrange by increasing time_value - x <- arrange(x, .data$time_value) - - # Now set up starts and stops for sliding/hopping - starts <- ref_time_values - before - stops <- ref_time_values + after - - # Symbolize new column name - new_col <- sym(new_col_name) - - # Computation for one group, all time values - slide_one_grp <- function(.data_group, - .group_key, # see `?group_modify` - ..., # `...` to `epi_slide` forwarded here - f_factory, - starts, - stops, - ref_time_values, - all_rows, - new_col) { - # Figure out which reference time values appear in the data group in the - # first place (we need to do this because it could differ based on the - # group, hence the setup/checks for the reference time values based on all - # the data could still be off): - o <- ref_time_values %in% .data_group$time_value - starts <- starts[o] - stops <- stops[o] - kept_ref_time_values <- ref_time_values[o] - - f <- f_factory(kept_ref_time_values) - - # Compute the slide values - slide_values_list <- slider::hop_index( - .x = .data_group, - .i = .data_group$time_value, - .starts = starts, - .stops = stops, - .f = f, - .group_key, ... - ) - # Now figure out which rows in the data group are in the reference time - # values; this will be useful for all sorts of checks that follow - o <- .data_group$time_value %in% kept_ref_time_values - num_ref_rows <- sum(o) + .f <- quosures + # Magic value that passes zero args as dots in calls below. Equivalent to + # `... <- missing_arg()`, but `assign` avoids warning about improper use of + # dots. + assign("...", missing_arg()) + } else { + used_data_masking <- FALSE + } + .slide_comp <- as_time_slide_computation(.f, ...) - # Count the number of appearances of each kept reference time value. - counts <- dplyr::filter(.data_group, .data$time_value %in% kept_ref_time_values) %>% - dplyr::count(.data$time_value) %>% - `[[`("n") + .align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + if (is.null(.window_size)) { + cli_abort("epi_slide: `.window_size` must be specified.") + } + validate_slide_window_arg(.window_size, time_type) + window_args <- get_before_after_from_window(.window_size, .align, time_type) - if ( - !all(purrr::map_lgl(slide_values_list, is.atomic)) && - !all(purrr::map_lgl(slide_values_list, is.data.frame)) - ) { + if (is.null(.ref_time_values)) { + .ref_time_values <- unique(.x$time_value) + } else { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE, unique = TRUE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { cli_abort( - "The slide computations must return always atomic vectors - or data frames (and not a mix of these two structures)." + "epi_slide: `ref_time_values` must be a unique subset of the time values in `x`.", + class = "epiprocess__epi_slide_invalid_ref_time_values" ) } + } + .ref_time_values <- sort(.ref_time_values) - # Unlist if appropriate: - slide_values <- - if (as_list_col) { - slide_values_list - } else { - vctrs::list_unchop(slide_values_list) - } - - if ( - all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && - length(slide_values_list) != 0L - ) { - # Recycle to make size stable (one slide value per ref time value). - # (Length-0 case also could be handled here, but causes difficulties; - # leave it to the next branch, where it also belongs.) - slide_values <- vctrs::vec_rep_each(slide_values, times = counts) - } else { - # Split and flatten if appropriate, perform a (loose) check on number of - # rows. - if (as_list_col) { - slide_values <- purrr::list_flatten(purrr::map( - slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]] - )) - } - if (vctrs::vec_size(slide_values) != num_ref_rows) { - cli_abort( - "The slide computations must either (a) output a single element/row each, or - (b) one element/row per appearance of the reference time value in the local window." - ) - } - } - - # If all rows, then pad slide values with NAs, else filter down data group - if (all_rows) { - orig_values <- slide_values - slide_values <- vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) - # ^ using vctrs::vec_init would be shorter but docs don't guarantee it - # fills with NA equivalent. - vctrs::vec_slice(slide_values, o) <- orig_values - } else { - .data_group <- filter(.data_group, o) + assert_character(.new_col_name, null.ok = TRUE) + if (!is.null(.new_col_name)) { + if (.new_col_name %in% names(.x)) { + cli_abort(c("`.new_col_name` cannot overlap with existing column names", + "x" = "{sym(.new_col_name)} already exists in `.x`", + ">" = "Try using a different `.new_col_name` instead." + )) } - return(mutate(.data_group, !!new_col := slide_values)) } - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos <- enquos(...) - if (length(quos) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - cli_abort("If `f` is missing then only a single computation can be specified via `...`.") - } + assert_logical(.all_rows, len = 1) - f <- quos[[1]] - new_col <- sym(names(rlang::quos_auto_name(quos))) - # Magic value that passes zero args as dots in calls below. Equivalent to - # `... <- missing_arg()`, but use `assign` to avoid warning about - # improper use of dots. - assign("...", missing_arg()) + # Check for duplicated time values within groups + duplicated_time_values <- .x %>% + group_epi_df() %>% + filter(dplyr::n() > 1) %>% + ungroup() + if (nrow(duplicated_time_values) > 0) { + bad_data <- capture.output(duplicated_time_values) + cli_abort( + "as_epi_df: some groups in a resulting dplyr computation have duplicated time values. + epi_df requires a unique time_value per group.", + body = c("Sample groups:", bad_data) + ) } - f <- as_slide_computation(f, ...) + # Begin handling completion. This will create a complete time index between + # the smallest and largest time values in the data. This is used to ensure + # that the slide function is called with a complete window of data. Each slide + # group will filter this down to between its min and max time values. We also + # mark which dates were in the data and which were added by our completion. + date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) + .x$.real <- TRUE + # Create a wrapper that calculates and passes `.ref_time_value` to the - # computation. `i` is contained in the `f_wrapper_factory` environment such - # that when called within `slide_one_grp` `i` is reset for every group. - f_wrapper_factory <- function(kept_ref_time_values) { - # Use `i` to advance through list of start dates. + # computation. `i` is contained in the `slide_comp_wrapper_factory` + # environment such that when called within `slide_one_grp` `i` advances + # through the list of reference time values within a group and then resets + # back to 1 when switching groups. + slide_comp_wrapper_factory <- function(kept_ref_time_values) { i <- 1L - f_wrapper <- function(.x, .group_key, ...) { + slide_comp_wrapper <- function(.x, .group_key, ...) { .ref_time_value <- kept_ref_time_values[[i]] i <<- i + 1L - f(.x, .group_key, .ref_time_value, ...) + .slide_comp(.x, .group_key, .ref_time_value, ...) } - return(f_wrapper) + return(slide_comp_wrapper) + } + + # - If .x is not grouped, then the trivial group is applied: + # https://dplyr.tidyverse.org/reference/group_map.html + # - We create a lambda that forwards the necessary slide arguments to + # `epi_slide_one_group`. + # - `...` from top of `epi_slide` are forwarded to `.f` here through + # group_modify and through the lambda. + .x_groups <- groups(.x) + result <- group_map( + .x, + .f = function(.data_group, .group_key, ...) { + epi_slide_one_group( + .data_group, .group_key, ..., + .slide_comp_factory = slide_comp_wrapper_factory, + .before = window_args$before, + .after = window_args$after, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows, + .new_col_name = .new_col_name, + .used_data_masking = used_data_masking, + .time_type = time_type, + .date_seq_list = date_seq_list + ) + }, + ..., + .keep = TRUE + ) %>% + bind_rows() %>% + filter(.real) %>% + select(-.real) %>% + arrange_col_canonical() %>% + group_by(!!!.x_groups) + + # If every group in epi_slide_one_group takes the + # length(available_ref_time_values) == 0 branch then we end up here. + if (ncol(result) == ncol(.x %>% select(-.real))) { + cli_abort( + "epi_slide: no new columns were created. This can happen if every group has no available ref_time_values. + This is likely a mistake in your data, in the slide computation, or in the ref_time_values argument.", + class = "epiprocess__epi_slide_no_new_columns" + ) } - x <- group_modify(x, slide_one_grp, + return(result) +} + +# Slide applied to one group. See `?group_modify` for the expected structure. The dots +# `...` forward their inputs to the function `f`. +epi_slide_one_group <- function( + .data_group, .group_key, ..., - f_factory = f_wrapper_factory, - starts = starts, - stops = stops, - ref_time_values = ref_time_values, - all_rows = all_rows, - new_col = new_col, - .keep = FALSE + .slide_comp_factory, .before, .after, .ref_time_values, .all_rows, + .new_col_name, .used_data_masking, .time_type, .date_seq_list) { + available_ref_time_values <- .ref_time_values[ + .ref_time_values >= min(.data_group$time_value) & .ref_time_values <= max(.data_group$time_value) + ] + + # Unpack the date_seq_list argument and complete the data group with missing + # time values, padding on the left and right as needed. + all_dates <- .date_seq_list$all_dates + missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] + .data_group <- bind_rows( + .data_group, + dplyr::bind_cols( + .group_key, + tibble( + time_value = c( + missing_times, + .date_seq_list$pad_early_dates, + .date_seq_list$pad_late_dates + ), .real = FALSE + ) + ) + ) %>% + arrange(.data$time_value) + + # If the data group does not contain any of the reference time values, return + # the original .data_group without slide columns and let bind_rows at the end + # of group_modify handle filling the empty data frame with NA values. + if (length(available_ref_time_values) == 0L) { + if (.all_rows) { + return(.data_group) + } + return(.data_group %>% filter(FALSE)) + } + + # Get stateful function that tracks ref_time_value per group and sends it to + # `f` when called. + .slide_comp <- .slide_comp_factory(available_ref_time_values) + + if (.time_type == "yearmonth" && identical(.before, Inf)) { + # - Inf is NA(s) rather than -Inf as a yearmonth; feed in -Inf manually + # (it will successfully be cast to -Inf as a yearmonth) + starts <- rep(-Inf, length(available_ref_time_values)) + stops <- available_ref_time_values + .after + } else { + starts <- available_ref_time_values - .before + stops <- available_ref_time_values + .after + } + + # Compute the slide values. slider::hop_index will return a list of f outputs + # e.g. list(f(.slide_group_1, .group_key, .ref_time_value_1), + # f(.slide_group_1, .group_key, .ref_time_value_2), ...) + slide_values_list <- slider::hop_index( + .x = .data_group, + .i = .data_group$time_value, + .starts = starts, + .stops = stops, + .f = .slide_comp, + .group_key, ... ) - # Unnest if we need to, and return - if (!as_list_col) { - x <- unnest(x, !!new_col, names_sep = names_sep) + # Validate returned values. This used to only happen when + # .used_data_masking=FALSE, so if it seems too slow, consider bringing that + # back. + return_types <- purrr::map_chr(slide_values_list, function(x) { + if (is.data.frame(x)) { + return("data.frame") + } else if (vctrs::obj_is_vector(x) && is.null(vctrs::vec_names(x))) { + return("vector") + } else { + return("other") + } + }) %>% unique() + # Returned values must be data.frame or vector. + if ("other" %in% return_types) { + cli_abort( + "epi_slide: slide computations must always return either data frames without rownames + or unnamed vectors (as determined by the vctrs package).", + class = "epiprocess__invalid_slide_comp_value" + ) + } + # Returned values must all be the same type. + if (length(return_types) != 1L) { + cli_abort( + "epi_slide: slide computations must always return either a data.frame or a vector (as determined by the + vctrs package), but not a mix of the two.", + class = "epiprocess__invalid_slide_comp_value" + ) + } + # Returned values must always be a scalar vector or a data frame with one row. + if (any(vctrs::list_sizes(slide_values_list) != 1L)) { + cli_abort( + "epi_slide: slide computations must return a single element (e.g. a scalar value, a single data.frame row, + or a list).", + class = "epiprocess__invalid_slide_comp_value" + ) + } + # Flatten the output list. This will also error if the user's slide function + # returned inconsistent types. + slide_values <- slide_values_list %>% vctrs::list_unchop() + + # If all rows, then pad slide values with NAs, else filter down data group + if (.all_rows) { + orig_values <- slide_values + slide_values <- vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) + vctrs::vec_slice(slide_values, .data_group$time_value %in% available_ref_time_values) <- orig_values + } else { + .data_group <- .data_group %>% filter(time_value %in% available_ref_time_values) + } + + # To label the result, we will parallel some code from `epix_slide`, though + # some logic is different and some optimizations are less likely to be + # needed as we're at a different loop depth. + + # Unlike `epix_slide`, we will not every have to deal with a 0-row + # `.group_key`: we return early if `epi_slide`'s `.x` has 0 rows, and our + # loop over groups is the outer loop (>= 1 row into the group loop ensures + # we will have only 1-row `.group_key`s). Further, unlike `epix_slide`, we + # actually will be using `.group_data` rather than work with `.group_key` at + # all, in order to keep the pre-existing non-key columns. We will also try + # to work directly with `epi_df`s instead of listified tibbles; since we're + # not in as tight of a loop, the increased overhead hopefully won't matter. + # We'll need to use `bind_cols` rather than `c` to avoid losing + # `epi_df`ness. + + res <- .data_group + + if (is.null(.new_col_name)) { + if (inherits(slide_values, "data.frame")) { + # Sometimes slide_values can parrot back columns already in `res`; allow + # this, but balk if a column has the same name as one in `res` but a + # different value: + comp_nms <- names(slide_values) + overlaps_existing_names <- comp_nms %in% names(res) + for (comp_i in which(overlaps_existing_names)) { + if (!identical(slide_values[[comp_i]], res[[comp_nms[[comp_i]]]])) { + lines <- c( + cli::format_error(c( + "New column and old column clash", + "x" = "slide computation output included a + {format_varname(comp_nms[[comp_i]])} column, but `.x` already had a + {format_varname(comp_nms[[comp_i]])} column with differing values", + "Here are examples of differing values, where the grouping variables were + {format_tibble_row(.group_key)}:" + )), + capture.output(print(waldo::compare( + res[[comp_nms[[comp_i]]]], slide_values[[comp_i]], + x_arg = rlang::expr_deparse(dplyr::expr(`$`(existing, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter + y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) # nolint: object_usage_linter + ))), + cli::format_message(c( + ">" = "You likely want to rename or remove this column from your slide + computation's output, or debug why it has a different value." + )) + ) + rlang::abort(paste(collapse = "\n", lines), + class = "epiprocess__epi_slide_output_vs_existing_column_conflict" + ) + } + } + # Unpack into separate columns (without name prefix). If there are + # columns duplicating existing columns, de-dupe and order them as if they + # didn't exist in slide_values. + res <- dplyr::bind_cols(res, slide_values[!overlaps_existing_names]) + } else { + # Apply default name (to vector or packed data.frame-type column): + if ("slide_value" %in% names(res)) { + cli_abort(c("Cannot guess a good column name for your output", + "x" = "`slide_value` already exists in `.x`", + ">" = "Please provide a `.new_col_name`." + )) + } + res[["slide_value"]] <- slide_values + } + } else { + # Vector or packed data.frame-type column (note: overlaps with existing + # column names should already be forbidden by earlier validation): + res[[.new_col_name]] <- slide_values } - return(x) + return(res) +} + +get_before_after_from_window <- function(window_size, align, time_type) { + if (identical(window_size, Inf)) { + if (align == "right") { + before <- Inf + # styler: off + after <- switch(time_type, + day = , week = as.difftime(0, units = glue::glue("{time_type}s")), + yearmonth = , integer = 0L, + cli_abort("Unrecognized time_type: {time_type}.") + ) + # styler: on + } else { + cli_abort( + "`epi_slide`: center and left alignment are not supported with an infinite window size." + ) + } + } else { + if (align == "right") { + before <- window_size - 1 + after <- 0 + } else if (align == "center") { + # For window_size = 5, before = 2, after = 2. For window_size = 4, before = 2, after = 1. + before <- floor(window_size / 2) + after <- window_size - before - 1 + } else if (align == "left") { + before <- 0 + after <- window_size - 1 + } + } + return(list(before = before, after = after)) } -#' Optimized slide function for performing common rolling computations on an `epi_df` object +#' Optimized slide function for performing common rolling computations on an +#' `epi_df` object #' #' Slides an n-timestep [data.table::froll] or [slider::summary-slide] function #' over variables in an `epi_df` object. See the @@ -295,33 +500,25 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' #' @template basic-slide-params #' @template opt-slide-params -#' @param f Function; together with `...` specifies the computation to slide. -#' `f` must be one of `data.table`'s rolling functions +#' @param .f Function; together with `...` specifies the computation to slide. +#' `.f` must be one of `data.table`'s rolling functions #' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one #' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, -#' etc. See [slider::summary-slide]). To "slide" means to apply a -#' computation within a sliding (a.k.a. "rolling") time window for each data -#' group. The window is determined by the `before` and `after` parameters -#' described below. One time step is typically one day or one week; see -#' details for more explanation. +#' etc. See [slider::summary-slide]). #' #' The optimized `data.table` and `slider` functions can't be directly passed -#' as the computation function in `epi_slide` without careful handling to -#' make sure each computation group is made up of the `n` dates rather than -#' `n` points. `epi_slide_opt` (and wrapper functions `epi_slide_mean` and -#' `epi_slide_sum`) take care of window completion automatically to prevent -#' associated errors. -#' @param ... Additional arguments to pass to the slide computation `f`, for -#' example, `na.rm` and `algo` if `f` is a `data.table` function. If `f` is -#' a `data.table` function, it is automatically passed the data `x` to -#' operate on, the window size `n`, and the alignment `align`. Providing -#' these args via `...` will cause an error. If `f` is a `slider` function, -#' it is automatically passed the data `x` to operate on, and number of -#' points `before` and `after` to use in the computation. -#' +#' as the computation function in `epi_slide` without careful handling to make +#' sure each computation group is made up of the `.window_size` dates rather +#' than `.window_size` points. `epi_slide_opt` (and wrapper functions +#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion +#' automatically to prevent associated errors. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). #' @template opt-slide-details #' -#' @importFrom dplyr bind_rows mutate %>% arrange tibble select +#' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of #' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg #' @importFrom tidyselect eval_select #' @importFrom purrr map map_lgl @@ -337,7 +534,7 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, before = 6 +#' .f = data.table::frollmean, .window_size = 7 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed, and rename new col #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% @@ -349,9 +546,9 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, before = 6, +#' .f = data.table::frollmean, .window_size = 7, #' # `frollmean` options -#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' algo = "exact", hasNA = TRUE, na.rm = TRUE #' ) %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() @@ -361,74 +558,88 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = slider::slide_mean, after = 6 +#' .f = slider::slide_mean, .window_size = 7, .align = "left" #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' -#' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` +#' # slide a 7-day center-aligned sum. This can also be done with `epi_slide_sum` #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollsum, before = 3, after = 3 +#' .f = data.table::frollsum, .window_size = 6, .align = "center" #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() -epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, as_list_col = NULL, - names_sep = NULL, all_rows = FALSE) { - assert_class(x, "epi_df") +epi_slide_opt <- function( + .x, .col_names, .f, ..., + .window_size = 1, .align = c("right", "center", "left"), + .ref_time_values = NULL, .all_rows = FALSE) { + assert_class(.x, "epi_df") - if (nrow(x) == 0L) { - cli_abort( - c( - "input data `x` unexpectedly has 0 rows", - "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `ref_time_values` argument was set appropriately" - ), - class = "epiprocess__epi_slide_opt__0_row_input", - epiprocess__x = x + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_opt: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." ) } - - if (!is.null(as_list_col)) { - cli_abort( - "`as_list_col` is not supported for `epi_slide_[opt/mean/sum]`", - class = "epiprocess__epi_slide_opt__list_not_supported" + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." ) } - if (!is.null(new_col_name)) { - cli_abort( - "`new_col_name` is not supported for `epi_slide_[opt/mean/sum]`", + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `dplyr::rename` after the slide.", class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } - if (!is.null(names_sep)) { - cli_abort( - "`names_sep` is not supported for `epi_slide_[opt/mean/sum]`", + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `names_sep` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `dplyr::rename` after the slide.", class = "epiprocess__epi_slide_opt__name_sep_not_supported" ) } - # Check that slide function `f` is one of those short-listed from + if (nrow(.x) == 0L) { + cli_abort( + c( + "input data `.x` unexpectedly has 0 rows", + "i" = "If this computation is occuring within an `epix_slide` call, + check that `epix_slide` `.versions` argument was set appropriately" + ), + class = "epiprocess__epi_slide_opt__0_row_input", + epiprocess__x = .x + ) + } + + # Check that slide function `.f` is one of those short-listed from # `data.table` and `slider` (or a function that has the exact same # definition, e.g. if the function has been reexported or defined # locally). if (any(map_lgl( list(frollmean, frollsum, frollapply), - function(roll_fn) { - identical(f, roll_fn) - } + ~ identical(.f, .x) ))) { f_from_package <- "data.table" } else if (any(map_lgl( list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), - function(roll_fn) { - identical(f, roll_fn) - } + ~ identical(.f, .x) ))) { f_from_package <- "slider" } else { @@ -442,66 +653,91 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref etc. See `?slider::\`summary-slide\`` for more options)." ), class = "epiprocess__epi_slide_opt__unsupported_slide_function", - epiprocess__f = f + epiprocess__f = .f ) } - user_provided_rtvs <- !is.null(ref_time_values) + user_provided_rtvs <- !is.null(.ref_time_values) if (!user_provided_rtvs) { - ref_time_values <- unique(x$time_value) + .ref_time_values <- unique(.x$time_value) } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(ref_time_values, unique(x$time_value))) { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { cli_abort( - "`ref_time_values` must be a unique subset of the time values in `x`." + "`ref_time_values` must be a unique subset of the time values in `x`.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" ) } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + if (anyDuplicated(.ref_time_values) != 0L) { + cli_abort( + "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) } } - ref_time_values <- sort(ref_time_values) + ref_time_values <- sort(.ref_time_values) - if (is.null(before) && !is.null(after)) { - if (inherits(after, "difftime")) { - before <- as.difftime(0, units = units(after)) + # Handle window arguments + align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + validate_slide_window_arg(.window_size, time_type) + if (identical(.window_size, Inf)) { + if (align == "right") { + before <- Inf + if (time_type %in% c("day", "week")) { + after <- as.difftime(0, units = glue::glue("{time_type}s")) + } else { + after <- 0 + } } else { - before <- 0 + cli_abort( + "`epi_slide`: center and left alignment are not supported with an infinite window size." + ) } - } - if (is.null(after) && !is.null(before)) { - if (inherits(before, "difftime")) { - after <- as.difftime(0, units = units(before)) - } else { - after <- 0 + } else { + if (align == "right") { + before <- .window_size - 1 + if (time_type %in% c("day", "week")) { + after <- as.difftime(0, units = glue::glue("{time_type}s")) + } else { + after <- 0 + } + } else if (align == "center") { + # For .window_size = 5, before = 2, after = 2. For .window_size = 4, before = 2, after = 1. + before <- floor(.window_size / 2) + after <- .window_size - before - 1 + } else if (align == "left") { + if (time_type %in% c("day", "week")) { + before <- as.difftime(0, units = glue::glue("{time_type}s")) + } else { + before <- 0 + } + after <- .window_size - 1 } } - validate_slide_window_arg(before, attr(x, "metadata")$time_type) - validate_slide_window_arg(after, attr(x, "metadata")$time_type) - # Make a complete date sequence between min(x$time_value) and max(x$time_value). - date_seq_list <- full_date_seq(x, before, after, attr(x, "metadata")$time_type) + # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). + date_seq_list <- full_date_seq(.x, before, after, time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates - # `frollmean` is 1-indexed, so create a new window width based on our - # `before` and `after` params. - window_size <- before + after + 1L - - # The position of a given column can be differ between input `x` and + # The position of a given column can be differ between input `.x` and # `.data_group` since the grouping step by default drops grouping columns. # To avoid rerunning `eval_select` for every `.data_group`, convert # positions of user-provided `col_names` into string column names. We avoid # using `names(pos)` directly for robustness and in case we later want to # allow users to rename fields via tidyselection. - pos <- eval_select(rlang::enquo(col_names), data = x, allow_rename = FALSE) - col_names_chr <- names(x)[pos] + if (inherits(quo_get_expr(enquo(.col_names)), "character")) { + pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE) + } else { + pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) + } + col_names_chr <- names(.x)[pos] # Always rename results to "slide_value_". result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] - # `frollmean` requires a full window to compute a result. Add NA values # to beginning and end of the group so that we get results for the # first `before` and last `after` elements. @@ -511,57 +747,63 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref ) %>% arrange(.data$time_value) - # If a group contains duplicate time values, `frollmean` will still only - # use the last `k` obs. It isn't looking at dates, it just goes in row - # order. So if the computation is aggregating across multiple obs for the - # same date, `epi_slide_opt` and derivates will produce incorrect - # results; `epi_slide` should be used instead. - if (anyDuplicated(.data_group$time_value) != 0L) { - cli_abort( - c( - "group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this - group will result in incorrect results", - "i" = "Please change the grouping structure of the input data so that - each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) - %>% epi_slide_opt(f = frollmean)`)", - "i" = "Use `epi_slide` to aggregate across groups" - ), - class = "epiprocess__epi_slide_opt__duplicate_time_values", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { - cli_abort( - c( - "group contains an unexpected number of rows", - "i" = c("Input data may contain `time_values` closer together than the - expected `time_step` size") - ), - class = "epiprocess__epi_slide_opt__unexpected_row_number", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - if (f_from_package == "data.table") { - roll_output <- f( - x = .data_group[, col_names_chr], n = window_size, align = "right", ... - ) + # If a group contains duplicate time values, `frollmean` will still only + # use the last `k` obs. It isn't looking at dates, it just goes in row + # order. So if the computation is aggregating across multiple obs for the + # same date, `epi_slide_opt` and derivates will produce incorrect results; + # `epi_slide` should be used instead. + if (anyDuplicated(.data_group$time_value) != 0L) { + cli_abort( + c( + "group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this + group will result in incorrect results", + "i" = "Please change the grouping structure of the input data so that + each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) + %>% epi_slide_opt(.f = frollmean)`)", + "i" = "Use `epi_slide` to aggregate across groups" + ), + class = "epiprocess__epi_slide_opt__duplicate_time_values", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } + if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { + cli_abort( + c( + "group contains an unexpected number of rows", + "i" = c("Input data may contain `time_values` closer together than the + expected `time_step` size") + ), + class = "epiprocess__epi_slide_opt__unexpected_row_number", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } + + # `frollmean` is 1-indexed, so create a new window width based on our + # `before` and `after` params. Right-aligned `frollmean` results' + # `ref_time_value`s will be `after` timesteps ahead of where they should + # be; shift results to the left by `after` timesteps. + if (before != Inf) { + window_size <- before + after + 1L + roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, ...) + } else { + window_size <- list(seq_along(.data_group$time_value)) + roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, adaptive = TRUE, ...) + } if (after >= 1) { - # Right-aligned `frollmean` results' `ref_time_value`s will be `after` - # timesteps ahead of where they should be. Shift results to the left by - # `after` timesteps. .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { c(.x[(after + 1L):length(.x)], rep(NA, after)) }) } else { .data_group[, result_col_names] <- roll_output } - } else if (f_from_package == "slider") { + } + if (f_from_package == "slider") { for (i in seq_along(col_names_chr)) { - .data_group[, result_col_names[i]] <- f( + .data_group[, result_col_names[i]] <- .f( x = .data_group[[col_names_chr[i]]], before = as.numeric(before), after = as.numeric(after), @@ -573,22 +815,22 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref return(.data_group) } - result <- mutate(x, .real = TRUE) %>% - group_modify(slide_one_grp, ..., .keep = FALSE) - - result <- result[result$.real, ] - result$.real <- NULL + result <- mutate(.x, .real = TRUE) %>% + group_modify(slide_one_grp, ..., .keep = FALSE) %>% + filter(.data$.real) %>% + select(-.real) %>% + arrange_col_canonical() - if (all_rows) { + if (.all_rows) { result[!(result$time_value %in% ref_time_values), result_col_names] <- NA } else if (user_provided_rtvs) { result <- result[result$time_value %in% ref_time_values, ] } if (!is_epi_df(result)) { - # `all_rows`handling strips epi_df format and metadata. + # `.all_rows` handling strips epi_df format and metadata. # Restore them. - result <- reclass(result, attributes(x)$metadata) + result <- reclass(result, attributes(.x)$metadata) } return(result) @@ -600,14 +842,14 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' Wrapper around `epi_slide_opt` with `f = datatable::frollmean`. +#' Wrapper around `epi_slide_opt` with `.f = datatable::frollmean`. #' #' @template basic-slide-params #' @template opt-slide-params -#' @param ... Additional arguments to pass to `data.table::frollmean`, for -#' example, `na.rm` and `algo`. `data.table::frollmean` is automatically -#' passed the data `x` to operate on, the window size `n`, and the alignment -#' `align`. Providing these args via `...` will cause an error. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). #' #' @template opt-slide-details #' @@ -617,7 +859,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, before = 6) %>% +#' epi_slide_mean(cases, .window_size = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() @@ -628,7 +870,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' group_by(geo_value) %>% #' epi_slide_mean( #' cases, -#' before = 6, +#' .window_size = 7, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -638,41 +880,73 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, after = 6) %>% +#' epi_slide_mean(cases, .window_size = 7, .align = "right") %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' -#' # slide a 7-day centre-aligned average +#' # slide a 7-day center-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, before = 3, after = 3) %>% +#' epi_slide_mean(cases, .window_size = 7, .align = "center") %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' -#' # slide a 14-day centre-aligned average +#' # slide a 14-day center-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, before = 6, after = 7) %>% +#' epi_slide_mean(cases, .window_size = 14, .align = "center") %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% #' ungroup() -epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, as_list_col = NULL, - names_sep = NULL, all_rows = FALSE) { +epi_slide_mean <- function( + .x, .col_names, ..., + .window_size = 1, .align = c("right", "center", "left"), + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_mean: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `new_col_name` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + epi_slide_opt( - x = x, - col_names = {{ col_names }}, - f = data.table::frollmean, + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollmean, ..., - before = before, - after = after, - ref_time_values = ref_time_values, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_rows = all_rows + .window_size = .window_size, + .align = .align, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows ) } @@ -682,14 +956,14 @@ epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_t #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' Wrapper around `epi_slide_opt` with `f = datatable::frollsum`. +#' Wrapper around `epi_slide_opt` with `.f = datatable::frollsum`. #' #' @template basic-slide-params #' @template opt-slide-params -#' @param ... Additional arguments to pass to `data.table::frollsum`, for -#' example, `na.rm` and `algo`. `data.table::frollsum` is automatically -#' passed the data `x` to operate on, the window size `n`, and the alignment -#' `align`. Providing these args via `...` will cause an error. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). #' #' @template opt-slide-details #' @@ -699,25 +973,56 @@ epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_t #' # slide a 7-day trailing sum formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, before = 6) %>% +#' epi_slide_sum(cases, .window_size = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% #' ungroup() -epi_slide_sum <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, as_list_col = NULL, - names_sep = NULL, all_rows = FALSE) { +epi_slide_sum <- function( + .x, .col_names, ..., + .window_size = 1, .align = c("right", "center", "left"), + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_sum: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `new_col_name` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } epi_slide_opt( - x = x, - col_names = {{ col_names }}, - f = data.table::frollsum, + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollsum, ..., - before = before, - after = after, - ref_time_values = ref_time_values, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_rows = all_rows + .window_size = .window_size, + .align = .align, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows ) } @@ -746,7 +1051,7 @@ full_date_seq <- function(x, before, after, time_type) { if (time_type %in% c("yearmonth", "integer")) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) - if (before != 0) { + if (before != 0 && before != Inf) { pad_early_dates <- all_dates[1L] - before:1 } if (after != 0) { @@ -759,7 +1064,7 @@ full_date_seq <- function(x, before, after, time_type) { ) all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - if (before != 0) { + if (before != 0 && before != Inf) { # The behavior is analogous to the branch with tsibble types above. For # more detail, note that the function `seq.Date(from, ..., length.out = # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first diff --git a/R/sysdata.rda b/R/sysdata.rda index d100711d..8e8dc5ff 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/utils.R b/R/utils.R index 8c1c622f..bb30264a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -22,7 +22,7 @@ #' `initial` is long or the printing width is very narrow. #' @return `chr`; to print, use [`base::writeLines`]. #' -#' @noRd +#' @keywords internal wrap_symbolics <- function(symbolics, initial = "", common_prefix = "", none_str = "", width = getOption("width", 80L)) { @@ -69,7 +69,7 @@ wrap_symbolics <- function(symbolics, #' @inheritParams wrap_symbolics #' @return `chr`; to print, use [`base::writeLines`]. #' -#' @noRd +#' @keywords internal wrap_varnames <- function(nms, initial = "", common_prefix = "", none_str = "", width = getOption("width", 80L)) { @@ -84,28 +84,109 @@ wrap_varnames <- function(nms, #' @param lines `chr` #' @return string #' -#' @noRd +#' @keywords internal paste_lines <- function(lines) { paste(paste0(lines, "\n"), collapse = "") } +#' Format a class vector as a string via deparsing it +#' +#' @param class_vec `chr`; output of `class(object)` for some `object` +#' @return string +#' @keywords internal +format_class_vec <- function(class_vec) { + paste(collapse = "", deparse(class_vec)) +} + +#' Format a character vector as a string via deparsing/quoting each +#' +#' @param x `chr`; e.g., `colnames` of some data frame +#' @param empty string; what should be output if `x` is of length 0? +#' @return string +#' @keywords internal +format_chr_with_quotes <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + # Deparse to get quoted + escape-sequenced versions of varnames; collapse to + # single line (assuming no newlines in `x`). Though if we hand this to cli + # it may insert them (even in middle of quotes) while wrapping lines. + deparsed_collapsed <- paste(collapse = "", deparse(x)) + if (length(x) == 1L) { + deparsed_collapsed + } else { + # remove surrounding `c()`: + substr(deparsed_collapsed, 3L, nchar(deparsed_collapsed) - 1L) + } + } +} + +#' "Format" a character vector of column/variable names for cli interpolation +#' +#' Designed to give good output if interpolated with cli. Main purpose is to add +#' backticks around variable names when necessary, and something other than an +#' empty string if length 0. +#' +#' @param x `chr`; e.g., `colnames` of some data frame +#' @param empty string; what should be output if `x` is of length 0? +#' @return `chr` +#' @keywords internal +format_varnames <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + as.character(syms(x)) + } +} + +#' "Format" column/variable name for cli interpolation +#' +#' Designed to give good output if interpolated with cli. Main purpose is to add +#' backticks around variable names when necessary. +#' +#' @param x string; e.g., a colname +#' @return string +#' @keywords internal +format_varname <- function(x) { + # `syms` provides backticks if necessary; `sym` does not + as.character(syms(x)) +} + +#' Format a tibble row as chr +#' +#' @param x a tibble with a single row +#' @return `chr` with one entry per column, of form " = " +#' @keywords internal +format_tibble_row <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + formatted_names <- as.character(syms(names(x))) + formatted_values <- purrr::map_chr(x, function(binding_value) { + paste(collapse = "\n", format(binding_value)) + }) + formatted_x <- paste(formatted_names, "=", formatted_values) + formatted_x + } +} #' Assert that a sliding computation function takes enough args #' #' @param f Function; specifies a computation to slide over an `epi_df` or -#' `epi_archive` in `epi_slide` or `epix_slide`. +#' `epi_archive` in `epi_slide` or `epix_slide`. #' @param ... Dots that will be forwarded to `f` from the dots of `epi_slide` or #' `epix_slide`. +#' @template ref-time-value-label #' #' @importFrom rlang is_missing #' @importFrom purrr map_lgl #' @importFrom utils tail #' #' @noRd -assert_sufficient_f_args <- function(f, ...) { - mandatory_f_args_labels <- c("window data", "group key", "reference time value") +assert_sufficient_f_args <- function(.f, ..., .ref_time_value_label) { + mandatory_f_args_labels <- c("window data", "group key", .ref_time_value_label) n_mandatory_f_args <- length(mandatory_f_args_labels) - args <- formals(args(f)) + args <- formals(args(.f)) args_names <- names(args) # Remove named arguments forwarded from `epi[x]_slide`'s `...`: forwarded_dots_names <- names(rlang::call_match(dots_expand = FALSE)[["..."]]) @@ -119,7 +200,7 @@ assert_sufficient_f_args <- function(f, ...) { dots_i <- which(remaining_args_names == "...") # integer(0) if no match n_f_args_before_dots <- dots_i - 1L if (length(dots_i) != 0L) { - # `f` has a dots "arg" + # `.f` has a dots "arg" # Keep all arg names before `...` mandatory_args_mapped_names <- remaining_args_names[seq_len(n_f_args_before_dots)] # nolint: object_usage_linter @@ -128,40 +209,40 @@ assert_sufficient_f_args <- function(f, ...) { tail(mandatory_f_args_labels, n_mandatory_f_args - n_f_args_before_dots) cli::cli_warn( - "`f` might not have enough positional arguments before its `...`; in + "`.f` might not have enough positional arguments before its `...`; in the current `epi[x]_slide` call, the {mandatory_f_args_in_f_dots} will - be included in `f`'s `...`; if `f` doesn't expect those arguments, it + be included in `.f`'s `...`; if `.f` doesn't expect those arguments, it may produce confusing error messages", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots", - epiprocess__f = f, + epiprocess__f = .f, epiprocess__mandatory_f_args_in_f_dots = mandatory_f_args_in_f_dots ) } - } else { # `f` doesn't have a dots "arg" + } else { # `.f` doesn't have a dots "arg" if (length(args_names) < n_mandatory_f_args + rlang::dots_n(...)) { - # `f` doesn't take enough args. + # `.f` doesn't take enough args. if (rlang::dots_n(...) == 0L) { # common case; try for friendlier error message - cli_abort("`f` must take at least {n_mandatory_f_args} arguments", + cli_abort("`.f` must take at least {n_mandatory_f_args} arguments", class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", - epiprocess__f = f + epiprocess__f = .f ) } else { # less common; highlight that they are (accidentally?) using dots forwarding cli_abort( - "`f` must take at least {n_mandatory_f_args} arguments plus the + "`.f` must take at least {n_mandatory_f_args} arguments plus the {rlang::dots_n(...)} arguments forwarded through `epi[x]_slide`'s `...`, or a named argument to `epi[x]_slide` was misspelled", class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", - epiprocess__f = f + epiprocess__f = .f ) } } } # Check for args with defaults that are filled with mandatory positional - # calling args. If `f` has fewer than n_mandatory_f_args before `...`, then we + # calling args. If `.f` has fewer than n_mandatory_f_args before `...`, then we # only need to check those args for defaults. Note that `n_f_args_before_dots` is - # length 0 if `f` doesn't accept `...`. + # length 0 if `.f` doesn't accept `...`. n_remaining_args_for_default_check <- min(c(n_f_args_before_dots, n_mandatory_f_args)) default_check_args <- remaining_args[seq_len(n_remaining_args_for_default_check)] default_check_args_names <- names(default_check_args) @@ -169,18 +250,18 @@ assert_sufficient_f_args <- function(f, ...) { if (any(has_default_replaced_by_mandatory)) { default_check_mandatory_args_labels <- mandatory_f_args_labels[seq_len(n_remaining_args_for_default_check)] - # ^ excludes any mandatory args absorbed by f's `...`'s: + # ^ excludes any mandatory args absorbed by .f's `...`'s: mandatory_args_replacing_defaults <- default_check_mandatory_args_labels[has_default_replaced_by_mandatory] # nolint: object_usage_linter args_with_default_replaced_by_mandatory <- rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) # nolint: object_usage_linter cli::cli_abort( "`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to - `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which - {?has a/have} default value{?s}; we suspect that `f` doesn't expect + `.f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which + {?has a/have} default value{?s}; we suspect that `.f` doesn't expect {?this arg/these args} at all and may produce confusing error messages. - Please add additional arguments to `f` or remove defaults as + Please add additional arguments to `.f` or remove defaults as appropriate.", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", - epiprocess__f = f + epiprocess__f = .f ) } } @@ -265,101 +346,197 @@ assert_sufficient_f_args <- function(f, ...) { #' @param ... Additional arguments to pass to the function or formula #' specified via `x`. If `x` is a quosure, any arguments passed via `...` #' will be ignored. -#' @examples -#' f <- as_slide_computation(~ .x + 1) -#' f(10) #' -#' g <- as_slide_computation(~ -1 * .) -#' g(4) +#' @param .ref_time_value_long_varnames `r lifecycle::badge("experimental")` +#' Character vector. What variable names should we allow formulas and +#' data-masking tidy evaluation to use to refer to `ref_time_value` for the +#' computation (in addition to `.z` in formulas)? E.g., `".ref_time_value"` or +#' `c(".ref_time_value", ".version")`. #' -#' h <- as_slide_computation(~ .x - .group_key) -#' h(6, 3) +#' @template ref-time-value-label #' #' @importFrom rlang is_function new_function f_env is_environment missing_arg #' f_rhs is_formula caller_arg caller_env -#' -#' @noRd -as_slide_computation <- function(f, ...) { - arg <- caller_arg(f) +#' @keywords internal +as_slide_computation <- function(.f, ..., .ref_time_value_long_varnames, .ref_time_value_label) { + arg <- caller_arg(.f) call <- caller_env() - # A quosure is a type of formula, so be careful with the order and contents - # of the conditional logic here. - if (is_quosure(f)) { + if (rlang::is_quosures(.f)) { + quosures <- rlang::quos_auto_name(.f) # resolves := among other things + nms <- names(quosures) + manually_named <- rlang::names2(.f) != "" | vapply(.f, function(quosure) { + expression <- rlang::quo_get_expr(quosure) + is.call(expression) && expression[[1L]] == rlang::sym(":=") + }, FUN.VALUE = logical(1L)) fn <- function(.x, .group_key, .ref_time_value) { - # Convert to environment to standardize between tibble and R6 - # based inputs. In both cases, we should get a simple - # environment with the empty environment as its parent. - data_env <- rlang::as_environment(.x) - data_mask <- rlang::new_data_mask(bottom = data_env, top = data_env) + x_as_env <- rlang::as_environment(.x) + results_env <- new.env(parent = x_as_env) + data_mask <- rlang::new_data_mask(bottom = results_env, top = x_as_env) data_mask$.data <- rlang::as_data_pronoun(data_mask) # We'll also install `.x` directly, not as an `rlang_data_pronoun`, so - # that we can, e.g., use more dplyr and epiprocess operations. + # that we can, e.g., use more dplyr and epiprocess operations. It won't be + # (and doesn't make sense nrow-wise to be) updated with results as we loop + # through the quosures. data_mask$.x <- .x data_mask$.group_key <- .group_key - data_mask$.ref_time_value <- .ref_time_value - rlang::eval_tidy(f, data_mask) + for (ref_time_value_long_varname in .ref_time_value_long_varnames) { + data_mask[[ref_time_value_long_varname]] <- .ref_time_value + } + common_size <- NULL + # The data mask is an environment; it doesn't track the binding order. + # We'll track that separately. For efficiency, we'll use `c` to add to + # this order, and deal with binding redefinitions at the end. We'll + # reflect deletions immediately (current implementation of `new_tibble` + # seems like it would exclude `NULL` bindings for us but `?new_tibble` + # doesn't reflect this behavior). + results_multiorder <- character(0L) + for (quosure_i in seq_along(.f)) { + quosure_result_raw <- rlang::eval_tidy(quosures[[quosure_i]], data_mask) + if (is.null(quosure_result_raw)) { + nm <- nms[[quosure_i]] + results_multiorder <- results_multiorder[results_multiorder != nm] + rlang::env_unbind(results_env, nm) + } else if ( + # vctrs considers data.frames to be vectors, but we still check + # separately for them because certain base operations output data frames + # with rownames, which we will allow (but might drop) + is.data.frame(quosure_result_raw) || + vctrs::obj_is_vector(quosure_result_raw) && is.null(vctrs::vec_names(quosure_result_raw)) + ) { + # We want something like `dplyr_col_modify()` but allowing recycling + # of previous computations and updating `results_env` and unpacking + # tibbles if not manually named. + if (!is.null(common_size)) { + # XXX could improve error messages here + quosure_result_recycled <- vctrs::vec_recycle(quosure_result_raw, common_size) + } else { + quosure_result_recycled <- quosure_result_raw + quosure_result_size <- vctrs::vec_size(quosure_result_raw) + if (quosure_result_size != 1L) { + common_size <- quosure_result_size + for (previous_result_nm in names(results_env)) { + results_env[[previous_result_nm]] <- vctrs::vec_recycle(results_env[[previous_result_nm]], common_size) + } + } # else `common_size` remains NULL + } + if (inherits(quosure_result_recycled, "data.frame") && !manually_named[[quosure_i]]) { + new_results_multiorder <- names(quosure_result_recycled) + results_multiorder <- c(results_multiorder, new_results_multiorder) + for (new_result_i in seq_along(quosure_result_recycled)) { + results_env[[new_results_multiorder[[new_result_i]]]] <- quosure_result_recycled[[new_result_i]] + } + } else { + nm <- nms[[quosure_i]] + results_multiorder <- c(results_multiorder, nm) + results_env[[nm]] <- quosure_result_recycled + } + } else { + cli_abort(" + Problem with output of {.code + {rlang::expr_deparse(rlang::quo_get_expr(.f[[quosure_i]]))}}; it + produced a result that was neither NULL, a data.frame, nor a vector + without unnamed entries (as determined by the vctrs package). + ", class = "epiprocess__invalid_slide_comp_tidyeval_output") + } + } + # If a binding was defined and redefined, we may have duplications within + # `results_multiorder`. `unique(results_multiorder, fromLast = TRUE)` is + # actually quite slow, so we'll keep the duplicates (--> duplicate result + # columns) and leave it to various `mutate` in epi[x]_slide to resolve + # this to the appropriate placement: + validate_tibble(new_tibble(as.list(results_env, all.names = TRUE)[results_multiorder])) } return(fn) } - if (is_function(f)) { - # Check that `f` takes enough args - assert_sufficient_f_args(f, ...) - return(f) + if (is_function(.f)) { + # Check that `.f` takes enough args + assert_sufficient_f_args(.f, ..., .ref_time_value_label = .ref_time_value_label) + return(.f) } - if (is_formula(f)) { - if (length(f) > 2) { + if (is_formula(.f)) { + if (is_quosure(.f)) { + cli_abort("`.f` argument to `as_slide_computation()` cannot be a `quosure`; it should probably be a `quosures`. This is likely an internal bug in `{{epiprocess}}`.") # nolint: line_length_linter + } + + if (length(.f) > 2) { cli_abort("{.code {arg}} must be a one-sided formula", class = "epiprocess__as_slide_computation__formula_is_twosided", - epiprocess__f = f, + epiprocess__f = .f, call = call ) } if (rlang::dots_n(...) > 0L) { cli_abort( "No arguments can be passed via `...` when `f` is a formula, or there - are unrecognized/misspelled parameter names.", + are unrecognized/misspelled parameter names, or there is a trailing + comma in the `epi[x]_slide()` call.", class = "epiprocess__as_slide_computation__formula_with_dots", - epiprocess__f = f, + epiprocess__f = .f, epiprocess__enquos_dots = enquos(...) ) } - env <- f_env(f) + env <- f_env(.f) if (!is_environment(env)) { cli_abort("Formula must carry an environment.", class = "epiprocess__as_slide_computation__formula_has_no_env", - epiprocess__f = f, + epiprocess__f = .f, epiprocess__f_env = env, arg = arg, call = call ) } - args <- list( - ... = missing_arg(), - .x = quote(..1), .y = quote(..2), .z = quote(..3), - . = quote(..1), .group_key = quote(..2), .ref_time_value = quote(..3) + args <- c( + list( + ... = missing_arg(), + .x = quote(..1), .y = quote(..2), .z = quote(..3), + . = quote(..1), .group_key = quote(..2) + ), + `names<-`( + rep(list(quote(..3)), length(.ref_time_value_long_varnames)), + .ref_time_value_long_varnames + ) ) - fn <- new_function(args, f_rhs(f), env) - fn <- structure(fn, class = c("epiprocess_slide_computation", "function")) + fn <- new_function(args, f_rhs(.f), env) + fn <- structure(fn, class = c("epiprocess_formula_slide_computation", "function")) return(fn) } cli_abort( - "Can't convert an object of class {paste(collapse = ' ', deparse(class(f)))} + "Can't convert an object of class {format_class_vec(class(.f))} to a slide computation", class = "epiprocess__as_slide_computation__cant_convert_catchall", - epiprocess__f = f, - epiprocess__f_class = class(f), + epiprocess__f = .f, + epiprocess__f_class = class(.f), arg = arg, call = call ) } +#' @rdname as_slide_computation +#' @keywords internal +as_time_slide_computation <- function(.f, ...) { + as_slide_computation( + .f, ..., + .ref_time_value_long_varnames = ".ref_time_value", + .ref_time_value_label = "reference time value" + ) +} + +#' @rdname as_slide_computation +#' @keywords internal +as_diagonal_slide_computation <- function(.f, ...) { + as_slide_computation( + .f, ..., + .ref_time_value_long_varnames = c(".version", ".ref_time_value"), + .ref_time_value_label = "version" + ) +} guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { @@ -403,11 +580,13 @@ guess_time_type <- function(time_value, time_value_arg = rlang::caller_arg(time_ if (inherits(time_value, "Date")) { unique_time_gaps <- as.numeric(diff(sort(unique(time_value)))) # Gaps in a weekly date sequence will cause some diffs to be larger than 7 - # days, so check modulo 7 equality, rather than equality with 7. - if (all(unique_time_gaps %% 7 == 0)) { + # days, so check modulo 7 equality, rather than equality with 7. The length + # check is there so that we don't classify epi_df with a single data point + # per geo as "week". + if (all(unique_time_gaps %% 7 == 0) && length(unique_time_gaps) > 0) { return("week") } - if (all(unique_time_gaps >= 28)) { + if (all(unique_time_gaps >= 28) && length(unique_time_gaps) > 0) { cli_abort( "Found a monthly or longer cadence in the time column `{time_value_arg}`. Consider using tsibble::yearmonth for monthly data and 'YYYY' integers for year data." @@ -803,40 +982,52 @@ guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg as.numeric(NextMethod(), units = "secs") } - -validate_slide_window_arg <- function(arg, time_type, arg_name = rlang::caller_arg(arg)) { - if (is.null(arg)) { - cli_abort("`{arg_name}` is a required argument.") - } - - if (!checkmate::test_scalar(arg)) { - cli_abort("Expected `{arg_name}` to be a scalar value.") +validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRUE, arg_name = rlang::caller_arg(arg)) { + if (!checkmate::test_scalar(arg) || arg < lower) { + cli_abort( + "Slide function expected `{arg_name}` to be a non-null, scalar integer >= {lower}.", + class = "epiprocess__validate_slide_window_arg" + ) } if (time_type == "custom") { - cli_abort("Unsure how to interpret slide units with a custom time type. Consider converting your time - column to a Date, yearmonth, or integer type.") + cli_abort( + "Unsure how to interpret slide units with a custom time type. Consider converting your time + column to a Date, yearmonth, or integer type.", + class = "epiprocess__validate_slide_window_arg" + ) } + msg <- "" if (!identical(arg, Inf)) { if (time_type == "day") { if (!test_int(arg, lower = 0L) && !(inherits(arg, "difftime") && units(arg) == "days")) { - cli_abort("Expected `{arg_name}` to be a difftime with units in days or a non-negative integer.") + msg <- glue::glue_collapse(c("difftime with units in days", "non-negative integer", "Inf"), " or ") } } else if (time_type == "week") { if (!(inherits(arg, "difftime") && units(arg) == "weeks")) { - cli_abort("Expected `{arg_name}` to be a difftime with units in weeks.") + msg <- glue::glue_collapse(c("difftime with units in weeks", "Inf"), " or ") } } else if (time_type == "yearmonth") { if (!test_int(arg, lower = 0L) || inherits(arg, "difftime")) { - cli_abort("Expected `{arg_name}` to be a non-negative integer.") + msg <- glue::glue_collapse(c("non-negative integer", "Inf"), " or ") } } else if (time_type == "integer") { if (!test_int(arg, lower = 0L) || inherits(arg, "difftime")) { - cli_abort("Expected `{arg_name}` to be a non-negative integer.") + msg <- glue::glue_collapse(c("non-negative integer", "Inf"), " or ") } } else { - cli_abort("Expected `{arg_name}` to be Inf, an appropriate a difftime, or a non-negative integer.") + msg <- glue::glue_collapse(c("difftime", "non-negative integer", "Inf"), " or ") + } + } else { + if (!allow_inf) { + msg <- glue::glue_collapse(c("a difftime", "a non-negative integer"), " or ") } } + if (msg != "") { + cli_abort( + "Slide function expected `{arg_name}` to be a {msg}.", + class = "epiprocess__validate_slide_window_arg" + ) + } } diff --git a/_pkgdown.yml b/_pkgdown.yml index b95a6386..1bc7f795 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -48,7 +48,6 @@ articles: - aggregation - outliers - archive - - advanced - compactify repo: @@ -83,6 +82,7 @@ reference: desc: Details on `epi_archive`, and basic functionality. - contents: - matches("archive") + - revision_summary - title: "`epix_*()` functions" desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - contents: diff --git a/data/incidence_num_outlier_example.rda b/data/incidence_num_outlier_example.rda index e898b5ea..96288982 100644 Binary files a/data/incidence_num_outlier_example.rda and b/data/incidence_num_outlier_example.rda differ diff --git a/data/jhu_csse_county_level_subset.rda b/data/jhu_csse_county_level_subset.rda index aca0983d..bc31b493 100644 Binary files a/data/jhu_csse_county_level_subset.rda and b/data/jhu_csse_county_level_subset.rda differ diff --git a/data/jhu_csse_daily_subset.rda b/data/jhu_csse_daily_subset.rda index 12fd5f15..e4dbdc9f 100644 Binary files a/data/jhu_csse_daily_subset.rda and b/data/jhu_csse_daily_subset.rda differ diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R index f8f6792d..df87f882 100644 --- a/man-roxygen/basic-slide-details.R +++ b/man-roxygen/basic-slide-details.R @@ -1,34 +1,69 @@ #' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is centered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. -#' -#' If there are not enough time steps available to complete the window at any -#' given reference time, then `epi_slide()` still attempts to perform the -#' computation anyway (it does not require a complete window). The issue of -#' what to do with partial computations (those run on incomplete windows) is -#' therefore left up to the user, either through the specified function or -#' formula `f`, or through post-processing. For a centrally-aligned slide of -#' `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after = -#' (n-1)/2` when the number of `time_value`s in a sliding window is odd and -#' `before = n/2-1` and `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments. -#' -#' If `f` is missing, then an expression for tidy evaluation can be specified, -#' for example, as in: +#' window. The `.window_size` arg determines the width of the window +#' (including the reference time) and the `.align` arg governs how the window +#' is aligned (see below for examples). The `.ref_time_values` arg controls +#' which time values to consider for the slide and `.all_rows` allows you to +#' keep NAs around. +#' +#' `epi_slide()` does not require a complete window (such as on the left +#' boundary of the dataset) and will attempt to perform the computation +#' anyway. The issue of what to do with partial computations (those run on +#' incomplete windows) is therefore left up to the user, either through the +#' specified function or formula, or through post-processing. +#' +#' Let's look at some window examples, assuming that the reference time value +#' is "tv". With .align = "right" and .window_size = 3, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 2, tv - 1, tv +#' +#' With .align = "center" and .window_size = 3, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 1, tv, tv + 1 +#' +#' With .align = "center" and .window_size = 4, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 2, tv - 1, tv, tv + 1 +#' +#' With .align = "left" and .window_size = 3, the window will be: +#' +#' time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv, tv + 1, tv + 2 +#' +#' If `.f` is missing, then ["data-masking"][rlang::args_data_masking] +#' expression(s) for tidy evaluation can be specified, for example, as in: #' ``` -#' epi_slide(x, cases_7dav = mean(cases), before = 6) +#' epi_slide(x, cases_7dav = mean(cases), .window_size = 7) #' ``` #' which would be equivalent to: #' ``` -#' epi_slide(x, function(x, g) mean(x$cases), before = 6, -#' new_col_name = "cases_7dav") +#' epi_slide(x, function(x, g, t) mean(x$cases), .window_size = 7, +#' .new_col_name = "cases_7dav") #' ``` -#' Thus, to be clear, when the computation is specified via an expression for -#' tidy evaluation (first example, above), then the name for the new column is -#' inferred from the given expression and overrides any name passed explicitly -#' through the `new_col_name` argument. +#' In a manner similar to [`dplyr::mutate`]: +#' * Expressions evaluating to length-1 vectors will be recycled to +#' appropriate lengths. +#' * `, name_var := value` can be used to set the output column name based on +#' a variable `name_var` rather than requiring you to use a hard-coded +#' name. (The leading comma is needed to make sure that `.f` is treated as +#' missing.) +#' * `= NULL` can be used to remove results from previous expressions (though +#' we don't allow it to remove pre-existing columns). +#' * `, fn_returning_a_data_frame(.x)` will unpack the output of the function +#' into multiple columns in the result. +#' * Named expressions evaluating to data frames will be placed into +#' [`tidyr::pack`]ed columns. +#' +#' In addition to [`.data`] and [`.env`], we make some additional +#' "pronoun"-like bindings available: +#' * .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object +#' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun] +#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and +#' `epiprocess` operations. If you have multiple expressions in `...`, this +#' won't let you refer to the output of the earlier expressions, but `.data` +#' will. +#' * .group_key, which is like `.y` in [`dplyr::group_modify`]. +#' * .ref_time_value, which is the element of `.ref_time_values` that +#' determined the time window for the current computation. diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index 7e169af6..8a63a817 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -1,46 +1,35 @@ -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' @param .x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `.x` will be treated as part of a #' single data group. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. The accepted values for these -#' depend on the type of the `time_value` column: +#' @param .window_size The size of the sliding window. By default, this is 1, +#' meaning that only the current ref_time_value is included. The accepted values +#' here depend on the `time_value` column: #' -#' - if it is a Date and the cadence is daily, then they can be integers -#' (which will be interpreted in units of days) or difftimes with units -#' "days" -#' - if it is a Date and the cadence is weekly, then they must be difftimes -#' with units "weeks" -#' - if it is an integer, then they must be integers +#' - if time_type is Date and the cadence is daily, then `.window_size` can be +#' an integer (which will be interpreted in units of days) or a difftime +#' with units "days" +#' - if time_type is Date and the cadence is weekly, then `.window_size` must +#' be a difftime with units "weeks" +#' - if time_type is an integer, then `.window_size` must be an integer #' -#' Endpoints of the window are inclusive. Common settings: -#' -#' - For trailing/right-aligned windows from `ref_time_value - k` to -#' `ref_time_value`: either pass `before=k` by itself, or pass `before=k, -#' after=0`. -#' - For center-aligned windows from `ref_time_value - k` to -#' `ref_time_value + k`: pass `before=k, after=k`. -#' - For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + k`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' -#' See "Details:" on how missing rows are handled within the window. -#' @param ref_time_values Time values for sliding computations, meaning, each +#' @param .align The alignment of the sliding window. If `right` (default), then +#' the window has its end at the reference time; if `center`, then the window is +#' centered at the reference time; if `left`, then the window has its start at +#' the reference time. If the alignment is `center` and the window size is odd, +#' then the window will have floor(window_size/2) points before and after the +#' reference time. If the window size is even, then the window will be +#' asymmetric and have one less value on the right side of the reference time +#' (assuming time increases from left to right). +#' @param .ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the #' underlying data table, by default. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing +#' @param .all_rows If `.all_rows = TRUE`, then all rows of `.x` will be kept in +#' the output even with `.ref_time_values` provided, with some type of missing #' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' outside `.ref_time_values`; otherwise, there will be one row for each row in +#' `.x` that had a `time_value` in `.ref_time_values`. Default is `FALSE`. The #' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. If using `as_list_col = TRUE`, note that -#' the missing marker is a `NULL` entry in the list column; for certain -#' operations, you might want to replace these `NULL` entries with a different -#' `NA` marker. -#' @return An `epi_df` object given by appending one or more new columns to -#' `x`, named according to the `new_col_name` argument. +#' of the slide computation output. +#' @return An `epi_df` object given by appending one or more new columns to `.x`, +#' named according to the `.new_col_name` argument. diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R deleted file mode 100644 index bedcb7d4..00000000 --- a/man-roxygen/epi_df-params.R +++ /dev/null @@ -1,18 +0,0 @@ -#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the -#' location column and set to "custom" if not recognized. -#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time -#' column and set to "custom" if not recognized. Unpredictable behavior may result -#' if the time type is not recognized. -#' @param as_of Time value representing the time at which the given data were -#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` -#' object that is created would represent the most up-to-date version of the -#' data available as of January 31, 2022. If the `as_of` argument is missing, -#' then the current day-time will be used. -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and -#' `as_of` fields; named entries from the passed list will be included as -#' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R index 5e8876d2..a8d93d93 100644 --- a/man-roxygen/opt-slide-details.R +++ b/man-roxygen/opt-slide-details.R @@ -1,16 +1,33 @@ -#' @details To "slide" means to apply a function over a rolling window of time -#' steps for each data group, where the window is centered at a reference time -#' and left and right endpoints are given by the `before` and `after` -#' arguments. - -#' If there are not enough time steps available to complete the window at any -#' given reference time, then `epi_slide_*()` will fail; it requires a -#' complete window to perform the computation. For a centrally-aligned slide -#' of `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after -#' = (n-1)/2` when the number of `time_value`s in a sliding window is odd and -#' `before = n/2-1` and `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments. +#' @details To "slide" means to apply a function or formula over a rolling +#' window. The `.window_size` arg determines the width of the window +#' (including the reference time) and the `.align` arg governs how the window +#' is aligned (see below for examples). The `.ref_time_values` arg controls +#' which time values to consider for the slide and `.all_rows` allows you to +#' keep NAs around. +#' +#' `epi_slide_*()` does not require a complete window (such as on the left +#' boundary of the dataset) and will attempt to perform the computation +#' anyway. The issue of what to do with partial computations (those run on +#' incomplete windows) is therefore left up to the user, either through the +#' specified function or formula `f`, or through post-processing. +#' +#' Let's look at some window examples, assuming that the reference time value +#' is "tv". With .align = "right" and .window_size = 3, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 2, tv - 1, tv +#' +#' With .align = "center" and .window_size = 3, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 1, tv, tv + 1 +#' +#' With .align = "center" and .window_size = 4, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 2, tv - 1, tv, tv + 1 +#' +#' With .align = "left" and .window_size = 3, the window will be: +#' +#' time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv, tv + 1, tv + 2 diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index d13921b2..ba4b4877 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,16 +1,10 @@ -#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), or -#' [other tidy-select expression][tidyselect::language]. Variable names can -#' be used as if they were positions in the data frame, so expressions like -#' `x:y` can be used to select a range of variables. If you have the desired -#' column names stored in a vector `vars`, use `col_names = all_of(vars)`. +#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' [other tidy-select expression][tidyselect::language], or a vector of +#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if +#' they were positions in the data frame, so expressions like `x:y` can be +#' used to select a range of variables. #' #' The tidy-selection renaming interface is not supported, and cannot be used #' to provide output column names; if you want to customize the output column #' names, use [`dplyr::rename`] after the slide. -#' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param new_col_name Character vector indicating the name(s) of the new -#' column(s) that will contain the derivative values. Default -#' is "slide_value"; note that setting `new_col_name` equal to any existing -#' column names will overwrite those columns. If `names_sep` is `NULL`, -#' `new_col_name` must be the same length as `col_names`. diff --git a/man-roxygen/ref-time-value-label.R b/man-roxygen/ref-time-value-label.R new file mode 100644 index 00000000..c81615b9 --- /dev/null +++ b/man-roxygen/ref-time-value-label.R @@ -0,0 +1,2 @@ +#' @param .ref_time_value_label String; how to describe/label the `ref_time_value` in +#' error messages; e.g., "reference time value" or "version". diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd new file mode 100644 index 00000000..14b884c6 --- /dev/null +++ b/man/apply_compactify.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{apply_compactify} +\alias{apply_compactify} +\title{given a tibble as would be found in an epi_archive, remove duplicate entries.} +\usage{ +apply_compactify(df, keys, tolerance = .Machine$double.eps^0.5) +} +\description{ +works by shifting all rows except the version, then comparing values to see +if they've changed. We need to arrange in descending order, but note that +we don't need to group, since at least one column other than version has +changed, and so is kept. +} +\keyword{internal} diff --git a/man/arrange_canonical.Rd b/man/arrange_canonical.Rd new file mode 100644 index 00000000..3d29c2af --- /dev/null +++ b/man/arrange_canonical.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{arrange_canonical} +\alias{arrange_canonical} +\title{Arrange an epi_df into a standard order} +\usage{ +arrange_canonical(x, ...) +} +\arguments{ +\item{x}{an \code{epi_df}. Other objects will produce a warning and return as is.} + +\item{...}{not used} +} +\description{ +Moves \code{\link[=key_colnames]{key_colnames()}} to the left, then arranges rows based on that +ordering. This function is mainly for use in tests and so that +other function output will be in predictable order, where necessary. +} +\keyword{internal} diff --git a/man/as_slide_computation.Rd b/man/as_slide_computation.Rd new file mode 100644 index 00000000..3db5a940 --- /dev/null +++ b/man/as_slide_computation.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{as_slide_computation} +\alias{as_slide_computation} +\alias{as_time_slide_computation} +\alias{as_diagonal_slide_computation} +\title{Generate a \verb{epi[x]_slide} computation function from a function, formula, or quosure} +\source{ +This code and documentation are based on +\href{https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L343-L427}{\code{as_function}} +from Hadley Wickham's \code{rlang} package. + +Below is the original license for the \code{rlang} package. + +MIT License + +Copyright (c) 2020 rlang authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of the original code used in this adaptation: +\enumerate{ +\item Much of the documentation and examples +\item The general flow of the function, including branching conditions +\item Error conditions and wording +\item The chunk converting a formula into a function, see +https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L411-L418 +} + +Changes made include: +\enumerate{ +\item Updates to documentation due to new functionality +\item The removal of function-as-string processing logic and helper arg +\code{env} +\item The addition of an output function wrapper that defines a data mask +for evaluating quosures +\item Calling an argument-checking function +\item Replacing rlang error functions with internal error functions +} +} +\usage{ +as_slide_computation( + .f, + ..., + .ref_time_value_long_varnames, + .ref_time_value_label +) + +as_time_slide_computation(.f, ...) + +as_diagonal_slide_computation(.f, ...) +} +\arguments{ +\item{...}{Additional arguments to pass to the function or formula +specified via \code{x}. If \code{x} is a quosure, any arguments passed via \code{...} +will be ignored.} + +\item{.ref_time_value_long_varnames}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +Character vector. What variable names should we allow formulas and +data-masking tidy evaluation to use to refer to \code{ref_time_value} for the +computation (in addition to \code{.z} in formulas)? E.g., \code{".ref_time_value"} or +\code{c(".ref_time_value", ".version")}.} + +\item{.ref_time_value_label}{String; how to describe/label the \code{ref_time_value} in +error messages; e.g., "reference time value" or "version".} + +\item{f}{A function, one-sided formula, or quosure. + +If a \strong{function}, the function is returned as-is, with no +modifications. + +If a \strong{formula}, e.g. \code{~ mean(.x$cases)}, it is converted to a function +with up to three arguments: \code{.x} (single argument), or \code{.x} and \code{.y} +(two arguments), or \code{.x}, \code{.y}, and \code{.z} (three arguments). The \code{.} +placeholder can be used instead of \code{.x}, \code{.group_key} can be used in +place of \code{.y}, and \code{.ref_time_value} can be used in place of \code{.z}. This +allows you to create very compact anonymous functions (lambdas) with up +to three inputs. Functions created from formulas have a special class. +Use \code{inherits(fn, "epiprocess_slide_computation")} to test for it. + +If a \strong{quosure}, in the case that \code{f} was not provided to the parent +\verb{epi[x]_slide} call and the \code{...} is interpreted as an expression for +tidy evaluation, it is evaluated within a wrapper function. The wrapper +sets up object access via a data mask.} +} +\description{ +\code{as_slide_computation()} transforms a one-sided formula or a +quosure into a function; functions are returned as-is or with light +modifications to calculate \code{ref_time_value}. + +This code extends \code{rlang::as_function} to create functions that take three +arguments. The arguments can be accessed via the idiomatic \code{.}, \code{.x}, and +\code{.y}, extended to include \code{.z}; positional references \code{..1} and \code{..2}, +extended to include \code{..3}; and also by \verb{epi[x]_slide}-specific names +\code{.group_key} and \code{.ref_time_value}. +} +\keyword{internal} diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd index 174768e5..9d016cd6 100644 --- a/man/as_tibble.epi_df.Rd +++ b/man/as_tibble.epi_df.Rd @@ -9,7 +9,7 @@ \arguments{ \item{x}{an \code{epi_df}} -\item{...}{additional arguments to forward to \code{NextMethod()}} +\item{...}{Unused, for extensibility.} } \description{ Converts an \code{epi_df} object into a tibble, dropping metadata and any diff --git a/man/complete.epi_df.Rd b/man/complete.epi_df.Rd new file mode 100644 index 00000000..9d791fb7 --- /dev/null +++ b/man/complete.epi_df.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{complete.epi_df} +\alias{complete.epi_df} +\title{Complete epi_df} +\usage{ +\method{complete}{epi_df}(data, ..., fill = list(), explicit = TRUE) +} +\arguments{ +\item{data}{an \code{epi_df}} + +\item{...}{see \code{\link[tidyr:complete]{tidyr::complete}}} + +\item{fill}{see \code{\link[tidyr:complete]{tidyr::complete}}} + +\item{explicit}{see \code{\link[tidyr:complete]{tidyr::complete}}} +} +\description{ +A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function +can be used, for example, to add rows for missing combinations +of ‘geo_value’ and ‘time_value’, filling other columns with \code{NA}s. +See the examples for usage details. +} +\examples{ +start_date <- as.Date("2020-01-01") +daily_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 3, 3, + 2, start_date + 2, 2, + 2, start_date + 3, 3, +) \%>\% + as_epi_df(as_of = start_date + 3) +# Complete without grouping puts all the geo_values on the same min and max +# time_value index +daily_edf \%>\% + complete(geo_value, time_value = full_seq(time_value, period = 1)) +# Complete with grouping puts all the geo_values on individual min and max +# time_value indices +daily_edf \%>\% + group_by(geo_value) \%>\% + complete(time_value = full_seq(time_value, period = 1)) +# Complete has explicit=TRUE by default, but if it's FALSE, then complete +# only fills the implicit gaps, not those that are explicitly NA +daily_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 2, NA, + 1, start_date + 3, 3, + 2, start_date + 2, 2, + 2, start_date + 3, 3, +) \%>\% + as_epi_df(as_of = start_date + 3) +daily_edf \%>\% + complete( + geo_value, + time_value = full_seq(time_value, period = 1), + fill = list(value = 0), + explicit = FALSE + ) +# Complete works for weekly data and can take a fill value +# No grouping +weekly_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 15, 3, + 2, start_date + 8, 2, + 2, start_date + 15, 3, +) \%>\% + as_epi_df(as_of = start_date + 3) +weekly_edf \%>\% + complete( + geo_value, + time_value = full_seq(time_value, period = 7), + fill = list(value = 0) + ) +# With grouping +weekly_edf \%>\% + group_by(geo_value) \%>\% + complete( + time_value = full_seq(time_value, period = 7), + fill = list(value = 0) + ) +} diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 333c4a7b..b57c4445 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -65,6 +65,5 @@ incidence_num_outlier_example \%>\% group_by(geo_value) \%>\% mutate(outlier_info = detect_outlr_rm( x = time_value, y = cases - )) \%>\% - unnest(outlier_info) + )) } diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 695c2de7..fb69e8da 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -96,7 +96,6 @@ incidence_num_outlier_example \%>\% group_by(geo_value) \%>\% mutate(outlier_info = detect_outlr_stl( x = time_value, y = cases, - seasonal_period = 7 - )) \%>\% # weekly seasonality for daily data - unnest(outlier_info) + seasonal_period = 7 # weekly seasonality for daily data + )) } diff --git a/man/difftime_summary.Rd b/man/difftime_summary.Rd new file mode 100644 index 00000000..ef153f3d --- /dev/null +++ b/man/difftime_summary.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{difftime_summary} +\alias{difftime_summary} +\title{summary doesn't work on difftimes} +\usage{ +difftime_summary(diff_time_val) +} +\description{ +summary doesn't work on difftimes +} +\keyword{internal} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 74591693..a5055f4e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -12,16 +12,15 @@ new_epi_archive( geo_type, time_type, other_keys, - additional_metadata, compactify, clobberable_versions_start, - versions_end + versions_end, + compactify_tol = .Machine$double.eps^0.5 ) validate_epi_archive( x, other_keys, - additional_metadata, compactify, clobberable_versions_start, versions_end @@ -31,8 +30,7 @@ as_epi_archive( x, geo_type = deprecated(), time_type = deprecated(), - other_keys = character(0L), - additional_metadata = list(), + other_keys = character(), compactify = NULL, clobberable_versions_start = NA, .versions_end = max_version_with_row_in(x), @@ -53,11 +51,8 @@ if the time type is not recognized.} \item{other_keys}{Character vector specifying the names of variables in \code{x} that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have the \code{geo_type} field; named -entries from the passed list or will be included as well.} +apart from "geo_value", "time_value", and "version". Typical examples +are "age" or more granular geographies.} \item{compactify}{Optional; Boolean. \code{TRUE} will remove some redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove @@ -89,6 +84,8 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +\item{compactify_tol}{double. the tolerance used to detect approximate equality for compactification} + \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being used to rename columns.} @@ -133,7 +130,8 @@ object: \itemize{ \item \code{geo_type}: the type for the geo values. \item \code{time_type}: the type for the time values. -\item \code{additional_metadata}: list of additional metadata for the data archive. +\item \code{other_keys}: any additional keys as a character vector. +Typical examples are "age" or sub-geographies. } While this metadata is not protected, it is generally recommended to treat it diff --git a/man/epi_df.Rd b/man/epi_df.Rd index dbb4a917..38f923c5 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -1,22 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/epi_df.R -\name{new_epi_df} -\alias{new_epi_df} +\name{as_epi_df} \alias{as_epi_df} \alias{as_epi_df.epi_df} \alias{as_epi_df.tbl_df} \alias{as_epi_df.data.frame} \alias{as_epi_df.tbl_ts} +\alias{new_epi_df} +\alias{epi_df} \title{\code{epi_df} object} \usage{ -new_epi_df( - x = tibble::tibble(), - geo_type, - time_type, - as_of, - additional_metadata = list() -) - as_epi_df(x, ...) \method{as_epi_df}{epi_df}(x, ...) @@ -26,23 +19,39 @@ as_epi_df(x, ...) geo_type = deprecated(), time_type = deprecated(), as_of, - additional_metadata = list(), + other_keys = character(), ... ) -\method{as_epi_df}{data.frame}(x, as_of, additional_metadata = list(), ...) +\method{as_epi_df}{data.frame}(x, as_of, other_keys = character(), ...) + +\method{as_epi_df}{tbl_ts}(x, as_of, other_keys = character(), ...) -\method{as_epi_df}{tbl_ts}(x, as_of, additional_metadata = list(), ...) +new_epi_df( + x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())), + geo_type, + time_type, + as_of, + other_keys = character(), + ... +) } \arguments{ -\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} +\item{x}{An \code{epi_df}, \code{data.frame}, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} +to be converted} + +\item{...}{Additional arguments passed to methods.} -\item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the -location column and set to "custom" if not recognized.} +\item{geo_type}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} in \code{as_epi_df()}, has no +effect; the geo value type is inferred from the location column and set to +"custom" if not recognized. In \code{new_epi_df()}, should be set to the same +value that would be inferred.} -\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time -column and set to "custom" if not recognized. Unpredictable behavior may result -if the time type is not recognized.} +\item{time_type}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} in \code{as_epi_df()}, has no +effect: the time value type inferred from the time column and set to +"custom" if not recognized. Unpredictable behavior may result if the time +type is not recognized. In \code{new_epi_df()}, should be set to the same value +that would be inferred.} \item{as_of}{Time value representing the time at which the given data were available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} @@ -50,13 +59,8 @@ object that is created would represent the most up-to-date version of the data available as of January 31, 2022. If the \code{as_of} argument is missing, then the current day-time will be used.} -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and -\code{as_of} fields; named entries from the passed list will be included as -well. If your tibble has additional keys, be sure to specify them as a -character vector in the \code{other_keys} component of \code{additional_metadata}.} - -\item{...}{Additional arguments passed to methods.} +\item{other_keys}{If your tibble has additional keys, be sure to specify them +as a character vector here (typical examples are "age" or sub-geographies).} } \value{ An \code{epi_df} object. @@ -114,6 +118,13 @@ data versioning works in the \code{epiprocess} package (including how to generate \code{epi_df} objects, as data snapshots, from an \code{epi_archive} object). } +\section{Functions}{ +\itemize{ +\item \code{as_epi_df()}: The preferred way of constructing \code{epi_df}s + +\item \code{new_epi_df()}: Lower-level constructor for \code{epi_df} object + +}} \section{Geo Types}{ The following geo types are recognized in an \code{epi_df}. @@ -197,7 +208,7 @@ ex2 <- ex2_input \%>\% dplyr::rename(geo_value = state, time_value = reported_date) \%>\% as_epi_df( as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol") + other_keys = "pol" ) attr(ex2, "metadata") @@ -216,9 +227,7 @@ ex3 <- ex3_input \%>\% state = rep("MA", 6), pol = rep(c("blue", "swing", "swing"), each = 2) ) \%>\% - # the 2 extra keys we added have to be specified in the other_keys - # component of additional_metadata. - as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) + as_epi_df(other_keys = c("state", "pol")) attr(ex3, "metadata") } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 5f4db7b4..74929eb1 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -5,105 +5,92 @@ \title{Slide a function over variables in an \code{epi_df} object} \usage{ epi_slide( - x, - f, + .x, + .f, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_rows = FALSE + .window_size = NULL, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .new_col_name = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{f}{Function, formula, or missing; together with \code{...} specifies the +\item{.f}{Function, formula, or missing; together with \code{...} specifies the computation to slide. To "slide" means to apply a computation within a sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} and \code{after} parameters described below. One time -step is typically one day or one week; see details for more explanation. If -a function, \code{f} must take a data frame with the same column names as -the original object, minus any grouping variables, containing the time -window data for one group-\code{ref_time_value} combination; followed by a -one-row tibble containing the values of the grouping variables for the -associated group; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~mean(.x$var)} to compute a mean of a column \code{var} for each -\code{ref_time_value}-group combination. The group key can be accessed via \code{.y}. -If \code{f} is missing, then \code{...} will specify the computation.} - -\item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then the \code{...} is interpreted as -an expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to \code{.x}, \code{.group_key}, and -\code{.ref_time_value}. See details.} - -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +determined by the \code{.window_size} and \code{.align} parameters, see the details +section for more. If a function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers +\item \code{x} is a data frame with the same column names as the original object, +minus any grouping variables, with only the windowed data for one +group-\code{.ref_time_value} combination +\item \code{g} is a one-row tibble containing the values of the grouping variables +for the associated group +\item \code{t} is the \code{.ref_time_value} for the current window +\item \code{...} are additional arguments } -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} - -See "Details:" on how missing rows are handled within the window.} +If a formula, \code{.f} can operate directly on columns accessed via \code{.x$var} or +\code{.$var}, as in \code{~mean(.x$var)} to compute a mean of a column \code{var} for each +\code{ref_time_value}-group combination. The group key can be accessed via \code{.y}. +If \code{.f} is missing, then \code{...} will specify the computation.} -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item{...}{Additional arguments to pass to the function or formula specified +via \code{.f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted +as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions +for tidy evaluation; in addition to referring columns directly by name, the +expressions have access to \code{.data} and \code{.env} pronouns as in \code{dplyr} verbs, +and can also refer to \code{.x} (not the same as the input epi_df), +\code{.group_key}, and \code{.ref_time_value}. See details.} + +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: +\itemize{ +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} + +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} + +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} +\item{.new_col_name}{String indicating the name of the new column that will +contain the derivative values. The default is "slide_value" unless your +slide computations output data frames, in which case they will be unpacked +into the constituent columns and those names used. New columns should not +be given names that clash with the existing columns of \code{.x}; see details.} -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to -\code{x}, named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides a given function over variables in an \code{epi_df} object. See the @@ -112,40 +99,80 @@ for examples. } \details{ To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is centered at a -reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. - -If there are not enough time steps available to complete the window at any -given reference time, then \code{epi_slide()} still attempts to perform the -computation anyway (it does not require a complete window). The issue of -what to do with partial computations (those run on incomplete windows) is -therefore left up to the user, either through the specified function or -formula \code{f}, or through post-processing. For a centrally-aligned slide of -\code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. - -If \code{f} is missing, then an expression for tidy evaluation can be specified, -for example, as in: - -\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), before = 6) +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\code{epi_slide()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is "tv". With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 1, tv, tv + 1 + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv, tv + 1 + +With .align = "left" and .window_size = 3, the window will be: + +time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv, tv + 1, tv + 2 + +If \code{.f} is missing, then \link[rlang:args_data_masking]{"data-masking"} +expression(s) for tidy evaluation can be specified, for example, as in: + +\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), .window_size = 7) }\if{html}{\out{
}} which would be equivalent to: -\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, g) mean(x$cases), before = 6, - new_col_name = "cases_7dav") +\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, g, t) mean(x$cases), .window_size = 7, + .new_col_name = "cases_7dav") }\if{html}{\out{
}} -Thus, to be clear, when the computation is specified via an expression for -tidy evaluation (first example, above), then the name for the new column is -inferred from the given expression and overrides any name passed explicitly -through the \code{new_col_name} argument. +In a manner similar to \code{\link[dplyr:mutate]{dplyr::mutate}}: +\itemize{ +\item Expressions evaluating to length-1 vectors will be recycled to +appropriate lengths. +\item \verb{, name_var := value} can be used to set the output column name based on +a variable \code{name_var} rather than requiring you to use a hard-coded +name. (The leading comma is needed to make sure that \code{.f} is treated as +missing.) +\item \verb{= NULL} can be used to remove results from previous expressions (though +we don't allow it to remove pre-existing columns). +\item \verb{, fn_returning_a_data_frame(.x)} will unpack the output of the function +into multiple columns in the result. +\item Named expressions evaluating to data frames will be placed into +\code{\link[tidyr:pack]{tidyr::pack}}ed columns. +} + +In addition to \code{\link{.data}} and \code{\link{.env}}, we make some additional +"pronoun"-like bindings available: +\itemize{ +\item .x, which is like \code{.x} in \code{\link[dplyr:group_map]{dplyr::group_modify}}; an ordinary object +like an \code{epi_df} rather than an rlang \link[rlang:as_data_mask]{pronoun} +like \code{\link{.data}}; this allows you to use additional \code{dplyr}, \code{tidyr}, and +\code{epiprocess} operations. If you have multiple expressions in \code{...}, this +won't let you refer to the output of the earlier expressions, but \code{.data} +will. +\item .group_key, which is like \code{.y} in \code{\link[dplyr:group_map]{dplyr::group_modify}}. +\item .ref_time_value, which is the element of \code{.ref_time_values} that +determined the time window for the current computation. +} } \examples{ # slide a 7-day trailing average formula on cases @@ -153,32 +180,28 @@ through the \code{new_col_name} argument. # the `epi_slide_mean` and `epi_slide_sum` functions instead. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_7dav = mean(cases), .window_size = 7) \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), after = 6) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "left") \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() -# slide a 7-day centre-aligned average +# slide a 7-day center-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "center") \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() -# slide a 14-day centre-aligned average +# slide a 14-day center-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_14dav = mean(cases), before = 6, after = 7) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_14dav = mean(cases), .window_size = 14, .align = "center") \%>\% dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% ungroup() @@ -186,11 +209,11 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide( - a = data.frame( + cases_2d = list(data.frame( cases_2dav = mean(cases), cases_2dma = mad(cases) - ), - before = 1, as_list_col = TRUE + )), + .window_size = 2 ) \%>\% ungroup() } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index aeb56729..09faefb6 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -5,124 +5,120 @@ \title{Optimized slide function for performing rolling averages on an \code{epi_df} object} \usage{ epi_slide_mean( - x, - col_names, + .x, + .col_names, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - as_list_col = NULL, - names_sep = NULL, - all_rows = FALSE + .window_size = 1, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or -\link[tidyselect:language]{other tidy-select expression}. Variable names can -be used as if they were positions in the data frame, so expressions like -\code{x:y} can be used to select a range of variables. If you have the desired -column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. +\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +\link[tidyselect:language]{other tidy-select expression}, or a vector of +characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if +they were positions in the data frame, so expressions like \code{x:y} can be +used to select a range of variables. The tidy-selection renaming interface is not supported, and cannot be used to provide output column names; if you want to customize the output column names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} -\item{...}{Additional arguments to pass to \code{data.table::frollmean}, for -example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically -passed the data \code{x} to operate on, the window size \code{n}, and the alignment -\code{align}. Providing these args via \code{...} will cause an error.} +\item{...}{Additional arguments to pass to the slide computation \code{.f}, for +example, \code{algo} or \code{na.rm} in data.table functions. You don't need to +specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider +functions).} -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} - -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} - -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} + +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} + +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{Character vector indicating the name(s) of the new -column(s) that will contain the derivative values. Default -is "slide_value"; note that setting \code{new_col_name} equal to any existing -column names will overwrite those columns. If \code{names_sep} is \code{NULL}, -\code{new_col_name} must be the same length as \code{col_names}.} - -\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to -\code{x}, named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ -Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. - -To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference time -and left and right endpoints are given by the \code{before} and \code{after} -arguments. -If there are not enough time steps available to complete the window at any -given reference time, then \verb{epi_slide_*()} will fail; it requires a -complete window to perform the computation. For a centrally-aligned slide -of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. +Wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollmean}. + +To "slide" means to apply a function or formula over a rolling +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\verb{epi_slide_*()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is "tv". With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 1, tv, tv + 1 + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv, tv + 1 + +With .align = "left" and .window_size = 3, the window will be: + +time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv, tv + 1, tv + 2 } \examples{ # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, before = 6) \%>\% + epi_slide_mean(cases, .window_size = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() @@ -133,7 +129,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_mean( cases, - before = 6, + .window_size = 7, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -143,23 +139,23 @@ jhu_csse_daily_subset \%>\% # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, after = 6) \%>\% + epi_slide_mean(cases, .window_size = 7, .align = "right") \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() -# slide a 7-day centre-aligned average +# slide a 7-day center-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, before = 3, after = 3) \%>\% + epi_slide_mean(cases, .window_size = 7, .align = "center") \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() -# slide a 14-day centre-aligned average +# slide a 14-day center-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, before = 6, after = 7) \%>\% + epi_slide_mean(cases, .window_size = 14, .align = "center") \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) \%>\% ungroup() diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 629134d5..dcaab3f8 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -2,120 +2,91 @@ % Please edit documentation in R/slide.R \name{epi_slide_opt} \alias{epi_slide_opt} -\title{Optimized slide function for performing common rolling computations on an \code{epi_df} object} +\title{Optimized slide function for performing common rolling computations on an +\code{epi_df} object} \usage{ epi_slide_opt( - x, - col_names, - f, + .x, + .col_names, + .f, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - as_list_col = NULL, - names_sep = NULL, - all_rows = FALSE + .window_size = 1, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or -\link[tidyselect:language]{other tidy-select expression}. Variable names can -be used as if they were positions in the data frame, so expressions like -\code{x:y} can be used to select a range of variables. If you have the desired -column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. +\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +\link[tidyselect:language]{other tidy-select expression}, or a vector of +characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if +they were positions in the data frame, so expressions like \code{x:y} can be +used to select a range of variables. The tidy-selection renaming interface is not supported, and cannot be used to provide output column names; if you want to customize the output column names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} -\item{f}{Function; together with \code{...} specifies the computation to slide. -\code{f} must be one of \code{data.table}'s rolling functions +\item{.f}{Function; together with \code{...} specifies the computation to slide. +\code{.f} must be one of \code{data.table}'s rolling functions (\code{frollmean}, \code{frollsum}, \code{frollapply}. See \link[data.table:froll]{data.table::roll}) or one of \code{slider}'s specialized sliding functions (\code{slide_mean}, \code{slide_sum}, -etc. See \link[slider:summary-slide]{slider::summary-slide}). To "slide" means to apply a -computation within a sliding (a.k.a. "rolling") time window for each data -group. The window is determined by the \code{before} and \code{after} parameters -described below. One time step is typically one day or one week; see -details for more explanation. +etc. See \link[slider:summary-slide]{slider::summary-slide}). The optimized \code{data.table} and \code{slider} functions can't be directly passed -as the computation function in \code{epi_slide} without careful handling to -make sure each computation group is made up of the \code{n} dates rather than -\code{n} points. \code{epi_slide_opt} (and wrapper functions \code{epi_slide_mean} and -\code{epi_slide_sum}) take care of window completion automatically to prevent -associated errors.} - -\item{...}{Additional arguments to pass to the slide computation \code{f}, for -example, \code{na.rm} and \code{algo} if \code{f} is a \code{data.table} function. If \code{f} is -a \code{data.table} function, it is automatically passed the data \code{x} to -operate on, the window size \code{n}, and the alignment \code{align}. Providing -these args via \code{...} will cause an error. If \code{f} is a \code{slider} function, -it is automatically passed the data \code{x} to operate on, and number of -points \code{before} and \code{after} to use in the computation.} - -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +as the computation function in \code{epi_slide} without careful handling to make +sure each computation group is made up of the \code{.window_size} dates rather +than \code{.window_size} points. \code{epi_slide_opt} (and wrapper functions +\code{epi_slide_mean} and \code{epi_slide_sum}) take care of window completion +automatically to prevent associated errors.} + +\item{...}{Additional arguments to pass to the slide computation \code{.f}, for +example, \code{algo} or \code{na.rm} in data.table functions. You don't need to +specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider +functions).} + +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} - -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} - -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} + +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} + +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{Character vector indicating the name(s) of the new -column(s) that will contain the derivative values. Default -is "slide_value"; note that setting \code{new_col_name} equal to any existing -column names will overwrite those columns. If \code{names_sep} is \code{NULL}, -\code{new_col_name} must be the same length as \code{col_names}.} - -\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to -\code{x}, named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides an n-timestep \link[data.table:froll]{data.table::froll} or \link[slider:summary-slide]{slider::summary-slide} function @@ -124,20 +95,39 @@ over variables in an \code{epi_df} object. See the for examples. } \details{ -To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference time -and left and right endpoints are given by the \code{before} and \code{after} -arguments. -If there are not enough time steps available to complete the window at any -given reference time, then \verb{epi_slide_*()} will fail; it requires a -complete window to perform the computation. For a centrally-aligned slide -of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. +To "slide" means to apply a function or formula over a rolling +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\verb{epi_slide_*()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is "tv". With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 1, tv, tv + 1 + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv, tv + 1 + +With .align = "left" and .window_size = 3, the window will be: + +time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv, tv + 1, tv + 2 } \examples{ # slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` @@ -145,7 +135,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, before = 6 + .f = data.table::frollmean, .window_size = 7 ) \%>\% # Remove a nonessential var. to ensure new col is printed, and rename new col dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% @@ -157,9 +147,9 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, before = 6, + .f = data.table::frollmean, .window_size = 7, # `frollmean` options - na.rm = TRUE, algo = "exact", hasNA = TRUE + algo = "exact", hasNA = TRUE, na.rm = TRUE ) \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() @@ -169,18 +159,18 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = slider::slide_mean, after = 6 + .f = slider::slide_mean, .window_size = 7, .align = "left" ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() -# slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` +# slide a 7-day center-aligned sum. This can also be done with `epi_slide_sum` jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollsum, before = 3, after = 3 + .f = data.table::frollsum, .window_size = 6, .align = "center" ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 7bf92e23..0c83c432 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -5,124 +5,120 @@ \title{Optimized slide function for performing rolling sums on an \code{epi_df} object} \usage{ epi_slide_sum( - x, - col_names, + .x, + .col_names, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - as_list_col = NULL, - names_sep = NULL, - all_rows = FALSE + .window_size = 1, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or -\link[tidyselect:language]{other tidy-select expression}. Variable names can -be used as if they were positions in the data frame, so expressions like -\code{x:y} can be used to select a range of variables. If you have the desired -column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. +\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +\link[tidyselect:language]{other tidy-select expression}, or a vector of +characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if +they were positions in the data frame, so expressions like \code{x:y} can be +used to select a range of variables. The tidy-selection renaming interface is not supported, and cannot be used to provide output column names; if you want to customize the output column names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} -\item{...}{Additional arguments to pass to \code{data.table::frollsum}, for -example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically -passed the data \code{x} to operate on, the window size \code{n}, and the alignment -\code{align}. Providing these args via \code{...} will cause an error.} +\item{...}{Additional arguments to pass to the slide computation \code{.f}, for +example, \code{algo} or \code{na.rm} in data.table functions. You don't need to +specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider +functions).} -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{Character vector indicating the name(s) of the new -column(s) that will contain the derivative values. Default -is "slide_value"; note that setting \code{new_col_name} equal to any existing -column names will overwrite those columns. If \code{names_sep} is \code{NULL}, -\code{new_col_name} must be the same length as \code{col_names}.} - -\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to -\code{x}, named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides an n-timestep sum over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ -Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. - -To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference time -and left and right endpoints are given by the \code{before} and \code{after} -arguments. -If there are not enough time steps available to complete the window at any -given reference time, then \verb{epi_slide_*()} will fail; it requires a -complete window to perform the computation. For a centrally-aligned slide -of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. +Wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. + +To "slide" means to apply a function or formula over a rolling +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\verb{epi_slide_*()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is "tv". With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 1, tv, tv + 1 + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv, tv + 1 + +With .align = "left" and .window_size = 3, the window will be: + +time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv, tv + 1, tv + 2 } \examples{ # slide a 7-day trailing sum formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, before = 6) \%>\% + epi_slide_sum(cases, .window_size = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) \%>\% ungroup() diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 4ab23882..c3682489 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -4,15 +4,21 @@ \alias{epix_as_of} \title{Generate a snapshot from an \code{epi_archive} object} \usage{ -epix_as_of(x, max_version, min_time_value = -Inf, all_versions = FALSE) +epix_as_of( + x, + version, + min_time_value = -Inf, + all_versions = FALSE, + max_version = deprecated() +) } \arguments{ \item{x}{An \code{epi_archive} object} -\item{max_version}{Time value specifying the max version to permit in the +\item{version}{Time value specifying the max version to permit in the snapshot. That is, the snapshot will comprise the unique rows of the current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose time values are at least +of the specified \code{version} (and whose time values are at least \code{min_time_value}.)} \item{min_time_value}{Time value specifying the min time value to permit in @@ -21,10 +27,13 @@ minimum considered.} \item{all_versions}{If \code{all_versions = TRUE}, then the output will be in \code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a +having \code{version <= version}. The resulting object will cover a potentially narrower \code{version} and \code{time_value} range than \code{x}, depending on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} +for the \code{version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{max_version}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} please use \code{version} +argument instead.} } \value{ An \code{epi_df} object. @@ -37,7 +46,7 @@ examples. \examples{ epix_as_of( archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version) + version = max(archive_cases_dv_subset$DT$version) ) range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index ea0d2444..564a1fdc 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -46,19 +46,77 @@ clobberable versions). If the \code{versions_end} values differ, the \code{sync} parameter controls what is done. } \details{ -In all cases, \code{additional_metadata} will be an empty list, and -\code{clobberable_versions_start} will be set to the earliest version that could -be clobbered in either input archive. +In all cases, \code{clobberable_versions_start} will be set to the +earliest version that could be clobbered in either input archive. } \examples{ -# create two example epi_archive datasets -x <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% - as_epi_archive(compactify = TRUE) -y <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value, time_value, version, percent_cli) \%>\% - as_epi_archive(compactify = TRUE) -# merge results stored in a third object: -xy <- epix_merge(x, y) +# Example 1 +# The s1 signal at August 1st gets revised from 10 to 11 on August 2nd +s1 <- tibble::tibble( + geo_value = c("ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-02")), + signal1 = c(10, 11, 7) +) + +s2 <- tibble::tibble( + geo_value = c("ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-03", "2024-08-03")), + signal2 = c(2, 3) +) + + +s1 <- s1 \%>\% as_epi_archive() +s2 <- s2 \%>\% as_epi_archive() + +merged <- epix_merge(s1, s2, sync = "locf") +merged[["DT"]] + +# Example 2 +# The s1 signal at August 1st gets revised from 12 to 13 on August 3rd +s1 <- tibble::tibble( + geo_value = c("ca", "ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02", "2024-08-03")), + version = as.Date(c("2024-08-01", "2024-08-03", "2024-08-03", "2024-08-03")), + signal1 = c(12, 13, 22, 19) +) + +s2 <- tibble::tibble( + geo_value = c("ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-02", "2024-08-02")), + signal2 = c(4, 5), +) + + +s1 <- s1 \%>\% as_epi_archive() +s2 <- s2 \%>\% as_epi_archive() + +merged <- epix_merge(s1, s2, sync = "locf") +merged[["DT"]] + + +# Example 3: +s1 <- tibble::tibble( + geo_value = c("ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), + signal1 = c(14, 11, 9) +) + +# The s2 signal at August 1st gets revised from 3 to 5 on August 3rd +s2 <- tibble::tibble( + geo_value = c("ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-02", "2024-08-03", "2024-08-03")), + signal2 = c(3, 5, 2), +) + +s1 <- s1 \%>\% as_epi_archive() +s2 <- s2 \%>\% as_epi_archive() +# Some LOCF for signal 1 as signal 2 gets updated +merged <- epix_merge(s1, s2, sync = "locf") +merged[["DT"]] } diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 2789cb01..1326cc18 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -7,86 +7,78 @@ \title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} \usage{ epix_slide( - x, - f, + .x, + .f, ..., - before = Inf, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE + .before = Inf, + .versions = NULL, + .new_col_name = NULL, + .all_versions = FALSE ) \method{epix_slide}{epi_archive}( - x, - f, + .x, + .f, ..., - before = Inf, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE + .before = Inf, + .versions = NULL, + .new_col_name = NULL, + .all_versions = FALSE ) \method{epix_slide}{grouped_epi_archive}( - x, - f, + .x, + .f, ..., - before = Inf, - ref_time_values = NULL, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE + .before = Inf, + .versions = NULL, + .new_col_name = NULL, + .all_versions = FALSE ) } \arguments{ -\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, +\item{.x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{f}{Function, formula, or missing; together with \code{...} specifies the +\item{.f}{Function, formula, or missing; together with \code{...} specifies the computation to slide. To "slide" means to apply a computation over a sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +determined by the \code{.before} parameter (see details for more). If a +function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where +\itemize{ +\item "x" is an epi_df with the same column names as the archive's \code{DT}, minus +the \code{version} column +\item "g" is a one-row tibble containing the values of the grouping variables +for the associated group +\item "t" is the ref_time_value for the current window +\item "..." are additional arguments +} + +If a formula, \code{.f} can operate directly on columns accessed via \code{.x$var} or +\code{.$var}, as in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for +each group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via \code{.z} +or \code{.ref_time_value}. If \code{.f} is missing, then \code{...} will specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} +via \code{f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted +as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions +for tidy evaluation; in addition to referring columns directly by name, the +expressions have access to \code{.data} and \code{.env} pronouns as in \code{dplyr} verbs, +and can also refer to \code{.x} (not the same as the input epi_archive), +\code{.group_key}, and \code{.ref_time_value}. See details for more.} -\item{before}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} +\item{.before}{How many time values before the \code{.ref_time_value} +should each snapshot handed to the function \code{.f} contain? If provided, it +should be a single value that is compatible with the time_type of the +time_value column (more below), but most commonly an integer. This window +endpoint is inclusive. For example, if \code{.before = 7}, \code{time_type} +in the archive is "day", and the \code{.ref_time_value} is January 8, then the +smallest time_value in the snapshot will be January 1. If missing, then the +default is no limit on the time values, so the full snapshot is given.} -\item{ref_time_values}{Reference time values / versions for sliding +\item{.versions}{Reference time values / versions for sliding computations; each element of this vector serves both as the anchor point for the \code{time_value} window for the computation and the \code{max_version} \code{epix_as_of} which we fetch data in this window. If missing, then this will @@ -94,32 +86,27 @@ set to a regularly-spaced sequence of values set to cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will be guessed (using the GCD of the skips between values).} -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} +\item{.new_col_name}{Either \code{NULL} or a string indicating the name of the new +column that will contain the derived values. The default, \code{NULL}, will use +the name "slide_value" unless your slide computations output data frames, +in which case they will be unpacked into the constituent columns and those +names used. If the resulting column name(s) overlap with the column names +used for labeling the computations, which are \code{group_vars(x)} and +\code{"version"}, then the values for these columns must be identical to the +labels we assign.} -\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} +\item{.all_versions}{(Not the same as \code{.all_rows} parameter of \code{epi_slide}.) +If \code{.all_versions = TRUE}, then the slide computation will be passed the +version history (all \code{version <= .version} where \code{.version} is one of the +requested \code{.versions}) for rows having a \code{time_value} of at least `.version +\itemize{ +\item before\verb{. Otherwise, the slide computation will be passed only the most recent }version\verb{for every unique}time_value\verb{. Default is }FALSE`. +}} } \value{ A tibble whose columns are: the grouping variables, \code{time_value}, containing the reference time values for the slide computation, and a -column named according to the \code{new_col_name} argument, containing the slide +column named according to the \code{.new_col_name} argument, containing the slide values. } \description{ @@ -133,26 +120,18 @@ examples. \details{ A few key distinctions between the current function and \code{epi_slide()}: \enumerate{ -\item In \code{f} functions for \code{epix_slide}, one should not assume that the input +\item In \code{.f} functions for \code{epix_slide}, one should not assume that the input data to contain any rows with \code{time_value} matching the computation's -\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for +\code{.ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for typical epidemiological surveillance data, observations pertaining to a particular time period (\code{time_value}) are first reported \code{as_of} some instant after that time period has ended. -\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend -from \code{before} time steps before a given \code{ref_time_value} through the last -\code{time_value} available as of version \code{ref_time_value} (typically, this -won't include \code{ref_time_value} itself, as observations about a particular -time interval (e.g., day) are only published after that time interval -ends); \code{epi_slide} windows extend from \code{before} time steps before a -\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. \item The input class and columns are similar but different: \code{epix_slide} -(with the default \code{all_versions=FALSE}) keeps all columns and the +(with the default \code{.all_versions=FALSE}) keeps all columns and the \code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only provides the grouping variables in the second input, and will convert the first input into a regular tibble if the grouping variables include the -essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will -will provide an \code{epi_archive} rather than an \code{epi-df} to each +essential \code{geo_value} column. (With .all_versions=TRUE\verb{, }epix_slide\verb{will will provide an}epi_archive\verb{rather than an}epi-df` to each computation.) \item The output class and columns are similar but different: \code{epix_slide()} returns a tibble containing only the grouping variables, \code{time_value}, and @@ -167,11 +146,11 @@ size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_sli roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed in the "advanced" vignette. -\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +\item \code{.all_rows} is not supported in \code{epix_slide}; since the slide computations are allowed more flexibility in their outputs than in \code{epi_slide}, we can't guess a good representation for missing computations -for excluded group-\code{ref_time_value} pairs. -\item The \code{ref_time_values} default for \code{epix_slide} is based on making an +for excluded group-\code{.ref_time_value} pairs. +\item The \code{.versions} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the \code{versions_end}, rather than the \code{time_value}s. } @@ -191,7 +170,7 @@ necessary (as it its purpose). library(dplyr) # Reference time points for which we want to compute slide values: -ref_time_values <- seq(as.Date("2020-06-01"), +versions <- seq(as.Date("2020-06-02"), as.Date("2020-06-15"), by = "1 day" ) @@ -201,10 +180,10 @@ ref_time_values <- seq(as.Date("2020-06-01"), archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = ref_time_values, - new_col_name = "case_rate_7d_av_recent_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_7d_av_recent_av" ) \%>\% ungroup() # We requested time windows that started 2 days before the corresponding time @@ -217,7 +196,7 @@ archive_cases_dv_subset \%>\% # * 2 `time_value`s, for the rest of the results # * never the 3 `time_value`s we would get from `epi_slide`, since, because # of data latency, we'll never have an observation -# `time_value == ref_time_value` as of `ref_time_value`. +# `time_value == .ref_time_value` as of `.ref_time_value`. # The example below shows this type of behavior in more detail. # Examining characteristics of the data passed to each computation with @@ -236,17 +215,17 @@ archive_cases_dv_subset \%>\% class1 = class(x)[[1L]] ) }, - before = 5, all_versions = FALSE, - ref_time_values = ref_time_values, names_sep = NULL + .before = 5, .all_versions = FALSE, + .versions = versions ) \%>\% ungroup() \%>\% - arrange(geo_value, time_value) + arrange(geo_value, version) # --- Advanced: --- # `epix_slide` with `all_versions=FALSE` (the default) applies a # version-unaware computation to several versions of the data. We can also -# use `all_versions=TRUE` to apply a version-*aware* computation to several +# use `.all_versions=TRUE` to apply a version-*aware* computation to several # versions of the data, again looking at characteristics of the data passed # to each computation. In this case, each computation should expect an # `epi_archive` containing the relevant version data: @@ -271,8 +250,8 @@ archive_cases_dv_subset \%>\% class1 = class(x)[[1L]] ) }, - before = 5, all_versions = TRUE, - ref_time_values = ref_time_values, names_sep = NULL + .before = 5, .all_versions = TRUE, + .versions = versions ) \%>\% ungroup() \%>\% # Focus on one geo_value so we can better see the columns above: diff --git a/man/f_no_na.Rd b/man/f_no_na.Rd new file mode 100644 index 00000000..9a832d72 --- /dev/null +++ b/man/f_no_na.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{f_no_na} +\alias{f_no_na} +\title{use when the default behavior returns a warning on empty lists, which we do +not want, and there is no super clean way of preventing this} +\usage{ +f_no_na(f, x) +} +\description{ +use when the default behavior returns a warning on empty lists, which we do +not want, and there is no super clean way of preventing this +} +\keyword{internal} diff --git a/man/format_chr_with_quotes.Rd b/man/format_chr_with_quotes.Rd new file mode 100644 index 00000000..49beffb0 --- /dev/null +++ b/man/format_chr_with_quotes.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_chr_with_quotes} +\alias{format_chr_with_quotes} +\title{Format a character vector as a string via deparsing/quoting each} +\usage{ +format_chr_with_quotes(x, empty = "*none*") +} +\arguments{ +\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} + +\item{empty}{string; what should be output if \code{x} is of length 0?} +} +\value{ +string +} +\description{ +Format a character vector as a string via deparsing/quoting each +} +\keyword{internal} diff --git a/man/format_class_vec.Rd b/man/format_class_vec.Rd new file mode 100644 index 00000000..2c7ae4b7 --- /dev/null +++ b/man/format_class_vec.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_class_vec} +\alias{format_class_vec} +\title{Format a class vector as a string via deparsing it} +\usage{ +format_class_vec(class_vec) +} +\arguments{ +\item{class_vec}{\code{chr}; output of \code{class(object)} for some \code{object}} +} +\value{ +string +} +\description{ +Format a class vector as a string via deparsing it +} +\keyword{internal} diff --git a/man/format_tibble_row.Rd b/man/format_tibble_row.Rd new file mode 100644 index 00000000..c43bd4a9 --- /dev/null +++ b/man/format_tibble_row.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_tibble_row} +\alias{format_tibble_row} +\title{Format a tibble row as chr} +\usage{ +format_tibble_row(x, empty = "*none*") +} +\arguments{ +\item{x}{a tibble with a single row} +} +\value{ +\code{chr} with one entry per column, of form "\if{html}{\out{}} = \if{html}{\out{}}" +} +\description{ +Format a tibble row as chr +} +\keyword{internal} diff --git a/man/format_varname.Rd b/man/format_varname.Rd new file mode 100644 index 00000000..fa9d3583 --- /dev/null +++ b/man/format_varname.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_varname} +\alias{format_varname} +\title{"Format" column/variable name for cli interpolation} +\usage{ +format_varname(x) +} +\arguments{ +\item{x}{string; e.g., a colname} +} +\value{ +string +} +\description{ +Designed to give good output if interpolated with cli. Main purpose is to add +backticks around variable names when necessary. +} +\keyword{internal} diff --git a/man/format_varnames.Rd b/man/format_varnames.Rd new file mode 100644 index 00000000..d25eb713 --- /dev/null +++ b/man/format_varnames.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_varnames} +\alias{format_varnames} +\title{"Format" a character vector of column/variable names for cli interpolation} +\usage{ +format_varnames(x, empty = "*none*") +} +\arguments{ +\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} + +\item{empty}{string; what should be output if \code{x} is of length 0?} +} +\value{ +\code{chr} +} +\description{ +Designed to give good output if interpolated with cli. Main purpose is to add +backticks around variable names when necessary, and something other than an +empty string if length 0. +} +\keyword{internal} diff --git a/man/get_last_run.Rd b/man/get_last_run.Rd new file mode 100644 index 00000000..53c10699 --- /dev/null +++ b/man/get_last_run.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{get_last_run} +\alias{get_last_run} +\title{return the first value in values_from from the last string of trues in bool_vec} +\usage{ +get_last_run(bool_vec, values_from) +} +\description{ +the point of this operation is to get the value in values_from which occurs +at the same index as the start of the last run of true values in bool_vec. +for example, in c(1,1,0,1,1), we want the 4th entry, since there's a 0 +breaking the run +} +\keyword{internal} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index 782d5f3f..169bd455 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -5,6 +5,7 @@ \alias{grouped_epi_archive} \alias{group_by.grouped_epi_archive} \alias{group_by_drop_default.grouped_epi_archive} +\alias{group_vars.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} \alias{is_grouped_epi_archive} @@ -16,6 +17,8 @@ \method{group_by_drop_default}{grouped_epi_archive}(.tbl) +\method{group_vars}{grouped_epi_archive}(x) + \method{groups}{grouped_epi_archive}(x) \method{ungroup}{grouped_epi_archive}(x, ...) @@ -52,8 +55,8 @@ factor columns.} \item{.tbl}{A \code{grouped_epi_archive} object.} -\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for -\code{is_grouped_epi_archive}: any object} +\item{x}{For \code{groups}, \code{group_vars}, or \code{ungroup}: a \code{grouped_epi_archive}; +for \code{is_grouped_epi_archive}: any object} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} @@ -90,10 +93,10 @@ grouped_archive \%>\% print() archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = as.Date("2020-06-11") + 0:2, + .new_col_name = "case_rate_3d_av" ) \%>\% ungroup() @@ -131,6 +134,11 @@ toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) +# To get the grouping variable names as a character vector: +toy_archive \%>\% + group_by(geo_value) \%>\% + group_vars() + # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): toy_archive \%>\% group_by(geo_value) \%>\% @@ -138,7 +146,7 @@ toy_archive \%>\% toy_archive \%>\% group_by(geo_value, age_group, .drop = FALSE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + epix_slide(.f = ~ sum(.x$value), .before = 20) \%>\% ungroup() } diff --git a/man/group_epi_df.Rd b/man/group_epi_df.Rd new file mode 100644 index 00000000..5895a52f --- /dev/null +++ b/man/group_epi_df.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{group_epi_df} +\alias{group_epi_df} +\title{Group an \code{epi_df} object by default keys} +\usage{ +group_epi_df(x, exclude = character()) +} +\arguments{ +\item{x}{an \code{epi_df}} + +\item{exclude}{character vector of column names to exclude from grouping} +} +\value{ +a grouped \code{epi_df} +} +\description{ +Group an \code{epi_df} object by default keys +} diff --git a/man/is_locf.Rd b/man/is_locf.Rd new file mode 100644 index 00000000..8efeecfd --- /dev/null +++ b/man/is_locf.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{is_locf} +\alias{is_locf} +\title{Checks to see if a value in a vector is LOCF} +\usage{ +is_locf(vec, tolerance) +} +\description{ +LOCF meaning last observation carried forward. lags the vector by 1, then +compares with itself. For doubles it uses float comparison via +\code{\link[dplyr:near]{dplyr::near}}, otherwise it uses equality. \code{NA}'s and \code{NaN}'s are +considered equal to themselves and each other. +} +\keyword{internal} diff --git a/man/key_colnames.Rd b/man/key_colnames.Rd index fbaa3c11..f5e13837 100644 --- a/man/key_colnames.Rd +++ b/man/key_colnames.Rd @@ -2,17 +2,33 @@ % Please edit documentation in R/key_colnames.R \name{key_colnames} \alias{key_colnames} +\alias{key_colnames.default} +\alias{key_colnames.data.frame} +\alias{key_colnames.epi_df} +\alias{key_colnames.epi_archive} \title{Grab any keys associated to an epi_df} \usage{ key_colnames(x, ...) + +\method{key_colnames}{default}(x, ...) + +\method{key_colnames}{data.frame}(x, other_keys = character(0L), exclude = character(0L), ...) + +\method{key_colnames}{epi_df}(x, exclude = character(0L), ...) + +\method{key_colnames}{epi_archive}(x, exclude = character(0L), ...) } \arguments{ \item{x}{a data.frame, tibble, or epi_df} \item{...}{additional arguments passed on to methods} + +\item{other_keys}{an optional character vector of other keys to include} + +\item{exclude}{an optional character vector of keys to exclude} } \value{ -If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL} +If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL}. } \description{ Grab any keys associated to an epi_df diff --git a/man/num_percent.Rd b/man/num_percent.Rd new file mode 100644 index 00000000..f5a82ac3 --- /dev/null +++ b/man/num_percent.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{num_percent} +\alias{num_percent} +\title{simple util for printing a fraction and it's percent} +\usage{ +num_percent(a, b, b_description) +} +\description{ +simple util for printing a fraction and it's percent +} +\keyword{internal} diff --git a/man/paste_lines.Rd b/man/paste_lines.Rd new file mode 100644 index 00000000..bab1e90b --- /dev/null +++ b/man/paste_lines.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{paste_lines} +\alias{paste_lines} +\title{Paste \code{chr} entries (lines) together with \code{"\\n"} separators, trailing \code{"\\n"}} +\usage{ +paste_lines(lines) +} +\arguments{ +\item{lines}{\code{chr}} +} +\value{ +string +} +\description{ +Paste \code{chr} entries (lines) together with \code{"\\n"} separators, trailing \code{"\\n"} +} +\keyword{internal} diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 5a232de0..d1664cd7 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/methods-epi_df.R \name{print.epi_df} \alias{print.epi_df} -\alias{summary.epi_df} \alias{group_by.epi_df} \alias{ungroup.epi_df} \alias{group_modify.epi_df} @@ -11,8 +10,6 @@ \usage{ \method{print}{epi_df}(x, ...) -\method{summary}{epi_df}(object, ...) - \method{group_by}{epi_df}(.data, ...) \method{ungroup}{epi_df}(x, ...) @@ -24,10 +21,7 @@ \arguments{ \item{x}{an \code{epi_df}} -\item{...}{Additional arguments, for compatibility with \code{summary()}. -Currently unused.} - -\item{object}{an \code{epi_df}} +\item{...}{additional arguments to forward to \code{NextMethod()}, or unused} \item{.data}{an \code{epi_df}} @@ -39,7 +33,4 @@ Currently unused.} } \description{ Print and summary functions for an \code{epi_df} object. - -Prints a variety of summary statistics about the \code{epi_df} object, such as -the time range included and geographic coverage. } diff --git a/man/reexports.Rd b/man/reexports.Rd index fdda2925..ba6ab976 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -14,6 +14,8 @@ \alias{rename} \alias{slice} \alias{unnest} +\alias{complete} +\alias{full_seq} \alias{autoplot} \title{Objects exported from other packages} \keyword{internal} @@ -26,7 +28,7 @@ below to see their documentation. \item{ggplot2}{\code{\link[ggplot2]{autoplot}}} - \item{tidyr}{\code{\link[tidyr]{unnest}}} + \item{tidyr}{\code{\link[tidyr]{complete}}, \code{\link[tidyr]{full_seq}}, \code{\link[tidyr]{unnest}}} \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} }} diff --git a/man/removed_by_compactify.Rd b/man/removed_by_compactify.Rd new file mode 100644 index 00000000..2f129888 --- /dev/null +++ b/man/removed_by_compactify.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{removed_by_compactify} +\alias{removed_by_compactify} +\title{get the entries that \code{compactify} would remove} +\usage{ +removed_by_compactify(df, keys, tolerance) +} +\description{ +get the entries that \code{compactify} would remove +} +\keyword{internal} diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd new file mode 100644 index 00000000..590a1ed5 --- /dev/null +++ b/man/revision_summary.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{revision_summary} +\alias{revision_summary} +\title{A function to describe revision behavior for an archive} +\usage{ +revision_summary( + epi_arch, + ..., + drop_nas = TRUE, + print_inform = TRUE, + min_waiting_period = as.difftime(60, units = "days"), + within_latest = 0.2, + quick_revision = as.difftime(3, units = "days"), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + compactify_tol = .Machine$double.eps^0.5, + should_compactify = TRUE +) +} +\arguments{ +\item{epi_arch}{an epi_archive to be analyzed} + +\item{...}{<\code{\link[=dplyr_tidy_select]{tidyselect}}>, used to choose the column to +summarize. If empty, it chooses the first. Currently only implemented for +one column at a time.} + +\item{drop_nas}{bool, drop any \code{NA} values from the archive? After dropping +\code{NA}'s compactify is run again to make sure there are no duplicate values +from occasions when the signal is revised to \code{NA}, and then back to its +immediately-preceding value.} + +\item{print_inform}{bool, determines whether to print summary information, or +only return the full summary tibble} + +\item{min_waiting_period}{\code{difftime}, integer or \code{NULL}. Sets a cutoff: any +time_values not earlier than \code{min_waiting_period} before \code{versions_end} are +removed. \code{min_waiting_period} should characterize the typical time during +which revisions occur. The default of 60 days corresponds to a typical +final value for case counts as reported in the context of insurance. To +avoid this filtering, either set to \code{NULL} or 0.} + +\item{within_latest}{double between 0 and 1. Determines the threshold +used for the \code{time_to}} + +\item{quick_revision}{difftime or integer (integer is treated as days), for +the printed summary, the amount of time between the final revision and the +actual time_value to consider the revision quickly resolved. Default of 3 +days} + +\item{few_revisions}{integer, for the printed summary, the upper bound on the +number of revisions to consider "few". Default is 3.} + +\item{abs_spread_threshold}{numeric, for the printed summary, the maximum +spread used to characterize revisions which don't actually change very +much. Default is 5\% of the maximum value in the dataset, but this is the +most unit dependent of values, and likely needs to be chosen appropriate +for the scale of the dataset.} + +\item{rel_spread_threshold}{float between 0 and 1, for the printed summary, +the relative spread fraction used to characterize revisions which don't +actually change very much. Default is .1, or 10\% of the final value} + +\item{compactify_tol}{float, used if \code{drop_nas=TRUE}, it determines the +threshold for when two floats are considered identical.} + +\item{should_compactify}{bool. Compactify if \code{TRUE}.} +} +\description{ +\code{revision_summary} removes all missing values (if requested), and then +computes some basic statistics about the revision behavior of an archive, +returning a tibble summarizing the revisions per time_value+epi_key features. If \code{print_inform} is true, it +prints a concise summary. The columns returned are: +\enumerate{ +\item \code{n_revisions}: the total number of revisions for that entry +\item \code{min_lag}: the minimum time to any value (if \code{drop_nas=FALSE}, this +includes \code{NA}'s) +\item \code{max_lag}: the amount of time until the final (new) version (same caveat +for \code{drop_nas=FALSE}, though it is far less likely to matter) +\item \code{min_value}: the minimum value across revisions +\item \code{max_value}: the maximum value across revisions +\item \code{median_value}: the median value across revisions +\item \code{spread}: the difference between the smallest and largest values (this +always excludes \code{NA} values) +\item \code{rel_spread}: \code{spread} divided by the largest value (so it will +always be less than 1). Note that this need not be the final value. It will +be \code{NA} whenever \code{spread} is 0. +\item \code{time_near_latest}: This gives the lag when the value is within +\code{within_latest} (default 20\%) of the value at the latest time. For example, +consider the series (0,20, 99, 150, 102, 100); then \code{time_near_latest} is +the 5th index, since even though 99 is within 20\%, it is outside the window +afterwards at 150. +} +} +\examples{ + +revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) + +revision_example \%>\% arrange(desc(spread)) +} diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd new file mode 100644 index 00000000..f1ba8474 --- /dev/null +++ b/man/sum_groups_epi_df.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{sum_groups_epi_df} +\alias{sum_groups_epi_df} +\title{Aggregate an \code{epi_df} object} +\usage{ +sum_groups_epi_df(.x, sum_cols = "value", group_cols = character()) +} +\arguments{ +\item{.x}{an \code{epi_df}} + +\item{sum_cols}{character vector of the columns to aggregate} + +\item{group_cols}{character vector of column names to group by. "time_value" is +included by default.} +} +\value{ +an \code{epi_df} object +} +\description{ +Aggregates an \code{epi_df} object by the specified group columns, summing the +\code{value} column, and returning an \code{epi_df}. If aggregating over \code{geo_value}, +the resulting \code{epi_df} will have \code{geo_value} set to \code{"total"}. +} diff --git a/man/summary.epi_df.Rd b/man/summary.epi_df.Rd new file mode 100644 index 00000000..831d4d4e --- /dev/null +++ b/man/summary.epi_df.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{summary.epi_df} +\alias{summary.epi_df} +\title{Summarize \code{epi_df} object} +\usage{ +\method{summary}{epi_df}(object, ...) +} +\arguments{ +\item{object}{an \code{epi_df}} + +\item{...}{Additional arguments, for compatibility with \code{summary()}. +Currently unused.} +} +\description{ +Prints a variety of summary statistics about the \code{epi_df} object, such as +the time range included and geographic coverage. +} diff --git a/man/time_within_x_latest.Rd b/man/time_within_x_latest.Rd new file mode 100644 index 00000000..1dd7e801 --- /dev/null +++ b/man/time_within_x_latest.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{time_within_x_latest} +\alias{time_within_x_latest} +\title{pull the value from lags when values starts indefinitely being within prop of it's last value.} +\usage{ +time_within_x_latest(lags, values, prop = 0.2) +} +\arguments{ +\item{values}{this should be a 1 column tibble. errors may occur otherwise} +} +\description{ +pull the value from lags when values starts indefinitely being within prop of it's last value. +} +\keyword{internal} diff --git a/man/wrap_symbolics.Rd b/man/wrap_symbolics.Rd new file mode 100644 index 00000000..cfee2dcf --- /dev/null +++ b/man/wrap_symbolics.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{wrap_symbolics} +\alias{wrap_symbolics} +\title{Line wrap list holding \link[rlang:is_expression]{symbolic}, with prefix&indent} +\usage{ +wrap_symbolics( + symbolics, + initial = "", + common_prefix = "", + none_str = "", + width = getOption("width", 80L) +) +} +\arguments{ +\item{symbolics}{List of \link[rlang:is_expression]{symbolic} objects: the variable +names (potentially empty)} + +\item{initial}{Optional; single string: a prefix for the initial line in the +result; e.g., "Variable names: ". Defaults to "". Any non-initial lines +will be indented with whitespace matching the (estimated) visual width of +\code{initial}.} + +\item{common_prefix}{Optional; single string: a prefix for every line (will +appear before \code{initial}); e.g., "# ". Defaults to "".} + +\item{none_str}{Optional; single string: what to display when given +\code{length}-0 input. Will be combined with \code{common_prefix} and \code{initial}.} + +\item{width}{Optional; single integer: desired maximum formatted line width. +The formatted output may not obey this setting if \code{common_prefix} plus +\code{initial} is long or the printing width is very narrow.} +} +\value{ +\code{chr}; to print, use \code{\link[base:writeLines]{base::writeLines}}. +} +\description{ +Helps pretty-print these objects. Adds backticks, commas, prefixes, and +indentation. Wraps lines, but won't insert line breaks in the middle of any +name while doing so. +} +\keyword{internal} diff --git a/man/wrap_varnames.Rd b/man/wrap_varnames.Rd new file mode 100644 index 00000000..8c3e1246 --- /dev/null +++ b/man/wrap_varnames.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{wrap_varnames} +\alias{wrap_varnames} +\title{Line wrap \code{chr} holding variable/column/other names, with prefix&indent} +\usage{ +wrap_varnames( + nms, + initial = "", + common_prefix = "", + none_str = "", + width = getOption("width", 80L) +) +} +\arguments{ +\item{nms}{Character vector: the variable names (potentially empty)} + +\item{initial}{Optional; single string: a prefix for the initial line in the +result; e.g., "Variable names: ". Defaults to "". Any non-initial lines +will be indented with whitespace matching the (estimated) visual width of +\code{initial}.} + +\item{common_prefix}{Optional; single string: a prefix for every line (will +appear before \code{initial}); e.g., "# ". Defaults to "".} + +\item{none_str}{Optional; single string: what to display when given +\code{length}-0 input. Will be combined with \code{common_prefix} and \code{initial}.} + +\item{width}{Optional; single integer: desired maximum formatted line width. +The formatted output may not obey this setting if \code{common_prefix} plus +\code{initial} is long or the printing width is very narrow.} +} +\value{ +\code{chr}; to print, use \code{\link[base:writeLines]{base::writeLines}}. +} +\description{ +Line wrap \code{chr} holding variable/column/other names, with prefix&indent +} +\keyword{internal} diff --git a/tests/testthat/_snaps/archive.md b/tests/testthat/_snaps/archive.md new file mode 100644 index 00000000..6e010da0 --- /dev/null +++ b/tests/testthat/_snaps/archive.md @@ -0,0 +1,13 @@ +# new_epi_archive correctly detects and warns about compactification + + Code + res <- dumb_ex %>% as_epi_archive() + Condition + Warning: + Found rows that appear redundant based on last (version of each) observation carried forward; these rows have been removed to 'compactify' and save space: + Key: + geo_value time_value value version + + 1: ca 2020-01-01 1 2020-01-02 + Built-in `epi_archive` functionality should be unaffected, but results may change if you work directly with its fields (such as `DT`). See `?as_epi_archive` for details. To silence this warning but keep compactification, you can pass `compactify=TRUE` when constructing the archive. + diff --git a/tests/testthat/_snaps/epi_df_forbidden_methods.md b/tests/testthat/_snaps/epi_df_forbidden_methods.md new file mode 100644 index 00000000..12dc3d48 --- /dev/null +++ b/tests/testthat/_snaps/epi_df_forbidden_methods.md @@ -0,0 +1,40 @@ +# Forbidden epi_df methods have decent error messages + + Code + edf %>% epi_slide(.window_size = 7L, ~ mean(.x)) + Condition + Error in `mean()`: + ! `mean` shouldn't be used on entire `epi_df`s + x .x was an `epi_df` + i If you encountered this while trying to take a rolling mean of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ mean(.x$colname)). You may also prefer to use the specialized `epi_slide_mean` method. + +--- + + Code + edf %>% epi_slide(.window_size = 7L, ~ sum(.x)) + Condition + Error in `.slide_comp()`: + ! `sum` shouldn't be used on entire `epi_df`s + x `sum`'s first argument was an `epi_df` + i If you encountered this while trying to take a rolling sum of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ sum(.x$colname)). You may also prefer to use the specialized `epi_slide_sum` method. + +--- + + Code + edf %>% epi_slide(.window_size = 7L, ~ min(.x)) + Condition + Error in `.slide_comp()`: + ! `min` shouldn't be used on entire `epi_df`s + x `min`'s first argument was an `epi_df` + i If you encountered this while trying to take a rolling min of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ min(.x$colname)). You may also prefer to use the specialized `epi_slide_opt` method. + +--- + + Code + edf %>% epi_slide(.window_size = 7L, ~ range(.x)) + Condition + Error in `.slide_comp()`: + ! `range` shouldn't be used on entire `epi_df`s + x `range`'s first argument was an `epi_df` + i If you encountered this while trying to take a rolling range of a column using `epi_slide`, you probably forgot to specify the column name (e.g., ~ range(.x$colname)). + diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md new file mode 100644 index 00000000..1ac21469 --- /dev/null +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -0,0 +1,94 @@ +# revision_summary works for a dummy dataset + + Code + dummy_ex %>% revision_summary() %>% print(n = 10, width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 days 1 days 1.6 days 4 days + Message + Fraction of epi_key+time_values with + No revisions: + * 3 out of 7 (42.86%) + Quick revisions (last revision within 3 days of the `time_value`): + * 4 out of 7 (57.14%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 1 out of 4 (25%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 4 (75%) + days until within 20% of the latest value: + Output + min median mean max + 0 days 3 days 6.9 days 19 days + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag time_near_latest spread + + 1 2020-01-01 ak 4 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 0 1 days 1 days 1 days 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 6 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 days 1 days 1.4 days 4 days + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 3 days of the `time_value`): + * 4 out of 7 (57.14%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + days until within 20% of the latest value: + Output + min median mean max + 0 days 3 days 6.9 days 19 days + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag time_near_latest spread + + 1 2020-01-01 ak 6 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 1 0 days 1 days 1 days 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index ac5aee8d..4232697e 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -4,16 +4,16 @@ test_that("first input must be a data.frame", { ) }) -dt <- archive_cases_dv_subset$DT +archive_data <- archive_cases_dv_subset$DT test_that("data.frame must contain geo_value, time_value and version columns", { - expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), + expect_error(as_epi_archive(select(archive_data, -geo_value), compactify = FALSE), regexp = "There is no geo_value column or similar name" ) - expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), + expect_error(as_epi_archive(select(archive_data, -time_value), compactify = FALSE), regexp = "There is no time_value column or similar name" ) - expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), + expect_error(as_epi_archive(select(archive_data, -version), compactify = FALSE), regexp = "There is no version column or similar name" ) }) @@ -21,60 +21,62 @@ test_that("data.frame must contain geo_value, time_value and version columns", { test_that("as_epi_archive custom name mapping works correctly", { # custom name works correctly expect_equal( - as_epi_archive(rename(dt, weirdName = version), + as_epi_archive(rename(archive_data, weirdName = version), version = weirdName, compactify = TRUE ), - as_epi_archive(dt, compactify = TRUE) + as_epi_archive(archive_data, compactify = TRUE) ) expect_equal( - as_epi_archive(rename(dt, weirdName = geo_value), + as_epi_archive(rename(archive_data, weirdName = geo_value), geo_value = weirdName, compactify = TRUE ), - as_epi_archive(dt, compactify = TRUE) + as_epi_archive(archive_data, compactify = TRUE) ) expect_equal( - as_epi_archive(rename(dt, weirdName = time_value), + as_epi_archive(rename(archive_data, weirdName = time_value), time_value = weirdName, compactify = TRUE ), - as_epi_archive(dt, compactify = TRUE) + as_epi_archive(archive_data, compactify = TRUE) ) expect_error( as_epi_archive( - rename(dt, weirdName = version), + rename(archive_data, weirdName = version), version = weirdName, version = time_value ), "Names must be unique" ) }) +dumb_ex <- data.frame( + geo_value = c("ca", "ca"), + time_value = as.Date(c("2020-01-01", "2020-01-01")), + value = c(1, 1), + version = as.Date(c("2020-01-01", "2020-01-02")) +) +test_that("new_epi_archive correctly detects and warns about compactification", { + expect_snapshot(res <- dumb_ex %>% as_epi_archive(), cnd_class = TRUE) +}) + test_that("other_keys can only contain names of the data.frame columns", { - expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE), + expect_error(as_epi_archive(archive_data, other_keys = "xyz", compactify = FALSE), regexp = "`other_keys` must be contained in the column names of `x`." ) - expect_error(as_epi_archive(dt, other_keys = "percent_cli", compactify = FALSE), NA) + expect_error(as_epi_archive(archive_data, other_keys = "percent_cli", compactify = FALSE), NA) }) test_that("other_keys cannot contain names geo_value, time_value or version", { - expect_error(as_epi_archive(dt, other_keys = "geo_value", compactify = FALSE), + expect_error(as_epi_archive(archive_data, other_keys = "geo_value", compactify = FALSE), regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." ) - expect_error(as_epi_archive(dt, other_keys = "time_value", compactify = FALSE), + expect_error(as_epi_archive(archive_data, other_keys = "time_value", compactify = FALSE), regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." ) - expect_error(as_epi_archive(dt, other_keys = "version", compactify = FALSE), + expect_error(as_epi_archive(archive_data, other_keys = "version", compactify = FALSE), regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." ) }) -test_that("Warning thrown when other_metadata contains overlapping names with geo_type field", { - expect_warning(as_epi_archive(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields" - ) - expect_warning(as_epi_archive(dt, additional_metadata = list(time_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields" - ) -}) test_that("epi_archives are correctly instantiated with a variety of data types", { d <- as.Date("2020-01-01") @@ -88,22 +90,22 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea1 <- as_epi_archive(df, compactify = FALSE) expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) - expect_equal(ea1$additional_metadata, list()) + expect_null(ea1$additional_metadata) - ea2 <- as_epi_archive(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + ea2 <- as_epi_archive(df, other_keys = "value", compactify = FALSE) expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) - expect_equal(ea2$additional_metadata, list(value = df$value)) + expect_null(ea2$additional_metadata) # Tibble tib <- tibble::tibble(df, code = "x") ea3 <- as_epi_archive(tib, compactify = FALSE) expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) - expect_equal(ea3$additional_metadata, list()) + expect_null(ea3$additional_metadata) - ea4 <- as_epi_archive(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + ea4 <- as_epi_archive(tib, other_keys = "code", compactify = FALSE) expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea4$additional_metadata, list(value = df$value)) + expect_null(ea4$additional_metadata) # Keyed data.table kdt <- data.table::data.table( @@ -118,12 +120,12 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea5 <- as_epi_archive(kdt, compactify = FALSE) # Key from data.table isn't absorbed when as_epi_archive is used expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) - expect_equal(ea5$additional_metadata, list()) + expect_null(ea5$additional_metadata) - ea6 <- as_epi_archive(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + ea6 <- as_epi_archive(kdt, other_keys = "value", compactify = FALSE) # Mismatched keys, but the one from as_epi_archive overrides expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) - expect_equal(ea6$additional_metadata, list(value = df$value)) + expect_null(ea6$additional_metadata) # Unkeyed data.table udt <- data.table::data.table( @@ -136,11 +138,11 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea7 <- as_epi_archive(udt, compactify = FALSE) expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) - expect_equal(ea7$additional_metadata, list()) + expect_null(ea7$additional_metadata) - ea8 <- as_epi_archive(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + ea8 <- as_epi_archive(udt, other_keys = "code", compactify = FALSE) expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea8$additional_metadata, list(value = df$value)) + expect_null(ea8$additional_metadata) # epi_df edf1 <- jhu_csse_daily_subset %>% @@ -149,15 +151,15 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea9 <- as_epi_archive(edf1, compactify = FALSE) expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) - expect_equal(ea9$additional_metadata, list()) + expect_null(ea9$additional_metadata) - ea10 <- as_epi_archive(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + ea10 <- as_epi_archive(edf1, other_keys = "code", compactify = FALSE) expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea10$additional_metadata, list(value = df$value)) + expect_null(ea10$additional_metadata) # Keyed epi_df edf2 <- data.frame( - geo_value = "al", + geo_value = c(rep("al", 10), rep("ak", 10)), time_value = rep(d + 0:9, 2), version = c( rep(as.Date("2020-01-25"), 10), @@ -166,15 +168,15 @@ test_that("epi_archives are correctly instantiated with a variety of data types" cases = 1:20, misc = "USA" ) %>% - as_epi_df(additional_metadata = list(other_keys = "misc")) + as_epi_df(other_keys = "misc") ea11 <- as_epi_archive(edf2, compactify = FALSE) expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) - expect_equal(ea11$additional_metadata, list()) + expect_null(ea11$additional_metadata) - ea12 <- as_epi_archive(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + ea12 <- as_epi_archive(edf2, other_keys = "misc", compactify = FALSE) expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) - expect_equal(ea12$additional_metadata, list(value = df$misc)) + expect_null(ea12$additional_metadata) }) test_that("`epi_archive` rejects nonunique keys", { @@ -226,3 +228,9 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns ) expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch") }) + +test_that("is_locf works as expected", { + vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) + is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) + expect_equal(is_locf(vec, .Machine$double.eps^0.5), as.logical(is_repeated)) +}) diff --git a/tests/testthat/test-arrange-canonical.R b/tests/testthat/test-arrange-canonical.R new file mode 100644 index 00000000..24d3f5f9 --- /dev/null +++ b/tests/testthat/test-arrange-canonical.R @@ -0,0 +1,20 @@ +test_that("canonical arrangement works", { + tib <- tibble( + x = 1:8, + demo_grp = rep(c("b", "b", "a", "a"), times = 2), + geo_value = rep(c("ga", "ca"), each = 4), + time_value = rep(2:1, times = 4) + ) + expect_error(arrange_canonical(tib)) + + tib <- tib %>% as_epi_df(other_keys = "demo_grp") + expect_equal(names(tib), c("geo_value", "demo_grp", "time_value", "x")) + + tib_sorted <- tib %>% + arrange_canonical() + expect_equal(names(tib_sorted), c("geo_value", "demo_grp", "time_value", "x")) + expect_equal(tib_sorted$geo_value, rep(c("ca", "ga"), each = 4)) + expect_equal(tib_sorted$time_value, c(1, 2, 1, 2, 1, 2, 1, 2)) + expect_equal(tib_sorted$demo_grp, c("a", "a", "b", "b", "a", "a", "b", "b")) + expect_equal(tib_sorted$x, c(8, 7, 6, 5, 4, 3, 2, 1)) +}) diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R index 488ace63..d2248a6d 100644 --- a/tests/testthat/test-as_tibble-decay.R +++ b/tests/testthat/test-as_tibble-decay.R @@ -8,8 +8,6 @@ test_that("as_tibble checks an attr to avoid decay to tibble", { }) test_that("as_tibble ungroups if needed", { - # tsibble is doing some method piracy, and overwriting as_tibble.grouped_df as of 1.1.5 - skip_if(packageVersion("tsibble") > "1.1.4") edf <- jhu_csse_daily_subset %>% group_by(geo_value) # removes the grouped_df class expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R index 7d29149b..3a82f615 100644 --- a/tests/testthat/test-deprecations.R +++ b/tests/testthat/test-deprecations.R @@ -1,47 +1,47 @@ test_that("epix_slide group_by= deprecation works", { expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before = 2L, group_by = c()), + epix_slide(function(...) {}, .before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before = 2L, group_by = c()), + epix_slide(function(...) {}, .before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before = 2L, group_by = c()), + epix_slide(function(...) {}, .before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before = 2L, group_by = c()), + epix_slide(function(...) {}, .before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) # expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before = 2L, all_rows = TRUE), + epix_slide(function(...) {}, .before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before = 2L, all_rows = TRUE), + epix_slide(function(...) {}, .before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before = 2L, all_rows = TRUE), + epix_slide(function(...) {}, .before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before = 2L, all_rows = TRUE), + epix_slide(function(...) {}, .before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) }) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index a49855aa..2444a87a 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -23,8 +23,7 @@ test_that("new_epi_df works as intended", { expect_true(lubridate::is.POSIXt(attributes(epi_tib)$metadata$as_of)) }) -test_that("as_epi_df errors when additional_metadata is not a list", { - # This is the 3rd example from as_epi_df +test_that("as_epi_df errors for non-character other_keys", { ex_input <- jhu_csse_county_level_subset %>% dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% dplyr::slice_tail(n = 6) %>% @@ -35,9 +34,10 @@ test_that("as_epi_df errors when additional_metadata is not a list", { ) expect_error( - as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")), - "Must be of type 'list', not 'character'." + as_epi_df(ex_input, other_keys = list()), + "Must be of type 'character'" ) + expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol"))) }) test_that("as_epi_df works for nonstandard input", { @@ -81,7 +81,7 @@ tib <- tibble::tibble( time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) -epi_tib <- epiprocess::as_epi_df(tib) +epi_tib <- as_epi_df(tib) test_that("grouped epi_df maintains type for select", { grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-y) @@ -108,9 +108,7 @@ test_that("grouped epi_df handles extra keys correctly", { geo_value = rep(c("ca", "hi"), each = 5), extra_key = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2) ) - epi_tib <- epiprocess::as_epi_df(tib, - additional_metadata = list(other_keys = "extra_key") - ) + epi_tib <- as_epi_df(tib, other_keys = "extra_key") grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-extra_key) expect_true(inherits(selected_df, "epi_df")) diff --git a/tests/testthat/test-epi_df_forbidden_methods.R b/tests/testthat/test-epi_df_forbidden_methods.R new file mode 100644 index 00000000..62d7cba0 --- /dev/null +++ b/tests/testthat/test-epi_df_forbidden_methods.R @@ -0,0 +1,23 @@ +edf <- as_epi_df(tibble( + geo_value = rep("nd", 10L), + time_value = as.Date("2020-01-01") + 1:10 - 1L, + value = 1:10 +)) + +test_that("Forbidden epi_df methods catches omitted column names in slide comp", { + for (f in list(mean, sum, prod, min, max, all, any, range)) { + expect_error(edf %>% epi_slide(.window_size = 7L, ~ f(.x)), + class = "epiprocess__summarizer_on_entire_epi_df" + ) + expect_error(edf %>% group_by(geo_value) %>% epi_slide(.window_size = 7L, ~ f(.x)), + class = "epiprocess__summarizer_on_entire_epi_df" + ) + } +}) + +test_that("Forbidden epi_df methods have decent error messages", { + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ mean(.x))) + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ sum(.x))) + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ min(.x))) + expect_snapshot(error = TRUE, edf %>% epi_slide(.window_size = 7L, ~ range(.x))) +}) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f369fe15..d644e9a7 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1,1123 +1,608 @@ -## Create an epi. df and a function to test epi_slide with - -test_date <- as.Date("2020-01-01") -days_dt <- as.difftime(1, units = "days") -weeks_dt <- as.difftime(1, units = "weeks") - -ungrouped <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5)) -) %>% - as_epi_df() -grouped <- ungrouped %>% - group_by(geo_value) - -small_x <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5)) -) %>% - as_epi_df(as_of = test_date + 6) %>% - group_by(geo_value) - -f <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) - -toy_edf <- tibble::tribble( - ~geo_value, ~time_value, ~value, - "a", test_date + 1:10, 2L^(1:10), - "b", test_date + 1:10, 2L^(11:20), -) %>% - tidyr::unchop(c(time_value, value)) %>% - as_epi_df(as_of = test_date + 100) - -# nolint start: line_length_linter. -basic_sum_result <- tibble::tribble( - ~geo_value, ~time_value, ~value, ~slide_value, - "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), -) %>% - tidyr::unchop(c(time_value, value, slide_value)) %>% - dplyr::arrange(time_value) %>% - as_epi_df(as_of = test_date + 100) +library(cli) +library(dplyr) +library(purrr) + +num_rows_per_group <- 30 +get_test_date <- function(time_type = "day") { + switch(time_type, + day = as.Date("2020-01-01"), + week = as.Date("2020-01-01"), + yearmonth = tsibble::make_yearmonth(year = 2022, month = 1), + integer = 2022L + ) +} +get_test_units <- function(time_type = "day") { + switch(time_type, + day = as.difftime(1, units = "days"), + week = as.difftime(1, units = "weeks"), + yearmonth = 1L, + integer = 1L + ) +} +# Returns a tibble with two geos on the same time index and one geo with a +# different but overlapping time index. Each geo has a missing value somewhere +# in the middle and a separate reported NA elsewhere. +get_test_dataset <- function(n, time_type = "day", other_keys = FALSE) { + checkmate::assert_integerish(n, lower = 1) + checkmate::assert_character(time_type) + checkmate::assert_logical(other_keys) + # Do this to actually get n rows per group. + n_ <- n - 1 + + values <- vctrs::vec_assign(0:n_, floor(n * 2 / 3), value = NA_real_) + test_date <- get_test_date(time_type) + units <- get_test_units(time_type) + df <- tibble::tribble( + ~geo_value, ~time_value, ~value, + "a", test_date + units * 0:n_, values**2, + "b", test_date + units * 0:n_, (10 * n + values)**2, + "c", test_date + units * (floor(n / 2) + 0:n_), (100 * n + values)**2, + ) %>% + tidyr::unnest_longer(c("time_value", "value")) %>% + slice(-10) -basic_mean_result <- tibble::tribble( - ~geo_value, ~time_value, ~value, ~slide_value, - "a", test_date + 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), -) %>% - tidyr::unchop(c(time_value, value, slide_value)) %>% - dplyr::arrange(time_value) %>% - as_epi_df(as_of = test_date + 100) -# nolint end: line_length_linter. + if (other_keys) { + df <- bind_rows( + df %>% mutate(x = 1, value = .data$value + 1), + df %>% mutate(x = 2, value = .data$value + 2), + ) %>% + as_epi_df(as_of = test_date + n, other_keys = "x") + } else { + df <- df %>% + as_epi_df(as_of = test_date + n) + } + df %>% + arrange_canonical() %>% + group_epi_df(exclude = "time_value") +} +test_data <- get_test_dataset(num_rows_per_group, "day") + +epi_slide_sum_test <- function( + .x, + .window_size = 7, .align = "right", .ref_time_values = NULL, .all_rows = FALSE) { + checkmate::assert_class(.x, "epi_df") + if (!(checkmate::test_integerish(.window_size, lower = 1, upper = Inf) || identical(as.numeric(.window_size), Inf))) { + cli::cli_abort("`.window_size` must be a positive integer or Inf.") + } + checkmate::assert_character(.align) + checkmate::assert_subset(.align, c("right", "center", "left")) + checkmate::assert( + checkmate::checkClass(.ref_time_values, "Date", null.ok = TRUE), + checkmate::checkClass(.ref_time_values, "yearmonth"), + checkmate::checkClass(.ref_time_values, "numeric") + ) + checkmate::assert_logical(.all_rows) + + time_type <- attr(.x, "metadata")$time_type + window_args <- get_before_after_from_window(.window_size, .align, time_type) + date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) + if (is.null(.ref_time_values)) { + .ref_time_values <- date_seq_list$all_dates + } -## --- These cases generate errors (or not): --- -test_that("`before` and `after` are both vectors of length 1", { - expect_error( - epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = test_date + 3), - "Expected `before` to be a scalar value." - ) - expect_error( - epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = test_date + 3), - "Expected `after` to be a scalar value." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = test_date + 3), - "Expected `before` to be a scalar value." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = test_date + 3), - "Expected `after` to be a scalar value." - ) + .x %>% + mutate(.real = TRUE) %>% + group_epi_df(exclude = "time_value") %>% + complete(time_value = vctrs::vec_c(!!!date_seq_list, .name_spec = rlang::zap())) %>% + arrange_canonical() %>% + group_epi_df(exclude = "time_value") %>% + mutate( + slide_value = slider::slide_index_sum( + .data$value, + .data$time_value, + before = window_args$before, + after = window_args$after + ) + ) %>% + # If .all_rows = TRUE, we need to keep all rows and NA out the ones not in + # the ref_time_values. Otherwise, we need to return only the rows in + # ref_time_values. + group_modify(~ { + available_ref_time_values <- .ref_time_values[.ref_time_values %in% .$time_value] + + if (.all_rows) { + dplyr::mutate(., slide_value = dplyr::if_else(time_value %in% available_ref_time_values, slide_value, NA)) + } else { + dplyr::filter(., time_value %in% available_ref_time_values) + } + }) %>% + dplyr::filter(.data$.real) %>% + select(-.real) %>% + relocate(all_of(key_colnames(.x)), .before = 1) +} +concatenate_list_params <- function(p) { + paste(paste0(names(p), "=", p), collapse = "\n") +} +is_null_or_na <- function(x) { + is.null(x) || + (is.na(x) && (is.logical(x) || is.double(x))) || + identical(x, list(NULL)) || + identical(x, list(NA)) || + identical(x, list(NA_real_)) +} +test_that("is_null_or_na works", { + x1 <- NULL + x2 <- NA + x3 <- NA_real_ + x4 <- 1 + x5 <- "NA" + x6 <- list(NULL) + x7 <- list(NA) + x8 <- list(NA_real_) + + expect_true(is_null_or_na(x1)) + expect_true(is_null_or_na(x2)) + expect_true(is_null_or_na(x3)) + expect_false(is_null_or_na(x4)) + expect_false(is_null_or_na(x5)) + expect_true(is_null_or_na(x6)) + expect_true(is_null_or_na(x7)) + expect_true(is_null_or_na(x8)) }) +expect_equal_handle_null <- function(x, y) { + x_na_mask <- purrr::map_lgl(x, is_null_or_na) + y_na_mask <- purrr::map_lgl(y, is_null_or_na) + testthat::expect_equal(x_na_mask, y_na_mask) + testthat::expect_equal(x[!x_na_mask], y[!y_na_mask]) +} + + +# Core functionality tests across an exhaustive combination of parameters on +# non-trivial data sets with three geo_groups, with non-identical time indices, +# with missing time values, and with reported NA values. +# +# .ref_time_values can be: +# - NULL is a special case where we just use all the unique time_values in the +# data. +# - c(1, 2) correspond to test_date + 1 * units and test_date + 2 * units. +# This is outside the time_value index for group c and is close to the +# left edge for a and b, so if window_size = 7, the output should be +# either empty or NA (depending if .all_rows is TRUE or not). +# - c(8, 9) corresponds to test_date + 8 * units amd test_date + 9 * units. +# In this case, groups a and b have values, but c does not. +# +# We filter down to reduce the number of combinations: +# - Since time_types only interact with .ref_time_values, we fix all the other +# parameters to a single common value. +# - We separate out .window_size=Inf, because it is only defined for +# .align="right". +# - We test .align and .all_rows separately, with a fixed .time_Type and +# .other_keys. +param_combinations <- bind_rows( + tidyr::expand_grid( + .time_type = c("day", "week", "yearmonth", "integer"), + .other_keys = c(TRUE), + .ref_time_values = list(NULL, c(1, 2), c(8, 9)), + .all_rows = c(TRUE), + .align = c("right"), + .window_size = c(7), + ), + tidyr::expand_grid( + .time_type = c("day", "week", "yearmonth", "integer"), + .other_keys = c(TRUE), + .ref_time_values = list(NULL, c(1, 2), c(8, 9)), + .all_rows = c(TRUE), + .align = c("right"), + .window_size = c(Inf), + ), + tidyr::expand_grid( + .time_type = c("day"), + .other_keys = c(FALSE), + .ref_time_values = list(NULL, c(1, 2), c(8, 9)), + .all_rows = c(FALSE, TRUE), + .align = c("right", "center", "left"), + .window_size = c(7), + ), +) +for (p in (param_combinations %>% transpose())) { + test_data <- get_test_dataset(num_rows_per_group, p$.time_type, p$.other_keys) + units <- get_test_units(p$.time_type) + test_date <- get_test_date(p$.time_type) + p$.window_size <- p$.window_size * units + if (!is.null(p$.ref_time_values)) { + p$.ref_time_values <- test_date + units * p$.ref_time_values + } + slide_args <- p[setdiff(names(p), c(".time_type", ".other_keys"))] -test_that("Test errors/warnings for discouraged features", { - expect_error( - epi_slide(grouped, f, ref_time_values = test_date + 1), - "`before` is a required argument." - ) - - expect_error( - epi_slide_mean(grouped, col_names = value, ref_time_values = test_date + 1), - "`before` is a required argument." - ) - - expect_no_warning( - ref1 <- epi_slide(grouped, f, before = days_dt, ref_time_values = test_date + 2) - ) - expect_no_warning( - ref2 <- epi_slide(grouped, f, after = days_dt, ref_time_values = test_date + 2) + test_that( + format_inline( + "epi_slide works correctly with formula vector output and params:\n", + concatenate_list_params(p) + ), + { + out <- rlang::inject(epi_slide(test_data, .f = ~ sum(.x$value), !!!slide_args)) + expected_out <- rlang::inject(epi_slide_sum_test(test_data, !!!slide_args)) + expect_equal( + out, + expected_out + ) + } ) - expect_no_warning( - opt1 <- epi_slide_mean(grouped, - col_names = value, - before = days_dt, ref_time_values = test_date + 2, na.rm = TRUE - ) - ) - expect_no_warning( - opt2 <- epi_slide_mean(grouped, - col_names = value, - after = days_dt, ref_time_values = test_date + 2, na.rm = TRUE - ) + test_that( + format_inline( + "epi_slide works correctly with formula data.frame output and params:\n", + concatenate_list_params(p) + ), + { + out <- rlang::inject(epi_slide(test_data, .f = ~ data.frame(slide_value = sum(.x$value)), !!!slide_args)) + expected_out <- rlang::inject(epi_slide_sum_test(test_data, !!!slide_args)) + expect_equal( + out, + expected_out + ) + } ) - # Results from epi_slide and epi_slide_mean should match - expect_equal(select(ref1, -slide_value_count), opt1) - expect_equal(select(ref2, -slide_value_count), opt2) -}) + test_that( + format_inline( + "epi_slide works correctly with formula list output and params:\n", + concatenate_list_params(p) + ), + { + out <- rlang::inject(epi_slide(test_data, .f = ~ list(sum(.x$value)), !!!slide_args)) + expected_out <- rlang::inject(epi_slide_sum_test(test_data, !!!slide_args)) %>% + rowwise() %>% + mutate(slide_value = list(slide_value)) %>% + ungroup() %>% + as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>% + group_epi_df(exclude = "time_value") -test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { - expect_error( - epi_slide(grouped, f, before = -1L, ref_time_values = test_date + 2L), - "Expected `before` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide(grouped, f, after = -1L, ref_time_values = test_date + 2L), - "Expected `after` to be a difftime with units in days or a non-negative integer." - ) - expect_error(epi_slide(grouped, f, before = "a", after = days_dt, ref_time_values = test_date + 2L), - regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." - ) - expect_error(epi_slide(grouped, f, before = days_dt, after = "a", ref_time_values = test_date + 2L), - regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." - ) - expect_error(epi_slide(grouped, f, before = 0.5, after = days_dt, ref_time_values = test_date + 2L), - regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." - ) - expect_error(epi_slide(grouped, f, before = days_dt, after = 0.5, ref_time_values = test_date + 2L), - regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = test_date + 2L), - "Expected `before` to be a scalar value." - ) - expect_error( - epi_slide(grouped, f, before = days_dt, after = NA, ref_time_values = test_date + 2L), - "Expected `after` to be a scalar value." + expect_equal( + out %>% select(-slide_value), + expected_out %>% select(-slide_value) + ) + expect_equal_handle_null(out$slide_value, expected_out$slide_value) + } ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = test_date + 2L), - "Expected `before` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, after = -1L, ref_time_values = test_date + 2L), - "Expected `after` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = test_date + 2L), - regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, after = "a", ref_time_values = test_date + 2L), - regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = test_date + 2L), - regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, after = 0.5, ref_time_values = test_date + 2L), - regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = NA, after = days_dt, ref_time_values = test_date + 2L), - "Expected `before` to be a scalar value." - ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = days_dt, after = NA, ref_time_values = test_date + 2L), - "Expected `after` to be a scalar value." + test_that( + format_inline( + "epi_slide works correctly with formula tibble list output and params:\n", + concatenate_list_params(p) + ), + { + out <- rlang::inject(epi_slide(test_data, .f = ~ tibble(slide_value = list(sum(.x$value))), !!!slide_args)) + expected_out <- rlang::inject(epi_slide_sum_test(test_data, !!!slide_args)) %>% + rowwise() %>% + mutate(slide_value = list(slide_value)) %>% + ungroup() %>% + as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>% + group_epi_df(exclude = "time_value") + expect_equal( + out %>% select(-slide_value), + expected_out %>% select(-slide_value) + ) + expect_equal_handle_null(out$slide_value, expected_out$slide_value) + } ) - # Non-integer-class but integer-compatible values are allowed: - expect_no_error( - ref <- epi_slide(grouped, f, before = days_dt, after = days_dt, ref_time_values = test_date + 2L) + test_that( + format_inline( + "epi_slide works with unnamed data-masking data.frame and params:\n", + concatenate_list_params(p) + ), + { + expected_out <- rlang::inject(epi_slide_sum_test(test_data, !!!slide_args)) + expect_equal( + rlang::inject(epi_slide( + test_data, , data.frame(slide_value = sum(.x$value)), + !!!slide_args + )), + expected_out + ) + } ) - expect_no_error(opt <- epi_slide_mean( - grouped, - col_names = value, before = days_dt, after = days_dt, - ref_time_values = test_date + 2L, na.rm = TRUE - )) - - # Results from epi_slide and epi_slide_mean should match - expect_equal(select(ref, -slide_value_count), opt) -}) - -test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { - expect_error( - epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, no data in the slide windows - expect_error( - epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 207L), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, no data in window - expect_error( - epi_slide_mean(grouped, col_names = value, before = 2 * days_dt, ref_time_values = test_date), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, no data in the slide windows - expect_error( - epi_slide_mean( - grouped, - col_names = value, - before = 2 * days_dt, - ref_time_values = test_date + 207L + test_that( + format_inline( + "epi_slide and epi_slide_opt/sum/mean outputs are consistent. Params:\n", + concatenate_list_params(p) ), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, no data in window -}) - -test_that( - c( - "`ref_time_values` + `before` + `after` that have some slide data, but - generate the error due to ref. time being out of time range (would - also happen if they were in between `time_value`s)" - ), - { - expect_error( - epi_slide(grouped, f, after = 2 * days_dt, ref_time_values = test_date), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, but we'd expect there to be data in the window - expect_error( - epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 201L), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, but still with data in window - - expect_error( - epi_slide_mean(grouped, value, after = 2 * days_dt, ref_time_values = test_date), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, but we'd expect there to be data in the window - expect_error( - epi_slide_mean(grouped, value, before = 2 * days_dt, ref_time_values = test_date + 201L), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, but still with data in window - } -) - -## --- These cases doesn't generate the error: --- -test_that( - c( - "these doesn't produce an error; the error appears only if the ref time - values are out of the range for every group" - ), - { - expect_equal( - epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 200L) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199) - ) # out of range for one group - expect_equal( - epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 3) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) - ) # not out of range for either group - - expect_equal( - epi_slide_mean( - grouped, value, - before = 2 * days_dt, ref_time_values = test_date + 200L, na.rm = TRUE - ) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199) - ) # out of range for one group - expect_equal( - epi_slide_mean( - grouped, value, - before = 2 * days_dt, ref_time_values = test_date + 3, na.rm = TRUE - ) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) - ) # not out of range for either group - } -) - -test_that("computation output formats x as_list_col", { - # See `toy_edf` and `basic_sum_result` definitions at top of file. - # We'll try 7d sum with a few formats. - expect_equal( - toy_edf %>% - epi_slide(before = 6 * days_dt, ~ sum(.x$value)), - basic_sum_result - ) - expect_equal( - toy_edf %>% - epi_slide(before = 6 * days_dt, ~ sum(.x$value), as_list_col = TRUE), - basic_sum_result %>% dplyr::mutate(slide_value = as.list(slide_value)) - ) - expect_equal( - toy_edf %>% - epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value))), - basic_sum_result %>% rename(slide_value_value = slide_value) - ) - expect_equal( - toy_edf %>% - epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), - basic_sum_result %>% - mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + { + out_sum <- rlang::inject(epi_slide(test_data, ~ sum(.x$value), !!!slide_args)) %>% + rename(slide_value_value = slide_value) + out_mean <- rlang::inject(epi_slide(test_data, ~ mean(.x$value), !!!slide_args)) %>% + rename(slide_value_value = slide_value) + + expect_equal( + out_sum, + rlang::inject(epi_slide_opt(test_data, value, .f = data.table::frollsum, !!!slide_args)) + ) + expect_equal( + out_sum, + rlang::inject(epi_slide_opt(test_data, value, .f = slider::slide_sum, !!!slide_args)) + ) + expect_equal( + out_sum, + rlang::inject(epi_slide_sum(test_data, value, !!!slide_args)) + ) + expect_equal( + out_mean, + rlang::inject(epi_slide_opt(test_data, value, .f = data.table::frollmean, !!!slide_args)) + ) + expect_equal( + out_mean, + rlang::inject(epi_slide_opt(test_data, value, .f = slider::slide_mean, !!!slide_args)) + ) + expect_equal( + out_mean, + rlang::inject(epi_slide_mean(test_data, value, !!!slide_args)) + ) + } ) -}) +} -test_that("epi_slide_mean errors when `as_list_col` non-NULL", { - # See `toy_edf` and `basic_mean_result` definitions at top of file. - # We'll try 7d avg with a few formats. - # Warning: not exactly the same naming behavior as `epi_slide`. +test_that(".window_size as integer works", { expect_equal( - toy_edf %>% - filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6 * days_dt, na.rm = TRUE - ), - basic_mean_result %>% dplyr::mutate( - slide_value_value = slide_value - ) %>% - select(-slide_value) - ) - expect_error( - toy_edf %>% - filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6 * days_dt, as_list_col = TRUE, na.rm = TRUE - ), - class = "epiprocess__epi_slide_opt__list_not_supported" + epi_slide(test_data, ~ sum(.x$value), .window_size = 7), + epi_slide_sum_test(test_data, .window_size = 7) ) - # `epi_slide_mean` doesn't return dataframe columns }) -test_that("nested dataframe output names are controllable", { - expect_equal( - toy_edf %>% - epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - new_col_name = "result" - ), - basic_sum_result %>% rename(result_value = slide_value) - ) - expect_equal( - toy_edf %>% - epi_slide( - before = 6 * days_dt, ~ data.frame(value_sum = sum(.x$value)), - names_sep = NULL - ), - basic_sum_result %>% rename(value_sum = slide_value) +bad_values <- list( + "a", 0.5, -1L, -1.5, 1.5, NA, c(0, 1) +) +for (bad_value in bad_values) { + test_that( + format_inline("`.window_size` fails on {bad_value}"), + { + expect_error( + epi_slide(test_data, ~ sum(.x), .window_size = bad_value), + class = "epiprocess__validate_slide_window_arg" + ) + expect_error( + epi_slide_mean(test_data, ~ sum(.x), .col_names = value, .window_size = bad_value), + class = "epiprocess__validate_slide_window_arg" + ) + } ) -}) +} -test_that("non-size-1 outputs are recycled", { - # trying with non-size-1 computation outputs: - # nolint start: line_length_linter. - basic_result_from_size2 <- tibble::tribble( - ~geo_value, ~time_value, ~value, ~slide_value, - "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, - ) %>% - tidyr::unchop(c(time_value, value, slide_value)) %>% - dplyr::arrange(time_value) %>% - as_epi_df(as_of = test_date + 100) - # nolint end - expect_equal( - toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + 0:1), - basic_result_from_size2 - ) - expect_equal( - toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + 0:1, as_list_col = TRUE), - basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) - ) - expect_equal( - toy_edf %>% epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value) + 0:1)), - basic_result_from_size2 %>% rename(slide_value_value = slide_value) +test_that(format_inline("epi_slide should fail when `.ref_time_values` is out of range for all groups "), { + bad_values <- c(min(test_data$time_value) - 1, max(test_data$time_value) + 1) + expect_error( + epi_slide(test_data, ~ sum(.x), .ref_time_values = bad_values, .window_size = 7), + class = "epiprocess__epi_slide_invalid_ref_time_values" ) - expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE - ), - basic_result_from_size2 %>% - mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + expect_error( + epi_slide_mean(test_data, .col_names = value, .ref_time_values = bad_values, .window_size = 7), + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" ) }) test_that("epi_slide alerts if the provided f doesn't take enough args", { - f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) - # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error( - epi_slide(grouped, f_xgt, before = days_dt, ref_time_values = test_date + 1), - regexp = NA + f_tib_avg_count <- function(x, g, t) dplyr::tibble(avg = mean(x$value), count = length(x$value)) + expect_no_error( + epi_slide(test_data, f_tib_avg_count, .window_size = 7), ) - expect_warning( - epi_slide(grouped, f_xgt, before = days_dt, ref_time_values = test_date + 1), - regexp = NA + expect_no_warning( + epi_slide(test_data, f_tib_avg_count, .window_size = 7), ) - f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) - expect_warning(epi_slide(grouped, f_x_dots, before = days_dt, ref_time_values = test_date + 1), + f_x_dots <- function(x, ...) dplyr::tibble(mean_value = mean(x$value), count = length(x$value)) + expect_warning( + epi_slide(test_data, f_x_dots, .window_size = 7), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) -test_that("`ref_time_values` + `all_rows = TRUE` works", { - # See `toy_edf` definition at top of file. We'll do variants of a slide - # returning the following: - # nolint start: line_length_linter. - basic_full_result <- tibble::tribble( - ~geo_value, ~time_value, ~value, ~slide_value, - "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - ) %>% - tidyr::unchop(c(time_value, value, slide_value)) %>% - dplyr::arrange(time_value) %>% - as_epi_df(as_of = test_date + 100) - # nolint end - # slide computations returning atomic vecs: - expect_equal( - toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value)), - basic_full_result - ) - expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ sum(.x$value), - ref_time_values = test_date + c(2L, 8L) - ), - basic_full_result %>% dplyr::filter(time_value %in% (test_date + c(2L, 8L))) - ) +test_that("epi_slide computation via f can use ref_time_value", { + expected_out <- test_data %>% mutate(slide_value = time_value) expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ sum(.x$value), - ref_time_values = test_date + c(2L, 8L), all_rows = TRUE - ), - basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), - slide_value, NA_integer_ - )) + epi_slide(test_data, ~.ref_time_value, .window_size = 7), + expected_out ) - expect_equal( - toy_edf %>% filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6 * days_dt, names_sep = NULL, na.rm = TRUE - ), - basic_mean_result %>% - rename(slide_value_value = slide_value) + epi_slide(test_data, ~.z, .window_size = 7), + expected_out ) expect_equal( - toy_edf %>% filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6 * days_dt, ref_time_values = test_date + c(2L, 8L), - names_sep = NULL, na.rm = TRUE - ), - filter(basic_mean_result, time_value %in% (test_date + c(2L, 8L))) %>% - rename(slide_value_value = slide_value) + epi_slide(test_data, ~..3, .window_size = 7), + expected_out ) expect_equal( - toy_edf %>% filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6 * days_dt, ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, - names_sep = NULL, na.rm = TRUE - ), - basic_mean_result %>% - dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), - slide_value, NA_integer_ - )) %>% - select(-slide_value) + epi_slide(test_data, .f = function(x, g, t) t, .window_size = 7), + expected_out ) +}) - # slide computations returning data frames: +test_that("epi_slide computation via f can use group", { + expected_out <- test_data %>% mutate(slide_value = geo_value) expect_equal( - toy_edf %>% epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value))), - basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + epi_slide(test_data, .f = ~ .group_key$geo_value, .window_size = 7), + expected_out ) expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L) - ), - basic_full_result %>% - dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% - dplyr::rename(slide_value_value = slide_value) + epi_slide(test_data, .f = ~ .y$geo_value, .window_size = 7), + expected_out ) expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L), all_rows = TRUE - ), - basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), - slide_value, NA_integer_ - )) %>% - dplyr::rename(slide_value_value = slide_value) - ) - # slide computations returning data frames with `as_list_col=TRUE`: - expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - as_list_col = TRUE - ), - basic_full_result %>% - dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) - ) - expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L), - as_list_col = TRUE - ), - basic_full_result %>% - dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% - dplyr::filter(time_value %in% (test_date + c(2L, 8L))) - ) - expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE - ), - basic_full_result %>% - dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), - slide_value, list(NULL) - )) + epi_slide(test_data, .f = ~ ..2$geo_value, .window_size = 7), + expected_out ) - # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - as_list_col = TRUE - ) %>% - unnest(slide_value, names_sep = "_"), - basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + epi_slide(test_data, .f = function(x, g, t) g$geo_value, .window_size = 7), + expected_out ) +}) + +test_that("epi_slide computation via dots can use ref_time_value", { expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L), - as_list_col = TRUE - ) %>% - unnest(slide_value, names_sep = "_"), - basic_full_result %>% - dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% - dplyr::rename(slide_value_value = slide_value) + epi_slide(test_data, slide_value = .ref_time_value, .window_size = 7), + mutate(test_data, slide_value = time_value) ) +}) + +test_that("epi_slide computation via dots can use group", { expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE - ) %>% - unnest(slide_value, names_sep = "_"), - basic_full_result %>% - # XXX unclear exactly what we want in this case. Current approach is - # compatible with `vctrs::vec_detect_missing` but breaks `tidyr::unnest` - # compatibility - dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% - dplyr::rename(slide_value_value = slide_value) + epi_slide(test_data, slide_value = nrow(.group_key), .window_size = 7), + mutate(test_data, slide_value = 1L) ) - rework_nulls <- function(slide_values_list) { - vctrs::vec_assign( - slide_values_list, - vctrs::vec_detect_missing(slide_values_list), - list(vctrs::vec_cast(NA, vctrs::vec_ptype_common(!!!slide_values_list))) - ) - } expect_equal( - toy_edf %>% epi_slide( - before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), - ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE - ) %>% - mutate(slide_value = rework_nulls(slide_value)) %>% - unnest(slide_value, names_sep = "_"), - basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), - slide_value, NA_integer_ - )) %>% - dplyr::rename(slide_value_value = slide_value) + epi_slide(test_data, slide_value = .group_key$geo_value, .window_size = 7), + mutate(test_data, slide_value = geo_value) ) }) -test_that("`epi_slide` doesn't decay date output", { - expect_true( - ungrouped %>% - epi_slide(before = 5 * days_dt, ~ as.Date("2020-01-01")) %>% - `[[`("slide_value") %>% - inherits("Date") - ) +test_that("epi_slide computation should not allow access from .data and .env", { + expect_error(epi_slide(test_data, slide_value = .env$.ref_time_value, .window_size = 7)) + expect_error(epi_slide(test_data, slide_value = .data$.ref_time_value, .window_size = 7)) + expect_error(epi_slide(test_data, slide_value = .env$.group_key, .window_size = 7)) + expect_error(epi_slide(test_data, slide_value = .data$.group_key, .window_size = 7)) }) -test_that("basic grouped epi_slide computation produces expected output", { - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - # formula - result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50 * days_dt) - expect_equal(result1, expected_output) - - # function - result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50 * days_dt) - expect_equal(result2, expected_output) - - # dots - result3 <- epi_slide(small_x, slide_value = sum(value), before = 50 * days_dt) - expect_equal(result3, expected_output) -}) - -test_that("basic grouped epi_slide_mean computation produces expected output", { - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- epi_slide_mean(small_x, value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) - expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) -}) +test_that("epi_slide computation via dots outputs the same result using col names and the data var", { + expected_output <- epi_slide(test_data, slide_value = max(time_value), .window_size = 7) -test_that("ungrouped epi_slide computation completes successfully", { - expect_no_error( - small_x %>% - ungroup() %>% - epi_slide( - before = 2 * days_dt, - slide_value = sum(.x$value) - ) + expect_equal( + epi_slide(test_data, slide_value = max(.x$time_value), .window_size = 7), + expected_output ) -}) - -test_that("basic ungrouped epi_slide computation produces expected output", { - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15)) - ) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x %>% - ungroup() %>% - filter(geo_value == "ak") %>% - epi_slide( - before = 50 * days_dt, - slide_value = sum(.x$value) - ) - expect_equal(result1, expected_output) - - # Ungrouped with multiple geos - expected_output <- dplyr::bind_rows( - dplyr::tibble( - geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) - ), - dplyr::tibble( - geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) - ) - ) %>% - as_epi_df(as_of = test_date + 6) %>% - arrange(time_value) - - result2 <- small_x %>% - ungroup() %>% - epi_slide( - before = 50 * days_dt, - slide_value = sum(.x$value) - ) - expect_equal(result2, expected_output) -}) - -test_that("basic ungrouped epi_slide_mean computation produces expected output", { - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - ) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x %>% - ungroup() %>% - filter(geo_value == "ak") %>% - epi_slide_mean(value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) - expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) - - # Ungrouped with multiple geos - # epi_slide_mean fails when input data groups contain duplicate time_values, - # e.g. aggregating across geos - expect_error( - small_x %>% ungroup() %>% epi_slide_mean(value, before = 6 * days_dt), - class = "epiprocess__epi_slide_opt__duplicate_time_values" + expect_equal( + epi_slide(test_data, slide_value = max(.data$time_value), .window_size = 7), + expected_output ) }) -test_that("epi_slide computation via formula can use ref_time_value", { - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x %>% - epi_slide( - f = ~.ref_time_value, - before = 50 * days_dt - ) - - expect_equal(result1, expected_output) - - result2 <- small_x %>% - epi_slide( - f = ~.z, - before = 50 * days_dt - ) - - expect_equal(result2, expected_output) - - result3 <- small_x %>% - epi_slide( - f = ~..3, - before = 50 * days_dt - ) - - expect_equal(result3, expected_output) - - # Ungrouped with multiple geos - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) - ) %>% - as_epi_df(as_of = test_date + 6) %>% - arrange(time_value) - - result4 <- small_x %>% - ungroup() %>% - epi_slide( - f = ~.ref_time_value, - before = 50 * days_dt - ) - expect_equal(result4, expected_output) -}) - -test_that("epi_slide computation via function can use ref_time_value", { - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x %>% - epi_slide( - f = function(x, g, t) t, - before = 2 * days_dt - ) - - expect_equal(result1, expected_output) +test_that("`epi_slide` can access objects inside of helper functions", { + helper <- function(archive_haystack, time_value_needle) { + epi_slide(archive_haystack, has_needle = time_value_needle %in% time_value, .window_size = 7) + } + expect_no_error(helper(test_data, as.Date("2021-01-01"))) }) -test_that("epi_slide computation via dots can use ref_time_value and group", { - # ref_time_value - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x %>% - epi_slide( - before = 50 * days_dt, - slide_value = .ref_time_value - ) - - expect_equal(result1, expected_output) - - # `.{x,group_key,ref_time_value}` should be inaccessible from `.data` and - # `.env`. - expect_error(small_x %>% - epi_slide( - before = 50 * days_dt, - slide_value = .env$.ref_time_value - )) - - # group_key - # Use group_key column - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = "ak"), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = "al") +test_that("epi_slide can use sequential data masking expressions including NULL", { + edf_a <- tibble::tibble( + geo_value = 1, + time_value = 1:10, + value = 1:10 * 1.0 ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) + as_epi_df(as_of = 12L) - result3 <- small_x %>% - epi_slide( - before = 2 * days_dt, - slide_value = .group_key$geo_value - ) - - expect_equal(result3, expected_output) - - # Use entire group_key object - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = 1L), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = 1L) - ) %>% + out <- edf_a %>% group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result4 <- small_x %>% - epi_slide( - before = 2 * days_dt, - slide_value = nrow(.group_key) - ) - - expect_equal(result4, expected_output) - - # Ungrouped with multiple geos - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) - ) %>% - as_epi_df(as_of = test_date + 6) %>% - arrange(time_value) - - result5 <- small_x %>% - ungroup() %>% epi_slide( - before = 50 * days_dt, - slide_value = .ref_time_value - ) - expect_equal(result5, expected_output) -}) - -test_that("epi_slide computation via dots outputs the same result using col names and the data var", { - expected_output <- small_x %>% - epi_slide( - before = 2 * days_dt, - slide_value = max(time_value) + .window_size = 5L, .align = "center", + m1 = .x$value[1], + m5 = .x$value[5], + derived_m5 = m1 + 4, + m1 = NULL ) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x %>% - epi_slide( - before = 2 * days_dt, - slide_value = max(.x$time_value) - ) - - expect_equal(result1, expected_output) - - result2 <- small_x %>% - epi_slide( - before = 2 * days_dt, - slide_value = max(.data$time_value) - ) - - expect_equal(result2, expected_output) + ungroup() %>% + as_epi_df(as_of = 12L) + na_mask <- !is.na(out$m5) & !is.na(out$derived_m5) + expect_equal(out$m5[na_mask], out$derived_m5[na_mask]) + expect_true(!"m1" %in% names(out)) }) -test_that("`epi_slide` can access objects inside of helper functions", { - helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epi_slide( - has_needle = time_value_needle %in% time_value, before = 365000L * days_dt - ) - } +test_that("epi_slide complains on invalid computation outputs", { expect_error( - helper(small_x, as.Date("2021-01-01")), - NA + epi_slide(test_data, .f = ~ lm(value ~ time_value, .x), .window_size = 7), + class = "epiprocess__invalid_slide_comp_value" ) -}) - -test_that("basic slide behavior is correct when groups have non-overlapping date ranges", { - small_x_misaligned_dates <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5)) - ) %>% - as_epi_df(as_of = test_date + 6) %>% - group_by(geo_value) - - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - dplyr::tibble( - geo_value = "al", time_value = test_date + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5 - ) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50 * days_dt) - expect_equal(result1, expected_output) - - result2 <- epi_slide_mean( - small_x_misaligned_dates, value, - before = 50 * days_dt, names_sep = NULL, na.rm = TRUE + expect_no_error( + epi_slide(test_data, .f = ~ list(lm(value ~ time_value, .x)), .window_size = 7), + class = "epiprocess__invalid_slide_comp_value" + ) + expect_error( + epi_slide(test_data, model = lm(value ~ time_value, .x), .window_size = 7), + class = "epiprocess__invalid_slide_comp_tidyeval_output" + ) + expect_no_error( + epi_slide(test_data, model = list(lm(value ~ time_value, .x)), .window_size = 7), + class = "epiprocess__invalid_slide_comp_tidyeval_output" + ) + expect_error( + epi_slide(test_data, .f = ~ sum(.x$value) + c(0, 0, 0), .window_size = 7), + class = "epiprocess__invalid_slide_comp_value" + ) + expect_error( + epi_slide(test_data, .f = ~ as.list(sum(.x$value) + c(0, 0, 0)), .window_size = 7), + class = "epiprocess__invalid_slide_comp_value" + ) + expect_error( + epi_slide(test_data, .f = ~ data.frame(slide_value = sum(.x$value) + c(0, 0, 0)), .window_size = 7), + class = "epiprocess__invalid_slide_comp_value" ) - expect_equal(result2, expected_output %>% rename(slide_value_value = slide_value)) }) - -test_that("epi_slide gets correct ref_time_value when groups have non-overlapping date ranges", { - small_x_misaligned_dates <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5)) - ) %>% - as_epi_df(as_of = test_date + 6) %>% - group_by(geo_value) - - expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), - dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5), slide_value = test_date + 151:155) - ) %>% - group_by(geo_value) %>% - as_epi_df(as_of = test_date + 6) - - result1 <- small_x_misaligned_dates %>% - epi_slide( - before = 50 * days_dt, - slide_value = .ref_time_value - ) - - expect_equal(result1, expected_output) +test_that("epi_slide can use {nm} :=", { + nm <- "slide_value" + expect_identical( + # unfortunately, we can't pass this directly as `f` and need an extra comma + epi_slide(test_data, , !!nm := sum(value), .window_size = 7), + epi_slide_sum_test(test_data, .window_size = 7) + ) }) -test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { - test_time_type_mean <- function(dates, vals, before = 6 * days_dt, after = 0 * days_dt, n, m, n_obs, k, ...) { - # Three states, with 2 variables. a is linear, going up in one state and down in the other - # b is just random. last (m-1):(n-1) dates are missing - epi_data <- epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = dates, - a = 1:n_obs, - b = vals - ), tibble( - geo_value = "ca", - time_value = dates, - a = n_obs:1, - b = vals + 10 - ))) %>% - group_by(geo_value) - - # Use the `epi_slide` result as a reference. - result1 <- epi_slide(epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = before, after = after, names_sep = NULL, ... - ) - result2 <- epi_slide_mean(epi_data, col_names = c(a, b), na.rm = TRUE, before = before, after = after, ...) - expect_equal(result1, result2) - } - - set.seed(0) - - # 3 missing dates - n <- 15 # Max date index - m <- 3 # Number of missing dates - n_obs <- n + 1 - m # Number of obs created - k <- c(0:(n - (m + 1)), n) # Date indices - - rand_vals <- rnorm(n_obs) - # Basic time type - days <- as.Date("2022-01-01") + k - - test_time_type_mean(days, rand_vals, before = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 1 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - - # Without any missing dates - n <- 15 # Max date index - m <- 0 # Number of missing dates - n_obs <- n + 1 - m # Number of obs created - k <- c(0:(n - (m + 1)), n) # Date indices - - rand_vals <- rnorm(n_obs) - # Basic time type - days <- as.Date("2022-01-01") + k - - test_time_type_mean(days, rand_vals, before = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 1 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) +test_that("epi_slide can produce packed outputs", { + packed_basic_result <- epi_slide_sum_test(test_data, .window_size = 7) %>% + tidyr::pack(container = c(slide_value)) %>% + dplyr_reconstruct(epi_slide_sum_test(test_data, .window_size = 7)) + expect_identical( + test_data %>% + epi_slide(~ tibble::tibble(slide_value = sum(.x$value)), .new_col_name = "container", .window_size = 7), + packed_basic_result + ) + expect_identical( + test_data %>% + epi_slide(container = tibble::tibble(slide_value = sum(.x$value)), .window_size = 7), + packed_basic_result + ) + expect_identical( + test_data %>% + epi_slide(, tibble::tibble(slide_value = sum(.x$value)), .new_col_name = "container", .window_size = 7), + packed_basic_result + ) }) -test_that("results for different time_types match between epi_slide and epi_slide_mean", { - n <- 6L # Max date index - m <- 1L # Number of missing dates - n_obs <- n + 1L - m # Number of obs created - k <- c(0L:(n - (m + 1L)), n) # Date indices - - set.seed(0) - rand_vals <- rnorm(n_obs) - - generate_special_date_data <- function(date_seq, ...) { - epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = date_seq, - a = seq_along(date_seq), - b = rand_vals - ), tibble( - geo_value = "ca", - time_value = date_seq, - a = rev(seq_along(date_seq)), - b = rand_vals + 10 - ), tibble( - geo_value = "fl", - time_value = date_seq, - a = rev(seq_along(date_seq)), - b = rand_vals * 2 - )), ...) - } - - # Basic time type, require before and after in difftimes - days <- as.Date("2022-01-01") + k - weeks <- as.Date("2022-01-01") + 7L * k - yearmonths <- tsibble::yearmonth(10L + k) - integers <- 2000L + k - - ref_epi_data <- generate_special_date_data(days) %>% - group_by(geo_value) - - ref_result <- epi_slide(ref_epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = 6 * days_dt, names_sep = NULL +test_that("nested dataframe output names are controllable", { + expect_equal( + test_data %>% epi_slide(~ data.frame(result = sum(.x$value)), .window_size = 7), + epi_slide_sum_test(test_data, .window_size = 7) %>% rename(result = slide_value) ) - - test_time_type_mean <- function(dates, before) { - # Three states, with 2 variables. a is linear, going up in one state and down in the other - # b is just random. date 10 is missing - epi_data <- generate_special_date_data(dates) %>% - group_by(geo_value) - - result1 <- epi_slide(epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = before, names_sep = NULL - ) - result2 <- epi_slide_mean(epi_data, - col_names = c(a, b), na.rm = TRUE, before = before - ) - expect_equal(result1, result2) - - # All fields except dates - expect_equal(select(ref_result, -time_value), select(result1, -time_value)) - expect_equal(select(ref_result, -time_value), select(result2, -time_value)) - } - - test_time_type_mean(days, before = 6 * days_dt) - test_time_type_mean(weeks, before = 6 * weeks_dt) - test_time_type_mean(yearmonths, before = 6) - test_time_type_mean(integers, before = 6) - - # `epi_slide_mean` can also handle `weeks` without `time_step` being - # provided, but `epi_slide` can't - epi_data <- generate_special_date_data(weeks) %>% - group_by(geo_value) - result2 <- epi_slide_mean(epi_data, - col_names = c(a, b), na.rm = TRUE, - before = 6 * weeks_dt + expect_equal( + test_data %>% epi_slide(~ data.frame(value_sum = sum(.x$value)), .window_size = 7), + epi_slide_sum_test(test_data, .window_size = 7) %>% rename(value_sum = slide_value) ) - expect_equal(select(ref_result, -time_value), select(result2, -time_value)) }) -test_that("helper `full_date_seq` returns expected date values", { - n <- 6L # Max date index - m <- 1L # Number of missing dates - n_obs <- n + 1L - m # Number of obs created - k <- c(0L:(n - (m + 1L)), n) # Date indices +test_that("`epi_slide` doesn't lose Date class output", { + expect_true( + test_data %>% + epi_slide(.window_size = 7, ~ as.Date("2020-01-01")) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) +test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { set.seed(0) - rand_vals <- rnorm(n_obs) - - generate_special_date_data <- function(date_seq, ...) { - epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = date_seq, - a = seq_along(date_seq), - b = rand_vals - ), tibble( - geo_value = "ca", - time_value = date_seq, - a = rev(seq_along(date_seq)), - b = rand_vals + 10 - ), tibble( - geo_value = "fl", - time_value = date_seq, - a = rev(seq_along(date_seq)), - b = rand_vals * 2 - )), ...) - } - - # Basic time type, require before and after in difftimes - days <- as.Date("2022-01-01") + k - weeks <- as.Date("2022-01-01") + 7L * k - yearmonths <- tsibble::yearmonth(10L + k) - integers <- 2000L + k + n <- 7 + epi_data_missing <- rbind( + tibble(geo_value = "al", a = 1:n, b = rnorm(n)), + tibble(geo_value = "ca", a = n:1, b = rnorm(n) + 10), + tibble(geo_value = "fl", a = n:1, b = rnorm(n) * 2) + ) %>% + mutate( + days = rep(as.Date("2022-01-01") - 1 + 1:n, 3), + weeks = rep(as.Date("2022-01-01") - 7 + 7L * 1:n, 3), + yearmonths = rep(tsibble::yearmonth(10L - 1 + 1:n), 3), + integers = rep(2000L - 1 + 1:n, 3) + ) %>% + slice(1:4, 6:7) before <- 2L after <- 1L expect_identical( full_date_seq( - generate_special_date_data(days), - before = before * days_dt, after = after * days_dt, time_type = "day" + epi_data_missing %>% + mutate(time_value = days) %>% + as_epi_df() %>% + group_by(geo_value), + before = before, after = after, time_type = "day" ), list( all_dates = as.Date(c( @@ -1129,7 +614,13 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq(generate_special_date_data(weeks), before = before, after = after, time_type = "week"), + full_date_seq( + epi_data_missing %>% + mutate(time_value = weeks) %>% + as_epi_df() %>% + group_by(geo_value), + before = before, after = after, time_type = "week" + ), list( all_dates = as.Date(c( "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", @@ -1140,7 +631,13 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq(generate_special_date_data(yearmonths), before = before, after = after, time_type = "yearmonth"), + full_date_seq( + epi_data_missing %>% + mutate(time_value = yearmonths) %>% + as_epi_df() %>% + group_by(geo_value), + before = before, after = after, time_type = "yearmonth" + ), list( all_dates = tsibble::yearmonth(10:16), pad_early_dates = tsibble::yearmonth(8:9), @@ -1148,11 +645,17 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq(generate_special_date_data(integers), before = before, after = after, time_type = "integer"), + full_date_seq( + epi_data_missing %>% + mutate(time_value = integers) %>% + as_epi_df() %>% + group_by(geo_value), + before = before, after = after, time_type = "integer" + ), list( - all_dates = 2000L:2006L, - pad_early_dates = 1998L:1999L, - pad_late_dates = 2007L + all_dates = as.double(2000:2006), + pad_early_dates = as.double(1998:1999), + pad_late_dates = 2007 ) ) @@ -1162,8 +665,11 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( - generate_special_date_data(days), - before = before * days_dt, after = after * days_dt, time_type = "day" + epi_data_missing %>% + mutate(time_value = days) %>% + as_epi_df() %>% + group_by(geo_value), + before = before, after = after, time_type = "day" ), list( all_dates = as.Date(c( @@ -1183,8 +689,11 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( - generate_special_date_data(days), - before = before * days_dt, after = after * days_dt, time_type = "day" + epi_data_missing %>% + mutate(time_value = days) %>% + as_epi_df() %>% + group_by(geo_value), + before = before, after = after, time_type = "day" ), list( all_dates = as.Date(c( @@ -1199,77 +708,44 @@ test_that("helper `full_date_seq` returns expected date values", { ) }) -test_that("epi_slide_mean produces same output as epi_slide_opt", { - result1 <- epi_slide_mean( - small_x, - value, - before = 50 * days_dt, - names_sep = NULL, - na.rm = TRUE - ) - result2 <- epi_slide_opt( - small_x, - value, - f = data.table::frollmean, - before = 50 * days_dt, - names_sep = NULL, - na.rm = TRUE - ) - expect_equal(result1, result2) - result3 <- epi_slide_opt( - small_x, - value, - f = slider::slide_mean, - before = 50 * days_dt, - names_sep = NULL, - na_rm = TRUE - ) - expect_equal(result1, result3) -}) - -test_that("epi_slide_sum produces same output as epi_slide_opt", { - result1 <- epi_slide_sum(small_x, value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) - result2 <- epi_slide_opt(small_x, value, - f = data.table::frollsum, - before = 50 * days_dt, names_sep = NULL, na.rm = TRUE - ) - expect_equal(result1, result2) - result3 <- epi_slide_opt(small_x, value, - f = slider::slide_sum, - before = 50 * days_dt, names_sep = NULL, na_rm = TRUE - ) - expect_equal(result1, result3) -}) test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` functions", { - expect_no_error( - epi_slide_opt( - grouped, - col_names = value, f = data.table::frollmean, - before = days_dt, ref_time_values = test_date + 1 - ) - ) - expect_no_error( - epi_slide_opt( - grouped, - col_names = value, f = slider::slide_min, - before = days_dt, ref_time_values = test_date + 1 - ) - ) reexport_frollmean <- data.table::frollmean expect_no_error( epi_slide_opt( - grouped, - col_names = value, f = reexport_frollmean, - before = days_dt, ref_time_values = test_date + 1 + test_data, + .col_names = value, .f = reexport_frollmean ) ) expect_error( epi_slide_opt( - grouped, - col_names = value, f = mean, - before = days_dt, ref_time_values = test_date + 1 + test_data, + .col_names = value, .f = mean ), class = "epiprocess__epi_slide_opt__unsupported_slide_function" ) }) + +test_that("no dplyr warnings from selecting multiple columns", { + multi_columns <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200, value2 = -1:-200), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), value2 = 1:5) + ) %>% + as_epi_df() %>% + group_by(geo_value) + expect_no_warning( + multi_slid <- epi_slide_mean(multi_columns, .col_names = c("value", "value2"), .window_size = 7) + ) + expect_equal( + names(multi_slid), + c("geo_value", "time_value", "value", "value2", "slide_value_value", "slide_value_value2") + ) + expect_no_warning( + multi_slid_select <- epi_slide_mean(multi_columns, c(value, value2), .window_size = 7) + ) + expect_equal(multi_slid_select, multi_slid) + expect_no_warning( + multi_slid_select <- epi_slide_mean(multi_columns, starts_with("value"), .window_size = 7) + ) + expect_equal(multi_slid_select, multi_slid) +}) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index d336622f..5b3de284 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -62,6 +62,95 @@ test_that("epix_merge merges and carries forward updates properly", { ) expect_identical(xy, xy_expected) + + + s1 <- tibble( + geo_value = c("ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-02")), + signal1 = c("XA", "XB", "XC") + ) + + s2 <- tibble( + geo_value = c("ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-03", "2024-08-03")), + signal2 = c("YA", "YB") + ) + + s1 <- s1 %>% as_epi_archive() + s2 <- s2 %>% as_epi_archive() + + merge1_expected <- tibble( + geo_value = rep("ca", 5), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-01", "2024-08-02", "2024-08-02")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03", "2024-08-02", "2024-08-03")), + signal1 = c("XA", "XB", "XB", "XC", "XC"), + signal2 = c(NA, NA, "YA", NA, "YB") + ) %>% as_epi_archive() + + merged1 <- epix_merge(s1, s2, sync = "locf") + + expect_identical(merged1, merge1_expected) + + s1 <- tibble( + geo_value = c("ca", "ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02", "2024-08-03")), + version = as.Date(c("2024-08-01", "2024-08-03", "2024-08-03", "2024-08-03")), + signal1 = c("XA", "XB", "XC", "XD") + ) + + s2 <- tibble( + geo_value = c("ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-02", "2024-08-02")), + signal2 = c("YA", "YB"), + ) + + + s1 <- s1 %>% as_epi_archive() + s2 <- s2 %>% as_epi_archive() + + merge2_expected <- tibble( + geo_value = rep("ca", 6), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-01", "2024-08-02", "2024-08-02", "2024-08-03")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03", "2024-08-02", "2024-08-03", "2024-08-03")), + signal1 = c("XA", "XA", "XB", NA, "XC", "XD"), + signal2 = c(NA, "YA", "YA", "YB", "YB", NA) + ) %>% as_epi_archive() + + merged2 <- epix_merge(s1, s2, sync = "locf") + + expect_identical(merged2, merge2_expected) + + s1 <- tibble( + geo_value = c("ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), + signal1 = c("XA", "XB", "XC") + ) + + s2 <- tibble( + geo_value = c("ca", "ca", "ca"), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-02")), + version = as.Date(c("2024-08-02", "2024-08-03", "2024-08-03")), + signal2 = c("YA", "YB", "YC"), + ) + + s1 <- s1 %>% as_epi_archive() + s2 <- s2 %>% as_epi_archive() + + merge3_expected <- tibble( + geo_value = rep("ca", 6), + time_value = as.Date(c("2024-08-01", "2024-08-01", "2024-08-01", "2024-08-02", "2024-08-02", "2024-08-03")), + version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03", "2024-08-02", "2024-08-03", "2024-08-03")), + signal1 = c("XA", "XA", "XA", "XB", "XB", "XC"), + signal2 = c(NA, "YA", "YB", NA, "YC", NA) + ) %>% as_epi_archive() + + merged3 <- epix_merge(s1, s2, sync = "locf") + + expect_identical(merged3, merge3_expected) }) test_that("epix_merge forbids and warns on metadata and naming issues", { @@ -88,26 +177,6 @@ test_that("epix_merge forbids and warns on metadata and naming issues", { ), regexp = "overlapping.*names" ) - expect_warning( - epix_merge( - as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L), - additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) - ), - as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L)) - ), - regexp = "x\\$additional_metadata", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - expect_warning( - epix_merge( - as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L), - additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) - ) - ), - regexp = "y\\$additional_metadata", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) }) # use `local` to prevent accidentally using the x, y, xy bindings here diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index cb7b3bdc..c0d752dc 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -1,4 +1,4 @@ -library(dplyr) +suppressPackageStartupMessages(library(dplyr)) test_date <- as.Date("2020-01-01") @@ -13,7 +13,7 @@ x <- tibble::tribble( test_date + 6, test_date + c(1:2, 4:5), 2^(7:10), test_date + 7, test_date + 2:6, 2^(11:15) ) %>% - tidyr::unnest(c(time_value, binary)) + tidyr::unchop(c(time_value, binary)) xx <- bind_cols(geo_value = rep("ak", 15), x) %>% as_epi_archive() @@ -22,14 +22,14 @@ test_that("epix_slide works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary" + .f = ~ sum(.x$binary), + .before = 2, + .new_col_name = "sum_binary" ) xx2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -44,9 +44,9 @@ test_that("epix_slide works as intended", { xx3 <- xx %>% group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% epix_slide( - f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary" + .f = ~ sum(.x$binary), + .before = 2, + .new_col_name = "sum_binary" ) expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical @@ -54,9 +54,9 @@ test_that("epix_slide works as intended", { # function interface xx4 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = function(x, gk, rtv) { + epix_slide(.f = function(x, gk, rtv) { tibble::tibble(sum_binary = sum(x$binary)) - }, before = 2, names_sep = NULL) + }, .before = 2) expect_identical(xx1, xx4) @@ -65,23 +65,22 @@ test_that("epix_slide works as intended", { group_by(.data$geo_value) %>% epix_slide( sum_binary = sum(binary), - before = 2 + .before = 2 ) expect_identical(xx1, xx5) }) -test_that("epix_slide works as intended with `as_list_col=TRUE`", { +test_that("epix_slide works as intended with list cols", { xx_dfrow1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE + .f = ~ list(data.frame(bin_sum = sum(.x$binary))), + .before = 2 ) xx_dfrow2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = c( 2^3 + 2^2, @@ -96,22 +95,20 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { xx_dfrow3 <- xx %>% group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% epix_slide( - f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE + .f = ~ list(data.frame(bin_sum = sum(.x$binary))), + .before = 2 ) expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical xx_df1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~ data.frame(bin = .x$binary), - before = 2, - as_list_col = TRUE + .f = ~ list(data.frame(bin = .x$binary)), + .before = 2 ) xx_df2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -126,13 +123,12 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { xx_scalar1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~ sum(.x$binary), - before = 2, - as_list_col = TRUE + .f = ~ list(sum(.x$binary)), + .before = 2 ) xx_scalar2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = list( 2^3 + 2^2, @@ -147,13 +143,12 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { xx_vec1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~ .x$binary, - before = 2, - as_list_col = TRUE + .f = ~ list(.x$binary), + .before = 2 ) xx_vec2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -166,28 +161,28 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { expect_identical(xx_vec1, xx_vec2) }) -test_that("epix_slide `before` validation works", { +test_that("epix_slide `.before` validation works", { expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = NA), - "Expected `before` to be a scalar value." + xx %>% epix_slide(.f = ~ sum(.x$binary), .before = NA), + class = "epiprocess__validate_slide_window_arg" ) expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = -1), - "Expected `before` to be a difftime with units in days or a non-negative integer." + xx %>% epix_slide(.f = ~ sum(.x$binary), .before = -1), + class = "epiprocess__validate_slide_window_arg" ) expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 1.5), - "Expected `before` to be a difftime with units in days or a non-negative integer." + xx %>% epix_slide(.f = ~ sum(.x$binary), .before = 1.5), + class = "epiprocess__validate_slide_window_arg" ) # These `before` values should be accepted: - expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = 0)) - expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = 2)) - expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = as.difftime(365000, units = "days"))) + expect_no_error(xx %>% epix_slide(.f = ~ sum(.x$binary), .before = 0)) + expect_no_error(xx %>% epix_slide(.f = ~ sum(.x$binary), .before = 2)) + expect_no_error(xx %>% epix_slide(.f = ~ sum(.x$binary), .before = as.difftime(365000, units = "days"))) }) test_that("quosure passing issue in epix_slide is resolved + other potential issues", { # (First part adapted from @examples) - time_values <- seq(as.Date("2020-06-01"), + versions <- seq(as.Date("2020-06-01"), as.Date("2020-06-02"), by = "1 day" ) @@ -203,18 +198,18 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus <- ea %>% group_by(modulus) %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ) reference_by_neither <- ea %>% group_by() %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ) # test the passing-something-that-must-be-enquosed behavior: # @@ -223,21 +218,21 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea %>% group_by(modulus) %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the .data pronoun behavior: expect_identical( epix_slide( - x = ea %>% group_by(.data$modulus), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .x = ea %>% group_by(.data$modulus), + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) @@ -245,21 +240,21 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea %>% group_by(.data$modulus) %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the passing across-all-of-string-literal behavior: expect_identical( epix_slide( - x = ea %>% group_by(dplyr::across(all_of("modulus"))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .x = ea %>% group_by(dplyr::across(all_of("modulus"))), + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) @@ -267,10 +262,10 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea %>% group_by(across(all_of("modulus"))) %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) @@ -278,11 +273,11 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss my_group_by <- "modulus" expect_identical( epix_slide( - x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) @@ -290,30 +285,30 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the default behavior (default in this case should just be grouping by neither): expect_identical( epix_slide( - x = ea, - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .x = ea, + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_neither ) expect_identical( ea %>% epix_slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" + .f = ~ mean(.x$case_rate_7d_av), + .before = 2, + .versions = versions, + .new_col_name = "case_rate_3d_av" ), reference_by_neither ) @@ -328,11 +323,11 @@ ea <- tibble::tribble( test_date + 6, test_date + 1:5, 2^(5:1), test_date + 7, test_date + 1:6, 2^(6:1) ) %>% - tidyr::unnest(c(time_value, binary)) %>% + tidyr::unchop(c(time_value, binary)) %>% mutate(geo_value = "ak") %>% as_epi_archive() -test_that("epix_slide with all_versions option has access to all older versions", { +test_that("epix_slide with .all_versions option has access to all older versions", { slide_fn <- function(x, gk, rtv) { return(tibble( n_versions = length(unique(x$DT$version)), @@ -347,16 +342,15 @@ test_that("epix_slide with all_versions option has access to all older versions" result1 <- ea %>% group_by() %>% epix_slide( - f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE + .f = slide_fn, + .before = 10^3, + .all_versions = TRUE ) expect_true(inherits(result1, "tbl_df")) result2 <- tibble::tribble( - ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + ~version, ~n_versions, ~n_row, ~dt_class1, ~dt_key, test_date + 2, 1L, sum(1:1), "data.table", key(ea$DT), test_date + 3, 2L, sum(1:2), "data.table", key(ea$DT), test_date + 4, 3L, sum(1:3), "data.table", key(ea$DT), @@ -370,10 +364,9 @@ test_that("epix_slide with all_versions option has access to all older versions" result3 <- ea %>% group_by() %>% epix_slide( - f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE + .f = slide_fn, + .before = 10^3, + .all_versions = TRUE ) expect_identical(result1, result3) # This and * Imply result2 and result3 are identical @@ -382,10 +375,9 @@ test_that("epix_slide with all_versions option has access to all older versions" result4 <- ea %>% group_by() %>% epix_slide( - f = ~ slide_fn(.x, .y), - before = 10^3, - names_sep = NULL, - all_versions = TRUE + .f = ~ slide_fn(.x, .y), + .before = 10^3, + .all_versions = TRUE ) expect_identical(result1, result4) # This and * Imply result2 and result4 are identical @@ -394,13 +386,11 @@ test_that("epix_slide with all_versions option has access to all older versions" result5 <- ea %>% group_by() %>% epix_slide( - data = slide_fn( - .x, - stop("slide_fn doesn't use group key, no need to prepare it") - ), - before = 10^3, - names_sep = NULL, - all_versions = TRUE + # unfortunately, we can't pass this directly as `f` and need an extra comma + , + slide_fn(.x, .group_key, .version), + .before = 10^3, + .all_versions = TRUE ) expect_identical(result1, result5) # This and * Imply result2 and result5 are identical @@ -408,37 +398,36 @@ test_that("epix_slide with all_versions option has access to all older versions" }) test_that("epix_as_of and epix_slide with long enough window are compatible", { - # For all_versions = FALSE: + # For .all_versions = FALSE: f1 <- function(x, gk, rtv) { tibble( diff_mean = mean(diff(x$binary)) ) } - ref_time_value1 <- test_date + version1 <- test_date expect_identical( - ea %>% epix_as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% epix_as_of(version1) %>% f1() %>% mutate(version = version1, .before = 1L), ea %>% epix_slide( f1, - before = 1000, - ref_time_values = ref_time_value1, - names_sep = NULL + .before = 1000, + .versions = version1 ) ) - # For all_versions = TRUE: + # For .all_versions = TRUE: f2 <- function(x, gk, rtv) { x %>% # extract time&version-lag-1 data: epix_slide( - function(subx, subgk, rtv) { + function(subx, subgk, version) { tibble(data = list( subx %>% - filter(time_value == attr(subx, "metadata")$as_of - 1) %>% - rename(real_time_value = time_value, lag1 = binary) + filter(time_value == version - 1) %>% + rename(lag1 = binary) )) }, - before = 1, names_sep = NULL + .before = 1 ) %>% # assess as nowcast: unnest(data) %>% @@ -448,19 +437,18 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { ) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } - ref_time_value2 <- test_date + 5 + version2 <- test_date + 5 expect_identical( ea %>% - epix_as_of(ref_time_value2, all_versions = TRUE) %>% + epix_as_of(version2, all_versions = TRUE) %>% f2() %>% - mutate(time_value = ref_time_value2, .before = 1L), + mutate(version = version2, .before = 1L), ea %>% epix_slide( f2, - before = 1000, - ref_time_values = ref_time_value2, - all_versions = TRUE, - names_sep = NULL + .before = 1000, + .versions = version2, + .all_versions = TRUE ) ) @@ -477,21 +465,20 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { group_by(geo_value) %>% epix_slide( f2, - before = 1000, - ref_time_values = ref_time_value2, - all_versions = TRUE, - names_sep = NULL + .before = 1000, + .versions = version2, + .all_versions = TRUE ) %>% filter(geo_value == "ak"), ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` - epix_as_of(ref_time_value2, all_versions = TRUE) %>% + epix_as_of(version2, all_versions = TRUE) %>% f2() %>% - transmute(geo_value = "ak", time_value = ref_time_value2, mean_abs_delta) %>% + transmute(geo_value = "ak", version = version2, mean_abs_delta) %>% group_by(geo_value) ) }) -test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { +test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `.all_versions=TRUE`", { slide_fn <- function(x, gk, rtv) { expect_class(x, "epi_archive") return(NA) @@ -500,27 +487,27 @@ test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_version ea %>% group_by() %>% epix_slide( - f = slide_fn, - before = 1, - ref_time_values = test_date + 5, - new_col_name = "out", - all_versions = TRUE + .f = slide_fn, + .before = 1, + .versions = test_date + 5, + .new_col_name = "out", + .all_versions = TRUE ) }) -test_that("epix_slide with all_versions option works as intended", { +test_that("epix_slide with .all_versions option works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE + .f = ~ sum(.x$DT$binary), + .before = 2, + .new_col_name = "sum_binary", + .all_versions = TRUE ) xx2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -535,10 +522,10 @@ test_that("epix_slide with all_versions option works as intended", { xx3 <- xx %>% group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% epix_slide( - f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE + .f = ~ sum(.x$DT$binary), + .before = 2, + .new_col_name = "sum_binary", + .all_versions = TRUE ) expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical @@ -557,7 +544,7 @@ test_that("epix_slide with all_versions option works as intended", { # expect_identical( # ea_updated_stale %>% # group_by(geo_value) %>% -# epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% +# epix_slide(~ slice_head(.x, n = 1L), .before = 10L) %>% # ungroup() %>% # attr("metadata") %>% # .$as_of, @@ -569,7 +556,7 @@ test_that("epix_slide with all_versions option works as intended", { test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty <- function(ea, ...) { ea %>% - epix_slide(before = 5, ..., function(x, gk, rtv) { + epix_slide(.before = 5, ..., function(x, gk, rtv) { tibble::tibble() }) } @@ -577,7 +564,7 @@ test_that("epix_slide works with 0-row computation outputs", { ea %>% epix_slide_empty(), tibble::tibble( - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) ) expect_identical( @@ -586,67 +573,47 @@ test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty(), tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) %>% group_by(geo_value) ) - # with `all_versions=TRUE`, we have something similar but never get an + # with `.all_versions=TRUE`, we have something similar but never get an # `epi_df`: expect_identical( ea %>% - epix_slide_empty(all_versions = TRUE), + epix_slide_empty(.all_versions = TRUE), tibble::tibble( - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) ) expect_identical( ea %>% group_by(geo_value) %>% - epix_slide_empty(all_versions = TRUE), + epix_slide_empty(.all_versions = TRUE), tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) %>% group_by(geo_value) ) }) -# nolint start: commented_code_linter. -# test_that("epix_slide grouped by geo can produce `epi_df` output", { -# # This is a characterization test. Not sure we actually want this behavior; -# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 -# expect_identical( -# ea %>% -# group_by(geo_value) %>% -# epix_slide(before = 5L, function(x,g) { -# tibble::tibble(value = 42) -# }, names_sep = NULL), -# tibble::tibble( -# geo_value = "ak", -# time_value = epix_slide_ref_time_values_default(ea), -# value = 42 -# ) %>% -# as_epi_df(as_of = ea$versions_end) -# ) -# }) -# nolint end - test_that("epix_slide alerts if the provided f doesn't take enough args", { f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epix_slide(xx, f = f_xgt, before = 2), regexp = NA) - expect_warning(epix_slide(xx, f = f_xgt, before = 2), regexp = NA) + expect_error(epix_slide(xx, .f = f_xgt, .before = 2), regexp = NA) + expect_warning(epix_slide(xx, .f = f_xgt, .before = 2), regexp = NA) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(epix_slide(xx, f_x_dots, before = 2), + expect_warning(epix_slide(xx, f_x_dots, .before = 2), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) -test_that("epix_slide computation via formula can use ref_time_value", { +test_that("epix_slide computation via formula can use version", { xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -654,8 +621,8 @@ test_that("epix_slide computation via formula can use ref_time_value", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~.ref_time_value, - before = 2 + .f = ~.version, + .before = 2 ) expect_identical(xx1, xx_ref) @@ -663,8 +630,8 @@ test_that("epix_slide computation via formula can use ref_time_value", { xx2 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~.z, - before = 2 + .f = ~.z, + .before = 2 ) expect_identical(xx2, xx_ref) @@ -672,17 +639,17 @@ test_that("epix_slide computation via formula can use ref_time_value", { xx3 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = ~..3, - before = 2 + .f = ~..3, + .before = 2 ) expect_identical(xx3, xx_ref) }) -test_that("epix_slide computation via function can use ref_time_value", { +test_that("epix_slide computation via function can use version", { xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -690,18 +657,18 @@ test_that("epix_slide computation via function can use ref_time_value", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - f = function(x, g, t) t, - before = 2 + .f = function(x, g, t) t, + .before = 2 ) expect_identical(xx1, xx_ref) }) -test_that("epix_slide computation via dots can use ref_time_value and group", { - # ref_time_value +test_that("epix_slide computation via dots can use version and group", { + # version xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -709,8 +676,8 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - before = 2, - slide_value = .ref_time_value + .before = 2, + slide_value = .version ) expect_identical(xx1, xx_ref) @@ -718,7 +685,7 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { # group_key xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = "ak" ) %>% group_by(geo_value) @@ -727,7 +694,7 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { xx3 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - before = 2, + .before = 2, slide_value = .group_key$geo_value ) @@ -738,7 +705,7 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { xx %>% group_by(.data$geo_value) %>% epix_slide( - before = 2, + .before = 2, slide_value = nrow(.group_key) ), NA @@ -749,14 +716,14 @@ test_that("epix_slide computation via dots outputs the same result using col nam xx_ref <- xx %>% group_by(.data$geo_value) %>% epix_slide( - before = 2, + .before = 2, sum_binary = sum(binary) ) xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - before = 2, + .before = 2, sum_binary = sum(.x$binary) ) @@ -765,7 +732,7 @@ test_that("epix_slide computation via dots outputs the same result using col nam xx2 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - before = 2, + .before = 2, sum_binary = sum(.data$binary) ) @@ -777,7 +744,7 @@ test_that("`epix_slide` doesn't decay date output", { xx$DT %>% as_tibble() %>% as_epi_archive() %>% - epix_slide(before = 5, ~ attr(.x, "metadata")$as_of) %>% + epix_slide(.before = 5, ~ attr(.x, "metadata")$as_of) %>% `[[`("slide_value") %>% inherits("Date") ) @@ -785,8 +752,93 @@ test_that("`epix_slide` doesn't decay date output", { test_that("`epix_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = Inf) + archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value) } expect_no_error(helper(archive_cases_dv_subset, as.Date("2021-01-01"))) expect_no_error(helper(xx, 3L)) }) + +test_that("`epix_slide` works with .before = Inf", { + expect_equal( + xx %>% + group_by(geo_value) %>% + epix_slide(sum_binary = sum(binary), .before = Inf) %>% + pull(sum_binary), + xx %>% + group_by(geo_value) %>% + epix_slide(sum_binary = sum(binary), .before = 365000) %>% + pull(sum_binary) + ) +}) + +test_that("`epix_slide` de-dupes labeling & value columns", { + # Deduping `version`: + # When comp is formula -> unpacked tibble: + forecasts1a <- xx %>% epix_slide(~ tibble( + version = .version, + geo_value = "not a group label, can be anything", + time_value = .version + c(0, 7), + value = 42 + )) + # When comp is data-masking: + forecasts1b <- xx %>% epix_slide( + version = .version, + geo_value = "not a group label, can be anything", + time_value = .version + c(0, 7), + value = 42 + ) + # Expected value: + forecasts1ref <- tibble( + version = rep(test_date + 4:7, each = 2L), + geo_value = "not a group label, can be anything", + time_value = version + c(0, 7), + value = 42 + ) + expect_equal(forecasts1a, forecasts1ref) + expect_equal(forecasts1b, forecasts1ref) + # Mismatch not accepted: + expect_error(xx %>% epix_slide( + version = .version - 1, + time_value = version + c(0, 7), + value = 42 + )) + # Solely parroting back values without any new columns seems likely to be + # nonsense (though this example would sort of act like a `distinct` + # operation if we accepted it): + expect_error(xx %>% epix_slide(~.version, .new_col_name = "version")) + + # Deduping group label: + # When comp is formula -> unpacked tibble: + forecasts2a <- xx %>% + group_by(geo_value) %>% + epix_slide(~ tibble( + geo_value = .group_key$geo_value, + version = .version, + time_value = .version + c(0, 7), + value = 42 + )) + # When comp is data-masking: + forecasts2b <- xx %>% + group_by(geo_value) %>% + epix_slide( + geo_value = .group_key$geo_value, + version = .version, + time_value = .version + c(0, 7), + value = 42 + ) + # Expected value: + forecasts2ref <- tibble( + geo_value = "ak", + version = rep(test_date + 4:7, each = 2L), + time_value = version + c(0, 7), + value = 42 + ) %>% group_by(geo_value) + expect_equal(forecasts2a, forecasts2ref) + expect_equal(forecasts2b, forecasts2ref) + # Mismatch not accepted: + expect_error(xx %>% group_by(geo_value) %>% epix_slide(geo_value = "bogus")) + # Solely parroting back values without any new columns seems likely to be + # nonsense (though this example would sort of act like a `distinct` + # operation if we accepted it): + expect_error(xx %>% group_by(geo_value) %>% epix_slide(~ .group_key$geo_value, .new_col_name = "geo_value")) +}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 413741aa..8ed5ea02 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -1,6 +1,6 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # From an example: - library(dplyr) + suppressPackageStartupMessages(library(dplyr)) toy_archive <- tribble( ~geo_value, ~age_group, ~time_value, ~version, ~value, @@ -37,9 +37,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { expect_error(toy_archive %>% group_by(.drop = "bogus"), regexp = "Must be of type 'logical', not 'character'" ) - expect_warning(toy_archive %>% group_by(.drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) + expect_no_warning(toy_archive %>% group_by(.drop = FALSE)) expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" ) @@ -50,9 +48,9 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) expect_identical( grouped_factor_then_nonfactor %>% - epix_slide(before = 10, s = sum(value)), + epix_slide(.before = 10, s = sum(value)), tibble::tribble( - ~age_group, ~geo_value, ~time_value, ~s, + ~age_group, ~geo_value, ~version, ~s, "pediatric", NA_character_, "2000-01-02", 0, "adult", "us", "2000-01-02", 121, "pediatric", "us", "2000-01-03", 5, @@ -60,30 +58,16 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) %>% mutate( age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) + version = as.Date(version) ) %>% - # nolint start: commented_code_linter. - # # See - # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 - # # and - # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 - # # for why this is commented out, pending some design - # # decisions. - # # - # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - # as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(age_group, geo_value, time_value, s) %>% - # nolint end group_by(age_group, geo_value, .drop = FALSE) ) expect_identical( toy_archive %>% group_by(geo_value, age_group, .drop = FALSE) %>% - epix_slide(before = 10, s = sum(value)), + epix_slide(.before = 10, s = sum(value)), tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~s, + ~geo_value, ~age_group, ~version, ~s, "us", "pediatric", "2000-01-02", 0, "us", "adult", "2000-01-02", 121, "us", "pediatric", "2000-01-03", 5, @@ -91,10 +75,10 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) %>% mutate( age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) + version = as.Date(version) ) %>% # as_epi_df(as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% + # other_keys = "age_group") %>% # # put back in expected order; see issue #166: # select(geo_value, age_group, time_value, s) %>% group_by(geo_value, age_group, .drop = FALSE) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 6686400b..45ba6ea1 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -25,13 +25,13 @@ test_that("Errors are thrown due to bad epix_as_of inputs", { test_that("Warning against max_version being clobberable", { # none by default - expect_warning(regexp = NA, ea %>% epix_as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea %>% epix_as_of(max_version = min(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes ea_with_clobberable <- ea ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) - expect_warning(ea_with_clobberable %>% epix_as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(max_version = min(ea$DT$version))) + expect_warning(ea_with_clobberable %>% epix_as_of(max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(min(ea$DT$version))) }) test_that("epix_as_of properly grabs the data and doesn't mutate key", { @@ -43,7 +43,7 @@ test_that("epix_as_of properly grabs the data and doesn't mutate key", { old_key <- data.table::key(ea2$DT) edf_as_of <- ea2 %>% - epix_as_of(max_version = as.Date("2020-06-03")) + epix_as_of(as.Date("2020-06-03")) edf_expected <- as_epi_df(tibble( geo_value = "ca", @@ -110,7 +110,6 @@ test_that("epix_truncate_version_after returns the same grouping type as input e expect_true(is_grouped_epi_archive(ea_as_of)) }) - test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { ea2 <- ea2_data %>% as_epi_archive() @@ -122,3 +121,10 @@ test_that("epix_truncate_version_after returns the same groups as input grouped_ epix_truncate_versions_after(max_version = as.Date("2020-06-04")) expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) }) + +test_that("group_vars works as expected", { + expect_equal( + ea2_data %>% as_epi_archive() %>% group_by(geo_value) %>% group_vars(), + "geo_value" + ) +}) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 27e9097c..3e5c180b 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -10,8 +10,7 @@ toy_epi_df <- tibble::tibble( indic_var1 = as.factor(rep(1:2, times = 5)), indic_var2 = as.factor(rep(letters[1:5], times = 2)) ) %>% as_epi_df( - additional_metadata = - list(other_keys = c("indic_var1", "indic_var2")) + other_keys = c("indic_var1", "indic_var2") ) att_toy <- attr(toy_epi_df, "metadata") @@ -70,21 +69,20 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { expect_equal(ncol(col_subset2), 2L) # Row and col subset that contains geo_value and time_value - should be epi_df - row_col_subset2 <- toy_epi_df[2:3, 1:3] + row_col_subset2 <- toy_epi_df[2:3, c(1, 4)] att_row_col_subset2 <- attr(row_col_subset2, "metadata") expect_true(is_epi_df(row_col_subset2)) expect_equal(nrow(row_col_subset2), 2L) - expect_equal(ncol(row_col_subset2), 3L) + expect_equal(ncol(row_col_subset2), 2L) expect_identical(att_row_col_subset2$geo_type, att_toy$geo_type) expect_identical(att_row_col_subset2$time_type, att_toy$time_type) expect_identical(att_row_col_subset2$as_of, att_toy$as_of) - expect_identical(att_row_col_subset2$other_keys, character(0)) }) test_that("When duplicate cols in subset should abort", { expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], - "Duplicated column names: time_value, y", + "Duplicated column names: indic_var1, time_value", fixed = TRUE ) expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], @@ -95,7 +93,7 @@ test_that("When duplicate cols in subset should abort", { test_that("Correct metadata when subset includes some of other_keys", { # Only include other_var of indic_var1 - only_indic_var1 <- toy_epi_df[, 1:5] + only_indic_var1 <- toy_epi_df[, c(1:2, 4:6)] att_only_indic_var1 <- attr(only_indic_var1, "metadata") expect_true(is_epi_df(only_indic_var1)) @@ -107,7 +105,7 @@ test_that("Correct metadata when subset includes some of other_keys", { expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2]) # Only include other_var of indic_var2 - only_indic_var2 <- toy_epi_df[, c(1:4, 6)] + only_indic_var2 <- toy_epi_df[, c(1, 3:6)] att_only_indic_var2 <- attr(only_indic_var2, "metadata") expect_true(is_epi_df(only_indic_var2)) @@ -131,19 +129,18 @@ test_that("Metadata is dropped by `as_tibble`", { }) test_that("Grouping are dropped by `as_tibble`", { - # tsibble is doing some method piracy, and overwriting as_tibble.grouped_df as of 1.1.5 - skip_if(packageVersion("tsibble") > "1.1.4") grouped_converted <- toy_epi_df %>% group_by(geo_value) %>% as_tibble() expect_true( !any(c("metadata", "groups") %in% names(attributes(grouped_converted))) ) + expect_s3_class(grouped_converted, class(tibble()), exact = TRUE) }) test_that("Renaming columns gives appropriate colnames and metadata", { edf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% - as_epi_df(additional_metadata = list(other_keys = "age")) + as_epi_df(other_keys = "age") # renaming using base R renamed_edf1 <- edf %>% `[`(c("geo_value", "time_value", "age", "value")) %>% @@ -152,14 +149,14 @@ test_that("Renaming columns gives appropriate colnames and metadata", { expect_identical(attr(renamed_edf1, "metadata")$other_keys, c("age_group")) # renaming using select renamed_edf2 <- edf %>% - as_epi_df(additional_metadata = list(other_keys = "age")) %>% + as_epi_df(other_keys = "age") %>% select(geo_value, time_value, age_group = age, value) expect_identical(renamed_edf1, renamed_edf2) }) test_that("Renaming columns while grouped gives appropriate colnames and metadata", { gedf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% - as_epi_df(additional_metadata = list(other_keys = "age")) %>% + as_epi_df(other_keys = "age") %>% group_by(geo_value) # renaming using base R renamed_gedf1 <- gedf %>% @@ -173,7 +170,161 @@ test_that("Renaming columns while grouped gives appropriate colnames and metadat expect_identical(attr(renamed_gedf1, "metadata")$other_keys, c("age_group")) # renaming using select renamed_gedf2 <- gedf %>% - as_epi_df(additional_metadata = list(other_keys = "age")) %>% select(geo_value, time_value, age_group = age, value) expect_identical(renamed_gedf1, renamed_gedf2) }) + +test_that("Additional `select` on `epi_df` tests", { + edf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% + as_epi_df(other_keys = "age") + + # Dropping a non-geo_value epikey column doesn't decay, though maybe it + # should, since you'd expect that to possibly result in multiple rows per + # epikey (though not in this toy case), and while we don't require that, we + # sort of expect it: + edf_not_decayed <- edf %>% + select(geo_value, time_value, value) + expect_class(edf_not_decayed, "epi_df") + expect_identical(attr(edf_not_decayed, "metadata")$other_keys, character(0L)) + + # Dropping geo_value does decay: + edf_decayed <- edf %>% + select(age, time_value, value) + expect_false(inherits(edf_decayed, "epi_df")) + expect_identical(attr(edf_decayed, "metadata"), NULL) +}) + +test_that("complete.epi_df works", { + start_date <- as.Date("2020-01-01") + daily_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 3, 3, + 2, start_date + 2, 2, + 2, start_date + 3, 3, + ) %>% + as_epi_df(as_of = start_date + 3) + # Complete without grouping puts all the geo_values on the same min and max + # time_value index + expect_identical( + daily_edf %>% + complete(geo_value, time_value = full_seq(time_value, period = 1)), + tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 2, NA, + 1, start_date + 3, 3, + 2, start_date + 1, NA, + 2, start_date + 2, 2, + 2, start_date + 3, 3, + ) %>% + as_epi_df(as_of = start_date + 3) + ) + # Complete with grouping puts all the geo_values on individual min and max + # time_value indices + expect_identical( + daily_edf %>% + group_by(geo_value) %>% + complete(time_value = full_seq(time_value, period = 1)), + tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 2, NA, + 1, start_date + 3, 3, + 2, start_date + 2, 2, + 2, start_date + 3, 3, + ) %>% + as_epi_df(as_of = start_date + 3) %>% + group_by(geo_value) + ) + # Complete has explicit=TRUE by default, but if it's FALSE, then complete only fills the implicit gaps + # not those that are explicitly NA + daily_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 2, NA, + 1, start_date + 3, 3, + 2, start_date + 2, 2, + 2, start_date + 3, 3, + ) %>% + as_epi_df(as_of = start_date + 3) + expect_identical( + daily_edf %>% + complete(geo_value, time_value = full_seq(time_value, period = 1), fill = list(value = 0), explicit = FALSE), + tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 2, NA, + 1, start_date + 3, 3, + 2, start_date + 1, 0, + 2, start_date + 2, 2, + 2, start_date + 3, 3, + ) %>% + as_epi_df(as_of = start_date + 3) + ) + # Complete works for weekly data and can take a fill value + # No grouping + weekly_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 15, 3, + 2, start_date + 8, 2, + 2, start_date + 15, 3, + ) %>% + as_epi_df(as_of = start_date + 3) + expect_identical( + weekly_edf %>% + complete(geo_value, + time_value = full_seq(time_value, period = 7), + fill = list(value = 0) + ), + tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 8, 0, + 1, start_date + 15, 3, + 2, start_date + 1, 0, + 2, start_date + 8, 2, + 2, start_date + 15, 3, + ) %>% + as_epi_df(as_of = start_date + 3) + ) + # With grouping + expect_identical( + weekly_edf %>% + group_by(geo_value) %>% + complete( + time_value = full_seq(time_value, period = 7), + fill = list(value = 0) + ), + tibble::tribble( + ~geo_value, ~time_value, ~value, + 1, start_date + 1, 1, + 1, start_date + 8, 0, + 1, start_date + 15, 3, + 2, start_date + 8, 2, + 2, start_date + 15, 3, + ) %>% + as_epi_df(as_of = start_date + 3) %>% + group_by(geo_value) + ) +}) + +test_that("sum_groups_epi_df works", { + out <- toy_epi_df %>% sum_groups_epi_df(sum_cols = "x") + expected_out <- toy_epi_df %>% + group_by(time_value) %>% + summarize(x = sum(x)) %>% + mutate(geo_value = "total") %>% + as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of) + expect_equal(out, expected_out) + + out <- toy_epi_df %>% + sum_groups_epi_df(sum_cols = c("x", "y"), group_cols = c("time_value", "geo_value", "indic_var1")) + expected_out <- toy_epi_df %>% + group_by(time_value, geo_value, indic_var1) %>% + summarize(x = sum(x), y = sum(y), .groups = "drop") %>% + as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of, other_keys = "indic_var1") %>% + arrange_canonical() + expect_equal(out, expected_out) +}) diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R new file mode 100644 index 00000000..ff722068 --- /dev/null +++ b/tests/testthat/test-revision-latency-functions.R @@ -0,0 +1,41 @@ +dummy_ex <- tibble::tribble( + ~geo_value, ~time_value, ~version, ~value, + # al 1 has 1 real revision, a lag of 0, and changes by 99 + "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, + # al 2 has no revision, a min lag of 0, and a rel_spread of 0 + "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, + # al 3 has 1 revision and a min lag of 1, and a change of 3 + "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, + "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, + # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 + "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, + "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, + # ak 1 has 4 revisions w/out NAs, but 6 with NAs + # a min lag of 2, and a change of 101 + "ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1, + "ak", as.Date("2020-01-01"), as.Date("2020-01-05"), NA, + "ak", as.Date("2020-01-01"), as.Date("2020-01-06"), 1, + "ak", as.Date("2020-01-01"), as.Date("2020-01-07"), 5, + "ak", as.Date("2020-01-01"), as.Date("2020-01-08"), 6, + "ak", as.Date("2020-01-01"), as.Date("2020-01-09"), 7, + "ak", as.Date("2020-01-01"), as.Date("2020-01-20"), 102, + # ak 2 has 1 revision a min lag of 4, a change of 9, and a rel change of 9% + "ak", as.Date("2020-01-02"), as.Date("2020-01-06"), 100, + "ak", as.Date("2020-01-02"), as.Date("2020-01-07"), 91, + # ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN + "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0, + "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0, +) %>% + as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE) + +test_that("revision_summary works for a dummy dataset", { + expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)) + expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) +}) +test_that("tidyselect is functional", { + expect_no_error(revision_summary(dummy_ex, value)) + expect_no_error(revision_summary(dummy_ex, starts_with("val"))) +}) +test_that("revision_summary works for various timetypes", {}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index e220af16..a159f98e 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -77,41 +77,56 @@ test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough f_xgt_dots <- function(x, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(assert_sufficient_f_args(f_xgt), regexp = NA) - expect_warning(assert_sufficient_f_args(f_xgt), regexp = NA) - expect_error(assert_sufficient_f_args(f_xgt_dots), regexp = NA) - expect_warning(assert_sufficient_f_args(f_xgt_dots), regexp = NA) + expect_no_error(assert_sufficient_f_args(f_xgt, .ref_time_value_label = "reference time value")) + expect_no_warning(assert_sufficient_f_args(f_xgt, .ref_time_value_label = "reference time value")) + expect_no_error(assert_sufficient_f_args(f_xgt_dots, .ref_time_value_label = "reference time value")) + expect_no_warning(assert_sufficient_f_args(f_xgt_dots, .ref_time_value_label = "reference time value")) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_dots <- function(...) dplyr::tibble(value = c(5), count = c(2)) f_x <- function(x) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f <- function() dplyr::tibble(value = c(5), count = c(2)) - expect_warning(assert_sufficient_f_args(f_x_dots), + expect_warning(assert_sufficient_f_args(f_x_dots, .ref_time_value_label = "reference time value"), regexp = ", the group key and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_warning(assert_sufficient_f_args(f_dots), + expect_warning(assert_sufficient_f_args(f_dots, .ref_time_value_label = "reference time value"), regexp = ", the window data, group key, and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_error(assert_sufficient_f_args(f_x), + expect_error(assert_sufficient_f_args(f_x, .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" ) - expect_error(assert_sufficient_f_args(f), + expect_error(assert_sufficient_f_args(f, .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" ) + # Make sure we generate the same sort of conditions on some external functions + # that have caused surprises in the past: + expect_warning(assert_sufficient_f_args(mean, .ref_time_value_label = "reference time value"), + regexp = ", the group key and reference time value will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + expect_warning(assert_sufficient_f_args(sum, .ref_time_value_label = "reference time value"), + regexp = ", the window data, group key, and reference time value will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + expect_warning(assert_sufficient_f_args(dplyr::slice, .ref_time_value_label = "reference time value"), + regexp = ", the group key and reference time value will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + f_xs_dots <- function(x, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_xs <- function(x, setting = "a") dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b"), + expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_error(assert_sufficient_f_args(f_xs, setting = "b"), + expect_error(assert_sufficient_f_args(f_xs, setting = "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" ) - expect_error(assert_sufficient_f_args(f_xgt, "b"), + expect_error(assert_sufficient_f_args(f_xgt, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" ) }) @@ -121,15 +136,15 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th f_xgt_dots <- function(x = 1, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_x_dots <- function(x = 1, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_error(assert_sufficient_f_args(f_xgt), - regexp = "pass the group key to `f`'s g argument,", + expect_error(assert_sufficient_f_args(f_xgt, .ref_time_value_label = "reference time value"), + regexp = "pass the group key to `\\.f`'s g argument,", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) - expect_error(assert_sufficient_f_args(f_xgt_dots), - regexp = "pass the window data to `f`'s x argument,", + expect_error(assert_sufficient_f_args(f_xgt_dots, .ref_time_value_label = "reference time value"), + regexp = "pass the window data to `\\.f`'s x argument,", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) - expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots)), + expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots, .ref_time_value_label = "reference time value")), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) @@ -138,23 +153,26 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th f_xs_dots <- function(x = 1, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # forwarding named dots should prevent some complaints: - expect_no_error(assert_sufficient_f_args(f_xsgt, setting = "b")) - expect_no_error(assert_sufficient_f_args(f_xsgt_dots, setting = "b")) - expect_error(suppressWarnings(assert_sufficient_f_args(f_xs_dots, setting = "b")), - regexp = "pass the window data to `f`'s x argument", + expect_no_error(assert_sufficient_f_args(f_xsgt, setting = "b", .ref_time_value_label = "reference time value")) + expect_no_error(assert_sufficient_f_args(f_xsgt_dots, setting = "b", .ref_time_value_label = "reference time value")) + expect_error( + suppressWarnings( + assert_sufficient_f_args(f_xs_dots, setting = "b", .ref_time_value_label = "reference time value") + ), + regexp = "pass the window data to `\\.f`'s x argument", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) # forwarding unnamed dots should not: - expect_error(assert_sufficient_f_args(f_xsgt, "b"), + expect_error(assert_sufficient_f_args(f_xsgt, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) - expect_error(assert_sufficient_f_args(f_xsgt_dots, "b"), + expect_error(assert_sufficient_f_args(f_xsgt_dots, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) expect_error( expect_warning( - assert_sufficient_f_args(f_xs_dots, "b"), + assert_sufficient_f_args(f_xs_dots, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" @@ -163,57 +181,75 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th # forwarding no dots should produce a different error message in some cases: expect_error( expect_warning( - assert_sufficient_f_args(f_xs_dots), + assert_sufficient_f_args(f_xs_dots, .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ), - regexp = "window data and group key to `f`'s x and setting argument", + regexp = "window data and group key to `\\.f`'s x and setting argument", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) }) test_that("computation formula-derived functions take all argument types", { # positional - expect_identical(as_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) - expect_identical(as_slide_computation(~..1)(1, 2, 3), 1) + expect_identical(as_time_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) + expect_identical(as_time_slide_computation(~..1)(1, 2, 3), 1) # Matching rlang, purr, dplyr usage - expect_identical(as_slide_computation(~ .x + .z)(1, 2, 3), 4) - expect_identical(as_slide_computation(~ .x + .y)(1, 2, 3), 3) + expect_identical(as_time_slide_computation(~ .x + .z)(1, 2, 3), 4) + expect_identical(as_time_slide_computation(~ .x + .y)(1, 2, 3), 3) # named - expect_identical(as_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) - expect_identical(as_slide_computation(~.group_key)(1, 2, 3), 2) + expect_identical(as_time_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) + expect_identical(as_time_slide_computation(~.group_key)(1, 2, 3), 2) }) test_that("as_slide_computation passes functions unaltered", { f <- function(a, b, c) { a * b * c + 5 } - expect_identical(as_slide_computation(f), f) + expect_identical(as_time_slide_computation(f), f) }) test_that("as_slide_computation raises errors as expected", { # Formulas must be one-sided - expect_error(as_slide_computation(y ~ ..1), + expect_error(as_time_slide_computation(y ~ ..1), class = "epiprocess__as_slide_computation__formula_is_twosided" ) # Formulas can't be paired with ... - expect_error(as_slide_computation(~..1, method = "fn"), + expect_error(as_time_slide_computation(~..1, method = "fn"), class = "epiprocess__as_slide_computation__formula_with_dots" ) # `f_env` must be an environment formula_without_env <- stats::as.formula(~..1) rlang::f_env(formula_without_env) <- 5 - expect_error(as_slide_computation(formula_without_env), + expect_error(as_time_slide_computation(formula_without_env), class = "epiprocess__as_slide_computation__formula_has_no_env" ) - # `f` must be a function, formula, or string - expect_error(as_slide_computation(5), + # `.f` must be a function, formula, or string + expect_error(as_time_slide_computation(5), class = "epiprocess__as_slide_computation__cant_convert_catchall" ) }) +test_that("as_slide_computation works", { + f1 <- as_slide_computation(~ .z - .x$time_value, + .ref_time_value_long_varnames = character(0L), + .ref_time_value_label = "third argument" + ) + expect_equal(f1(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + f2 <- as_time_slide_computation(~ .ref_time_value - .x$time_value) + expect_equal(f2(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + f3 <- as_diagonal_slide_computation(~ .version - .x$time_value) + expect_equal(f3(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + f4 <- as_diagonal_slide_computation(~ .ref_time_value - .x$time_value) + expect_equal(f4(tibble::tibble(time_value = 10), tibble::tibble(), 12), 2) + g <- as_time_slide_computation(~ -1 * .) + expect_equal(g(4), -4) + h <- as_time_slide_computation(~ .x - .group_key) + expect_equal(h(6, 3), 3) +}) + test_that("guess_period works", { # Error cases: expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") @@ -240,8 +276,8 @@ test_that("guess_period works", { weekly_dates ) # On POSIXcts: - daily_posixcts <- as.POSIXct(daily_dates, tz = "ET") + 3600 - weekly_posixcts <- as.POSIXct(weekly_dates, tz = "ET") + 3600 + daily_posixcts <- as.POSIXct(daily_dates, tz = "US/Aleutian") + 3600 + weekly_posixcts <- as.POSIXct(weekly_dates, tz = "US/Aleutian") + 3600 expect_identical( daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L), daily_posixcts @@ -251,8 +287,8 @@ test_that("guess_period works", { weekly_posixcts ) # On POSIXlts: - daily_posixlts <- as.POSIXlt(daily_dates, tz = "ET") + 3600 - weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "ET") + 3600 + daily_posixlts <- as.POSIXlt(daily_dates, tz = "UTC") + 3600 + weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "UTC") + 3600 expect_identical( daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L), daily_posixlts diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd deleted file mode 100644 index 1ea13c5f..00000000 --- a/vignettes/advanced.Rmd +++ /dev/null @@ -1,530 +0,0 @@ ---- -title: Advanced sliding with nonstandard outputs -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Advanced sliding with nonstandard outputs} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with less common grouping schemes or with computations that -have advanced output structures. The output of a slide computation should either -be an atomic value/vector, or a data frame. This data frame can have multiple -columns, multiple rows, or both. - -During basic usage (e.g., when all optional arguments are set to their defaults): - -* `epi_slide(edf, , .....)`: - * keeps **all** columns of `edf`, adds computed column(s) - * outputs **one row per row in `edf`** (recycling outputs from - computations appropriately if there are multiple time series bundled - together inside any group(s)) - * maintains the grouping or ungroupedness of `edf` - * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by - `dplyr::arrange(time_value, .by_group = TRUE)`** - * outputs an **`epi_df`** if the required columns are present, otherwise a - tibble -* `epix_slide(ea, , .....)`: - * keeps **grouping and `time_value`** columns of `ea`, adds computed - column(s) - * outputs **any number of rows** (computations are allowed to output any - number of elements/rows, and no recycling is performed) - * maintains the grouping or ungroupedness of `ea`, unless it was explicitly - grouped by zero variables; this isn't supported by `grouped_df` and it will - automatically turn into an ungrouped tibble - * is roughly analogous to (the non-sliding) **`dplyr::group_modify`** - * outputs a **tibble** - -These differences in basic behavior make some common slide operations require less boilerplate: - -* predictors and targets calculated with `epi_slide` are automatically lined up - with each other and with the signals from which they were calculated; and -* computations for an `epix_slide` can output data frames with any number of - rows, containing models, forecasts, evaluations, etc., and will not be - recycled. - -When using more advanced features, more complex rules apply: - -* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` - will output one row for every row in `edf` with `time_value` appearing inside - `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` - followed by `dplyr::filter` to those `ref_time_values`. We call this property - **size stability**, and describe how it is achieved in the following sections. - The default behavior described above is a special case of this general rule - based on a default value of `ref_time_values`. -* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, - all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but - instead will output one row per row in `edf`. -* Exception/feature: `epi_slide(edf, ....., as_list_col=TRUE)` will format the - output to add a single list-class computed column. -* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will format the - output to have one row per computation and a single list-class computed column - (in addition to the grouping variables and `time_value`), as if we had used - `tidyr::chop()` or `tidyr::nest()`. -* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% - epix_slide(, .....)` will call the computation on any missing - groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional - output rows. - -Below we demonstrate some advanced use cases of sliding with different output -structures. We focus on `epi_slide()` for the most part, though some of the -behavior we demonstrate also carries over to `epix_slide()`. - -## Recycling outputs - -When a computation returns a single atomic value, `epi_slide()` will internally -try to recycle the output so that it is size stable (in the sense described -above). We can use this to our advantage, for example, in order to compute a -trailing average marginally over geo values, which we demonstrate below in a -simple synthetic example. - -```{r message = FALSE} -library(epiprocess) -library(dplyr) -set.seed(123) - -edf <- tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - x = seq_along(geo_value) + 0.01 * rnorm(length(geo_value)), -) %>% - as_epi_df(as_of = as.Date("2024-03-20")) - -# 2-day trailing average, per geo value -edf %>% - group_by(geo_value) %>% - epi_slide(x_2dav = mean(x), before = 1) %>% - ungroup() - -# 2-day trailing average, marginally -edf %>% - epi_slide(x_2dav = mean(x), before = 1) -``` - -```{r, include = FALSE} -# More checks (not included) -edf %>% - epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) - -edf %>% - # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% - ungroup() - -edf %>% - # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% - ungroup() -``` - -When the slide computation returns an atomic vector (rather than a single value) -`epi_slide()` checks whether its return length ensures size stability, and if -so, uses it to fill the new column. For example, this next computation gives the -same result as the last one. - -```{r} -edf %>% - epi_slide(y_2dav = rep(mean(x), 3), before = 1) -``` - -However, if the output is an atomic vector (rather than a single value) and it -is *not* size stable, then `epi_slide()` throws an error. For example, below we -are trying to return 2 things for 3 states. - -```{r, error = TRUE} -edf %>% - epi_slide(x_2dav = rep(mean(x), 2), before = 1) -``` - -## Multi-column outputs - -Now we move on to outputs that are data frames with a single row but multiple -columns. Working with this type of output structure has in fact has already been -demonstrated in the [slide -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). When -we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` -object returned by `epi_slide()` has a list column containing the slide values. - -```{r} -edf2 <- edf %>% - group_by(geo_value) %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = TRUE - ) %>% - ungroup() - -class(edf2$a) -length(edf2$a) -edf2$a[[2]] -``` - -When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function -unnests (in the sense of `tidyr::unnest()`) the list column `a`, so that the -resulting `epi_df` has multiple new columns containing the slide values. The -default is to name these unnested columns by prefixing the name assigned to the -list column (here `a`) onto the column names of the output data frame from the -slide computation (here `x_2dav` and `x_2dma`) separated by "_". - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE - ) %>% - ungroup() -``` - -We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop -the prefix associated with list column name, in naming the unnested columns. - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL - ) %>% - ungroup() -``` - -Furthermore, `epi_slide()` will recycle the single row data frame as needed in -order to make the result size stable, just like the case for atomic values. - -```{r} -edf %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL - ) -``` - -```{r, include = FALSE} -# More checks (not included) -edf %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL - ) - -edf %>% - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL - ) %>% - ungroup() -``` - -## Multi-row outputs - -For a slide computation that outputs a data frame with more than one row, the -behavior is analogous to a slide computation that outputs an atomic vector. -Meaning, `epi_slide()` will check that the result is size stable, and if so, -will fill the new column(s) in the resulting `epi_df` object appropriately. - -This can be convenient for modeling in the following sense: we can, for example, -fit a sliding, data-versioning-unaware nowcasting or forecasting model by -pooling data from different locations, and then return separate forecasts from -this common model for each location. We use our synthetic example to demonstrate -this idea abstractly but simply by forecasting (actually, nowcasting) `y` from -`x` by fitting a time-windowed linear model that pooling data across all -locations. - -```{r} -edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) - -edf %>% - epi_slide(function(d, ...) { - obj <- lm(y ~ x, data = d) - return( - as.data.frame( - predict(obj, - newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), - interval = "prediction", level = 0.9 - ) - ) - ) - }, before = 1, new_col_name = "fc", names_sep = NULL) -``` - -The above example focused on simplicity to show how to work with multi-row -outputs. Note however, the following issues in this example: - -* The `lm` fitting data includes the testing instances, as no training-test split was performed. -* Adding a simple training-test split would not factor in reporting latency properly. -* Data revisions are not taken into account. - -All three of these factors contribute to unrealistic retrospective forecasts and -overly optimistic retrospective performance evaluations. Instead, one should -favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using -`epix_slide` also makes it easier to express certain types of forecasts; while -in `epi_slide`, forecasts for additional aheads or quantile levels would need to -be expressed as additional columns, or nested inside list columns, `epix_slide` -does not perform size stability checks or recycling, allowing computations to -output any number of rows. - -## Version-aware forecasting, revisited - -We revisit the COVID-19 forecasting example from the [archive -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the preceding points regarding forecast evaluation in a more -realistic setting. First, we fetch the versioned data and build the archive. - -```{r, message = FALSE, warning = FALSE, eval =FALSE} -library(epidatr) -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -y1 <- pub_covidcast( - source = "doctor-visits", - signals = "smoothed_adj_cli", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_value = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -y2 <- pub_covidcast( - source = "jhu-csse", - signal = "confirmed_7dav_incidence_prop", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_value = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -x <- y1 %>% - select(geo_value, time_value, - version = issue, - percent_cli = value - ) %>% - as_epi_archive(compactify = FALSE) - -# mutating merge operation: -x <- epix_merge( - x, - y2 %>% - select(geo_value, time_value, - version = issue, - case_rate_7d_av = value - ) %>% - as_epi_archive(compactify = FALSE), - sync = "locf", - compactify = FALSE -) -``` - -```{r, message = FALSE, echo =FALSE} -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -x <- archive_cases_dv_subset$DT %>% - filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) -``` - -Next, we extend the ARX function to handle multiple geo values, since in the -present case, we will not be grouping by geo value and each slide computation -will be run on multiple geo values at once. Note that, because `epix_slide()` -only returns the grouping variables, `time_value`, and the slide computations in -the eventual returned tibble, we need to include `geo_value` as a column in the -output data frame from our ARX computation. - -```{r} -library(tidyr) -library(purrr) - -prob_arx_args <- function(lags = c(0, 7, 14), - ahead = 7, - min_train_window = 20, - lower_level = 0.05, - upper_level = 0.95, - symmetrize = TRUE, - intercept = FALSE, - nonneg = TRUE) { - return(list( - lags = lags, - ahead = ahead, - min_train_window = min_train_window, - lower_level = lower_level, - upper_level = upper_level, - symmetrize = symmetrize, - intercept = intercept, - nonneg = nonneg - )) -} - -prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { - # Return NA if insufficient training data - if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = NA, lower = NA, upper = NA - )) - } - - # Set up x, y, lags list - if (!missing(x)) { - x <- data.frame(x, y) - } else { - x <- data.frame(y) - } - if (!is.list(args$lags)) args$lags <- list(args$lags) - args$lags <- rep(args$lags, length.out = ncol(x)) - - # Build features and response for the AR model, and then fit it - dat <- - tibble(i = seq_len(ncol(x)), lag = args$lags) %>% - unnest(lag) %>% - mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter - # One list element for each lagged feature - pmap(function(i, lag, name) { - tibble( - geo_value = geo_value, - time_value = time_value + lag, # Shift back - !!name := x[, i] - ) - }) %>% - # One list element for the response vector - c(list( - tibble( - geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward - y = y - ) - )) %>% - # Combine them together into one data frame - reduce(full_join, by = c("geo_value", "time_value")) %>% - arrange(time_value) - if (args$intercept) dat$x0 <- rep(1, nrow(dat)) - obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) - - # Use LOCF to fill NAs in the latest feature values (do this by geo value) - setDT(dat) # Convert to a data.table object by reference - cols <- setdiff(names(dat), c("geo_value", "time_value")) - dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] - - # Make predictions - test_time_value <- max(time_value) - point <- predict( - obj, - newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value) - ) - - # Compute bands - r <- residuals(obj) - s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? - q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) - lower <- point + q[1] - upper <- point + q[2] - - # Clip at zero if we need to, then return - if (args$nonneg) { - point <- pmax(point, 0) - lower <- pmax(lower, 0) - upper <- pmax(upper, 0) - } - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = point, lower = lower, upper = upper - )) -} -``` - -We now make forecasts on the archive and compare to forecasts on the latest -data. - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -# Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) - -# Simple function to produce forecasts k weeks ahead -k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { - if (as_of) { - x %>% - epix_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - before = 119, ref_time_values = fc_time_values - ) %>% - mutate( - target_date = .data$time_value + ahead, as_of = TRUE, - geo_value = .data$fc_geo_value - ) - } else { - x_latest %>% - epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - before = 119, ref_time_values = fc_time_values - ) %>% - mutate(target_date = .data$time_value + ahead, as_of = FALSE) - } -} - -# Generate the forecasts, and bind them together -fc <- bind_rows( - k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE) -) - -# Plot them, on top of latest COVID-19 case rates -ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + - geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line( - data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = fc_point)) + - geom_point(aes(y = fc_point), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(as_of), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` - -We can see that these forecasts, which come from training an ARX model jointly -over CA and FL, exhibit generally less variability and wider prediction bands -compared to the ones from the archive vignette, which come from training a -separate ARX model on each state. As in the archive vignette, we can see a -difference between version-aware (right column) and -unaware (left column) -forecasting, as well. - -## Attribution -The `case_rate_7d_av` data used in this document is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index ec5f36af..9d205f53 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -52,13 +52,12 @@ x <- jhu_csse_county_level_subset ## Converting to `tsibble` format For manipulating and wrangling time series data, the -[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a whole -bunch of useful tools. A tsibble object (formerly, of class `tbl_ts`) is -basically a tibble (data frame) but with two specially-marked columns: an -**index** column representing the time variable (defining an order from past to -present), and a **key** column identifying a unique observational unit for each -time point. In fact, the key can be made up of any number of columns, not just a -single one. +[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a host of +useful tools. A tsibble object (formerly, of class `tbl_ts`) is basically a +tibble (data frame) but with two specially-marked columns: an **index** column +representing the time variable (defining an order from past to present), and a +**key** column identifying a unique observational unit for each time point. In +fact, the key can be made up of any number of columns, not just a single one. In an `epi_df` object, the index variable is `time_value`, and the key variable is typically `geo_value` (though this need not always be the case: for example, @@ -113,11 +112,13 @@ Let's first remove certain dates from our data set to create gaps: ```{r} # First make geo value more readable for tables, plots, etc. x <- x %>% - mutate(geo_value = paste( - substr(county_name, 1, nchar(county_name) - 7), - name_to_abbr(state_name), - sep = ", " - )) %>% + mutate( + geo_value = paste( + substr(county_name, 1, nchar(county_name) - 7), + name_to_abbr(state_name), + sep = ", " + ) + ) %>% select(geo_value, time_value, cases) xt <- as_tsibble(x) %>% filter(cases >= 3) @@ -182,7 +183,7 @@ Explicit imputation for missingness (zero-filling in our case) can be important for protecting against bugs in all sorts of downstream tasks. For example, even something as simple as a 7-day trailing average is complicated by missingness. The function `epi_slide()` looks for all rows within a window of 7 days anchored -on the right at the reference time point (when `before = 6`). +on the right at the reference time point (when `.window_size = 7`). But when some days in a given week are missing because they were censored because they had small case counts, taking an average of the observed case counts can be misleading and is unintentionally biased upwards. Meanwhile, @@ -194,7 +195,7 @@ running `epi_slide()` on the zero-filled data brings these trailing averages xt %>% as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% ungroup() %>% filter( geo_value == "Plymouth, MA", @@ -205,7 +206,7 @@ xt %>% xt_filled %>% as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% ungroup() %>% filter( geo_value == "Plymouth, MA", diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 686f558f..62eea2aa 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -51,6 +51,10 @@ library(data.table) library(dplyr) library(purrr) library(ggplot2) +dv <- archive_cases_dv_subset$DT %>% + select(-case_rate_7d_av) %>% + rename(issue = version, value = percent_cli) %>% + tibble() ``` ## Getting data into `epi_archive` format @@ -72,7 +76,7 @@ format, with `issue` playing the role of `version`. We can now use redundant version updates in `as_epi_archive` using compactify, please refer to the [compactify vignette](articles/compactify.html). -```{r, eval=FALSE} +```{r} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% as_epi_archive(compactify = TRUE) @@ -81,15 +85,6 @@ class(x) print(x) ``` -```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset$DT %>% - select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) - -class(x) -print(x) -``` - An `epi_archive` is consists of a primary field `DT`, which is a data table (from the `data.table` package) that has the columns `geo_value`, `time_value`, `version` (and possibly additional ones), and other metadata fields, such as @@ -119,7 +114,6 @@ The following pieces of metadata are included as fields in an `epi_archive` object: * `geo_type`: the type for the geo values. -* `additional_metadata`: list of additional metadata for the data archive. Metadata for an `epi_archive` object `x` can be accessed (and altered) directly, as in `x$geo_type`, etc. Just like `as_epi_df()`, the function @@ -127,14 +121,53 @@ as in `x$geo_type`, etc. Just like `as_epi_df()`, the function object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). +## Summarizing Revision Behavior + +There are many ways to examine the ways that signals change across different +revisions. The simplest that is included directly in epiprocess is +`revision_summary()`, which computes simple summary statistics for each key (by +default, `(geo_value,time_value)` pairs), such as the lag to the first value +(latency). In addition to the per key summary, it also returns an overall +summary: + +```{r} +revision_details <- revision_summary(x, print_inform = TRUE) +``` + +So as was mentioned at the top, this is clearly a data set where basically +everything has some amount of revisions, only 0.37% have no revision at all, and +0.92 have fewer than 3. Over 94% change by more than 10%. On the other hand, +most are within plus or minus 20% within 5-9 days, so the revisions converge +relatively quickly, even if the revisions continue for longer. + +To do more detailed analysis than is possible with the above printing, we have +`revision_details`: + +```{r} +revision_details %>% + group_by(geo_value) %>% + summarise( + n_rev = mean(n_revisions), + min_lag = min(min_lag), + max_lag = max(max_lag), + spread = mean(spread), + rel_spread = mean(rel_spread), + time_near_latest = mean(time_near_latest) + ) +``` + +Most of the states have similar stats on most of these features, except for +Florida, which takes nearly double the amount of time to get close to the right +value, with California not too far behind. + ## Producing snapshots in `epi_df` form -A key method of an `epi_archive` class is `epix_as_of()`, which generates a snapshot -of the archive in `epi_df` format. This represents the most up-to-date values of -the signal variables as of a given version. +A key method of an `epi_archive` class is `epix_as_of()`, which generates a +snapshot of the archive in `epi_df` format. This represents the most up-to-date +values of the signal variables as of a given version. ```{r} -x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of(x, as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -147,16 +180,6 @@ date was June 1, 2021. From this we can infer that the doctor's visits signal was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` object has the version date recorded in the `as_of` field. -By default, using the maximum of the `version` column in the underlying data table in an -`epi_archive` object itself generates a snapshot of the latest values of signal -variables in the entire archive. The `epix_as_of()` function issues a warning in -this case, since updates to the current version may still come in at a later -point in time, due to various reasons, such as synchronization issues. - -```{r} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -``` - Below, we pull several snapshots from the archive, spaced one month apart. We overlay the corresponding signal curves as colored lines, with the version dates marked by dotted vertical lines, and draw the latest curve in black (from the @@ -165,10 +188,11 @@ latest snapshot `x_latest` that the archive can provide). ```{r, fig.width = 8, fig.height = 7} theme_set(theme_bw()) +x_latest <- epix_as_of(x, x$versions_end) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of(x, max_version = v) %>% mutate(version = v) + epix_as_of(x, v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) @@ -222,7 +246,7 @@ When merging archives, unless the archives have identical data release patterns, download the currently available version data for one of the archives, but not the other). -```{r, message = FALSE, warning = FALSE,eval=FALSE} +```{r, message = FALSE, warning = FALSE, eval=FALSE} y <- pub_covidcast( source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", @@ -322,29 +346,28 @@ Next we slide this forecaster over the working `epi_archive` object, in order to forecast COVID-19 case rates 7 days into the future. ```{r} -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) +fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month") z <- x %>% group_by(geo_value) %>% epix_slide( - fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, - ref_time_values = fc_time_values + fc = prob_arx(x = percent_cli, y = case_rate_7d_av, ahead = 7), + .before = 119, + .versions = fc_time_values ) %>% ungroup() head(z, 10) ``` -We get back a tibble `z` with the grouping variables (here geo value), the time -values, and three columns `fc_point`, `fc_lower`, and `fc_upper` produced by the -slide computation that correspond to the point forecast, and the lower and upper -endpoints of the 95\% prediction band, respectively. (If instead we had set -`as_list_col = TRUE` in the call to `epix_slide()`, then we would have gotten a -list column `fc`, where each element of `fc` is a data frame with named columns -`point`, `lower`, and `upper`.) +We get back a tibble `z` with the grouping variables (here geo value), the +(reference) time values, and a ["packed"][tidyr::pack] data frame column `fc` +containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point +forecast, and the lower and upper endpoints of the 95\% prediction band, +respectively. (We could also have used `, prob_ar(cases_7dav)` to get three +separate columns `point`, `lower`, and `upper`, or used `fc = +list(prob_ar(cases_7dav))` to make an `fc` column with a ["nested"][tidyr::nest] +format (list of data frames) instead.) On the whole, `epix_slide()` works similarly to `epix_slide()`, though there are a few notable differences, even apart from the version-aware aspect. You can @@ -356,25 +379,25 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, x$versions_end) # Simple function to produce forecasts k weeks ahead -k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { +forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% epix_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .before = 119, + .versions = fc_time_values ) %>% - mutate(target_date = .data$time_value + ahead, as_of = TRUE) %>% + mutate(target_date = .data$version + ahead, as_of = TRUE) %>% ungroup() } else { x_latest %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .window_size = 120, + .ref_time_values = fc_time_values ) %>% mutate(target_date = .data$time_value + ahead, as_of = FALSE) %>% ungroup() @@ -383,25 +406,25 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { # Generate the forecasts, and bind them together fc <- bind_rows( - k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE) + forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) ) # Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + - geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + + geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + geom_line( data = x_latest, aes(x = time_value, y = case_rate_7d_av), inherit.aes = FALSE, color = "gray50" ) + - geom_line(aes(y = fc_point)) + - geom_point(aes(y = fc_point), size = 0.5) + + geom_line(aes(y = fc$point)) + + geom_point(aes(y = fc$point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + @@ -429,9 +452,250 @@ to look for more robust forecasting methodology. The forecasters that appear in the vignettes in the current package are only meant to demo the slide functionality with some of the most basic forecasting methodology possible. +## Sliding version-aware computations with geo-pooling + +First, we fetch the versioned data and build the archive. + +```{r, message = FALSE, warning = FALSE, eval =FALSE} +library(epidatr) +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +y1 <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) + +y2 <- pub_covidcast( + source = "jhu-csse", + signal = "confirmed_7dav_incidence_prop", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) + +x <- y1 %>% + select(geo_value, time_value, + version = issue, + percent_cli = value + ) %>% + as_epi_archive(compactify = FALSE) + +# mutating merge operation: +x <- epix_merge( + x, + y2 %>% + select(geo_value, time_value, + version = issue, + case_rate_7d_av = value + ) %>% + as_epi_archive(compactify = FALSE), + sync = "locf", + compactify = FALSE +) +``` + +```{r, message = FALSE, echo =FALSE} +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +x <- archive_cases_dv_subset$DT %>% + filter(geo_value %in% c("ca", "fl")) %>% + as_epi_archive(compactify = FALSE) +``` + +Next, we extend the ARX function to handle multiple geo values, since in the +present case, we will not be grouping by geo value and each slide computation +will be run on multiple geo values at once. Note that, because `epix_slide()` +only returns the grouping variables, `time_value`, and the slide computations in +the eventual returned tibble, we need to include `geo_value` as a column in the +output data frame from our ARX computation. + +```{r} +library(tidyr) +library(purrr) + +prob_arx_args <- function(lags = c(0, 7, 14), + ahead = 7, + min_train_window = 20, + lower_level = 0.05, + upper_level = 0.95, + symmetrize = TRUE, + intercept = FALSE, + nonneg = TRUE) { + return(list( + lags = lags, + ahead = ahead, + min_train_window = min_train_window, + lower_level = lower_level, + upper_level = upper_level, + symmetrize = symmetrize, + intercept = intercept, + nonneg = nonneg + )) +} + +prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { + # Return NA if insufficient training data + if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = NA, lower = NA, upper = NA + )) + } + + # Set up x, y, lags list + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } + if (!is.list(args$lags)) args$lags <- list(args$lags) + args$lags <- rep(args$lags, length.out = ncol(x)) + + # Build features and response for the AR model, and then fit it + dat <- tibble(i = seq_len(ncol(x)), lag = args$lags) %>% + unnest(lag) %>% + mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter + # One list element for each lagged feature + pmap(function(i, lag, name) { + tibble( + geo_value = geo_value, + time_value = time_value + lag, # Shift back + !!name := x[, i] + ) + }) %>% + # One list element for the response vector + c(list( + tibble( + geo_value = geo_value, + time_value = time_value - args$ahead, # Shift forward + y = y + ) + )) %>% + # Combine them together into one data frame + reduce(full_join, by = c("geo_value", "time_value")) %>% + arrange(time_value) + if (args$intercept) dat$x0 <- rep(1, nrow(dat)) + obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) + + # Use LOCF to fill NAs in the latest feature values (do this by geo value) + setDT(dat) # Convert to a data.table object by reference + cols <- setdiff(names(dat), c("geo_value", "time_value")) + dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] + + # Make predictions + test_time_value <- max(time_value) + point <- predict( + obj, + newdata = dat %>% + dplyr::group_by(geo_value) %>% + dplyr::filter(time_value == test_time_value) + ) + + # Compute bands + r <- residuals(obj) + s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? + q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) + lower <- point + q[1] + upper <- point + q[2] + + # Clip at zero if we need to, then return + if (args$nonneg) { + point <- pmax(point, 0) + lower <- pmax(lower, 0) + upper <- pmax(upper, 0) + } + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = point, lower = lower, upper = upper + )) +} +``` + +We now make forecasts on the archive and compare to forecasts on the latest +data. + +```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} +# Latest snapshot of data, and forecast dates +x_latest <- epix_as_of(x, version = max(x$DT$version)) +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), + by = "1 month" +) + +# Simple function to produce forecasts k weeks ahead +forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { + if (as_of) { + x %>% + epix_slide( + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, + args = prob_arx_args(ahead = ahead) + ), + .before = 219, .versions = fc_time_values + ) %>% + mutate( + target_date = .data$version + ahead, as_of = TRUE, + geo_value = .data$fc$geo_value + ) + } else { + x_latest %>% + epi_slide( + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, + args = prob_arx_args(ahead = ahead) + ), + .window_size = 220, .ref_time_values = fc_time_values + ) %>% + mutate(target_date = .data$time_value + ahead, as_of = FALSE) + } +} + +# Generate the forecasts, and bind them together +fc <- bind_rows( + forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) +) + +# Plot them, on top of latest COVID-19 case rates +ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + + geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc$point)) + + geom_point(aes(y = fc$point), size = 0.5) + + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + facet_grid(vars(geo_value), vars(as_of), scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") +``` + +We can see that these forecasts, which come from training an ARX model jointly +over CA and FL, exhibit generally less variability and wider prediction bands +compared to the ones from the archive vignette, which come from training a +separate ARX model on each state. As in the archive vignette, we can see a +difference between version-aware (right column) and -unaware (left column) +forecasting, as well. + ## Attribution + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. - - diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 8579be6a..72a2d266 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -101,7 +101,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of, "as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea %>% epix_slide(median = median(.data$case_rate_7d_av), before = 7) + my_ea %>% epix_slide(median = median(.data$case_rate_7d_av), .before = 7) } speeds <- rbind(speeds, speed_test(slide_median, "slide_median")) diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 24a98505..b1840bb2 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -128,9 +128,7 @@ columns required for an `epi_df` object (along with many others). We can use frame into `epi_df` format. ```{r, message = FALSE} -x <- as_epi_df(cases, - as_of = max(cases$issue) -) %>% +x <- as_epi_df(cases, as_of = max(cases$issue)) %>% select(geo_value, time_value, total_cases = value) class(x) @@ -176,9 +174,11 @@ attributes(x)$metadata ``` ## Using additional key columns in `epi_df` + In the following examples we will show how to create an `epi_df` with additional keys. ### Converting a `tsibble` that has county code as an extra key + ```{r} ex1 <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), @@ -200,10 +200,10 @@ The metadata now includes `county_code` as an extra key. attr(ex1, "metadata") ``` - ### Dealing with misspecified column names `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. + ```{r, error = TRUE} data.frame( # misnamed @@ -211,12 +211,13 @@ data.frame( # extra key pol = rep(c("blue", "swing", "swing"), each = 3), # misnamed - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = 9), + value = 1:9 + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, 9)) ) %>% as_epi_df(as_of = as.Date("2024-03-20")) ``` The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. + ```{r} ex2 <- tibble( # misnamed @@ -234,13 +235,12 @@ ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>% as_epi_df( as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol") + other_keys = "pol" ) attr(ex2, "metadata") ``` - ### Adding additional keys to an `epi_df` object In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. @@ -264,12 +264,12 @@ ex3 <- ex3 %>% state = rep(tolower("MA"), 6), pol = rep(c("blue", "swing", "swing"), each = 2) ) %>% - as_epi_df(additional_metadata = list(other_keys = c("state", "pol")), as_of = as.Date("2024-03-20")) + as_epi_df(other_keys = c("state", "pol"), as_of = as.Date("2024-03-20")) attr(ex3, "metadata") ``` -Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. +Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` argument. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. Currently `other_keys` metadata in `epi_df` doesn't impact `epi_slide()`, contrary to `other_keys` in `as_epi_archive` which affects how the update data is interpreted. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index abef646f..acbb53ee 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -22,6 +22,7 @@ library(tidyr) ``` The data is fetched with the following query: + ```{r, message = FALSE, eval=F} x <- pub_covidcast( source = "jhu-csse", @@ -38,7 +39,6 @@ x <- pub_covidcast( The data has 1,158 rows and 3 columns. - ```{r, echo=FALSE} data(jhu_csse_daily_subset) x <- jhu_csse_daily_subset %>% diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index ea3c30ac..1a2cfa41 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -127,11 +127,14 @@ vote across the base methods to determine whether a value is an outlier. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(outlier_info = detect_outlr( - x = time_value, y = cases, - methods = detection_methods, - combiner = "median" - )) %>% + mutate( + outlier_info = detect_outlr( + x = time_value, + y = cases, + methods = detection_methods, + combiner = "median" + ) + ) %>% ungroup() %>% unnest(outlier_info) @@ -240,10 +243,9 @@ ggplot(y, aes(x = time_value)) + More advanced correction functionality will be coming at some point in the future. - ## Attribution + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 92590fb1..92d8456d 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -11,22 +11,19 @@ A central tool in the `epiprocess` package is `epi_slide()`, which is based on the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In `epiprocess`, to "slide" means to apply a computation---represented as a -function or formula---over a sliding/rolling data window. Suitable -groupings can always be achieved by a preliminary call to `group_by()`. - -By default, the meaning of one time step is inferred from the `time_value` -column of the `epi_df` object under consideration, based on the way this column -understands addition and subtraction. For example, if the time values are coded -as `Date` objects, then one time step is one day, since `as.Date("2022-01-01") + -1` equals `as.Date("2022-01-02")`. Alternatively, the time step can be specified -manually in the call to `epi_slide()`; you can read the documentation for more -details. Furthermore, the alignment of the running window used in `epi_slide()` -is specified by `before` and `after`. +function or formula---over a sliding/rolling data window. The function always +applies the slide inside each group and the grouping is assumed to be across all +group keys of the `epi_df` (this is the grouping used by default if you do not +group the `epi_df` with a `group_by()`). + +By default, the `.window_size` units depend on the `time_type` of the `epi_df`, +which is determined from the types in the `time_value` column of the `epi_df`. +See the "Details" in `epi_slide()` for more. As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the -[`epidatr`](https://github.com/cmu-delphi/epidatr) package, -and then convert this to `epi_df` format. +[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert +this to `epi_df` format. ```{r, message = FALSE, warning=FALSE} library(epidatr) @@ -35,8 +32,9 @@ library(dplyr) ``` The data is fetched with the following query: + ```{r, message = FALSE, eval=F} -x <- pub_covidcast( +edf <- pub_covidcast( source = "jhu-csse", signals = "confirmed_incidence_num", geo_type = "state", @@ -53,117 +51,126 @@ The data has 2,684 rows and 3 columns. ```{r, echo=FALSE} data(jhu_csse_daily_subset) -x <- jhu_csse_daily_subset %>% +edf <- jhu_csse_daily_subset %>% select(geo_value, time_value, cases) %>% arrange(geo_value, time_value) %>% as_epi_df() ``` -## Optimized rolling mean +## Optimized rolling mean and sums -We first demonstrate how to apply a 7-day trailing average to the daily cases -in order to smooth the signal, by passing in the name of the column(s) we -want to average for the first argument of `epi_slide_mean()`. `epi_slide_mean -()` can only be used for averaging. To do this computation per state, we -first call `group_by()`. +For the two most common sliding operations, we offer two optimized versions: +`epi_slide_mean()` and `epi_slide_sum()`. This example gets the 7-day trailing +average of the daily cases. Note that the name of the column(s) that we want to +average is specified as the first argument of `epi_slide_mean()`. ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide_mean("cases", before = 6) %>% + epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) %>% ungroup() %>% head(10) ``` -The calculation is done using `data.table::frollmean`, whose behavior can be -adjusted by passing relevant arguments via `...`. +Note that we passed `na.rm = TRUE` to `data.table::frollmean()` via `...` to +`epi_slide_mean`. + +The following computes the 7-day trailing sum of daily cases (and passed `na.rm` +to `data.table::frollsum()` similarly): + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) %>% + ungroup() %>% + head(10) +``` -## Slide with a formula +## General sliding with a formula -The previous computation can also be performed using `epi_slide()`, which is -more flexible but quite a bit slower than `epi_slide_mean()`. It is -recommended to use `epi_slide_mean()` when possible. +The previous computations can also be performed using `epi_slide()`, which can +be used for more general sliding computations (but is much slower for the +specific cases of mean and sum). The same 7-day trailing average of daily cases can be computed by passing in a -formula for the first argument of `epi_slide()`. To do this per state, we -first call `group_by()`. +formula for the first argument of `epi_slide()`: ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), before = 6) %>% + epi_slide(~ mean(.x$cases, na.rm = TRUE), .window_size = 7) %>% ungroup() %>% head(10) ``` -The formula specified has access to all non-grouping columns present in the -original `epi_df` object (and must refer to them with the prefix `.x$`). As we -can see, the function `epi_slide()` returns an `epi_df` object with a new column -appended that contains the results (from sliding), named `slide_value` as the -default. We can of course change this post hoc, or we can instead specify a new -name up front using the `new_col_name` argument: +If your formula returns a data.frame, then the columns of the data.frame +will be unpacked into the resulting `epi_df`. For example, the following +computes the 7-day trailing average of daily cases and the 7-day trailing sum of +daily cases: ```{r} -x <- x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav") %>% - ungroup() - -head(x, 10) + epi_slide( + ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sum = sum(.x$cases, na.rm = TRUE)), + .window_size = 7 + ) %>% + ungroup() %>% + head(10) ``` +Note that this formula has access to all non-grouping columns present in the +original `epi_df` object and must refer to them with the prefix `.x$...`. As we +can see, the function `epi_slide()` returns an `epi_df` object with a new column +appended that contains the results (from sliding), named `slide_value` as the +default. + Some other information is available in additional variables: * `.group_key` is a one-row tibble containing the values of the grouping variables for the associated group * `.ref_time_value` is the reference time value the time window was based on -Like in `group_modify()`, there are alternative names for these variables as -well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` -instead of `.ref_time_value`. - -## Slide with a function - -We can also pass a function for the first argument in `epi_slide()`. In this -case, the passed function must accept the following arguments: - -In this case, the passed function `f` must accept the following arguments: a -data frame with the same column names as the original object, minus any grouping -variables, containing the time window data for one group-`ref_time_value` -combination; followed by a one-row tibble containing the values of the grouping -variables for the associated group; followed by the associated `ref_time_value`. -It can accept additional arguments; `epi_slide()` will forward any `...` args it -receives to `f`. - -Recreating the last example of a 7-day trailing average: - ```{r} -x <- x %>% +# Returning geo_value in the formula +edf %>% group_by(geo_value) %>% - epi_slide(function(x, gk, rtv) mean(x$cases), before = 6, new_col_name = "cases_7dav") %>% - ungroup() + epi_slide(~ .x$geo_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) -head(x, 10) +# Returning time_value in the formula +edf %>% + group_by(geo_value) %>% + epi_slide(~ .x$time_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) ``` +While the computations above do not look very useful, these can be used as +building blocks for computations that do something different depending on the +geo_value or ref_time_value. + ## Slide the tidy way Perhaps the most convenient way to setup a computation in `epi_slide()` is to pass in an expression for tidy evaluation. In this case, we can simply define the name of the new column directly as part of the expression, setting it equal -to a computation in which we can access any columns of `x` by name, just as we +to a computation in which we can access any columns of `.x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} -x <- x %>% +slide_output <- edf %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% - ungroup() - -head(x, 10) + epi_slide(cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) ``` -In addition to referring to individual columns by name, you can refer to the -time window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse "pronouns" `.data` and `.env` can also be used. + +In addition to referring to individual columns by name, you can refer to +`epi_df` time window as `.x` (`.group_key` and `.ref_time_value` are still +available). Also, the tidyverse "pronouns" `.data` and `.env` can also be used +if you need distinguish between the data and environment. As a simple sanity check, we visualize the 7-day trailing averages computed on top of the original counts: @@ -172,7 +179,7 @@ top of the original counts: library(ggplot2) theme_set(theme_bw()) -ggplot(x, aes(x = time_value)) + +ggplot(slide_output, aes(x = time_value)) + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + facet_wrap(~geo_value, scales = "free_y") + @@ -183,18 +190,40 @@ ggplot(x, aes(x = time_value)) + As we can see from the top right panel, it looks like Texas moved to weekly reporting of COVID-19 cases in summer of 2021. -## Running a local forecaster +## Slide with a function + +We can also pass a function to the second argument in `epi_slide()`. In this +case, the passed function `.f` must have the form `function(x, g, t, ...)`, +where -As a more complex example, we create a forecaster based on a local (in time) -autoregression or AR model. AR models can be fit in numerous ways (using base R -functions and various packages), but here we define it "by hand" both because it -provides a more advanced example of sliding a function over an `epi_df` object, -and because it allows us to be a bit more flexible in defining a *probabilistic* -forecaster: one that outputs not just a point prediction, but a notion of -uncertainty around this. In particular, our forecaster will output a point -prediction along with an 90\% uncertainty band, represented by a predictive -quantiles at the 5\% and 95\% levels (lower and upper endpoints of the -uncertainty band). +- "x" is an epi_df with the same column names as the archive's `DT`, minus + the `version` column +- "g" is a one-row tibble containing the values of the grouping variables +for the associated group +- "t" is the ref_time_value for the current window +- "..." are additional arguments + +Recreating the last example of a 7-day trailing average: + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide(function(x, g, t) mean(x$cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) +``` + +## Running a simple autoregressive forecaster + +As a more complex example, we create a forecaster based on an autoregression or +AR model. AR models can be fit in numerous ways (using base R functions and +various packages), but here we define it "by hand" both because it provides a +more advanced example of sliding a function over an `epi_df` object, and because +it allows us to be a bit more flexible in defining a *probabilistic* forecaster: +one that outputs not just a point prediction, but a notion of uncertainty around +this. In particular, our forecaster will output a point prediction along with an +90\% uncertainty band, represented by a predictive quantiles at the 5\% and 95\% +levels (lower and upper endpoints of the uncertainty band). The function defined below, `prob_ar()`, is our probabilistic AR forecaster. The `lags`argument indicates which lags to use in the model, and `ahead` indicates @@ -211,6 +240,9 @@ prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, return(data.frame(point = NA, lower = NA, upper = NA)) } + # Filter down the edge-NAs + y <- y[!is.na(y)] + # Build features and response for the AR model dat <- do.call( data.frame, @@ -247,28 +279,21 @@ scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, to modeling weekly sums of COVID-19 cases. ```{r} -fc_time_values <- seq(as.Date("2020-06-01"), - as.Date("2021-12-01"), - by = "1 months" -) -x %>% +fc_time_values <- seq(as.Date("2020-06-01"), as.Date("2021-12-01"), by = "1 months") +edf %>% group_by(geo_value) %>% - epi_slide( - fc = prob_ar(cases_7dav), before = 119, - ref_time_values = fc_time_values - ) %>% + epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% + epi_slide(fc = prob_ar(.data$cases_7dav), .window_size = 120, .ref_time_values = fc_time_values) %>% ungroup() %>% head(10) ``` -Note that here we have utilized an argument `ref_time_values` to perform the +Note that here we have utilized an argument `.ref_time_values` to perform the sliding computation (here, compute a forecast) at a specific subset of reference -time values. We get out three columns `fc_point`, `fc_lower`, and `fc_upper` -that correspond to the point forecast, and the lower and upper endpoints of the -95\% prediction band, respectively. (If instead we had set `as_list_col = TRUE` -in the call to `epi_slide()`, then we would have gotten a list column `fc`, -where each element of `fc` is a data frame with named columns `point`, `lower`, -and `upper`.) +time values (the start of every month from mid 2020 to the end of 2021). The +resulting epi_df now contains three new columns: `fc$point`, `fc$lower`, and +`fc$upper` corresponding to the point forecast, and the lower and upper +endpoints of the 95\% prediction band, respectively. To finish off, we plot the forecasts at some times (spaced out by a few months) over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do @@ -276,13 +301,16 @@ so, we encapsulate the process of generating forecasts into a simple function, so that we can call it a few times. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -# Note the use of all_rows = TRUE (keeps all original rows in the output) +# Note the use of .all_rows = TRUE (keeps all original rows in the output) k_week_ahead <- function(x, ahead = 7) { x %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% + epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% epi_slide( - fc = prob_ar(.data$cases_7dav, ahead = ahead), before = 119, - ref_time_values = fc_time_values, all_rows = TRUE + fc = prob_ar(.data$cases_7dav, ahead = ahead), + .window_size = 120, + .ref_time_values = fc_time_values, + .all_rows = TRUE ) %>% ungroup() %>% mutate(target_date = .data$time_value + ahead) @@ -290,21 +318,21 @@ k_week_ahead <- function(x, ahead = 7) { # First generate the forecasts, and bind them together z <- bind_rows( - k_week_ahead(x, ahead = 7), - k_week_ahead(x, ahead = 14), - k_week_ahead(x, ahead = 21), - k_week_ahead(x, ahead = 28) + k_week_ahead(edf, ahead = 7), + k_week_ahead(edf, ahead = 14), + k_week_ahead(edf, ahead = 21), + k_week_ahead(edf, ahead = 28) ) # Now plot them, on top of actual COVID-19 case counts ggplot(z) + geom_line(aes(x = time_value, y = cases_7dav), color = "gray50") + geom_ribbon(aes( - x = target_date, ymin = fc_lower, ymax = fc_upper, + x = target_date, ymin = fc$lower, ymax = fc$upper, group = time_value ), fill = 6, alpha = 0.4) + - geom_line(aes(x = target_date, y = fc_point, group = time_value)) + - geom_point(aes(x = target_date, y = fc_point, group = time_value), + geom_line(aes(x = target_date, y = fc$point, group = time_value)) + + geom_point(aes(x = target_date, y = fc$point, group = time_value), size = 0.5 ) + geom_vline( @@ -341,8 +369,10 @@ example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). ## Attribution + +The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - +These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes.