From 76e516de064231b2ce7725ebfac4e7cf570b66dd Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 24 Oct 2024 10:45:34 -0700 Subject: [PATCH 01/39] doc: update from Rachel's comments --- R/methods-epi_df.R | 2 +- man/complete.epi_df.Rd | 4 +++- vignettes/epiprocess.Rmd | 10 +++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 84a75e46..7d99bd49 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -258,7 +258,7 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { #' Complete epi_df #' -#' A `tidyr::complete()` analogue for `epi_df`` objects. This function +#' A `tidyr::complete()` analogue for `epi_df` objects. This function #' can be used, for example, to add rows for missing combinations #' of `geo_value` and `time_value`, filling other columns with `NA`s. #' See the examples for usage details. diff --git a/man/complete.epi_df.Rd b/man/complete.epi_df.Rd index 71dbcb38..7645a01f 100644 --- a/man/complete.epi_df.Rd +++ b/man/complete.epi_df.Rd @@ -16,7 +16,9 @@ \item{explicit}{see \code{\link[tidyr:complete]{tidyr::complete}}} } \description{ -A \code{tidyr::complete()} analogue for \verb{epi_df`` objects. This function can be used, for example, to add rows for missing combinations of }geo_value\code{and}time_value\verb{, filling other columns with }NA`s. +A \code{tidyr::complete()} analogue for \code{epi_df} objects. This function +can be used, for example, to add rows for missing combinations +of \code{geo_value} and \code{time_value}, filling other columns with \code{NA}s. See the examples for usage details. } \examples{ diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 613f9146..b9648e00 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -86,7 +86,7 @@ edf %>% ``` We can compute the 7 day moving average of the confirmed daily cases for each -geo_value by using the `epi_slide_mean()` function. For a more in-depth guide to +`geo_value` by using the `epi_slide_mean()` function. For a more in-depth guide to sliding, see `vignette("epi_df")`. ```{r} @@ -96,7 +96,7 @@ edf %>% ``` We can compute the growth rate of the confirmed cumulative cases for each -geo_value. For a more in-depth guide to growth rates, see `vignette("growth_rate")`. +`geo_value`. For a more in-depth guide to growth rates, see `vignette("growth_rate")`. ```{r} edf %>% @@ -104,7 +104,7 @@ edf %>% mutate(cases_growth = growth_rate(x = time_value, y = cases_cumulative, method = "rel_change", h = 7)) ``` -Detect outliers in daily reported cases for each geo_value. For a more in-depth +Detect outliers in daily reported cases for each `geo_value`. For a more in-depth guide to outlier detection, see `vignette("outliers")`. ```{r message=FALSE} @@ -114,8 +114,8 @@ edf %>% ungroup() ``` -Add a column to the epi_df object with the daily deaths for each geo_value and -compute the correlations between cases and deaths for each geo_value. For a more +Add a column to the epi_df object with the daily deaths for each `geo_value` and +compute the correlations between cases and deaths for each `geo_value`. For a more in-depth guide to correlations, see `vignette("correlation")`. ```{r} From 63bdc1da6428e1c248445427c865cf963c1c1fb4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 24 Oct 2024 15:53:37 -0700 Subject: [PATCH 02/39] docs(complete.epi_df): fix backticks, tweak 1-line summary --- R/methods-epi_df.R | 4 ++-- man/complete.epi_df.Rd | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 84a75e46..ffbfa9c5 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -256,9 +256,9 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) } -#' Complete epi_df +#' Complete an `epi_df` with additional rows for missing key combinations #' -#' A `tidyr::complete()` analogue for `epi_df`` objects. This function +#' A `tidyr::complete()` analogue for `epi_df` objects. This function #' can be used, for example, to add rows for missing combinations #' of `geo_value` and `time_value`, filling other columns with `NA`s. #' See the examples for usage details. diff --git a/man/complete.epi_df.Rd b/man/complete.epi_df.Rd index 71dbcb38..02323a5c 100644 --- a/man/complete.epi_df.Rd +++ b/man/complete.epi_df.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/methods-epi_df.R \name{complete.epi_df} \alias{complete.epi_df} -\title{Complete epi_df} +\title{Complete an \code{epi_df} with additional rows for missing key combinations} \usage{ \method{complete}{epi_df}(data, ..., fill = list(), explicit = TRUE) } @@ -16,7 +16,9 @@ \item{explicit}{see \code{\link[tidyr:complete]{tidyr::complete}}} } \description{ -A \code{tidyr::complete()} analogue for \verb{epi_df`` objects. This function can be used, for example, to add rows for missing combinations of }geo_value\code{and}time_value\verb{, filling other columns with }NA`s. +A \code{tidyr::complete()} analogue for \code{epi_df} objects. This function +can be used, for example, to add rows for missing combinations +of \code{geo_value} and \code{time_value}, filling other columns with \code{NA}s. See the examples for usage details. } \examples{ From c3be3acc34f69c55465b2d272dc49b2a9af5d5c3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 25 Oct 2024 11:19:18 -0700 Subject: [PATCH 03/39] docs(complete.epi_df): further tweak summary line --- R/methods-epi_df.R | 2 +- man/complete.epi_df.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index ffbfa9c5..6e19f753 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -256,7 +256,7 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) } -#' Complete an `epi_df` with additional rows for missing key combinations +#' "Complete" an `epi_df`, adding missing rows and/or replacing `NA`s #' #' A `tidyr::complete()` analogue for `epi_df` objects. This function #' can be used, for example, to add rows for missing combinations diff --git a/man/complete.epi_df.Rd b/man/complete.epi_df.Rd index 02323a5c..38c9a6fe 100644 --- a/man/complete.epi_df.Rd +++ b/man/complete.epi_df.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/methods-epi_df.R \name{complete.epi_df} \alias{complete.epi_df} -\title{Complete an \code{epi_df} with additional rows for missing key combinations} +\title{"Complete" an \code{epi_df}, adding missing rows and/or replacing \code{NA}s} \usage{ \method{complete}{epi_df}(data, ..., fill = list(), explicit = TRUE) } From ac12ec185886a294f95bef0d38d9b3587bb5eaa7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 6 Nov 2024 03:56:53 -0800 Subject: [PATCH 04/39] refactor(epi_slide_opt): simplify tidyselect --- NAMESPACE | 2 -- R/slide.R | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1fd65d37..04e8bf61 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -189,7 +189,6 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) -importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) @@ -207,7 +206,6 @@ importFrom(rlang,is_quosure) importFrom(rlang,list2) importFrom(rlang,missing_arg) importFrom(rlang,new_function) -importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) diff --git a/R/slide.R b/R/slide.R index f936916e..e7325e66 100644 --- a/R/slide.R +++ b/R/slide.R @@ -564,7 +564,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' functions). #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of -#' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg +#' @importFrom rlang enquo expr_label caller_arg #' @importFrom tidyselect eval_select #' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply @@ -714,11 +714,7 @@ epi_slide_opt <- function( # positions of user-provided `col_names` into string column names. We avoid # using `names(pos)` directly for robustness and in case we later want to # allow users to rename fields via tidyselection. - if (inherits(quo_get_expr(enquo(.col_names)), "character")) { - pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE) - } else { - pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) - } + pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) col_names_chr <- names(.x)[pos] # Always rename results to "slide_value_". result_col_names <- paste0("slide_value_", col_names_chr) From c75de3829a5e07c4858030c987919bbe4ce9e75d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 7 Nov 2024 14:49:45 -0800 Subject: [PATCH 05/39] Fix typo + adjust naming of sliding median example --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index e7325e66..3064b087 100644 --- a/R/slide.R +++ b/R/slide.R @@ -6,8 +6,8 @@ #' as follows: #' #' ``` -#' # Create new column `cases_7dm` that contains a 7-day trailing median of cases -#' epi_slide(edf, cases_7dav = median(cases), .window_size = 7) +#' # Create new column `cases_7dmed` that contains a 7-day trailing median of cases +#' epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) #' ``` #' #' For two very common use cases, we provide optimized functions that are much From a2f2f448c639334767c7fbaa2e28bd7f69b0bd53 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 11 Nov 2024 16:19:58 -0800 Subject: [PATCH 06/39] delphidocs on dev --- .github/workflows/pkgdown.yaml | 26 +++---------------- _pkgdown.yml | 46 +++++++++++----------------------- 2 files changed, 18 insertions(+), 54 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 886055ae..3a1ff927 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -28,7 +28,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -42,32 +42,12 @@ jobs: needs: website - name: Build site - # - target_ref gets the ref from a different variable, depending on the event - # - override allows us to set the pkgdown mode and version_label - # - mode: release is the standard build mode, devel places the site in /dev - # - version_label: 'light' and 'success' are CSS labels for Bootswatch: Cosmo - # https://bootswatch.com/cosmo/ - # - we use pkgdown:::build_github_pages to build the site because of an issue in pkgdown - # https://github.com/r-lib/pkgdown/issues/2257 - run: | - target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" - override <- if (target_ref == "main" || target_ref == "refs/heads/main") { - list(development = list(mode = "release", version_label = "light")) - } else if (target_ref == "dev" || target_ref == "refs/heads/dev") { - list(development = list(mode = "devel", version_label = "success")) - } else { - stop("Unexpected target_ref: ", target_ref) - } - pkg <- pkgdown::as_pkgdown(".", override = override) - cli::cli_rule("Cleaning files from old site...") - pkgdown::clean_site(pkg) - pkgdown::build_site(pkg, preview = FALSE, install = FALSE, new_process = FALSE) - pkgdown:::build_github_pages(pkg) + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} - name: Deploy to GitHub pages ๐Ÿš€ if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.4.1 + uses: JamesIves/github-pages-deploy-action@v4.5.0 with: clean: false branch: gh-pages diff --git a/_pkgdown.yml b/_pkgdown.yml index 2214df7c..743adc4b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,40 +1,25 @@ -# Colors should stay consistent across epipredict, epiprocess, and epidatr, -# using Carnegie Red -# https://www.cmu.edu/brand/brand-guidelines/visual-identity/colors.html - -# This is to give a default value to the `mode` parameter in the -# `pkgdown::build_site` function. This is useful when building the site locally, -# as it will default to `devel` mode. In practice, this should all be handled -# dynamically by the CI/CD pipeline. development: - mode: devel - version_label: success + mode: auto template: - bootstrap: 5 - bootswatch: cosmo - bslib: - font_scale: 1.0 - primary: "#C41230" - success: "#B4D43C" - link-color: "#C41230" - -navbar: - bg: primary - type: light + package: delphidocs url: https://cmu-delphi.github.io/epiprocess/ home: links: - - text: Introduction to Delphi's Tooling Work - href: https://cmu-delphi.github.io/delphi-tooling-book/ - - text: Get the epipredict R package + - text: The epipredict package href: https://cmu-delphi.github.io/epipredict/ - - text: Get the epidatr R package - href: https://github.com/cmu-delphi/epidatr - - text: Get the epidatasets R package - href: https://cmu-delphi.github.io/epidatasets/ + sidebar: + structure: [links, license, community, citation, authors, dev, related] + components: + related: + title: Delphi packages and resources + text: | + * [The epidatr package](https://github.com/cmu-delphi/epidatr/) + * [The epipredict package](https://cmu-delphi.github.io/epipredict/) + * [The epidatasets package](https://cmu-delphi.github.io/epidatasets/) + * [Introduction to Delphi's Tooling Work](https://cmu-delphi.github.io/delphi-tooling-book/) articles: - title: Using the package @@ -54,10 +39,9 @@ articles: repo: url: - home: https://github.com/cmu-delphi/epiprocess/tree/main/ - source: https://github.com/cmu-delphi/epiprocess/blob/main/ + home: https://github.com/cmu-delphi/epiprocess/ + source: https://github.com/cmu-delphi/epiprocess/ issue: https://github.com/cmu-delphi/epiprocess/issues - user: https://github.com/ reference: - title: "`epi_df` basics" From fab1c088e22cc68562aac0182941eff3639db050 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 6 Nov 2024 04:22:53 -0800 Subject: [PATCH 07/39] feat(epi_slide_opt)!: add `.prefix =, .suffix =, .new_col_names =` - BREAKING CHANGE: adjust default output column naming scheme, disallow overwriting columns. --- NAMESPACE | 5 + R/epi_df.R | 11 +- R/epiprocess-package.R | 3 + R/slide.R | 172 +++++++++++++++++++++++++------- R/utils.R | 67 ++++++++++++- README.Rmd | 3 +- README.md | 3 +- man/epi_slide.Rd | 4 +- man/epi_slide_opt.Rd | 57 +++++++++-- man/time_delta_to_n_steps.Rd | 31 ++++++ tests/testthat/test-epi_slide.R | 155 ++++++++++++++++++++++++++-- 11 files changed, 439 insertions(+), 72 deletions(-) create mode 100644 man/time_delta_to_n_steps.Rd diff --git a/NAMESPACE b/NAMESPACE index 04e8bf61..e044739b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -114,6 +114,7 @@ importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) +importFrom(checkmate,assert_string) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) @@ -176,6 +177,7 @@ importFrom(dplyr,summarize) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(ggplot2,autoplot) +importFrom(glue,glue) importFrom(lifecycle,deprecated) importFrom(lubridate,as.period) importFrom(lubridate,days) @@ -198,6 +200,7 @@ importFrom(rlang,env) importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) +importFrom(rlang,is_bare_integerish) importFrom(rlang,is_environment) importFrom(rlang,is_formula) importFrom(rlang,is_function) @@ -206,6 +209,7 @@ importFrom(rlang,is_quosure) importFrom(rlang,list2) importFrom(rlang,missing_arg) importFrom(rlang,new_function) +importFrom(rlang,quo_get_env) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) @@ -230,3 +234,4 @@ importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) +importFrom(vctrs,vec_data) diff --git a/R/epi_df.R b/R/epi_df.R index bcf9e56f..5cf379e2 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -232,7 +232,6 @@ as_epi_df.tbl_df <- function( as_of, other_keys = character(), ...) { - # possible standard substitutions for time_value x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "geo_value", geo_column_names()) @@ -282,11 +281,11 @@ as_epi_df.tbl_df <- function( cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") } - duplicated_time_values <- x %>% - group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% - filter(dplyr::n() > 1) %>% - ungroup() - if (nrow(duplicated_time_values) > 0) { + if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) { + duplicated_time_values <- x %>% + group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% + filter(dplyr::n() > 1) %>% + ungroup() bad_data <- capture.output(duplicated_time_values) cli_abort( "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.", diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index e28cec0f..b640a0b5 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -6,6 +6,7 @@ #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt +#' @importFrom checkmate assert_string #' @importFrom checkmate check_atomic check_data_frame expect_class test_int #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname @@ -16,6 +17,8 @@ #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom rlang %||% +#' @importFrom rlang is_bare_integerish +#' @importFrom vctrs vec_data ## usethis namespace: end NULL diff --git a/R/slide.R b/R/slide.R index 3064b087..9d383446 100644 --- a/R/slide.R +++ b/R/slide.R @@ -537,7 +537,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' #' @template basic-slide-params #' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), #' [other tidy-select expression][tidyselect::language], or a vector of #' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if #' they were positions in the data frame, so expressions like `x:y` can be @@ -559,13 +559,40 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' `epi_slide_mean` and `epi_slide_sum`) take care of window completion #' automatically to prevent associated errors. #' @param ... Additional arguments to pass to the slide computation `.f`, for -#' example, `algo` or `na.rm` in data.table functions. You don't need to -#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider -#' functions). +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). +#' @param .prefix Optional [`glue::glue`] format string; name the slide result +#' column(s) by attaching this prefix to the corresponding input column(s). +#' Some shorthand is supported for basing the output names on `.window_size` +#' or other arguments; see "Prefix and suffix shorthand" below. +#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The +#' default naming behavior is equivalent to `.suffix = +#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination +#' with `.prefix`. +#' @param .new_col_names Optional character vector with length matching the +#' number of input columns from `.col_names`; name the slide result column(s) +#' with these names. Cannot be used in combination with `.prefix` and/or +#' `.suffix`. +#' +#' @section Prefix and suffix shorthand: +#' +#' [`glue::glue`] format strings specially interpret content within curly +#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix` +#' and `.suffix`, we provide `glue` with some additional variable bindings: +#' +#' - `{.n}` will be the number of time steps in the computation +#' corresponding to the `.window_size`. +#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the +#' `time_type` of `.x` +#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; +#' otherwise, it will be the first letter of `.align` +#' - `{.f_abbr}` will be a short string based on what `.f` #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of -#' @importFrom rlang enquo expr_label caller_arg +#' @importFrom rlang enquo expr_label caller_arg quo_get_env #' @importFrom tidyselect eval_select +#' @importFrom glue glue #' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period @@ -577,8 +604,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' # Compute a 7-day trailing average on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) #' #' # Same as above, but adjust `frollmean` settings for speed, accuracy, and #' # to allow partially-missing windows. @@ -588,11 +614,11 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' cases, #' .f = data.table::frollmean, .window_size = 7, #' algo = "exact", hasNA = TRUE, na.rm = TRUE -#' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +#' ) epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { assert_class(.x, "epi_df") @@ -620,7 +646,7 @@ epi_slide_opt <- function( if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { cli::cli_abort( "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `dplyr::rename` after the slide.", + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } @@ -644,21 +670,37 @@ epi_slide_opt <- function( ) } + # The position of a given column can be differ between input `.x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + col_names_quo <- enquo(.col_names) + pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) + col_names_chr <- names(.x)[pos] + # Check that slide function `.f` is one of those short-listed from # `data.table` and `slider` (or a function that has the exact same # definition, e.g. if the function has been reexported or defined # locally). - if (any(map_lgl( - list(frollmean, frollsum, frollapply), - ~ identical(.f, .x) - ))) { - f_from_package <- "data.table" - } else if (any(map_lgl( - list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), - ~ identical(.f, .x) - ))) { - f_from_package <- "slider" - } else { + f_possibilities <- + tibble::tribble( + ~f, ~package, ~abbr, + frollmean, "data.table", "av", + frollsum, "data.table", "sum", + frollapply, "data.table", "slide", + slide_sum, "slider", "sum", + slide_prod, "slider", "prod", + slide_mean, "slider", "av", + slide_min, "slider", "min", + slide_max, "slider", "max", + slide_all, "slider", "all", + slide_any, "slider", "any", + ) + f_info <- f_possibilities %>% + filter(map_lgl(.data$f, ~ identical(.f, .x))) + if (nrow(f_info) == 0L) { # `f` is from somewhere else and not supported cli_abort( c( @@ -672,6 +714,7 @@ epi_slide_opt <- function( epiprocess__f = .f ) } + f_from_package <- f_info$package user_provided_rtvs <- !is.null(.ref_time_values) if (!user_provided_rtvs) { @@ -702,22 +745,72 @@ epi_slide_opt <- function( validate_slide_window_arg(.window_size, time_type) window_args <- get_before_after_from_window(.window_size, .align, time_type) + # Handle output naming + if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { + cli_abort( + "Can't use both .prefix/.suffix and .new_col_names at the same time.", + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + } + assert_string(.prefix, null.ok = TRUE) + assert_string(.suffix, null.ok = TRUE) + assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE) + if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { + .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" + # ^ does not account for any arguments specified to underlying functions via + # `...` such as `na.rm =`, nor does it distinguish between functions from + # different packages accomplishing the same type of computation. Those are + # probably only set one way per task, so this probably produces cleaner + # names without clashes (though maybe some confusion if switching between + # code with different settings). + } + if (!is.null(.prefix) || !is.null(.suffix)) { + .prefix <- .prefix %||% "" + .suffix <- .suffix %||% "" + if (identical(.window_size, Inf)) { + n <- "running_" + time_unit_abbr <- "" + align_abbr <- "" + } else { + n <- time_delta_to_n_steps(.window_size, time_type) + time_unit_abbr <- time_type_unit_abbr(time_type) + align_abbr <- c(right = "", center = "c", left = "l")[[.align]] + } + glue_env <- rlang::env( + .n = n, + .time_unit_abbr = time_unit_abbr, + .align_abbr = align_abbr, + .f_abbr = f_info$abbr, + quo_get_env(col_names_quo) + ) + .new_col_names <- unclass( + glue(.prefix, .envir = glue_env) + + col_names_chr + + glue(.suffix, .envir = glue_env) + ) + } else { + # `.new_col_names` was provided by user; we don't need to do anything. + } + if (any(.new_col_names %in% names(.x))) { + cli_abort(c( + "Naming conflict between new columns and existing columns", + "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" + ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") + } + if (anyDuplicated(.new_col_names)) { + cli_abort(c( + "New column names contain duplicates", + "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" + ), class = "epiprocess__epi_slide_opt_new_name_duplicated") + } + result_col_names <- .new_col_names + # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates - # The position of a given column can be differ between input `.x` and - # `.data_group` since the grouping step by default drops grouping columns. - # To avoid rerunning `eval_select` for every `.data_group`, convert - # positions of user-provided `col_names` into string column names. We avoid - # using `names(pos)` directly for robustness and in case we later want to - # allow users to rename fields via tidyselection. - pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) - col_names_chr <- names(.x)[pos] - # Always rename results to "slide_value_". - result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] # `frollmean` requires a full window to compute a result. Add NA values @@ -827,8 +920,7 @@ epi_slide_opt <- function( #' # Compute a 7-day trailing average on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +#' epi_slide_mean(cases, .window_size = 7) #' #' # Same as above, but adjust `frollmean` settings for speed, accuracy, and #' # to allow partially-missing windows. @@ -838,11 +930,11 @@ epi_slide_opt <- function( #' cases, #' .window_size = 7, #' na.rm = TRUE, algo = "exact", hasNA = TRUE -#' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +#' ) epi_slide_mean <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { # Deprecated argument handling provided_args <- rlang::call_args_names(rlang::call_match()) @@ -885,6 +977,9 @@ epi_slide_mean <- function( ..., .window_size = .window_size, .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, .ref_time_values = .ref_time_values, .all_rows = .all_rows ) @@ -899,11 +994,11 @@ epi_slide_mean <- function( #' # Compute a 7-day trailing sum on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) +#' epi_slide_sum(cases, .window_size = 7) epi_slide_sum <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE) { # Deprecated argument handling provided_args <- rlang::call_args_names(rlang::call_match()) @@ -945,6 +1040,9 @@ epi_slide_sum <- function( ..., .window_size = .window_size, .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, .ref_time_values = .ref_time_values, .all_rows = .all_rows ) diff --git a/R/utils.R b/R/utils.R index 1bfd2129..06876f08 100644 --- a/R/utils.R +++ b/R/utils.R @@ -640,7 +640,7 @@ guess_time_type <- function(time_value, time_value_arg = rlang::caller_arg(time_ return("day") } else if (inherits(time_value, "yearmonth")) { return("yearmonth") - } else if (rlang::is_integerish(time_value)) { + } else if (is_bare_integerish(time_value)) { return("integer") } @@ -1109,3 +1109,68 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU ) } } + + +#' Convert a time delta to a compatible integerish number of steps between time values +#' +#' @param time_delta a vector that can be added to time values of time type +#' `time_type` to arrive at other time values of that time type, or +#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed +#' in, if supported by the class of `time_delta`, even if `time_type` doesn't +#' necessarily support Inf/-Inf entries. Basically a slide window arg but +#' without sign and length restrictions. +#' @param time_type as in [`validate_slide_window_arg`] +#' @return [bare integerish][rlang::is_integerish] vector (with possible +#' infinite values) that produces the same result as `time_delta` when added +#' to time values of time type `time_type`. If the given time type does not +#' support infinite values, then it should produce +Inf or -Inf for analogous +#' entries of `time_delta`, and match the addition result match the addition +#' result for non-infinite values, and product +Inf / -Inf when match the sign +#' and of `time_delta`. +#' +#' @keywords internal +time_delta_to_n_steps <- function(time_delta, time_type) { + # could be S3 if we're willing to export + if (inherits(time_delta, "difftime")) { + output_units <- switch(time_type, + day = "days", + week = "weeks", + cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") + ) + units(time_delta) <- output_units # converts number to represent same duration; not just attr<- + n_steps <- vec_data(time_delta) + if (!is_bare_integerish(n_steps)) { + cli_abort("`time_delta` did not appear to contain only integerish numbers + of steps between time values of time type {format_chr_with_quotes(time_type)}") + } + n_steps + } else if (is_bare_integerish(time_delta)) { # (allows infinite values) + switch(time_type, + day = , + week = , + yearmonth = , + integer = time_delta, + cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") + ) + } else { + cli_abort("Invalid or unsupported kind of `time_delta`") + } +} + +# Using these unit abbreviations happens to make our automatic slide output +# naming look like taking ISO-8601 duration designations, removing the P, and +# lowercasing any characters. Fortnightly or sub-daily time types would need an +# adjustment to remain consistent. +time_type_unit_abbrs <- c( + day = "d", + week = "w", + yearmonth = "m" +) + +time_type_unit_abbr <- function(time_type) { + maybe_unit_abbr <- time_type_unit_abbrs[time_type] + if (is.na(maybe_unit_abbr)) { + cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") + } + maybe_unit_abbr +} diff --git a/README.Rmd b/README.Rmd index 82d60fc7..0e8756d3 100644 --- a/README.Rmd +++ b/README.Rmd @@ -113,8 +113,7 @@ Compute the 7 day moving average of the confirmed daily cases for each geo_value ```{r} edf <- edf %>% group_by(geo_value) %>% - epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) %>% - rename(smoothed_cases_daily = slide_value_cases_daily) + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf ``` diff --git a/README.md b/README.md index 7c14443d..af8c24e9 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,7 @@ geo\_value ``` r edf <- edf %>% group_by(geo_value) %>% - epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) %>% - rename(smoothed_cases_daily = slide_value_cases_daily) + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf #> An `epi_df` object, 2,808 x 5 with metadata: #> * geo_type = state diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 10d38957..1c399d35 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -99,8 +99,8 @@ This is useful for computations like rolling averages. The function supports many ways to specify the computation, but by far the most common use case is as follows: -\if{html}{\out{
}}\preformatted{# Create new column `cases_7dm` that contains a 7-day trailing median of cases -epi_slide(edf, cases_7dav = median(cases), .window_size = 7) +\if{html}{\out{
}}\preformatted{# Create new column `cases_7dmed` that contains a 7-day trailing median of cases +epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) }\if{html}{\out{
}} For two very common use cases, we provide optimized functions that are much diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index cd293ee1..7f250cf4 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,6 +13,9 @@ epi_slide_opt( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE ) @@ -23,6 +26,9 @@ epi_slide_mean( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE ) @@ -33,6 +39,9 @@ epi_slide_sum( ..., .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, + .suffix = NULL, + .new_col_names = NULL, .ref_time_values = NULL, .all_rows = FALSE ) @@ -43,7 +52,7 @@ columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} \item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be @@ -96,6 +105,20 @@ window will be asymmetric and have one more value before the reference time than after. }} +\item{.prefix}{Optional \code{\link[glue:glue]{glue::glue}} format string; name the slide result +column(s) by attaching this prefix to the corresponding input column(s). +Some shorthand is supported for basing the output names on \code{.window_size} +or other arguments; see "Prefix and suffix shorthand" below.} + +\item{.suffix}{Optional \code{\link[glue:glue]{glue::glue}} format string; like \code{.prefix}. The +default naming behavior is equivalent to \code{.suffix = "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"}. Can be used in combination +with \code{.prefix}.} + +\item{.new_col_names}{Optional character vector with length matching the +number of input columns from \code{.col_names}; name the slide result column(s) +with these names. Cannot be used in combination with \code{.prefix} and/or +\code{.suffix}.} + \item{.ref_time_values}{The time values at which to compute the slides values. By default, this is all the unique time values in \code{.x}.} @@ -119,12 +142,28 @@ These functions tend to be much faster than \code{epi_slide()}. See \code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. } +\section{Prefix and suffix shorthand}{ + + +\code{\link[glue:glue]{glue::glue}} format strings specially interpret content within curly +braces. E.g., \code{glue::glue("ABC{2 + 2}")} evaluates to \code{"ABC4"}. For \code{.prefix} +and \code{.suffix}, we provide \code{glue} with some additional variable bindings: +\itemize{ +\item \code{{.n}} will be the number of time steps in the computation +corresponding to the \code{.window_size}. +\item \code{{.time_unit_abbr}} will be a lower-case letter corresponding to the +\code{time_type} of \code{.x} +\item \code{{.align_abbr}} will be \code{""} if \code{.align} is the default of \code{"right"}; +otherwise, it will be the first letter of \code{.align} +\item \code{{.f_abbr}} will be a short string based on what \code{.f} +} +} + \examples{ # Compute a 7-day trailing average on cases. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) + epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) # Same as above, but adjust `frollmean` settings for speed, accuracy, and # to allow partially-missing windows. @@ -134,13 +173,11 @@ cases_deaths_subset \%>\% cases, .f = data.table::frollmean, .window_size = 7, algo = "exact", hasNA = TRUE, na.rm = TRUE - ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) + ) # Compute a 7-day trailing average on cases. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) + epi_slide_mean(cases, .window_size = 7) # Same as above, but adjust `frollmean` settings for speed, accuracy, and # to allow partially-missing windows. @@ -150,13 +187,11 @@ cases_deaths_subset \%>\% cases, .window_size = 7, na.rm = TRUE, algo = "exact", hasNA = TRUE - ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) + ) # Compute a 7-day trailing sum on cases. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) + epi_slide_sum(cases, .window_size = 7) } \seealso{ \code{\link{epi_slide}} for the more general slide function diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd new file mode 100644 index 00000000..0f9325be --- /dev/null +++ b/man/time_delta_to_n_steps.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{time_delta_to_n_steps} +\alias{time_delta_to_n_steps} +\title{Convert a time delta to a compatible integerish number of steps between time values} +\usage{ +time_delta_to_n_steps(time_delta, time_type) +} +\arguments{ +\item{time_delta}{a vector that can be added to time values of time type +\code{time_type} to arrive at other time values of that time type, or +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} such a vector with Inf/-Inf entries mixed +in, if supported by the class of \code{time_delta}, even if \code{time_type} doesn't +necessarily support Inf/-Inf entries. Basically a slide window arg but +without sign and length restrictions.} + +\item{time_type}{as in \code{\link{validate_slide_window_arg}}} +} +\value{ +\link[rlang:is_integerish]{bare integerish} vector (with possible +infinite values) that produces the same result as \code{time_delta} when added +to time values of time type \code{time_type}. If the given time type does not +support infinite values, then it should produce +Inf or -Inf for analogous +entries of \code{time_delta}, and match the addition result match the addition +result for non-infinite values, and product +Inf / -Inf when match the sign +and of \code{time_delta}. +} +\description{ +Convert a time delta to a compatible integerish number of steps between time values +} +\keyword{internal} diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f658bcf4..e8c318c7 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -300,34 +300,48 @@ for (p in (param_combinations %>% transpose())) { concatenate_list_params(p) ), { - out_sum <- rlang::inject(epi_slide(test_data, ~ sum(.x$value), !!!slide_args)) %>% - rename(slide_value_value = slide_value) - out_mean <- rlang::inject(epi_slide(test_data, ~ mean(.x$value), !!!slide_args)) %>% - rename(slide_value_value = slide_value) + out_sum <- rlang::inject(epi_slide(test_data, ~ sum(.x$value), !!!slide_args)) + out_mean <- rlang::inject(epi_slide(test_data, ~ mean(.x$value), !!!slide_args)) expect_equal( out_sum, - rlang::inject(epi_slide_opt(test_data, value, .f = data.table::frollsum, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = data.table::frollsum, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_sum, - rlang::inject(epi_slide_opt(test_data, value, .f = slider::slide_sum, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = slider::slide_sum, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_sum, - rlang::inject(epi_slide_sum(test_data, value, !!!slide_args)) + rlang::inject(epi_slide_sum(test_data, value, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_mean, - rlang::inject(epi_slide_opt(test_data, value, .f = data.table::frollmean, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = data.table::frollmean, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_mean, - rlang::inject(epi_slide_opt(test_data, value, .f = slider::slide_mean, !!!slide_args)) + rlang::inject(epi_slide_opt(test_data, value, + .f = slider::slide_mean, !!!slide_args, + .new_col_names = "slide_value" + )) ) expect_equal( out_mean, - rlang::inject(epi_slide_mean(test_data, value, !!!slide_args)) + rlang::inject(epi_slide_mean(test_data, value, !!!slide_args, + .new_col_names = "slide_value" + )) ) } ) @@ -730,7 +744,7 @@ test_that("no dplyr warnings from selecting multiple columns", { ) expect_equal( names(multi_slid), - c("geo_value", "time_value", "value", "value2", "slide_value_value", "slide_value_value2") + c("geo_value", "time_value", "value", "value2", "value_7dav", "value2_7dav") ) expect_no_warning( multi_slid_select <- epi_slide_mean(multi_columns, c(value, value2), .window_size = 7) @@ -741,3 +755,122 @@ test_that("no dplyr warnings from selecting multiple columns", { ) expect_equal(multi_slid_select, multi_slid) }) + +test_that("epi_slide_opt output naming features", { + multi_columns <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200, value2 = -1:-200), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), value2 = 1:5) + ) %>% + as_epi_df() %>% + group_by(geo_value) + multi_columns_weekly <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = test_date + 7 * (1:200), value = 1:200, value2 = -1:-200), + dplyr::tibble(geo_value = "al", time_value = test_date + 7 * (1:5), value = -(1:5), value2 = 1:5) + ) %>% + as_epi_df() %>% + group_by(geo_value) + yearmonthly <- + tibble::tibble( + geo_value = 1, + time_value = tsibble::make_yearmonth(2000, 1) + 1:30 - 1, + value = 1:30 %% 2 == 0 + ) %>% + as_epi_df() %>% + group_by(geo_value) + + # Auto-naming: + # * Changing .f and .window_size: + expect_equal( + multi_columns %>% epi_slide_opt(value2, frollmean, .window_size = 14) %>% names(), + c(names(multi_columns), "value2_14dav") + ) + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_mean, .window_size = as.difftime(14, units = "days")) %>% names(), + c(names(multi_columns), "value2_14dav") + ) + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_sum, .window_size = Inf) %>% names(), + c(names(multi_columns), "value2_running_sum") + ) + # * Changing .f and .align: + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_min, .window_size = 14, .align = "center") %>% names(), + c(names(multi_columns), "value2_14dcmin") + ) + expect_equal( + multi_columns %>% epi_slide_opt(value2, slide_max, .window_size = 14, .align = "left") %>% names(), + c(names(multi_columns), "value2_14dlmax") + ) + # * Changing .f, time_type(, .window_size): + expect_equal( + multi_columns_weekly %>% + epi_slide_opt(value2, slide_prod, .window_size = as.difftime(2, units = "weeks")) %>% + names(), + c(names(multi_columns_weekly), "value2_2wprod") + ) + expect_equal( + yearmonthly %>% epi_slide_opt(value, slide_any, .window_size = 3) %>% names(), + c(names(yearmonthly), "value_3many") # not the best name, but super unlikely anyway + ) + + # Manual naming: + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .suffix = "_s{.n}") %>% + names(), + c(names(multi_columns), "value_s7", "value2_s7") + ) + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .prefix = "{.f_abbr}_", .suffix = "_{.n}") %>% + names(), + c(names(multi_columns), "sum_value_7", "sum_value2_7") + ) + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .prefix = "slide_value_") %>% + names(), + c(names(multi_columns), "slide_value_value", "slide_value_value2") + ) + expect_equal( + multi_columns %>% + epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .new_col_names = c("slide_value", "sv2")) %>% + names(), + c(names(multi_columns), "slide_value", "sv2") + ) + + # Validation errors: + # * Wrong sizes: + expect_error( + multi_columns %>% epi_slide_opt(starts_with("value"), slide_sum, + .window_size = 7, + .suffix = c("a", "b") + ) + ) + expect_error( + multi_columns %>% epi_slide_opt(starts_with("value"), slide_sum, + .window_size = 7, + .new_col_names = "slide_value" + ) + ) + expect_error( + multi_columns %>% epi_slide_mean(starts_with("value"), .window_size = 7, .new_col_names = "output") + ) + # * Incompatible args: + expect_error( + multi_columns %>% epi_slide_opt(value, slide_sum, + .window_size = 7, + .prefix = "a", .suffix = "b", .new_col_names = "slide_value" + ), + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + # * Bad resulting output names: + expect_error( + multi_columns %>% epi_slide_mean(value, .window_size = 7, .new_col_names = "value"), + class = "epiprocess__epi_slide_opt_old_new_name_conflict" + ) + expect_error( + multi_columns %>% epi_slide_mean(value:value2, .window_size = 7, .new_col_names = c("output", "output")), + class = "epiprocess__epi_slide_opt_new_name_duplicated" + ) +}) From 79ed2f399f82669ed17ab2a0f3a5a7047f6777dc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 12 Nov 2024 11:38:41 -0800 Subject: [PATCH 08/39] Use different automatic names for slides on logical columns --- R/slide.R | 34 +++++++++++++++++---------------- R/utils.R | 5 +++++ man/epi_slide_opt.Rd | 3 ++- tests/testthat/test-epi_slide.R | 25 ++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/R/slide.R b/R/slide.R index 9d383446..96beb828 100644 --- a/R/slide.R +++ b/R/slide.R @@ -587,7 +587,8 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' `time_type` of `.x` #' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; #' otherwise, it will be the first letter of `.align` -#' - `{.f_abbr}` will be a short string based on what `.f` +#' - `{.f_abbr}` will be a character vector containing a short abbreviation +#' for `.f` factoring in the input column type(s) for `.col_names` #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of #' @importFrom rlang enquo expr_label caller_arg quo_get_env @@ -681,22 +682,23 @@ epi_slide_opt <- function( col_names_chr <- names(.x)[pos] # Check that slide function `.f` is one of those short-listed from - # `data.table` and `slider` (or a function that has the exact same - # definition, e.g. if the function has been reexported or defined - # locally). + # `data.table` and `slider` (or a function that has the exact same definition, + # e.g. if the function has been reexported or defined locally). Extract some + # metadata. `namer` will be mapped over columns (.x will be a column, not the + # entire edf). f_possibilities <- tibble::tribble( - ~f, ~package, ~abbr, - frollmean, "data.table", "av", - frollsum, "data.table", "sum", - frollapply, "data.table", "slide", - slide_sum, "slider", "sum", - slide_prod, "slider", "prod", - slide_mean, "slider", "av", - slide_min, "slider", "min", - slide_max, "slider", "max", - slide_all, "slider", "all", - slide_any, "slider", "any", + ~f, ~package, ~namer, + frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", + frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", + frollapply, "data.table", ~"slide", + slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", + slide_prod, "slider", ~"prod", + slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", + slide_min, "slider", ~"min", + slide_max, "slider", ~"max", + slide_all, "slider", ~"all", + slide_any, "slider", ~"any", ) f_info <- f_possibilities %>% filter(map_lgl(.data$f, ~ identical(.f, .x))) @@ -780,7 +782,7 @@ epi_slide_opt <- function( .n = n, .time_unit_abbr = time_unit_abbr, .align_abbr = align_abbr, - .f_abbr = f_info$abbr, + .f_abbr = purrr::map_chr(.x[col_names_chr], unwrap(f_info$namer)), quo_get_env(col_names_quo) ) .new_col_names <- unclass( diff --git a/R/utils.R b/R/utils.R index 06876f08..6625e5c3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1174,3 +1174,8 @@ time_type_unit_abbr <- function(time_type) { } maybe_unit_abbr } + +unwrap <- function(x) { + checkmate::assert_list(x, len = 1L, names = "unnamed") + x[[1L]] +} diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 7f250cf4..c4526c0e 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -155,7 +155,8 @@ corresponding to the \code{.window_size}. \code{time_type} of \code{.x} \item \code{{.align_abbr}} will be \code{""} if \code{.align} is the default of \code{"right"}; otherwise, it will be the first letter of \code{.align} -\item \code{{.f_abbr}} will be a short string based on what \code{.f} +\item \code{{.f_abbr}} will be a character vector containing a short abbreviation +for \code{.f} factoring in the input column type(s) for \code{.col_names} } } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index e8c318c7..2cb04eec 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -812,8 +812,24 @@ test_that("epi_slide_opt output naming features", { yearmonthly %>% epi_slide_opt(value, slide_any, .window_size = 3) %>% names(), c(names(yearmonthly), "value_3many") # not the best name, but super unlikely anyway ) + # * Through forwarding functions: + expect_equal( + # XXX perhaps this should be an auto-naming feature? + yearmonthly %>% + epi_slide_mean(value, .window_size = Inf) %>% + names(), + c(names(yearmonthly), "value_running_prop") + ) + expect_equal( + # XXX perhaps this should be an auto-naming feature? + yearmonthly %>% + epi_slide_sum(value, .window_size = Inf) %>% + names(), + c(names(yearmonthly), "value_running_count") + ) # Manual naming: + # * Various combinations of args: expect_equal( multi_columns %>% epi_slide_opt(starts_with("value"), slide_sum, .window_size = 7, .suffix = "_s{.n}") %>% @@ -838,6 +854,15 @@ test_that("epi_slide_opt output naming features", { names(), c(names(multi_columns), "slide_value", "sv2") ) + # * Through forwarding functions: + expect_equal( + yearmonthly %>% epi_slide_mean(value, .window_size = Inf, .suffix = "_{.f_abbr}") %>% names(), + c(names(yearmonthly), "value_prop") + ) + expect_equal( + yearmonthly %>% epi_slide_sum(value, .window_size = Inf, .suffix = "_{.f_abbr}") %>% names(), + c(names(yearmonthly), "value_count") + ) # Validation errors: # * Wrong sizes: From 97d17e8859588c81ae9ca662186e93dad5472ccd Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 13 Nov 2024 08:02:50 -0800 Subject: [PATCH 09/39] back to our custom workflow, no color handling --- .github/workflows/pkgdown.yaml | 17 ++++++++++++++++- _pkgdown.yml | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 3a1ff927..e9bfcc71 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -42,7 +42,22 @@ jobs: needs: website - name: Build site - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) +# our versioning system+dev branch doesn't match the requirements for + # develop mode = auto + run: | + target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" + override <- if (target_ref == "main" || target_ref == "refs/heads/main") { + list(development = list(mode = "release")) + } else if (target_ref == "dev" || target_ref == "refs/heads/dev") { + list(development = list(mode = "devel")) + } else { + stop("Unexpected target_ref: ", target_ref) + } + pkg <- pkgdown::as_pkgdown(".", override = override) + cli::cli_rule("Cleaning files from old site...") + pkgdown::clean_site(pkg) + pkgdown::build_site(pkg, preview = FALSE, install = FALSE, new_process = FALSE) + pkgdown:::build_github_pages(pkg) shell: Rscript {0} - name: Deploy to GitHub pages ๐Ÿš€ diff --git a/_pkgdown.yml b/_pkgdown.yml index 743adc4b..a7d700e1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,5 +1,5 @@ development: - mode: auto + mode: devel template: package: delphidocs From f858ff9a5ced4b0db8e7740621b061baa71af6ab Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 13 Nov 2024 08:03:58 -0800 Subject: [PATCH 10/39] rm spaces --- .github/workflows/pkgdown.yaml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index e9bfcc71..19542ace 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -45,19 +45,19 @@ jobs: # our versioning system+dev branch doesn't match the requirements for # develop mode = auto run: | - target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" - override <- if (target_ref == "main" || target_ref == "refs/heads/main") { - list(development = list(mode = "release")) - } else if (target_ref == "dev" || target_ref == "refs/heads/dev") { - list(development = list(mode = "devel")) - } else { - stop("Unexpected target_ref: ", target_ref) - } - pkg <- pkgdown::as_pkgdown(".", override = override) - cli::cli_rule("Cleaning files from old site...") - pkgdown::clean_site(pkg) - pkgdown::build_site(pkg, preview = FALSE, install = FALSE, new_process = FALSE) - pkgdown:::build_github_pages(pkg) + target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" + override <- if (target_ref == "main" || target_ref == "refs/heads/main") { + list(development = list(mode = "release")) + } else if (target_ref == "dev" || target_ref == "refs/heads/dev") { + list(development = list(mode = "devel")) + } else { + stop("Unexpected target_ref: ", target_ref) + } + pkg <- pkgdown::as_pkgdown(".", override = override) + cli::cli_rule("Cleaning files from old site...") + pkgdown::clean_site(pkg) + pkgdown::build_site(pkg, preview = FALSE, install = FALSE, new_process = FALSE) + pkgdown:::build_github_pages(pkg) shell: Rscript {0} - name: Deploy to GitHub pages ๐Ÿš€ From 2eaa5eb8b6dd2f7269e1187692010dad3fff3c08 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 13 Nov 2024 08:08:11 -0800 Subject: [PATCH 11/39] delphidocs in description --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index fd09aa57..8917f800 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -72,10 +72,12 @@ Suggests: VignetteBuilder: knitr Remotes: + cmu-delphi/delphidocs, cmu-delphi/epidatasets, cmu-delphi/epidatr, glmgen/genlasso, reconverse/outbreaks +Config/Needs/website: cmu-delphi/delphidocs Config/testthat/edition: 3 Config/testthat/parallel: true Copyright: file inst/COPYRIGHTS From 4970ff4a7809ecc309e485e48677bbcc1d1f3f93 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 13 Nov 2024 16:28:03 -0800 Subject: [PATCH 12/39] style(.../workflows/pkgdown.yaml): indentation --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 19542ace..1381682d 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -42,7 +42,7 @@ jobs: needs: website - name: Build site -# our versioning system+dev branch doesn't match the requirements for + # our versioning system+dev branch doesn't match the requirements for # develop mode = auto run: | target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" From 05a0ca8900187068b13d744d66a55d0128f99691 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 11:40:12 -0800 Subject: [PATCH 13/39] Update epi_slide_{sum,mean} examples w/ naming options, cleanup - Show naming options, including with multi-column selections when we have accommodating example data sets - Select away the pre-existing 7d aggregations in the example data set - Ungroup output --- R/slide.R | 51 ++++++++++++++++++++++++++++++++++++-------- man/epi_slide.Rd | 1 + man/epi_slide_opt.Rd | 50 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 84 insertions(+), 18 deletions(-) diff --git a/R/slide.R b/R/slide.R index 96beb828..5a6d8264 100644 --- a/R/slide.R +++ b/R/slide.R @@ -114,6 +114,7 @@ #' }, #' .window_size = 7 #' ) %>% +#' ungroup() %>% #' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) #' #' # Use the geo_value or the ref_time_value in the slide computation @@ -605,7 +606,8 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' # Compute a 7-day trailing average on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) +#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) %>% +#' ungroup() #' #' # Same as above, but adjust `frollmean` settings for speed, accuracy, and #' # to allow partially-missing windows. @@ -615,7 +617,8 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' cases, #' .f = data.table::frollmean, .window_size = 7, #' algo = "exact", hasNA = TRUE, na.rm = TRUE -#' ) +#' ) %>% +#' ungroup() epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -919,20 +922,36 @@ epi_slide_opt <- function( #' #' @export #' @examples -#' # Compute a 7-day trailing average on cases. -#' cases_deaths_subset %>% +#' # Compute a 7-day trailing average of case rates. +#' covid_case_death_rates_extended %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, .window_size = 7) +#' epi_slide_mean(case_rate, .window_size = 7) %>% +#' ungroup() #' #' # Same as above, but adjust `frollmean` settings for speed, accuracy, and #' # to allow partially-missing windows. -#' cases_deaths_subset %>% +#' covid_case_death_rates_extended %>% #' group_by(geo_value) %>% #' epi_slide_mean( -#' cases, +#' case_rate, #' .window_size = 7, #' na.rm = TRUE, algo = "exact", hasNA = TRUE -#' ) +#' ) %>% +#' ungroup() +#' +#' # Compute a 7-day trailing average of case rates and death rates, with custom +#' # output column names: +#' covid_case_death_rates_extended %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(c(case_rate, death_rate), +#' .window_size = 7, +#' .new_col_names = c("smoothed_case_rate", "smoothed_death_rate") +#' ) %>% +#' ungroup() +#' covid_case_death_rates_extended %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(c(case_rate, death_rate), .window_size = 7, .suffix = "_{.n}{.time_unit_abbr}_avg") %>% +#' ungroup() epi_slide_mean <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -995,8 +1014,22 @@ epi_slide_mean <- function( #' @examples #' # Compute a 7-day trailing sum on cases. #' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7) %>% +#' ungroup() +#' +#' # Specify output column names and/or naming scheme: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% +#' ungroup() +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7) +#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% +#' ungroup() epi_slide_sum <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 1c399d35..511ba8a8 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -172,6 +172,7 @@ cases_deaths_subset \%>\% }, .window_size = 7 ) \%>\% + ungroup() \%>\% dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) # Use the geo_value or the ref_time_value in the slide computation diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index c4526c0e..c014c2d1 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -164,7 +164,8 @@ for \code{.f} factoring in the input column type(s) for \code{.col_names} # Compute a 7-day trailing average on cases. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) + epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) \%>\% + ungroup() # Same as above, but adjust `frollmean` settings for speed, accuracy, and # to allow partially-missing windows. @@ -174,25 +175,56 @@ cases_deaths_subset \%>\% cases, .f = data.table::frollmean, .window_size = 7, algo = "exact", hasNA = TRUE, na.rm = TRUE - ) -# Compute a 7-day trailing average on cases. -cases_deaths_subset \%>\% + ) \%>\% + ungroup() +# Compute a 7-day trailing average of case rates. +covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 7) + epi_slide_mean(case_rate, .window_size = 7) \%>\% + ungroup() # Same as above, but adjust `frollmean` settings for speed, accuracy, and # to allow partially-missing windows. -cases_deaths_subset \%>\% +covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% epi_slide_mean( - cases, + case_rate, .window_size = 7, na.rm = TRUE, algo = "exact", hasNA = TRUE - ) + ) \%>\% + ungroup() + +# Compute a 7-day trailing average of case rates and death rates, with custom +# output column names: +covid_case_death_rates_extended \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(c(case_rate, death_rate), + .window_size = 7, + .new_col_names = c("smoothed_case_rate", "smoothed_death_rate") + ) \%>\% + ungroup() +covid_case_death_rates_extended \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(c(case_rate, death_rate), .window_size = 7, .suffix = "_{.n}{.time_unit_abbr}_avg") \%>\% + ungroup() # Compute a 7-day trailing sum on cases. cases_deaths_subset \%>\% + select(geo_value, time_value, cases) \%>\% + group_by(geo_value) \%>\% + epi_slide_sum(cases, .window_size = 7) \%>\% + ungroup() + +# Specify output column names and/or naming scheme: +cases_deaths_subset \%>\% + select(geo_value, time_value, cases) \%>\% + group_by(geo_value) \%>\% + epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") \%>\% + ungroup() +cases_deaths_subset \%>\% + select(geo_value, time_value, cases) \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, .window_size = 7) + epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") \%>\% + ungroup() } \seealso{ \code{\link{epi_slide}} for the more general slide function From 5ffaf2f6b9285a4b8b570fe9b69243c5ebb623ac Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 12:51:00 -0800 Subject: [PATCH 14/39] Tweak&fix epi_slide docs regarding packing, nesting --- R/slide.R | 55 +++++++++++++++++++++++++++++++++----------- man/epi_slide.Rd | 55 +++++++++++++++++++++++++++++++++----------- man/epi_slide_opt.Rd | 6 ++--- 3 files changed, 86 insertions(+), 30 deletions(-) diff --git a/R/slide.R b/R/slide.R index 5a6d8264..ba5f3f3a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -70,6 +70,8 @@ #' @export #' @seealso [`epi_slide_opt`] for optimized slide functions #' @examples +#' library(dplyr) +#' #' # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases #' cases_deaths_subset %>% #' epi_slide( @@ -77,45 +79,72 @@ #' cases_7dav = mean(cases, na.rm = TRUE), #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' # Note that epi_slide_mean could be used to more quickly calculate cases_7dav. +#' +#' # In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or +#' # formula in a way similar to [`dplyr::group_modify`]: +#' my_summarizer <- function(window_data) { +#' window_data %>% +#' summarize( +#' cases_7sd = sd(cases, na.rm = TRUE), +#' cases_7dav = mean(cases, na.rm = TRUE) +#' ) +#' } +#' cases_deaths_subset %>% +#' epi_slide( +#' ~ my_summarizer(.x), +#' .window_size = 7 +#' ) %>% +#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' #' -#' # The same as above, but unpacking using an unnamed data.frame with a formula +#' +#' +#' +#' #### Advanced: #### +#' +#' # The tidyverse supports ["packing"][tidyr::pack] multiple columns into a +#' # single tibble-type column contained within some larger tibble. Like dplyr, +#' # we normally don't pack output columns together, but will if you provide a +#' # name for a tibble-type output: #' cases_deaths_subset %>% #' epi_slide( -#' ~ data.frame( +#' slide_packed = tibble( #' cases_7sd = sd(.x$cases, na.rm = TRUE), #' cases_7dav = mean(.x$cases, na.rm = TRUE) #' ), #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) -#' -#' # The same as above, but packing using a named data.frame with a tidy evaluation -#' # expression +#' select(geo_value, time_value, cases, slide_packed) #' cases_deaths_subset %>% #' epi_slide( -#' slide_packed = data.frame( +#' ~ tibble( #' cases_7sd = sd(.x$cases, na.rm = TRUE), #' cases_7dav = mean(.x$cases, na.rm = TRUE) #' ), +#' .new_col_name = "slide_packed", #' .window_size = 7 #' ) %>% -#' dplyr::select(geo_value, time_value, cases, slide_packed) +#' select(geo_value, time_value, cases, slide_packed) #' -#' # nested new columns +#' # You can also get ["nested"][tidyr::nest] format by wrapping your results in +#' # a list: #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide( #' function(x, g, t) { -#' data.frame( +#' list(tibble( #' cases_7sd = sd(x$cases, na.rm = TRUE), #' cases_7dav = mean(x$cases, na.rm = TRUE) -#' ) +#' )) #' }, #' .window_size = 7 #' ) %>% #' ungroup() %>% -#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' select(geo_value, time_value, slide_value) +#' +#' #' #' # Use the geo_value or the ref_time_value in the slide computation #' cases_deaths_subset %>% diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 511ba8a8..c497b5d3 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -128,6 +128,8 @@ determined the time window for the current computation. } } \examples{ +library(dplyr) + # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases cases_deaths_subset \%>\% epi_slide( @@ -135,45 +137,72 @@ cases_deaths_subset \%>\% cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) + select(geo_value, time_value, cases, cases_7sd, cases_7dav) +# Note that epi_slide_mean could be used to more quickly calculate cases_7dav. + +# In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or +# formula in a way similar to [`dplyr::group_modify`]: +my_summarizer <- function(window_data) { + window_data \%>\% + summarize( + cases_7sd = sd(cases, na.rm = TRUE), + cases_7dav = mean(cases, na.rm = TRUE) + ) +} +cases_deaths_subset \%>\% + epi_slide( + ~ my_summarizer(.x), + .window_size = 7 + ) \%>\% + select(geo_value, time_value, cases, cases_7sd, cases_7dav) + -# The same as above, but unpacking using an unnamed data.frame with a formula + + + +#### Advanced: #### + +# The tidyverse supports ["packing"][tidyr::pack] multiple columns into a +# single tibble-type column contained within some larger tibble. Like dplyr, +# we normally don't pack output columns together, but will if you provide a +# name for a tibble-type output: cases_deaths_subset \%>\% epi_slide( - ~ data.frame( + slide_packed = tibble( cases_7sd = sd(.x$cases, na.rm = TRUE), cases_7dav = mean(.x$cases, na.rm = TRUE) ), .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) - -# The same as above, but packing using a named data.frame with a tidy evaluation -# expression + select(geo_value, time_value, cases, slide_packed) cases_deaths_subset \%>\% epi_slide( - slide_packed = data.frame( + ~ tibble( cases_7sd = sd(.x$cases, na.rm = TRUE), cases_7dav = mean(.x$cases, na.rm = TRUE) ), + .new_col_name = "slide_packed", .window_size = 7 ) \%>\% - dplyr::select(geo_value, time_value, cases, slide_packed) + select(geo_value, time_value, cases, slide_packed) -# nested new columns +# You can also get ["nested"][tidyr::nest] format by wrapping your results in +# a list: cases_deaths_subset \%>\% group_by(geo_value) \%>\% epi_slide( function(x, g, t) { - data.frame( + list(tibble( cases_7sd = sd(x$cases, na.rm = TRUE), cases_7dav = mean(x$cases, na.rm = TRUE) - ) + )) }, .window_size = 7 ) \%>\% ungroup() \%>\% - dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) + select(geo_value, time_value, slide_value) + + # Use the geo_value or the ref_time_value in the slide computation cases_deaths_subset \%>\% diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index c014c2d1..e61e94cd 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -198,10 +198,8 @@ covid_case_death_rates_extended \%>\% # output column names: covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% - epi_slide_mean(c(case_rate, death_rate), - .window_size = 7, - .new_col_names = c("smoothed_case_rate", "smoothed_death_rate") - ) \%>\% + epi_slide_mean(c(case_rate, death_rate), .window_size = 7, + .new_col_names = c("smoothed_case_rate", "smoothed_death_rate")) \%>\% ungroup() covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% From e099fd0cc39b276844717804ae67caa4b9593abe Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 13:28:28 -0800 Subject: [PATCH 15/39] Make epi_slide* all use autogrouping + make autogrouping temporary --- R/slide.R | 59 ++++++++++++++++---------------- man-roxygen/basic-slide-params.R | 4 +-- man/epi_slide.Rd | 4 +-- man/epi_slide_opt.Rd | 10 +++--- tests/testthat/test-epi_slide.R | 38 ++++++++++++++++++++ 5 files changed, 78 insertions(+), 37 deletions(-) diff --git a/R/slide.R b/R/slide.R index ba5f3f3a..d94dd01a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -186,15 +186,16 @@ epi_slide <- function( # Validate arguments assert_class(.x, "epi_df") - if (checkmate::test_class(.x, "grouped_df")) { + .x_orig_groups <- groups(.x) + if (inherits(.x, "grouped_df")) { expected_group_keys <- .x %>% key_colnames(exclude = "time_value") %>% sort() if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { cli_abort( - "epi_slide: `.x` must be either grouped by {expected_group_keys}. (Or you can just ungroup - `.x` and we'll do this grouping automatically.) You may need to aggregate your data first, - see aggregate_epi_df().", + "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, + we'll temporarily group by {expected_group_keys} for this operation. You may need + to aggregate your data first, see aggregate_epi_df().", class = "epiprocess__epi_slide__invalid_grouping" ) } @@ -300,7 +301,6 @@ epi_slide <- function( # `epi_slide_one_group`. # - `...` from top of `epi_slide` are forwarded to `.f` here through # group_modify and through the lambda. - .x_groups <- groups(.x) result <- group_map( .x, .f = function(.data_group, .group_key, ...) { @@ -324,7 +324,7 @@ epi_slide <- function( filter(.real) %>% select(-.real) %>% arrange_col_canonical() %>% - group_by(!!!.x_groups) + group_by(!!!.x_orig_groups) # If every group in epi_slide_one_group takes the # length(available_ref_time_values) == 0 branch then we end up here. @@ -691,12 +691,30 @@ epi_slide_opt <- function( ) } + assert_class(.x, "epi_df") + .x_orig_groups <- groups(.x) + if (inherits(.x, "grouped_df")) { + expected_group_keys <- .x %>% + key_colnames(exclude = "time_value") %>% + sort() + if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { + cli_abort( + "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, + we'll temporarily group by {expected_group_keys} for this operation. You may need + to aggregate your data first, see aggregate_epi_df().", + class = "epiprocess__epi_slide__invalid_grouping" + ) + } + } else { + .x <- group_epi_df(.x, exclude = "time_value") + } if (nrow(.x) == 0L) { cli_abort( c( "input data `.x` unexpectedly has 0 rows", "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `.versions` argument was set appropriately" + check that `epix_slide` `.versions` argument was set appropriately + so that you don't get any completely-empty snapshots" ), class = "epiprocess__epi_slide_opt__0_row_input", epiprocess__x = .x @@ -857,27 +875,9 @@ epi_slide_opt <- function( arrange(.data$time_value) if (f_from_package == "data.table") { - # If a group contains duplicate time values, `frollmean` will still only - # use the last `k` obs. It isn't looking at dates, it just goes in row - # order. So if the computation is aggregating across multiple obs for the - # same date, `epi_slide_opt` and derivates will produce incorrect results; - # `epi_slide` should be used instead. - if (anyDuplicated(.data_group$time_value) != 0L) { - cli_abort( - c( - "group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this - group will result in incorrect results", - "i" = "Please change the grouping structure of the input data so that - each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) - %>% epi_slide_opt(.f = frollmean)`)", - "i" = "Use `epi_slide` to aggregate across groups" - ), - class = "epiprocess__epi_slide_opt__duplicate_time_values", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - + # Grouping should ensure that we don't have duplicate time values. + # Completion above should ensure we have at least .window_size rows. Check + # that we don't have more than .window_size rows (or fewer somehow): if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { cli_abort( c( @@ -928,7 +928,8 @@ epi_slide_opt <- function( group_modify(slide_one_grp, ..., .keep = FALSE) %>% filter(.data$.real) %>% select(-.real) %>% - arrange_col_canonical() + arrange_col_canonical() %>% + group_by(!!!.x_orig_groups) if (.all_rows) { result[!(result$time_value %in% ref_time_values), result_col_names] <- NA diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index 638307d6..8ccd35f9 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -1,5 +1,5 @@ -#' @param .x An `epi_df` object. If ungrouped, we group by `geo_value` and any -#' columns in `other_keys`. If grouped, we make sure the grouping is by +#' @param .x An `epi_df` object. If ungrouped, we temporarily group by `geo_value` +#' and any columns in `other_keys`. If grouped, we make sure the grouping is by #' `geo_value` and `other_keys`. #' @param .window_size The size of the sliding window. The accepted values #' depend on the type of the `time_value` column in `.x`: diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index c497b5d3..9f4abd36 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -16,8 +16,8 @@ epi_slide( ) } \arguments{ -\item{.x}{An \code{epi_df} object. If ungrouped, we group by \code{geo_value} and any -columns in \code{other_keys}. If grouped, we make sure the grouping is by +\item{.x}{An \code{epi_df} object. If ungrouped, we temporarily group by \code{geo_value} +and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} \item{.f}{Function, formula, or missing; together with \code{...} specifies the diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index e61e94cd..13b03a1b 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -47,8 +47,8 @@ epi_slide_sum( ) } \arguments{ -\item{.x}{An \code{epi_df} object. If ungrouped, we group by \code{geo_value} and any -columns in \code{other_keys}. If grouped, we make sure the grouping is by +\item{.x}{An \code{epi_df} object. If ungrouped, we temporarily group by \code{geo_value} +and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} \item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column @@ -198,8 +198,10 @@ covid_case_death_rates_extended \%>\% # output column names: covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% - epi_slide_mean(c(case_rate, death_rate), .window_size = 7, - .new_col_names = c("smoothed_case_rate", "smoothed_death_rate")) \%>\% + epi_slide_mean(c(case_rate, death_rate), + .window_size = 7, + .new_col_names = c("smoothed_case_rate", "smoothed_death_rate") + ) \%>\% ungroup() covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 2cb04eec..0aa4aca7 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -899,3 +899,41 @@ test_that("epi_slide_opt output naming features", { class = "epiprocess__epi_slide_opt_new_name_duplicated" ) }) + +test_that("epi_slide* output grouping matches input grouping", { + toy_edf <- as_epi_df(bind_rows(list( + tibble(geo_value = 1, age_group = 1, time_value = as.Date("2020-01-01") + 1:10 - 1, value = 1:10), + tibble(geo_value = 1, age_group = 2, time_value = as.Date("2020-01-01") + 1:10 - 1, value = 20:11), + tibble(geo_value = 2, age_group = 2, time_value = as.Date("2020-01-01") + 1:10 - 1, value = 31:40) + )), other_keys = "age_group", as_of = as.Date("2020-01-01") + 20) + + # Preserving existing grouping: + expect_equal( + toy_edf %>% + group_by(age_group, geo_value) %>% + epi_slide(value_7dsum = sum(value), .window_size = 7) %>% + group_vars(), + c("age_group", "geo_value") + ) + expect_equal( + toy_edf %>% + group_by(age_group, geo_value) %>% + epi_slide_sum(value, .window_size = 7) %>% + group_vars(), + c("age_group", "geo_value") + ) + + # Removing automatic grouping: + expect_equal( + toy_edf %>% + epi_slide(value_7dsum = sum(value), .window_size = 7) %>% + group_vars(), + character(0) + ) + expect_equal( + toy_edf %>% + epi_slide_sum(value, .window_size = 7) %>% + group_vars(), + character(0) + ) +}) From 6d8bcf6c934f97564134217af6f2a417d4455ad8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 13:41:47 -0800 Subject: [PATCH 16/39] Update NEWS.md with slide naming & autogrouping changes, bugfix --- NEWS.md | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8ab77ee4..436b0c7d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,12 +14,10 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat with `covid`. The data set previously named `jhu_confirmed_cumulative_num` has been removed from the package, but a renamed version is has been removed from the package, but a renamed version is still available in `epidatasets`. - -## Bug fixes - -- Removed `.window_size = 1` default from `epi_slide_{mean,sum,opt}`; this - argument is now mandatory, and should nearly always be greater than 1 except - for testing purposes. +- `epi_slide_{sum,mean,opt}` have improved default output column names, and + additional arguments for specifying names: `.prefix`, `.suffix`, + `.new_col_names`. To obtain the old naming behavior, use `.prefix = + "slide_value_"`. ## Improvements @@ -29,6 +27,18 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Improved validation of `.window_size` arguments. - Rewrote a lot of the package documentation to be more consistent and informative. Simplified and streamlined the vignettes. +- `epi_slide_{sum,mean,opt}` on ungrouped `epi_df`s will now temporarily group + by `geo_value` and any `other_keys` for the slide operation rather than raise + an error about duplicated time values. `epi_slide`'s analogous automatic + grouping has been made temporary in order to match. + +## Bug fixes + +- Removed `.window_size = 1` default from `epi_slide_{mean,sum,opt}`; this + argument is now mandatory, and should nearly always be greater than 1 except + for testing purposes. +- Fixed `epi_slide_{sum,mean,opt}` raising an error on certain tidyselect + expressions. ## Cleanup From 03155dd73f4d4ea085506ba5603a0a46e5bcb7bd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 14:00:50 -0800 Subject: [PATCH 17/39] docs(time_delta_to_n_steps): copyediting --- R/utils.R | 3 +-- man/time_delta_to_n_steps.Rd | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index 6625e5c3..c1804a90 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1125,8 +1125,7 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU #' to time values of time type `time_type`. If the given time type does not #' support infinite values, then it should produce +Inf or -Inf for analogous #' entries of `time_delta`, and match the addition result match the addition -#' result for non-infinite values, and product +Inf / -Inf when match the sign -#' and of `time_delta`. +#' result for non-infinite entries. #' #' @keywords internal time_delta_to_n_steps <- function(time_delta, time_type) { diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd index 0f9325be..f5c6639b 100644 --- a/man/time_delta_to_n_steps.Rd +++ b/man/time_delta_to_n_steps.Rd @@ -22,8 +22,7 @@ infinite values) that produces the same result as \code{time_delta} when added to time values of time type \code{time_type}. If the given time type does not support infinite values, then it should produce +Inf or -Inf for analogous entries of \code{time_delta}, and match the addition result match the addition -result for non-infinite values, and product +Inf / -Inf when match the sign -and of \code{time_delta}. +result for non-infinite entries. } \description{ Convert a time delta to a compatible integerish number of steps between time values From cac02c299ff034f6a56ddaf0a1ed421e32fb6bce Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 14:05:42 -0800 Subject: [PATCH 18/39] docs(time_delta_to_n_steps): fix inaccurate equivalency statement --- R/utils.R | 13 +++++++------ man/time_delta_to_n_steps.Rd | 15 ++++++++------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/R/utils.R b/R/utils.R index c1804a90..348cf7fa 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1111,7 +1111,7 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU } -#' Convert a time delta to a compatible integerish number of steps between time values +#' Convert a time delta to a integerish number of "unit" steps between time values #' #' @param time_delta a vector that can be added to time values of time type #' `time_type` to arrive at other time values of that time type, or @@ -1121,11 +1121,12 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU #' without sign and length restrictions. #' @param time_type as in [`validate_slide_window_arg`] #' @return [bare integerish][rlang::is_integerish] vector (with possible -#' infinite values) that produces the same result as `time_delta` when added -#' to time values of time type `time_type`. If the given time type does not -#' support infinite values, then it should produce +Inf or -Inf for analogous -#' entries of `time_delta`, and match the addition result match the addition -#' result for non-infinite entries. +#' infinite values) that produces the same result as `time_delta` when +#' multiplied by some "unit time step" for that time type and added to time +#' values of time type `time_type`. If the given time type does not support +#' infinite values, then it should produce +Inf or -Inf for analogous entries +#' of `time_delta`, and match the addition result match the addition result +#' for non-infinite entries. #' #' @keywords internal time_delta_to_n_steps <- function(time_delta, time_type) { diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd index f5c6639b..4c0bdcd6 100644 --- a/man/time_delta_to_n_steps.Rd +++ b/man/time_delta_to_n_steps.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/utils.R \name{time_delta_to_n_steps} \alias{time_delta_to_n_steps} -\title{Convert a time delta to a compatible integerish number of steps between time values} +\title{Convert a time delta to a integerish number of "unit" steps between time values} \usage{ time_delta_to_n_steps(time_delta, time_type) } @@ -18,13 +18,14 @@ without sign and length restrictions.} } \value{ \link[rlang:is_integerish]{bare integerish} vector (with possible -infinite values) that produces the same result as \code{time_delta} when added -to time values of time type \code{time_type}. If the given time type does not -support infinite values, then it should produce +Inf or -Inf for analogous -entries of \code{time_delta}, and match the addition result match the addition -result for non-infinite entries. +infinite values) that produces the same result as \code{time_delta} when +multiplied by some "unit time step" for that time type and added to time +values of time type \code{time_type}. If the given time type does not support +infinite values, then it should produce +Inf or -Inf for analogous entries +of \code{time_delta}, and match the addition result match the addition result +for non-infinite entries. } \description{ -Convert a time delta to a compatible integerish number of steps between time values +Convert a time delta to a integerish number of "unit" steps between time values } \keyword{internal} From 9cf52297e1591ef22c6c87e70856d4e650c25be5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 14 Nov 2024 14:38:36 -0800 Subject: [PATCH 19/39] Remove invalid link to undocumented helper function --- R/utils.R | 12 ++++++------ man/time_delta_to_n_steps.Rd | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/utils.R b/R/utils.R index 348cf7fa..cb99ceca 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1119,14 +1119,14 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU #' in, if supported by the class of `time_delta`, even if `time_type` doesn't #' necessarily support Inf/-Inf entries. Basically a slide window arg but #' without sign and length restrictions. -#' @param time_type as in [`validate_slide_window_arg`] +#' @param time_type as in `validate_slide_window_arg` #' @return [bare integerish][rlang::is_integerish] vector (with possible #' infinite values) that produces the same result as `time_delta` when -#' multiplied by some "unit time step" for that time type and added to time -#' values of time type `time_type`. If the given time type does not support -#' infinite values, then it should produce +Inf or -Inf for analogous entries -#' of `time_delta`, and match the addition result match the addition result -#' for non-infinite entries. +#' multiplied by the "natural" "unit time delta" (not yet implemented) for +#' that time type and added to time values of time type `time_type`. If the +#' given time type does not support infinite values, then it should produce +#' +Inf or -Inf for analogous entries of `time_delta`, and match the addition +#' result match the addition result for non-infinite entries. #' #' @keywords internal time_delta_to_n_steps <- function(time_delta, time_type) { diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd index 4c0bdcd6..db8cdaaa 100644 --- a/man/time_delta_to_n_steps.Rd +++ b/man/time_delta_to_n_steps.Rd @@ -14,16 +14,16 @@ in, if supported by the class of \code{time_delta}, even if \code{time_type} doe necessarily support Inf/-Inf entries. Basically a slide window arg but without sign and length restrictions.} -\item{time_type}{as in \code{\link{validate_slide_window_arg}}} +\item{time_type}{as in \code{validate_slide_window_arg}} } \value{ \link[rlang:is_integerish]{bare integerish} vector (with possible infinite values) that produces the same result as \code{time_delta} when -multiplied by some "unit time step" for that time type and added to time -values of time type \code{time_type}. If the given time type does not support -infinite values, then it should produce +Inf or -Inf for analogous entries -of \code{time_delta}, and match the addition result match the addition result -for non-infinite entries. +multiplied by the "natural" "unit time delta" (not yet implemented) for +that time type and added to time values of time type \code{time_type}. If the +given time type does not support infinite values, then it should produce ++Inf or -Inf for analogous entries of \code{time_delta}, and match the addition +result match the addition result for non-infinite entries. } \description{ Convert a time delta to a integerish number of "unit" steps between time values From 632c8d99e9f3fd06419d1dbc02f11594c7386656 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 15 Nov 2024 10:06:55 -0800 Subject: [PATCH 20/39] Address CHECK lints --- R/epiprocess-package.R | 2 +- R/slide.R | 5 ++++- man/epi_slide_opt.Rd | 5 ++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index b640a0b5..68b7b9b5 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -27,5 +27,5 @@ utils::globalVariables(c( "fitted", ".response", "geo_value", "time_value", "value", ".real", "lag", "max_value", "min_value", "median_value", "spread", "rel_spread", "time_to", - "time_near_latest", "n_revisions" + "time_near_latest", "n_revisions", "min_lag", "max_lag" )) diff --git a/R/slide.R b/R/slide.R index d94dd01a..f212473e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -980,7 +980,10 @@ epi_slide_opt <- function( #' ungroup() #' covid_case_death_rates_extended %>% #' group_by(geo_value) %>% -#' epi_slide_mean(c(case_rate, death_rate), .window_size = 7, .suffix = "_{.n}{.time_unit_abbr}_avg") %>% +#' epi_slide_mean(c(case_rate, death_rate), +#' .window_size = 7, +#' .suffix = "_{.n}{.time_unit_abbr}_avg" +#' ) %>% #' ungroup() epi_slide_mean <- function( .x, .col_names, ..., diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 13b03a1b..724a928e 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -205,7 +205,10 @@ covid_case_death_rates_extended \%>\% ungroup() covid_case_death_rates_extended \%>\% group_by(geo_value) \%>\% - epi_slide_mean(c(case_rate, death_rate), .window_size = 7, .suffix = "_{.n}{.time_unit_abbr}_avg") \%>\% + epi_slide_mean(c(case_rate, death_rate), + .window_size = 7, + .suffix = "_{.n}{.time_unit_abbr}_avg" + ) \%>\% ungroup() # Compute a 7-day trailing sum on cases. cases_deaths_subset \%>\% From ad15b8f1c25f8f1abea355eed0dd9dd6ab5dd312 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 15 Nov 2024 10:14:13 -0800 Subject: [PATCH 21/39] Add unit_time_delta to make time_delta_to_n_steps make sense --- NAMESPACE | 1 + R/utils.R | 22 +++++++++++++- man/time_delta_to_n_steps.Rd | 2 +- man/unit_time_delta.Rd | 21 ++++++++++++++ tests/testthat/test-utils.R | 56 ++++++++++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 man/unit_time_delta.Rd diff --git a/NAMESPACE b/NAMESPACE index e044739b..7b054fd6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,6 +98,7 @@ export(slice) export(sum_groups_epi_df) export(time_column_names) export(ungroup) +export(unit_time_delta) export(unnest) export(validate_epi_archive) export(version_column_names) diff --git a/R/utils.R b/R/utils.R index cb99ceca..f3eb114d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1122,7 +1122,7 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU #' @param time_type as in `validate_slide_window_arg` #' @return [bare integerish][rlang::is_integerish] vector (with possible #' infinite values) that produces the same result as `time_delta` when -#' multiplied by the "natural" "unit time delta" (not yet implemented) for +#' multiplied by the natural [`unit_time_delta`] for #' that time type and added to time values of time type `time_type`. If the #' given time type does not support infinite values, then it should produce #' +Inf or -Inf for analogous entries of `time_delta`, and match the addition @@ -1157,6 +1157,26 @@ time_delta_to_n_steps <- function(time_delta, time_type) { } } +#' Object that, added to time_values of time_type, advances by one time step/interval +#' +#' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` +#' @return an object `u` such that `time_values + u` represents advancing by one +#' time step / moving to the subsequent time interval for any `time_values` +#' object of time type `time_type`, and such that `time_values + k * u` for +#' integerish vector `k` advances by `k` steps (with vectorization, +#' recycling). +#' +#' @export +unit_time_delta <- function(time_type) { + switch(time_type, + day = as.difftime(1, units = "days"), + week = as.difftime(1, units = "weeks"), + yearmonth = 1, + integer = 1L, + cli_abort("Unsupported time_type: {time_type}") + ) +} + # Using these unit abbreviations happens to make our automatic slide output # naming look like taking ISO-8601 duration designations, removing the P, and # lowercasing any characters. Fortnightly or sub-daily time types would need an diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd index db8cdaaa..93715919 100644 --- a/man/time_delta_to_n_steps.Rd +++ b/man/time_delta_to_n_steps.Rd @@ -19,7 +19,7 @@ without sign and length restrictions.} \value{ \link[rlang:is_integerish]{bare integerish} vector (with possible infinite values) that produces the same result as \code{time_delta} when -multiplied by the "natural" "unit time delta" (not yet implemented) for +multiplied by the natural \code{\link{unit_time_delta}} for that time type and added to time values of time type \code{time_type}. If the given time type does not support infinite values, then it should produce +Inf or -Inf for analogous entries of \code{time_delta}, and match the addition diff --git a/man/unit_time_delta.Rd b/man/unit_time_delta.Rd new file mode 100644 index 00000000..3b45840d --- /dev/null +++ b/man/unit_time_delta.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{unit_time_delta} +\alias{unit_time_delta} +\title{Object that, added to time_values of time_type, advances by one time step/interval} +\usage{ +unit_time_delta(time_type) +} +\arguments{ +\item{time_type}{string; \code{epi_df}'s or \code{epi_archive}'s \code{time_type}} +} +\value{ +an object \code{u} such that \code{time_values + u} represents advancing by one +time step / moving to the subsequent time interval for any \code{time_values} +object of time type \code{time_type}, and such that \code{time_values + k * u} for +integerish vector \code{k} advances by \code{k} steps (with vectorization, +recycling). +} +\description{ +Object that, added to time_values of time_type, advances by one time step/interval +} diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index c5e6c5aa..37125d53 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -362,3 +362,59 @@ test_that("validate_slide_window_arg works", { class = "epiprocess__validate_slide_window_arg" ) }) + +test_that("unit_time_delta works", { + expect_equal( + as.Date("2020-01-01") + 5 * unit_time_delta("day"), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + 2 * unit_time_delta("week"), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + 5 * unit_time_delta("yearmonth"), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + 5L * unit_time_delta("integer"), + 6L + ) + # + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-06") - as.Date("2020-01-01"), "day") * + unit_time_delta("day"), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.integer(as.Date("2020-01-06") - as.Date("2020-01-01")), "day") * + unit_time_delta("day"), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-15") - as.Date("2020-01-01"), "week") * + unit_time_delta("week"), + as.Date("2020-01-15") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.difftime(2, units = "weeks"), "week") * + unit_time_delta("week"), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + + time_delta_to_n_steps(5, "yearmonth") * + unit_time_delta("yearmonth"), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + + time_delta_to_n_steps(5, "integer") * + unit_time_delta("integer"), + 6L + ) +}) From 275290321aa648edfa2c3c8bb0c65144ea6eaea5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 15 Nov 2024 10:17:56 -0800 Subject: [PATCH 22/39] Don't export unit_time_delta (yet) We may later need it to make default args, in which case we should probably export to be less "magical". --- NAMESPACE | 1 - R/utils.R | 2 +- man/unit_time_delta.Rd | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7b054fd6..e044739b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,7 +98,6 @@ export(slice) export(sum_groups_epi_df) export(time_column_names) export(ungroup) -export(unit_time_delta) export(unnest) export(validate_epi_archive) export(version_column_names) diff --git a/R/utils.R b/R/utils.R index f3eb114d..55022677 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1166,7 +1166,7 @@ time_delta_to_n_steps <- function(time_delta, time_type) { #' integerish vector `k` advances by `k` steps (with vectorization, #' recycling). #' -#' @export +#' @keywords internal unit_time_delta <- function(time_type) { switch(time_type, day = as.difftime(1, units = "days"), diff --git a/man/unit_time_delta.Rd b/man/unit_time_delta.Rd index 3b45840d..46b3c48d 100644 --- a/man/unit_time_delta.Rd +++ b/man/unit_time_delta.Rd @@ -19,3 +19,4 @@ recycling). \description{ Object that, added to time_values of time_type, advances by one time step/interval } +\keyword{internal} From ed9524ef8f73680db08af7e023eec165b487a5f1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 22 Nov 2024 10:30:42 -0800 Subject: [PATCH 23/39] Address CHECKS+workflow: don't document epidatasets re-exports --- R/reexports.R | 87 -------------------------- man/archive_cases_dv_subset.Rd | 84 ------------------------- man/cases_deaths_subset.Rd | 79 ----------------------- man/covid_case_death_rates_extended.Rd | 74 ---------------------- man/covid_incidence_county_subset.Rd | 75 ---------------------- man/covid_incidence_outliers.Rd | 68 -------------------- 6 files changed, 467 deletions(-) delete mode 100644 man/archive_cases_dv_subset.Rd delete mode 100644 man/cases_deaths_subset.Rd delete mode 100644 man/covid_case_death_rates_extended.Rd delete mode 100644 man/covid_incidence_county_subset.Rd delete mode 100644 man/covid_incidence_outliers.Rd diff --git a/R/reexports.R b/R/reexports.R index e091ce12..72241175 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -79,104 +79,17 @@ ggplot2::autoplot # epidatasets ------------------------------------------------------------------- -#' @inherit epidatasets::cases_deaths_subset description source references title -#' @inheritSection epidatasets::cases_deaths_subset Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::cases_deaths_subset -#' -#' # works -#' library(epiprocess) -#' cases_deaths_subset -#' -#' # fails -#' \dontrun{ -#' data(cases_deaths_subset, package = "epiprocess") -#' } #' @export delayedAssign("cases_deaths_subset", epidatasets::cases_deaths_subset) -#' @inherit epidatasets::covid_incidence_county_subset description source references title -#' @inheritSection epidatasets::covid_incidence_county_subset Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::covid_incidence_county_subset -#' -#' # works -#' library(epiprocess) -#' covid_incidence_county_subset -#' -#' # fails -#' \dontrun{ -#' data(covid_incidence_county_subset, package = "epiprocess") -#' } #' @export delayedAssign("covid_incidence_county_subset", epidatasets::covid_incidence_county_subset) -#' @inherit epidatasets::covid_incidence_outliers description source references title -#' @inheritSection epidatasets::covid_incidence_outliers Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::covid_incidence_outliers -#' -#' # works -#' library(epiprocess) -#' covid_incidence_outliers -#' -#' # fails -#' \dontrun{ -#' data(covid_incidence_outliers, package = "epiprocess") -#' } #' @export delayedAssign("covid_incidence_outliers", epidatasets::covid_incidence_outliers) -#' @inherit epidatasets::archive_cases_dv_subset description source references title -#' @inheritSection epidatasets::archive_cases_dv_subset Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::archive_cases_dv_subset -#' -#' # works -#' library(epiprocess) -#' archive_cases_dv_subset -#' -#' # fails -#' \dontrun{ -#' data(archive_cases_dv_subset, package = "epiprocess") -#' } -#' #' @export delayedAssign("archive_cases_dv_subset", epidatasets::archive_cases_dv_subset) -#' @inherit epidatasets::covid_case_death_rates_extended description source references title -#' @inheritSection epidatasets::covid_case_death_rates_extended Data dictionary -#' @examples -#' # Since this is a re-exported dataset, it cannot be loaded using -#' # the `data()` function. `data()` looks for a file of the same name -#' # in the `data/` directory, which doesn't exist in this package. -#' # works -#' epiprocess::covid_case_death_rates_extended -#' -#' # works -#' library(epiprocess) -#' covid_case_death_rates_extended -#' -#' # fails -#' \dontrun{ -#' data(covid_case_death_rates_extended, package = "epiprocess") -#' } -#' #' @export delayedAssign("covid_case_death_rates_extended", epidatasets::covid_case_death_rates_extended) diff --git a/man/archive_cases_dv_subset.Rd b/man/archive_cases_dv_subset.Rd deleted file mode 100644 index 207bb025..00000000 --- a/man/archive_cases_dv_subset.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{archive_cases_dv_subset} -\alias{archive_cases_dv_subset} -\title{Subset of daily COVID-19 doctor visits and cases from 6 states in archive format} -\format{ -An object of class \code{epi_archive} of length 6. -} -\source{ -This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by Johns Hopkins University on behalf of its Center for Systems Science in Engineering. -Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: \code{case_rate_7d_av} signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. -\item Furthermore, the data has been limited to a very small number of rows, the -signal names slightly altered, and formatted into an \code{epi_archive}. -} - -This object contains a modified part of the -\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{Delphi \code{doctor-visits} indicator}. -This data source is computed by the Delphi -Group from information about outpatient visits, provided to Delphi by -health system partners, and published in the COVIDcast Epidata API. This -data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Delphi group. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits signal}: The signal \code{smoothed_adj_cli} is taken directly from the API without changes. -\item Furthermore, the data has been limited to a very small number of rows, the -signal names slightly altered, and formatted into an \code{epi_archive}. -} -} -\usage{ -archive_cases_dv_subset -} -\description{ -This data source is based on information about outpatient visits, provided -to us by health system partners, and also contains confirmed COVID-19 -cases based on reports made available by the Center for Systems Science -and Engineering at Johns Hopkins University. This example data ranges from -June 1, 2020 to December 1, 2021, issued on dates from June 1, 2020 to December 1, -2021. It is limited to California, Florida, Texas, and New York. - -It is used in the {epiprocess} \code{compactify}, \code{epi_archive}, and -advanced-use (\code{advanced}) vignettes. -} -\section{Data dictionary}{ - - -The data in the \code{epi_archive$DT} attribute has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{version}{the time value specifying the version for each row of measurements. } -\item{percent_cli}{percentage of doctorโ€™s visits with CLI (COVID-like illness) computed from medical insurance claims} -\item{case_rate_7d_av}{7-day average signal of number of new confirmed cases due to COVID-19 per 100,000 population, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::archive_cases_dv_subset - -# works -library(epiprocess) -archive_cases_dv_subset - -# fails -\dontrun{ -data(archive_cases_dv_subset, package = "epiprocess") -} - -} -\keyword{datasets} diff --git a/man/cases_deaths_subset.Rd b/man/cases_deaths_subset.Rd deleted file mode 100644 index 45e8dd4c..00000000 --- a/man/cases_deaths_subset.Rd +++ /dev/null @@ -1,79 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{cases_deaths_subset} -\alias{cases_deaths_subset} -\title{Subset of JHU daily state COVID-19 cases and deaths from 6 states} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 4026 rows and 6 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -The case signal is taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}. -The rate signals were computed by Delphi using Census population data. -The 7-day average signals were computed by Delphi by calculating moving -averages of the preceding 7 days, so the signal for June 7 is the -average of the underlying data for June 1 through 7, inclusive. -\item Furthermore, the data has been limited to a very small number of rows, the -signal names slightly altered, and formatted into an \code{epi_df}. -} -} -\usage{ -cases_deaths_subset -} -\description{ -This data source of confirmed COVID-19 cases and deaths is based on reports -made available by the Center for Systems Science and Engineering at Johns -Hopkins University. This example data is a snapshot as of March 20, 2024, and -ranges from March 1, 2020 to December 31, 2021. It is limited to California, -Florida, Texas, New York, Georgia, and Pennsylvania. - -It is used in the {epiprocess} growth rate and \code{epi_slide} vignettes. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row -of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{case_rate_7d_av}{7-day average signal of number of new -confirmed COVID-19 cases per 100,000 population, daily} -\item{death_rate_7d_av}{7-day average signal of number of new confirmed -deaths due to COVID-19 per 100,000 population, daily} -\item{cases}{Number of new confirmed COVID-19 cases, daily} -\item{cases_7d_av}{7-day average signal of number of new confirmed -COVID-19 cases, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::cases_deaths_subset - -# works -library(epiprocess) -cases_deaths_subset - -# fails -\dontrun{ -data(cases_deaths_subset, package = "epiprocess") -} -} -\keyword{datasets} diff --git a/man/covid_case_death_rates_extended.Rd b/man/covid_case_death_rates_extended.Rd deleted file mode 100644 index 72482edd..00000000 --- a/man/covid_case_death_rates_extended.Rd +++ /dev/null @@ -1,74 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{covid_case_death_rates_extended} -\alias{covid_case_death_rates_extended} -\title{JHU daily COVID-19 cases and deaths rates from all states} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 37576 rows and 4 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -calculating moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -} -} -\usage{ -covid_case_death_rates_extended -} -\description{ -This data source of confirmed COVID-19 cases and deaths is based on reports -made available by the Center for Systems Science and Engineering at Johns -Hopkins University, as downloaded from the CMU Delphi COVIDcast Epidata -API. This example data is a snapshot as of May 31, 2022, and -ranges from March 1, 2020 to December 31, 2021. It -includes all states. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row -of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{case_rate}{7-day average signal of number of new -confirmed COVID-19 cases per 100,000 population, daily} -\item{death_rate}{7-day average signal of number of new confirmed -deaths due to COVID-19 per 100,000 population, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::covid_case_death_rates_extended - -# works -library(epiprocess) -covid_case_death_rates_extended - -# fails -\dontrun{ -data(covid_case_death_rates_extended, package = "epiprocess") -} - -} -\keyword{datasets} diff --git a/man/covid_incidence_county_subset.Rd b/man/covid_incidence_county_subset.Rd deleted file mode 100644 index edc881d9..00000000 --- a/man/covid_incidence_county_subset.Rd +++ /dev/null @@ -1,75 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{covid_incidence_county_subset} -\alias{covid_incidence_county_subset} -\title{Subset of JHU daily COVID-19 cases from counties in Massachusetts and Vermont} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 16212 rows and 5 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as -\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems -Science in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -as moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -\item Furthermore, the data has been limited to a very small number of rows, -formatted into an \code{epi_df}, and the signal names slightly altered. -} -} -\usage{ -covid_incidence_county_subset -} -\description{ -This data source of confirmed COVID-19 cases and deaths -is based on reports made available by the Center for -Systems Science and Engineering at Johns Hopkins University. -This example data is a snapshot as of March 20, 2024, and -ranges from March 1, 2020 to December 31, 2021. -It is limited to counties from Massachusetts and Vermont. - -It is used in the {epiprocess} aggregation vignette. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{cases}{Number of new confirmed COVID-19 cases, daily} -\item{county_name}{the name of the county} -\item{state_name}{the full name of the state} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::covid_incidence_county_subset - -# works -library(epiprocess) -covid_incidence_county_subset - -# fails -\dontrun{ -data(covid_incidence_county_subset, package = "epiprocess") -} -} -\keyword{datasets} diff --git a/man/covid_incidence_outliers.Rd b/man/covid_incidence_outliers.Rd deleted file mode 100644 index 52b49fd3..00000000 --- a/man/covid_incidence_outliers.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reexports.R -\docType{data} -\name{covid_incidence_outliers} -\alias{covid_incidence_outliers} -\title{Subset of JHU daily COVID-19 cases from New Jersey and Florida} -\format{ -An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 730 rows and 3 columns. -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems -Science in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. -\item Furthermore, the data has been limited to a very small number of rows, -formatted into an \code{epi_df}, and the signal names slightly altered. -} -} -\usage{ -covid_incidence_outliers -} -\description{ -This data source of confirmed COVID-19 cases is based on reports made -available by the Center for Systems Science and Engineering at Johns -Hopkins University. This example data is downloaded from the CMU Delphi -COVIDcast Epidata API. It is a snapshot as of October 28, 2021, and captures the -cases from June 1, 2020 to May 31, 2021. It is limited to New Jersey and -Florida. - -This data set is used in the {epiprocess} vignette on outliers. -} -\section{Data dictionary}{ - - -The data has columns: -\describe{ -\item{geo_value}{the geographic value associated with each row of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{cases}{Number of new confirmed COVID-19 cases, daily} -} - -} - -\examples{ -# Since this is a re-exported dataset, it cannot be loaded using -# the `data()` function. `data()` looks for a file of the same name -# in the `data/` directory, which doesn't exist in this package. -# works -epiprocess::covid_incidence_outliers - -# works -library(epiprocess) -covid_incidence_outliers - -# fails -\dontrun{ -data(covid_incidence_outliers, package = "epiprocess") -} -} -\keyword{datasets} From 5cc0d159d0a20dc0ff2f0388c426625354253713 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 22 Nov 2024 11:02:25 -0800 Subject: [PATCH 24/39] Fix error class name that referred to different function Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index f212473e..e8e66966 100644 --- a/R/slide.R +++ b/R/slide.R @@ -702,7 +702,7 @@ epi_slide_opt <- function( "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, we'll temporarily group by {expected_group_keys} for this operation. You may need to aggregate your data first, see aggregate_epi_df().", - class = "epiprocess__epi_slide__invalid_grouping" + class = "epiprocess__epi_slide_opt__invalid_grouping" ) } } else { From 9d64bdabea1f563aef19b0117d1648be957653cc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 22 Nov 2024 10:50:22 -0800 Subject: [PATCH 25/39] Fix partial rename: aggregate_epi_df -> sum_groups_epi_df in messages --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index e8e66966..d56623cc 100644 --- a/R/slide.R +++ b/R/slide.R @@ -195,7 +195,7 @@ epi_slide <- function( cli_abort( "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, we'll temporarily group by {expected_group_keys} for this operation. You may need - to aggregate your data first, see aggregate_epi_df().", + to aggregate your data first; see sum_groups_epi_df().", class = "epiprocess__epi_slide__invalid_grouping" ) } @@ -701,7 +701,7 @@ epi_slide_opt <- function( cli_abort( "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, we'll temporarily group by {expected_group_keys} for this operation. You may need - to aggregate your data first, see aggregate_epi_df().", + to aggregate your data first; see sum_groups_epi_df().", class = "epiprocess__epi_slide_opt__invalid_grouping" ) } From 5c68195843b608e4a27025ca7b5dc3eaccdb1d69 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 22 Nov 2024 12:01:17 -0800 Subject: [PATCH 26/39] Describe output grouping in basic-slide-params.R --- man-roxygen/basic-slide-params.R | 3 ++- man/epi_slide.Rd | 3 ++- man/epi_slide_opt.Rd | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index 8ccd35f9..0e5f28ae 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -32,4 +32,5 @@ #' a missing value marker (typically NA, but more technically the result of #' `vctrs::vec_cast`-ing `NA` to the type of the slide computation output). #' @return An `epi_df` object with one or more new slide computation columns -#' added. +#' added. It will be ungrouped if `.x` was ungrouped, and have the same groups +#' as `.x` if `.x` was grouped. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 9f4abd36..ce94c67b 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -91,7 +91,8 @@ a missing value marker (typically NA, but more technically the result of } \value{ An \code{epi_df} object with one or more new slide computation columns -added. +added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups +as \code{.x} if \code{.x} was grouped. } \description{ Slides a given function over variables in an \code{epi_df} object. diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 724a928e..da65cb00 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -130,7 +130,8 @@ a missing value marker (typically NA, but more technically the result of } \value{ An \code{epi_df} object with one or more new slide computation columns -added. +added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups +as \code{.x} if \code{.x} was grouped. } \description{ \code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} From d3695270d7105b69c48b3b4e820c57911637b0ff Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 22 Nov 2024 12:06:27 -0800 Subject: [PATCH 27/39] docs: roughly describe grouping of epix_slide output --- R/methods-epi_archive.R | 8 ++++---- man/epix_slide.Rd | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 04bb841f..9ad45673 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -688,10 +688,10 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' requested `.versions`) for rows having a `time_value` of at least `.version #' - before`. Otherwise, the slide computation will be passed only the most #' recent `version` for every unique `time_value`. Default is `FALSE`. -#' @return A tibble whose columns are: the grouping variables, `time_value`, -#' containing the reference time values for the slide computation, and a -#' column named according to the `.new_col_name` argument, containing the slide -#' values. +#' @return A tibble whose columns are: the grouping variables (if any), +#' `time_value`, containing the reference time values for the slide +#' computation, and a column named according to the `.new_col_name` argument, +#' containing the slide values. It will be grouped by the grouping variables. #' #' @details A few key distinctions between the current function and `epi_slide()`: #' 1. In `.f` functions for `epix_slide`, one should not assume that the input diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 6fe2cfa5..cadb1983 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -104,10 +104,10 @@ requested \code{.versions}) for rows having a \code{time_value} of at least `.ve }} } \value{ -A tibble whose columns are: the grouping variables, \code{time_value}, -containing the reference time values for the slide computation, and a -column named according to the \code{.new_col_name} argument, containing the slide -values. +A tibble whose columns are: the grouping variables (if any), +\code{time_value}, containing the reference time values for the slide +computation, and a column named according to the \code{.new_col_name} argument, +containing the slide values. It will be grouped by the grouping variables. } \description{ Slides a given function over variables in an \code{epi_archive} object. This From 6191b61d5e06bc1609aa5efbdb1c00828ca30c8c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 22 Nov 2024 16:02:56 -0800 Subject: [PATCH 28/39] Rework `epi_slide_{opt,mean,sum}` examples as they are 1 help topic --- R/slide.R | 104 +++++++++++++++++-------------------------- man/epi_slide_opt.Rd | 82 +++++++++++++--------------------- 2 files changed, 72 insertions(+), 114 deletions(-) diff --git a/R/slide.R b/R/slide.R index d56623cc..cf19075c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -632,22 +632,53 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' @export #' @seealso [`epi_slide`] for the more general slide function #' @examples -#' # Compute a 7-day trailing average on cases. +#' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: #' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' epi_slide_sum(cases, .window_size = 7) +#' +#' # Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean(case_rate, .window_size = 7) +#' +#' # Use a less common specialized slide function: +#' cases_deaths_subset %>% +#' epi_slide_opt(cases, slider::slide_min, .window_size = 7) +#' +#' # Specify output column names and/or a naming scheme: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) %>% +#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% #' ungroup() -#' -#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and -#' # to allow partially-missing windows. #' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% #' group_by(geo_value) %>% -#' epi_slide_opt( -#' cases, -#' .f = data.table::frollmean, .window_size = 7, -#' algo = "exact", hasNA = TRUE, na.rm = TRUE -#' ) %>% +#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% #' ungroup() +#' +#' # Additional settings can be sent to the {data.table} and {slider} functions +#' # via `...`. This example passes some arguments to `frollmean` settings for +#' # speed, accuracy, and to allow partially-missing windows: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean( +#' case_rate, +#' .window_size = 7, +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) +#' +#' # If the more specialized possibilities for `.f` don't cover your needs, you +#' # can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a +#' # custom function at the cost of more computation time. See also `epi_slide` +#' # if you need something even more general. +#' cases_deaths_subset %>% +#' select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) %>% +#' epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), +#' data.table::frollapply, +#' FUN = median, .window_size = 28, +#' .suffix = "_{.n}{.time_unit_abbr}_median" +#' ) %>% +#' print(n = 40) epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -951,40 +982,6 @@ epi_slide_opt <- function( #' datatable::frollmean`. #' #' @export -#' @examples -#' # Compute a 7-day trailing average of case rates. -#' covid_case_death_rates_extended %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(case_rate, .window_size = 7) %>% -#' ungroup() -#' -#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and -#' # to allow partially-missing windows. -#' covid_case_death_rates_extended %>% -#' group_by(geo_value) %>% -#' epi_slide_mean( -#' case_rate, -#' .window_size = 7, -#' na.rm = TRUE, algo = "exact", hasNA = TRUE -#' ) %>% -#' ungroup() -#' -#' # Compute a 7-day trailing average of case rates and death rates, with custom -#' # output column names: -#' covid_case_death_rates_extended %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(c(case_rate, death_rate), -#' .window_size = 7, -#' .new_col_names = c("smoothed_case_rate", "smoothed_death_rate") -#' ) %>% -#' ungroup() -#' covid_case_death_rates_extended %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(c(case_rate, death_rate), -#' .window_size = 7, -#' .suffix = "_{.n}{.time_unit_abbr}_avg" -#' ) %>% -#' ungroup() epi_slide_mean <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -1044,25 +1041,6 @@ epi_slide_mean <- function( #' datatable::frollsum`. #' #' @export -#' @examples -#' # Compute a 7-day trailing sum on cases. -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7) %>% -#' ungroup() -#' -#' # Specify output column names and/or naming scheme: -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% -#' ungroup() -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% -#' ungroup() epi_slide_sum <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index da65cb00..7dc47a16 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -162,63 +162,20 @@ for \code{.f} factoring in the input column type(s) for \code{.col_names} } \examples{ -# Compute a 7-day trailing average on cases. +# Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) \%>\% - ungroup() - -# Same as above, but adjust `frollmean` settings for speed, accuracy, and -# to allow partially-missing windows. -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_opt( - cases, - .f = data.table::frollmean, .window_size = 7, - algo = "exact", hasNA = TRUE, na.rm = TRUE - ) \%>\% - ungroup() -# Compute a 7-day trailing average of case rates. -covid_case_death_rates_extended \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(case_rate, .window_size = 7) \%>\% - ungroup() + select(geo_value, time_value, cases) \%>\% + epi_slide_sum(cases, .window_size = 7) -# Same as above, but adjust `frollmean` settings for speed, accuracy, and -# to allow partially-missing windows. +# Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: covid_case_death_rates_extended \%>\% - group_by(geo_value) \%>\% - epi_slide_mean( - case_rate, - .window_size = 7, - na.rm = TRUE, algo = "exact", hasNA = TRUE - ) \%>\% - ungroup() + epi_slide_mean(case_rate, .window_size = 7) -# Compute a 7-day trailing average of case rates and death rates, with custom -# output column names: -covid_case_death_rates_extended \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(c(case_rate, death_rate), - .window_size = 7, - .new_col_names = c("smoothed_case_rate", "smoothed_death_rate") - ) \%>\% - ungroup() -covid_case_death_rates_extended \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(c(case_rate, death_rate), - .window_size = 7, - .suffix = "_{.n}{.time_unit_abbr}_avg" - ) \%>\% - ungroup() -# Compute a 7-day trailing sum on cases. +# Use a less common specialized slide function: cases_deaths_subset \%>\% - select(geo_value, time_value, cases) \%>\% - group_by(geo_value) \%>\% - epi_slide_sum(cases, .window_size = 7) \%>\% - ungroup() + epi_slide_opt(cases, slider::slide_min, .window_size = 7) -# Specify output column names and/or naming scheme: +# Specify output column names and/or a naming scheme: cases_deaths_subset \%>\% select(geo_value, time_value, cases) \%>\% group_by(geo_value) \%>\% @@ -229,6 +186,29 @@ cases_deaths_subset \%>\% group_by(geo_value) \%>\% epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") \%>\% ungroup() + +# Additional settings can be sent to the {data.table} and {slider} functions +# via `...`. This example passes some arguments to `frollmean` settings for +# speed, accuracy, and to allow partially-missing windows: +covid_case_death_rates_extended \%>\% + epi_slide_mean( + case_rate, + .window_size = 7, + na.rm = TRUE, algo = "exact", hasNA = TRUE + ) + +# If the more specialized possibilities for `.f` don't cover your needs, you +# can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a +# custom function at the cost of more computation time. See also `epi_slide` +# if you need something even more general. +cases_deaths_subset \%>\% + select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) \%>\% + epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), + data.table::frollapply, + FUN = median, .window_size = 28, + .suffix = "_{.n}{.time_unit_abbr}_median" + ) \%>\% + print(n = 40) } \seealso{ \code{\link{epi_slide}} for the more general slide function From 6ab73d541fe5c2368d109ad0ca3fbb17664c7287 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 22 Nov 2024 16:11:35 -0800 Subject: [PATCH 29/39] feat(epi_slide_opt): guard against multiple .f matches --- R/slide.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/slide.R b/R/slide.R index cf19075c..3da3c473 100644 --- a/R/slide.R +++ b/R/slide.R @@ -797,6 +797,12 @@ epi_slide_opt <- function( epiprocess__f = .f ) } + if (nrow(f_info) > 1L) { + cli_abort('epiprocess internal error: looking up `.f` in table of possible + functions yielded multiple matches. Please report it using "New + issue" at https://github.com/cmu-delphi/epiprocess/issues, using + reprex::reprex to provide a minimal reproducible example.') + } f_from_package <- f_info$package user_provided_rtvs <- !is.null(.ref_time_values) From c78bcca16d59c9d88efada6a561d43d18fdabc3e Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 22 Nov 2024 16:15:39 -0800 Subject: [PATCH 30/39] Improve slide output packing example comments Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index 3da3c473..0c747d17 100644 --- a/R/slide.R +++ b/R/slide.R @@ -106,8 +106,8 @@ #' #' # The tidyverse supports ["packing"][tidyr::pack] multiple columns into a #' # single tibble-type column contained within some larger tibble. Like dplyr, -#' # we normally don't pack output columns together, but will if you provide a -#' # name for a tibble-type output: +#' # we normally don't pack output columns together. However, packing behavior can be turned on +#' # by providing a name for a tibble-type output: #' cases_deaths_subset %>% #' epi_slide( #' slide_packed = tibble( From 53505d3f842328e944739857982a5efa4ca1e05a Mon Sep 17 00:00:00 2001 From: brookslogan Date: Sat, 23 Nov 2024 00:18:20 +0000 Subject: [PATCH 31/39] docs: document (GHA) --- man/epi_slide.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index ce94c67b..f053909d 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -165,8 +165,8 @@ cases_deaths_subset \%>\% # The tidyverse supports ["packing"][tidyr::pack] multiple columns into a # single tibble-type column contained within some larger tibble. Like dplyr, -# we normally don't pack output columns together, but will if you provide a -# name for a tibble-type output: +# we normally don't pack output columns together. However, packing behavior can be turned on +# by providing a name for a tibble-type output: cases_deaths_subset \%>\% epi_slide( slide_packed = tibble( From 0352d7b31e99045ce63f5cabadb1e836a5fefcc0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 26 Nov 2024 03:47:21 -0800 Subject: [PATCH 32/39] Make `as_epi_df` remove grouping --- NAMESPACE | 1 + R/epi_df.R | 17 +++++++++++++++-- man/epi_df.Rd | 24 ++++++++++++++++++------ tests/testthat/test-epi_df.R | 16 ++++++++++++++++ 4 files changed, 50 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e044739b..e214d8f3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ S3method(arrange_row_canonical,default) S3method(arrange_row_canonical,epi_df) S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) +S3method(as_epi_df,grouped_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) S3method(as_tibble,epi_df) diff --git a/R/epi_df.R b/R/epi_df.R index 5cf379e2..83cca073 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -174,7 +174,7 @@ NULL #' @param other_keys If your tibble has additional keys, be sure to specify them #' as a character vector here (typical examples are "age" or sub-geographies). #' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @return * Of `new_epi_df()`: an `epi_df` #' #' @export new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())), @@ -205,6 +205,8 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = #' to be converted #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example, `geo_value = STATEFP, time_value = end_date`. +#' @return * Of `as_epi_df()`: an (ungrouped) `epi_df` +#' #' @export as_epi_df <- function(x, ...) { UseMethod("as_epi_df") @@ -215,6 +217,7 @@ as_epi_df <- function(x, ...) { #' @method as_epi_df epi_df #' @export as_epi_df.epi_df <- function(x, ...) { + x <- ungroup(x) return(x) } @@ -296,6 +299,14 @@ as_epi_df.tbl_df <- function( new_epi_df(x, geo_type, time_type, as_of, other_keys) } +#' @rdname epi_df +#' @order 1 +#' @method as_epi_df grouped_df +#' @export +as_epi_df.grouped_df <- function(x, ...) { + as_epi_df(ungroup(x), ...) +} + #' @rdname epi_df #' @order 1 #' @method as_epi_df data.frame @@ -319,9 +330,11 @@ as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) { #' Test for `epi_df` format #' #' @param x An object. -#' @return `TRUE` if the object inherits from `epi_df`. +#' @return * Of `is_epi_df`: `TRUE` if the object inherits from `epi_df`, +#' otherwise `FALSE`. #' #' @rdname epi_df +#' @order 1 #' @export is_epi_df <- function(x) { inherits(x, "epi_df") diff --git a/man/epi_df.Rd b/man/epi_df.Rd index 4c592ab7..a6782718 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -4,12 +4,13 @@ \alias{as_epi_df} \alias{as_epi_df.epi_df} \alias{as_epi_df.tbl_df} +\alias{as_epi_df.grouped_df} \alias{as_epi_df.data.frame} \alias{as_epi_df.tbl_ts} +\alias{is_epi_df} \alias{new_epi_df} \alias{epi_df} -\alias{is_epi_df} -\title{\code{epi_df} object} +\title{Test for \code{epi_df} format} \usage{ as_epi_df(x, ...) @@ -24,10 +25,14 @@ as_epi_df(x, ...) ... ) +\method{as_epi_df}{grouped_df}(x, ...) + \method{as_epi_df}{data.frame}(x, as_of, other_keys = character(), ...) \method{as_epi_df}{tbl_ts}(x, as_of, other_keys = character(), ...) +is_epi_df(x) + new_epi_df( x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())), geo_type, @@ -36,8 +41,6 @@ new_epi_df( other_keys = character(), ... ) - -is_epi_df(x) } \arguments{ \item{x}{An object.} @@ -65,9 +68,18 @@ then the current day-time will be used.} as a character vector here (typical examples are "age" or sub-geographies).} } \value{ -An \code{epi_df} object. +\itemize{ +\item Of \code{as_epi_df()}: an (ungrouped) \code{epi_df} +} -\code{TRUE} if the object inherits from \code{epi_df}. +\itemize{ +\item Of \code{is_epi_df}: \code{TRUE} if the object inherits from \code{epi_df}, +otherwise \code{FALSE}. +} + +\itemize{ +\item Of \code{new_epi_df()}: an \code{epi_df} +} } \description{ One of the two main data structures for storing time series in \code{epiprocess}. diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 297d68df..44bb62e2 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -75,6 +75,22 @@ test_that("as_epi_df works for nonstandard input", { ) }) +test_that("as_epi_df ungroups", { + expect_false( + tibble::tibble(geo_value = 1, time_value = 1) %>% + dplyr::group_by(geo_value) %>% + as_epi_df(as_of = 2) %>% + dplyr::is_grouped_df() + ) + expect_false( + tibble::tibble(geo_value = 1, time_value = 1) %>% + as_epi_df(as_of = 2) %>% + dplyr::group_by(geo_value) %>% + as_epi_df(as_of = 2) %>% + dplyr::is_grouped_df() + ) +}) + # select fixes tib <- tibble::tibble( x = 1:10, y = 1:10, From 081bd22f69c5924c9c173050a5e6a9a71cd886da Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 26 Nov 2024 04:54:03 -0800 Subject: [PATCH 33/39] Bring back slide unique-key checks, make as_epi_df ukey checks faster --- DESCRIPTION | 2 +- NAMESPACE | 5 +++ NEWS.md | 2 ++ R/epi_df.R | 20 ++++++------ R/epiprocess-package.R | 6 ++++ R/slide.R | 16 +++------- R/utils.R | 54 +++++++++++++++++++++++++++++++++ man/check_ukey_unique.Rd | 27 +++++++++++++++++ man/unwrap.Rd | 18 +++++++++++ tests/testthat/_snaps/epi_df.md | 15 +++++++++ tests/testthat/test-epi_df.R | 14 +++++++++ 11 files changed, 155 insertions(+), 24 deletions(-) create mode 100644 man/check_ukey_unique.Rd create mode 100644 man/unwrap.Rd create mode 100644 tests/testthat/_snaps/epi_df.md diff --git a/DESCRIPTION b/DESCRIPTION index fd09aa57..c3f67c08 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: epiprocess Type: Package Title: Tools for basic signal processing in epidemiology -Version: 0.9.6 +Version: 0.9.7 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index e214d8f3..1f5180fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -109,6 +109,7 @@ importFrom(checkmate,assert) importFrom(checkmate,assert_character) importFrom(checkmate,assert_class) importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_false) importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) @@ -116,6 +117,8 @@ importFrom(checkmate,assert_logical) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) importFrom(checkmate,assert_string) +importFrom(checkmate,assert_subset) +importFrom(checkmate,assert_tibble) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) @@ -165,6 +168,7 @@ importFrom(dplyr,groups) importFrom(dplyr,if_all) importFrom(dplyr,if_any) importFrom(dplyr,if_else) +importFrom(dplyr,is_grouped_df) importFrom(dplyr,lag) importFrom(dplyr,mutate) importFrom(dplyr,near) @@ -236,3 +240,4 @@ importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) importFrom(vctrs,vec_data) +importFrom(vctrs,vec_equal) diff --git a/NEWS.md b/NEWS.md index 436b0c7d..ba6826da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat additional arguments for specifying names: `.prefix`, `.suffix`, `.new_col_names`. To obtain the old naming behavior, use `.prefix = "slide_value_"`. +- `as_epi_df` now removes any grouping that `x` had applied. ## Improvements @@ -31,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat by `geo_value` and any `other_keys` for the slide operation rather than raise an error about duplicated time values. `epi_slide`'s analogous automatic grouping has been made temporary in order to match. +- Improved speed of key-uniqueness checks. ## Bug fixes diff --git a/R/epi_df.R b/R/epi_df.R index 83cca073..6cae22dd 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -279,22 +279,20 @@ as_epi_df.tbl_df <- function( } assert_character(other_keys) + assert_subset(other_keys, names(x)) + # Fix up if given more than just other keys, at least until epipredict#428 + # merged: + other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")] if (".time_value_counts" %in% other_keys) { cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"") } - if (anyDuplicated(x[c("geo_value", "time_value", other_keys)])) { - duplicated_time_values <- x %>% - group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>% - filter(dplyr::n() > 1) %>% - ungroup() - bad_data <- capture.output(duplicated_time_values) - cli_abort( - "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.", - body = c("Sample groups:", bad_data) - ) - } + assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c( + ">" = "If this is line list data, convert it to counts/rates first.", + ">" = "If this contains a demographic breakdown, check that you have + specified appropriate `other_keys`" # . from checkmate + ))) new_epi_df(x, geo_type, time_type, as_of, other_keys) } diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 68b7b9b5..675d000d 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -5,8 +5,11 @@ #' @import epidatasets #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list +#' @importFrom checkmate assert_false #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt #' @importFrom checkmate assert_string +#' @importFrom checkmate assert_subset +#' @importFrom checkmate assert_tibble #' @importFrom checkmate check_atomic check_data_frame expect_class test_int #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname @@ -14,11 +17,14 @@ #' @importFrom data.table as.data.table #' @importFrom data.table key #' @importFrom data.table setkeyv +#' @importFrom dplyr arrange +#' @importFrom dplyr is_grouped_df #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom rlang %||% #' @importFrom rlang is_bare_integerish #' @importFrom vctrs vec_data +#' @importFrom vctrs vec_equal ## usethis namespace: end NULL diff --git a/R/slide.R b/R/slide.R index 0c747d17..7342e0fd 100644 --- a/R/slide.R +++ b/R/slide.R @@ -259,18 +259,7 @@ epi_slide <- function( assert_logical(.all_rows, len = 1) # Check for duplicated time values within groups - duplicated_time_values <- .x %>% - group_epi_df() %>% - filter(dplyr::n() > 1) %>% - ungroup() - if (nrow(duplicated_time_values) > 0) { - bad_data <- capture.output(duplicated_time_values) - cli_abort( - "as_epi_df: some groups in a resulting dplyr computation have duplicated time values. - epi_df requires a unique time_value per group.", - body = c("Sample groups:", bad_data) - ) - } + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) # Begin handling completion. This will create a complete time index between # the smallest and largest time values in the data. This is used to ensure @@ -752,6 +741,9 @@ epi_slide_opt <- function( ) } + # Check for duplicated time values within groups + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) + # The position of a given column can be differ between input `.x` and # `.data_group` since the grouping step by default drops grouping columns. # To avoid rerunning `eval_select` for every `.data_group`, convert diff --git a/R/utils.R b/R/utils.R index 55022677..e350ade2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1195,7 +1195,61 @@ time_type_unit_abbr <- function(time_type) { maybe_unit_abbr } +#' Extract singular element of a length-1 unnamed list (validated) +#' +#' Inverse of `list(elt)`. +#' +#' @param x a length-1 list +#' @return x[[1L]], if x actually was a length-1 list; error otherwise +#' +#' @keywords internal unwrap <- function(x) { checkmate::assert_list(x, len = 1L, names = "unnamed") x[[1L]] } + +#' Check that a unique key is indeed unique in a tibble (TRUE/str) +#' +#' A `checkmate`-style check function. +#' +#' @param x a tibble, with no particular row or column order (if you have a +#' guaranteed row order based on the ukey you can probably do something more +#' efficient) +#' @param ukey_names character vector; subset of column names of `x` denoting a +#' unique key. +#' @param end_cli_message optional character vector, a cli message format +#' string/vector; information/advice to tack onto any error messages. +#' @return `TRUE` if no ukey is duplicated (i.e., `x[ukey_names]` has no +#' duplicated rows); string with an error message if there are errors. +#' +#' @keywords internal +check_ukey_unique <- function(x, ukey_names, end_cli_message = character()) { + assert_tibble(x) # to not have to think about `data.table` perf, xface + assert_false(is_grouped_df(x)) # to not have to think about `grouped_df` perf, xface + assert_character(ukey_names) + assert_subset(ukey_names, names(x)) + # + if (nrow(x) <= 1L) { + TRUE + } else { + # Fast check, slow error message. + arranged_ukeys <- arrange(x[ukey_names], across(all_of(ukey_names))) + if (!any(vec_equal(arranged_ukeys[-1L, ], arranged_ukeys[-nrow(arranged_ukeys), ]))) { + TRUE + } else { + bad_data <- x %>% + group_by(across(all_of(ukey_names))) %>% + filter(dplyr::n() > 1) %>% + ungroup() + lines <- c( + cli::format_error(" + There cannot be more than one row with the same combination of + {format_varnames(ukey_names)}. Problematic rows: + "), + capture.output(bad_data), + cli::format_message(end_cli_message) + ) + paste(collapse = "\n", lines) + } + } +} diff --git a/man/check_ukey_unique.Rd b/man/check_ukey_unique.Rd new file mode 100644 index 00000000..c6306f07 --- /dev/null +++ b/man/check_ukey_unique.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{check_ukey_unique} +\alias{check_ukey_unique} +\title{Check that a unique key is indeed unique in a tibble (TRUE/str)} +\usage{ +check_ukey_unique(x, ukey_names, end_cli_message = character()) +} +\arguments{ +\item{x}{a tibble, with no particular row or column order (if you have a +guaranteed row order based on the ukey you can probably do something more +efficient)} + +\item{ukey_names}{character vector; subset of column names of \code{x} denoting a +unique key.} + +\item{end_cli_message}{optional character vector, a cli message format +string/vector; information/advice to tack onto any error messages.} +} +\value{ +\code{TRUE} if no ukey is duplicated (i.e., \code{x[ukey_names]} has no +duplicated rows); string with an error message if there are errors. +} +\description{ +A \code{checkmate}-style check function. +} +\keyword{internal} diff --git a/man/unwrap.Rd b/man/unwrap.Rd new file mode 100644 index 00000000..dad0b441 --- /dev/null +++ b/man/unwrap.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{unwrap} +\alias{unwrap} +\title{Extract singular element of a length-1 unnamed list (validated)} +\usage{ +unwrap(x) +} +\arguments{ +\item{x}{a length-1 list} +} +\value{ +x[\link{1L}], if x actually was a length-1 list; error otherwise +} +\description{ +Inverse of \code{list(elt)}. +} +\keyword{internal} diff --git a/tests/testthat/_snaps/epi_df.md b/tests/testthat/_snaps/epi_df.md new file mode 100644 index 00000000..29280bf8 --- /dev/null +++ b/tests/testthat/_snaps/epi_df.md @@ -0,0 +1,15 @@ +# as_epi_df errors on nonunique epikeytime + + Code + as_epi_df(tibble::tibble(geo_value = 1, time_value = 1, value = 1:2), as_of = 5) + Condition + Error: + ! Assertion on 'x' failed: There cannot be more than one row with the same combination of geo_value and time_value. Problematic rows: + # A tibble: 2 x 3 + geo_value time_value value + + 1 1 1 1 + 2 1 1 2 + > If this is line list data, convert it to counts/rates first. + > If this contains a demographic breakdown, check that you have specified appropriate `other_keys`. + diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 44bb62e2..c3e51aa2 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -40,6 +40,20 @@ test_that("as_epi_df errors for non-character other_keys", { expect_silent(as_epi_df(ex_input, other_keys = c("state", "pol"))) }) +test_that("as_epi_df errors on nonunique epikeytime", { + expect_snapshot( + as_epi_df(tibble::tibble( + geo_value = 1, time_value = 1, value = 1:2 + ), as_of = 5), + error = TRUE + ) + expect_no_error( + as_epi_df(tibble::tibble( + geo_value = 1, age_group = 1:2, time_value = 1, value = 1:2 + ), other_keys = "age_group", as_of = 5) + ) +}) + test_that("as_epi_df works for nonstandard input", { tib <- tibble::tibble( x = 1:10, y = 1:10, From 79093a8b97231301881b6fb6d1bc291180784379 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 26 Nov 2024 05:32:53 -0800 Subject: [PATCH 34/39] Fix _pkgdown.yml (data set topics removed; link epidatasets topics) --- _pkgdown.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 2214df7c..1344705c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -103,11 +103,11 @@ reference: - title: Example data - contents: - - cases_deaths_subset - - archive_cases_dv_subset - - covid_incidence_county_subset - - covid_incidence_outliers - - covid_case_death_rates_extended + - epidatasets::cases_deaths_subset + - epidatasets::archive_cases_dv_subset + - epidatasets::covid_incidence_county_subset + - epidatasets::covid_incidence_outliers + - epidatasets::covid_case_death_rates_extended - title: internal - contents: From b4bacf865259af914c1b171df89c19bad06a0366 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 26 Nov 2024 06:06:41 -0800 Subject: [PATCH 35/39] Mollify CHECK regarding epidatasets re-exports --- R/reexports.R | 19 +++++++++++++++++ man/epidatasets_reexports.Rd | 40 ++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 man/epidatasets_reexports.Rd diff --git a/R/reexports.R b/R/reexports.R index 72241175..9a33e94b 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -79,17 +79,36 @@ ggplot2::autoplot # epidatasets ------------------------------------------------------------------- +#' @rdname epidatasets_reexports +#' +#' @title Selected example data sets from `epidatasets` +#' +#' @description Data sets re-exported from `epidatasets`; please see +#' documentation for each of these objects in `epidatasets`. +#' +#' A brief description of the format of each of the objects above are described +#' in matching order below. +#' +#' @keywords internal #' @export delayedAssign("cases_deaths_subset", epidatasets::cases_deaths_subset) +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("covid_incidence_county_subset", epidatasets::covid_incidence_county_subset) +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("covid_incidence_outliers", epidatasets::covid_incidence_outliers) +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("archive_cases_dv_subset", epidatasets::archive_cases_dv_subset) +#' @rdname epidatasets_reexports +#' @keywords internal #' @export delayedAssign("covid_case_death_rates_extended", epidatasets::covid_case_death_rates_extended) diff --git a/man/epidatasets_reexports.Rd b/man/epidatasets_reexports.Rd new file mode 100644 index 00000000..3dc809e4 --- /dev/null +++ b/man/epidatasets_reexports.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports.R +\docType{data} +\name{cases_deaths_subset} +\alias{cases_deaths_subset} +\alias{covid_incidence_county_subset} +\alias{covid_incidence_outliers} +\alias{archive_cases_dv_subset} +\alias{covid_case_death_rates_extended} +\title{Selected example data sets from \code{epidatasets}} +\format{ +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 4026 rows and 6 columns. + +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 16212 rows and 5 columns. + +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 730 rows and 3 columns. + +An object of class \code{epi_archive} of length 6. + +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 37576 rows and 4 columns. +} +\usage{ +cases_deaths_subset + +covid_incidence_county_subset + +covid_incidence_outliers + +archive_cases_dv_subset + +covid_case_death_rates_extended +} +\description{ +Data sets re-exported from \code{epidatasets}; please see +documentation for each of these objects in \code{epidatasets}. + +A brief description of the format of each of the objects above are described +in matching order below. +} +\keyword{internal} From 7bf4c6a34ab7c6f7dce70a0065136cfdadde96bc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 9 Dec 2024 15:29:31 -0800 Subject: [PATCH 36/39] Add missing dplyr import in slide example --- R/slide.R | 2 ++ man/epi_slide_opt.Rd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R/slide.R b/R/slide.R index 7342e0fd..e8fafa7a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -621,6 +621,8 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' @export #' @seealso [`epi_slide`] for the more general slide function #' @examples +#' library(dplyr) +#' #' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: #' cases_deaths_subset %>% #' select(geo_value, time_value, cases) %>% diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 7dc47a16..68244410 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -162,6 +162,8 @@ for \code{.f} factoring in the input column type(s) for \code{.col_names} } \examples{ +library(dplyr) + # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: cases_deaths_subset \%>\% select(geo_value, time_value, cases) \%>\% From 101aad80633c080583af15bf4d5d71f231d98df5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 9 Dec 2024 15:43:40 -0800 Subject: [PATCH 37/39] Bump to 0.10.0, update Author info --- DESCRIPTION | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e1568a9e..e894f099 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,10 @@ Package: epiprocess Type: Package Title: Tools for basic signal processing in epidemiology -Version: 0.9.7 +Version: 0.10.0 Authors@R: c( person("Jacob", "Bien", role = "ctb"), - person("Logan", "Brooks", , "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), + person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), person("Rafael", "Catoia", role = "ctb"), person("Nat", "DeFries", role = "ctb"), person("Daniel", "McDonald", role = "aut"), @@ -13,8 +13,9 @@ Authors@R: c( person("Chloe", "You", role = "ctb"), person("Quang", "Nguyen", role = "ctb"), person("Evan", "Ray", role = "aut"), - person("Dmitry", "Shemetov", role = "ctb"), + person("Dmitry", "Shemetov", role = "aut"), person("Ryan", "Tibshirani", role = "aut"), + person("David", "Weber", , "davidweb@andrew.cmu.edu", role = "ctb"), person("Lionel", "Henry", role = "ctb", comment = "Author of included rlang fragments"), person("Hadley", "Wickham", role = "ctb", From 271aaaa8974451db20896d104fa21e686ddcebb8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 9 Dec 2024 22:39:54 -0800 Subject: [PATCH 38/39] Improve unsupported arg check messages in epi_slide_{sum,mean} --- R/slide.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/slide.R b/R/slide.R index e8fafa7a..761639d4 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1010,8 +1010,9 @@ epi_slide_mean <- function( } if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { cli::cli_abort( - "epi_slide_mean: the argument `new_col_name` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { @@ -1069,8 +1070,9 @@ epi_slide_sum <- function( } if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { cli::cli_abort( - "epi_slide_sum: the argument `new_col_name` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { From 5da97c7bfd15b27195eb0637a6ce44aefb8ada43 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 9 Dec 2024 22:40:28 -0800 Subject: [PATCH 39/39] Re-document --- man/epiprocess-package.Rd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd index b4f3e174..774d5f8a 100644 --- a/man/epiprocess-package.Rd +++ b/man/epiprocess-package.Rd @@ -16,12 +16,13 @@ Useful links: } \author{ -\strong{Maintainer}: Logan Brooks \email{lcbrooks@andrew.cmu.edu} +\strong{Maintainer}: Logan Brooks \email{lcbrooks+github@andrew.cmu.edu} Authors: \itemize{ \item Daniel McDonald \item Evan Ray + \item Dmitry Shemetov \item Ryan Tibshirani } @@ -34,7 +35,7 @@ Other contributors: \item Ken Mawer [contributor] \item Chloe You [contributor] \item Quang Nguyen [contributor] - \item Dmitry Shemetov [contributor] + \item David Weber \email{davidweb@andrew.cmu.edu} [contributor] \item Lionel Henry (Author of included rlang fragments) [contributor] \item Hadley Wickham (Author of included rlang fragments) [contributor] \item Posit (Copyright holder of included rlang fragments) [copyright holder]