diff --git a/DESCRIPTION b/DESCRIPTION index 71d95969..c737e0c6 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.5 +Version: 0.7.6 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index 03e0e41d..cf59f29d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ export(detect_outlr_stl) export(epi_archive) export(epi_cor) export(epi_slide) +export(epi_slide_mean) export(epix_as_of) export(epix_merge) export(epix_slide) @@ -74,6 +75,7 @@ importFrom(checkmate,assert) importFrom(checkmate,assert_character) importFrom(checkmate,assert_class) importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) @@ -93,11 +95,13 @@ importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) +importFrom(data.table,frollmean) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setDF) importFrom(data.table,setkeyv) +importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,dplyr_col_modify) @@ -114,17 +118,21 @@ importFrom(dplyr,relocate) importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,slice) +importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(ggplot2,autoplot) +importFrom(lubridate,as.period) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") +importFrom(purrr,map) importFrom(purrr,map_lgl) importFrom(rlang,"!!!") importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) +importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,enquo) @@ -139,6 +147,7 @@ importFrom(rlang,is_missing) importFrom(rlang,is_quosure) importFrom(rlang,missing_arg) importFrom(rlang,new_function) +importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) diff --git a/NEWS.md b/NEWS.md index 5bf584e6..908302f2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `epi_slide` computations are now 2-4 times faster after changing how reference time values, made accessible within sliding functions, are calculated (#397). +- Add new `epi_slide_mean` function to allow much (~30x) faster rolling + average computations in some cases (#400). - regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API - changed approach to versioning, see DEVELOPMENT.md for details diff --git a/R/slide.R b/R/slide.R index bbcaeb02..b0903fab 100644 --- a/R/slide.R +++ b/R/slide.R @@ -32,16 +32,18 @@ #' provided; the other's default will be 0. Any value provided for either #' argument must be a single, non-`NA`, non-negative, #' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: * For trailing/right-aligned -#' windows from `ref_time_value - time_step(k)` to `ref_time_value`: either -#' pass `before=k` by itself, or pass `before=k, after=0`. * For -#' center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. * For -#' leading/left-aligned windows from `ref_time_value` to `ref_time_value + -#' time_step(k)`: either pass pass `after=k` by itself, or pass `before=0, -#' after=k`. See "Details:" about the definition of a time step, -#' (non)treatment of missing rows within the window, and avoiding warnings -#' about `before`&`after` settings for a certain uncommon use case. +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. #' @param ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the @@ -125,34 +127,41 @@ #' @importFrom dplyr bind_rows group_vars filter select #' @importFrom rlang .data .env !! enquo enquos sym env missing_arg #' @export +#' @seealso [`epi_slide_mean`] #' @examples #' # slide a 7-day trailing average formula on cases +#' # This and other simple sliding means are much faster to do using +#' # the `epi_slide_mean` function instead. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), before = 6) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), after = 6) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' epi_slide(cases_14dav = mean(cases), before = 6, after = 7) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' ungroup() #' #' # nested new columns #' jhu_csse_daily_subset %>% @@ -163,7 +172,8 @@ #' cases_2dma = mad(cases) #' ), #' before = 1, as_list_col = TRUE -#' ) +#' ) %>% +#' ungroup() epi_slide <- function(x, f, ..., before, after, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, @@ -373,3 +383,453 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, return(x) } + +#' Optimized slide function for performing rolling averages on an `epi_df` object +#' +#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for +#' examples. +#' +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param col_names A single tidyselection or a tidyselection vector of the +#' names of one or more columns for which to calculate the rolling mean. +#' @param ... Additional arguments to pass to `data.table::frollmean`, for +#' example, `na.rm` and `algo`. `data.table::frollmean` is automatically +#' passed the data `x` to operate on, the window size `n`, and the alignment +#' `align`. Providing these args via `...` will cause an error. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. +#' @param ref_time_values Time values for sliding computations, meaning, each +#' element of this vector serves as the reference time point for one sliding +#' window. If missing, then this will be set to all unique time values in the +#' underlying data table, by default. +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class `lubridate::period`. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_names String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_names` equal to an existing column name will overwrite this column. +#' @param as_list_col Not supported. Included to match `epi_slide` interface. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_names` entirely. +#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, depending on the `col_names` argument, named according to the +#' `new_col_names` argument. +#' +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is entered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. The unit (the meaning of one time step) is implicitly +#' defined by the way the `time_value` column treats addition and subtraction; +#' for example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals +#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly +#' using the `time_step` argument (which if specified would override the +#' default choice based on `time_value` column). If there are not enough time +#' steps available to complete the window at any given reference time, then +#' `epi_slide()` still attempts to perform the computation anyway (it does not +#' require a complete window). The issue of what to do with partial +#' computations (those run on incomplete windows) is therefore left up to the +#' user, either through the specified function or formula `f`, or through +#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and +#' `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) +#' +#' @importFrom dplyr bind_rows mutate %>% arrange tibble select +#' @importFrom rlang enquo quo_get_expr as_label +#' @importFrom purrr map +#' @importFrom data.table frollmean +#' @importFrom lubridate as.period +#' @importFrom checkmate assert_function +#' @export +#' @seealso [`epi_slide`] +#' @examples +#' # slide a 7-day trailing average formula on cases +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed +#' # and accuracy, and to allow partially-missing windows. +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, +#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' # `frollmean` options +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 7-day leading average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 7-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 14-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' ungroup() +epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, + time_step, + new_col_names = "slide_value", as_list_col = NULL, + names_sep = "_", all_rows = FALSE) { + assert_class(x, "epi_df") + + if (nrow(x) == 0L) { + cli_abort( + c( + "input data `x` unexpectedly has 0 rows", + "i" = "If this computation is occuring within an `epix_slide` call, + check that `epix_slide` `ref_time_values` argument was set appropriately" + ), + class = "epiprocess__epi_slide_mean__0_row_input", + epiprocess__x = x + ) + } + + if (!is.null(as_list_col)) { + cli_abort( + "`as_list_col` is not supported for `epi_slide_mean`", + class = "epiproces__epi_slide_mean__list_not_supported" + ) + } + + user_provided_rtvs <- !missing(ref_time_values) + if (!user_provided_rtvs) { + ref_time_values <- unique(x$time_value) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(ref_time_values, unique(x$time_value))) { + cli_abort( + "`ref_time_values` must be a unique subset of the time values in `x`." + ) + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } + } + ref_time_values <- sort(ref_time_values) + + # Validate and pre-process `before`, `after`: + if (!missing(before)) { + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) + } + if (!missing(after)) { + after <- vctrs::vec_cast(after, integer()) + assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) + } + if (missing(before)) { + if (missing(after)) { + cli_abort("Either or both of `before`, `after` must be provided.") + } else if (after == 0L) { + cli_warn("`before` missing, `after==0`; maybe this was intended to be some + non-zero-width trailing window, but since `before` appears to be + missing, it's interpreted as a zero-width window (`before=0, + after=0`).") + } + before <- 0L + } else if (missing(after)) { + if (before == 0L) { + cli_warn("`before==0`, `after` missing; maybe this was intended to be some + non-zero-width leading window, but since `after` appears to be + missing, it's interpreted as a zero-width window (`before=0, + after=0`).") + } + after <- 0L + } + + # Make a complete date sequence between min(x$time_value) and max(x$time_value). + date_seq_list <- full_date_seq(x, before, after, time_step) + all_dates <- date_seq_list$all_dates + pad_early_dates <- date_seq_list$pad_early_dates + pad_late_dates <- date_seq_list$pad_late_dates + + # `frollmean` is 1-indexed, so create a new window width based on our + # `before` and `after` params. + window_size <- before + after + 1L + + col_names_quo <- enquo(col_names) + col_names_chr <- as.character(rlang::quo_get_expr(col_names_quo)) + if (startsWith(rlang::as_label(col_names_quo), "c(")) { + # List or vector of col names. We need to drop the first element since it + # will be either "c" (if built as a vector) or "list" (if built as a + # list). + col_names_chr <- col_names_chr[-1] + } else if (startsWith(rlang::as_label(col_names_quo), "list(")) { + cli_abort( + "`col_names` must be a single tidy column name or a vector + (`c()`) of tidy column names", + class = "epiprocess__epi_slide_mean__col_names_in_list", + epiprocess__col_names = col_names_chr + ) + } + # If single column name, do nothing. + + if (is.null(names_sep)) { + if (length(new_col_names) != length(col_names_chr)) { + cli_abort( + c( + "`new_col_names` must be the same length as `col_names` when + `names_sep` is NULL to avoid duplicate output column names." + ), + class = "epiprocess__epi_slide_mean__col_names_length_mismatch", + epiprocess__new_col_names = new_col_names, + epiprocess__col_names = col_names_chr + ) + } + result_col_names <- new_col_names + } else { + if (length(new_col_names) != 1L && length(new_col_names) != length(col_names_chr)) { + cli_abort( + "`new_col_names` must be either length 1 or the same length as `col_names`.", + class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", + epiprocess__new_col_names = new_col_names, + epiprocess__col_names = col_names_chr + ) + } + result_col_names <- paste(new_col_names, col_names_chr, sep = names_sep) + } + + slide_one_grp <- function(.data_group, .group_key, ...) { + missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] + + # `frollmean` requires a full window to compute a result. Add NA values + # to beginning and end of the group so that we get results for the + # first `before` and last `after` elements. + .data_group <- bind_rows( + .data_group, + tibble(time_value = c(missing_times, pad_early_dates, pad_late_dates), .real = FALSE) + ) %>% + arrange(.data$time_value) + + # If a group contains duplicate time values, `frollmean` will still only + # use the last `k` obs. It isn't looking at dates, it just goes in row + # order. So if the computation is aggregating across multiple obs for the + # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` + # should be used instead. + if (anyDuplicated(.data_group$time_value) != 0L) { + cli_abort( + c( + "group contains duplicate time values. Using `epi_slide_mean` on this + group will result in incorrect results", + "i" = "Please change the grouping structure of the input data so that + each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) + %>% epi_slide_mean`)", + "i" = "Use `epi_slide` to aggregate across groups" + ), + class = "epiprocess__epi_slide_mean__duplicate_time_values", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } + if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { + cli_abort( + c( + "group contains an unexpected number of rows", + "i" = c("Input data may contain `time_values` closer together than the + expected `time_step` size") + ), + class = "epiprocess__epi_slide_mean__unexpected_row_number", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } + + roll_output <- data.table::frollmean( + x = select(.data_group, {{ col_names }}), n = window_size, align = "right", ... + ) + + if (after >= 1) { + # Right-aligned `frollmean` results' `ref_time_value`s will be `after` + # timesteps ahead of where they should be. Shift results to the left by + # `after` timesteps. + .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { + c(.x[(after + 1L):length(.x)], rep(NA, after)) + }) + } else { + .data_group[, result_col_names] <- roll_output + } + + return(.data_group) + } + + result <- mutate(x, .real = TRUE) %>% + group_modify(slide_one_grp, ..., .keep = FALSE) + + result <- result[result$.real, ] + result$.real <- NULL + + if (all_rows) { + result[!(result$time_value %in% ref_time_values), result_col_names] <- NA + } else if (user_provided_rtvs) { + result <- result[result$time_value %in% ref_time_values, ] + } + + if (!is_epi_df(result)) { + # `all_rows`handling strip epi_df format and metadata. + # Restore them. + result <- reclass(result, attributes(x)$metadata) + } + + return(result) +} + +#' Make a complete date sequence between min(x$time_value) and max +#' (x$time_value). Produce lists of dates before min(x$time_value) and after +#' max(x$time_value) for padding initial and final windows to size `n`. +#' +#' `before` and `after` inputs here should be raw (numeric) values; +#' `time_step` function should NOT have been applied. `full_date_seq` applies +#' `time_step` as needed. +#' +#' @importFrom checkmate assert_function +#' @noRd +full_date_seq <- function(x, before, after, time_step) { + pad_early_dates <- c() + pad_late_dates <- c() + + # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` + # defines a step of 1 to automatically be the quantum (smallest resolvable + # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. + # + # `tsibble` classes apparently can't be added to in different units, so even + # if `time_step` is provided by the user, use a value-1 unitless step. + if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || + is.numeric(x$time_value)) { + all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) + + if (before != 0) { + pad_early_dates <- all_dates[1L] - before:1 + } + if (after != 0) { + pad_late_dates <- all_dates[length(all_dates)] + 1:after + } + } else if (missing(time_step)) { + # Guess what `by` should be based on the epi_df's `time_type`. + ttype <- attributes(x)$metadata$time_type + by <- switch(ttype, + day = "days", + week = "weeks", + yearweek = "weeks", + yearmonth = "months", + yearquarter = "quarters", + year = "years", + NA # default value for "custom", "day-time" + ) + + if (is.na(by)) { + cli_abort( + c( + "`frollmean` requires a full window to compute a result, but the + `time_type` associated with the epi_df was not mappable to a period + type valid for creating a date sequence.", + "i" = c("The input data's `time_type` was probably `custom` or `day-time`. + These require also passing a `time_step` function.") + ), + class = "epiprocess__epi_slide_mean__unmappable_time_type", + epiprocess__time_type = ttype + ) + } + + # `seq.Date` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. + all_dates <- seq(min(x$time_value), max(x$time_value), by = by) + + if (before != 0) { + # Use `seq.Date` here to avoid having to map `epi_df` `time_type` to + # `time_step` functions. + # + # The first element `seq.Date` returns is always equal to the provided + # `from` date (`from + 0`). The full return value is equivalent to + # `from + 0:n`. In our case, we `from + 1:n`, so drop the first + # element. + # + # Adding "-1" to the `by` arg makes `seq.Date` go backwards in time. + pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) + } + if (after != 0) { + pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] + } + } else { + # A custom time step is specified. + assert_function(time_step) + + # Calculate the number of `time_step`s required to go between min and max time + # values. This is roundabout because difftime objects, lubridate::period objects, + # and Dates are hard to convert to the same time scale and add. + t_elapsed_s <- difftime(max(x$time_value), min(x$time_value), units = "secs") + step_size_s <- lubridate::as.period(time_step(1), unit = "secs") + n_steps <- ceiling(as.numeric(t_elapsed_s) / as.numeric(step_size_s)) + + all_dates <- min(x$time_value) + time_step(0:n_steps) + + if (before != 0) { + pad_early_dates <- all_dates[1L] - time_step(before:1) + } + if (after != 0) { + pad_late_dates <- all_dates[length(all_dates)] + time_step(1:after) + } + } + + return(list( + all_dates = all_dates, + pad_early_dates = pad_early_dates, + pad_late_dates = pad_late_dates + )) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 8ac42458..bc7dd779 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -67,6 +67,7 @@ reference: desc: Functions that act on `epi_df` objects. - contents: - epi_slide + - epi_slide_mean - epi_cor - title: Vector functions desc: Functions that act directly on signal variables. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 668be9ff..d09dfdda 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -50,14 +50,19 @@ the sliding window extend? At least one of these two arguments must be provided; the other's default will be 0. Any value provided for either argument must be a single, non-\code{NA}, non-negative, \link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: * For trailing/right-aligned -windows from \code{ref_time_value - time_step(k)} to \code{ref_time_value}: either -pass \code{before=k} by itself, or pass \verb{before=k, after=0}. * For -center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. * For -leading/left-aligned windows from \code{ref_time_value} to \code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. See "Details:" about the definition of a time step, -(non)treatment of missing rows within the window, and avoiding warnings -about \code{before}&\code{after} settings for a certain uncommon use case.} +the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass +\verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item For leading/left-aligned windows from \code{ref_time_value} to +\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +or pass \verb{before=0, after=k}. +See "Details:" about the definition of a time step,(non)treatment of +missing rows within the window, and avoiding warnings about +\code{before}&\code{after} settings for a certain uncommon use case. +}} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding @@ -154,32 +159,38 @@ through the \code{new_col_name} argument. } \examples{ # slide a 7-day trailing average formula on cases +# This and other simple sliding means are much faster to do using +# the `epi_slide_mean` function instead. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), before = 6) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), after = 6) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + epi_slide(cases_14dav = mean(cases), before = 6, after = 7) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% + ungroup() # nested new columns jhu_csse_daily_subset \%>\% @@ -190,5 +201,9 @@ jhu_csse_daily_subset \%>\% cases_2dma = mad(cases) ), before = 1, as_list_col = TRUE - ) + ) \%>\% + ungroup() +} +\seealso{ +\code{\link{epi_slide_mean}} } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd new file mode 100644 index 00000000..fd4b84ab --- /dev/null +++ b/man/epi_slide_mean.Rd @@ -0,0 +1,169 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{epi_slide_mean} +\alias{epi_slide_mean} +\title{Optimized slide function for performing rolling averages on an \code{epi_df} object} +\usage{ +epi_slide_mean( + x, + col_names, + ..., + before, + after, + ref_time_values, + time_step, + new_col_names = "slide_value", + as_list_col = NULL, + names_sep = "_", + all_rows = FALSE +) +} +\arguments{ +\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +single data group.} + +\item{col_names}{A single tidyselection or a tidyselection vector of the +names of one or more columns for which to calculate the rolling mean.} + +\item{...}{Additional arguments to pass to \code{data.table::frollmean}, for +example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically +passed the data \code{x} to operate on, the window size \code{n}, and the alignment +\code{align}. Providing these args via \code{...} will cause an error.} + +\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should +the sliding window extend? At least one of these two arguments must be +provided; the other's default will be 0. Any value provided for either +argument must be a single, non-\code{NA}, non-negative, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of +the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass +\verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item For leading/left-aligned windows from \code{ref_time_value} to +\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +or pass \verb{before=0, after=k}. +See "Details:" about the definition of a time step,(non)treatment of +missing rows within the window, and avoiding warnings about +\code{before}&\code{after} settings for a certain uncommon use case. +}} + +\item{ref_time_values}{Time values for sliding computations, meaning, each +element of this vector serves as the reference time point for one sliding +window. If missing, then this will be set to all unique time values in the +underlying data table, by default.} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a non-negative integer and +return an object of class \code{lubridate::period}. For example, we can use +\code{time_step = lubridate::hours} in order to set the time step to be one hour +(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_names}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_names} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_names} entirely.} + +\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output.} +} +\value{ +An \code{epi_df} object given by appending one or more new columns to +\code{x}, depending on the \code{col_names} argument, named according to the +\code{new_col_names} argument. +} +\description{ +Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +examples. +} +\details{ +To "slide" means to apply a function or formula over a rolling +window of time steps for each data group, where the window is entered at a +reference time and left and right endpoints are given by the \code{before} and +\code{after} arguments. The unit (the meaning of one time step) is implicitly +defined by the way the \code{time_value} column treats addition and subtraction; +for example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals +\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly +using the \code{time_step} argument (which if specified would override the +default choice based on \code{time_value} column). If there are not enough time +steps available to complete the window at any given reference time, then +\code{epi_slide()} still attempts to perform the computation anyway (it does not +require a complete window). The issue of what to do with partial +computations (those run on incomplete windows) is therefore left up to the +user, either through the specified function or formula \code{f}, or through +post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and +\code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) +} +\examples{ +# slide a 7-day trailing average formula on cases +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed +# and accuracy, and to allow partially-missing windows. +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, + new_col_names = "cases_7dav", names_sep = NULL, before = 6, + # `frollmean` options + na.rm = TRUE, algo = "exact", hasNA = TRUE + ) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 7-day leading average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 7-day centre-aligned average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 14-day centre-aligned average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% + ungroup() +} +\seealso{ +\code{\link{epi_slide}} +} diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 163bf010..f727337c 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -28,6 +28,25 @@ toy_edf <- tibble::tribble( tidyr::unchop(c(time_value, value)) %>% as_epi_df(as_of = 100) +# nolint start: line_length_linter. +basic_result_from_size1_sum <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), +) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + +basic_result_from_size1_mean <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), +) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) +# nolint end: line_length_linter. + ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { expect_error( @@ -38,6 +57,15 @@ test_that("`before` and `after` are both vectors of length 1", { epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), "Assertion on 'after' failed: Must have length 1" ) + + expect_error( + epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = d + 3), + "Assertion on 'before' failed: Must have length 1" + ) + expect_error( + epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = d + 3), + "Assertion on 'after' failed: Must have length 1" + ) }) test_that("Test errors/warnings for discouraged features", { @@ -53,10 +81,56 @@ test_that("Test errors/warnings for discouraged features", { epi_slide(grouped, f, after = 0L, ref_time_values = d + 1), "`before` missing, `after==0`" ) + + expect_error( + epi_slide_mean(grouped, col_names = value, ref_time_values = d + 1), + "Either or both of `before`, `after` must be provided." + ) + expect_warning( + epi_slide_mean(grouped, col_names = value, before = 0L, ref_time_values = d + 1), + "`before==0`, `after` missing" + ) + expect_warning( + epi_slide_mean(grouped, col_names = value, after = 0L, ref_time_values = d + 1), + "`before` missing, `after==0`" + ) + # Below cases should raise no errors/warnings: - expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d + 2), NA) - expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d + 2), NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2), NA) + expect_no_warning( + ref1 <- epi_slide(grouped, f, before = 1L, ref_time_values = d + 2) + ) + expect_no_warning( + ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2) + ) + expect_no_warning( + ref3 <- epi_slide(grouped, f, + before = 0L, after = 0L, ref_time_values = d + 2 + ) + ) + + expect_no_warning( + opt1 <- epi_slide_mean(grouped, + col_names = value, + before = 1L, ref_time_values = d + 2, na.rm = TRUE + ) + ) + expect_no_warning( + opt2 <- epi_slide_mean(grouped, + col_names = value, + after = 1L, ref_time_values = d + 2, na.rm = TRUE + ) + ) + expect_no_warning( + opt3 <- epi_slide_mean(grouped, + col_names = value, + before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE + ) + ) + + # Results from epi_slide and epi_slide_mean should match + expect_identical(select(ref1, -slide_value_count), opt1) + expect_identical(select(ref2, -slide_value_count), opt2) + expect_identical(select(ref3, -slide_value_count), opt3) }) test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { @@ -88,8 +162,48 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), "Assertion on 'after' failed: May not be NA" ) + + expect_error( + epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = d + 2L), + "Assertion on 'before' failed: Element 1 is not >= 0" + ) + expect_error( + epi_slide_mean(grouped, col_names = value, before = 2L, after = -1L, ref_time_values = d + 2L), + "Assertion on 'after' failed: Element 1 is not >= 0" + ) + expect_error(epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = "a", ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = 0.5, ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error( + epi_slide_mean(grouped, col_names = value, before = NA, after = 1L, ref_time_values = d + 2L), + "Assertion on 'before' failed: May not be NA" + ) + expect_error( + epi_slide_mean(grouped, col_names = value, before = 1L, after = NA, ref_time_values = d + 2L), + "Assertion on 'after' failed: May not be NA" + ) + # Non-integer-class but integer-compatible values are allowed: - expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L), NA) + expect_no_error( + ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L) + ) + expect_no_error(opt <- epi_slide_mean( + grouped, + col_names = value, before = 1, after = 1, + ref_time_values = d + 2L, na.rm = TRUE + )) + + # Results from epi_slide and epi_slide_mean should match + expect_identical(select(ref, -slide_value_count), opt) }) test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { @@ -101,12 +215,23 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window + + expect_error( + epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # before the first, no data in the slide windows + expect_error( + epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d + 207L), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # beyond the last, no data in window }) test_that( - "`ref_time_values` + `before` + `after` that have some slide data, but - generate the error due to ref. time being out of time range (would also - happen if they were in between `time_value`s)", + c( + "`ref_time_values` + `before` + `after` that have some slide data, but + generate the error due to ref. time being out of time range (would + also happen if they were in between `time_value`s)" + ), { expect_error( epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), @@ -116,19 +241,50 @@ test_that( epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window + + expect_error( + epi_slide_mean(grouped, value, before = 0L, after = 2L, ref_time_values = d), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 201L), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # beyond the last, but still with data in window } ) ## --- These cases generate warnings (or not): --- test_that("Warn user against having a blank `before`", { - expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d + 1L), NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 1L, ref_time_values = d + 1L), NA) + expect_no_warning(ref1 <- epi_slide( + grouped, f, + after = 1L, ref_time_values = d + 1L + )) + expect_no_warning(ref2 <- epi_slide( + grouped, f, + before = 0L, after = 1L, ref_time_values = d + 1L + )) + + expect_no_warning(opt1 <- epi_slide_mean( + grouped, value, + after = 1L, ref_time_values = d + 1L, na.rm = TRUE + )) + expect_no_warning(opt2 <- epi_slide_mean( + grouped, value, + before = 0L, after = 1L, + ref_time_values = d + 1L, na.rm = TRUE + )) + + # Results from epi_slide and epi_slide_mean should match + expect_identical(select(ref1, -slide_value_count), opt1) + expect_identical(select(ref2, -slide_value_count), opt2) }) ## --- These cases doesn't generate the error: --- test_that( - "these doesn't produce an error; the error appears only if the ref - time values are out of the range for every group", + c( + "these doesn't produce an error; the error appears only if the ref time + values are out of the range for every group" + ), { expect_identical( epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% @@ -142,54 +298,104 @@ test_that( dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) ) # not out of range for either group + + expect_identical( + epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group } ) test_that("computation output formats x as_list_col", { - # See `toy_edf` definition at top of file. + # See `toy_edf` and `basic_result_from_size1_sum` definitions at top of file. # We'll try 7d sum with a few formats. - # nolint start: line_length_linter. - basic_result_from_size1 <- tibble::tribble( - ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - ) %>% - tidyr::unchop(c(time_value, value, slide_value)) %>% - dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) - # nolint end expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), - basic_result_from_size1 + basic_result_from_size1_sum ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), - basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value)) + basic_result_from_size1_sum %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), - basic_result_from_size1 %>% rename(slide_value_value = slide_value) + basic_result_from_size1_sum %>% rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), - basic_result_from_size1 %>% + basic_result_from_size1_sum %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) - # output naming functionality: + + # See `toy_edf` and `basic_result_from_size1_mean` definitions at top of file. + # We'll try 7d avg with a few formats. + # Warning: not exactly the same naming behavior as `epi_slide`. expect_identical( - toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - new_col_name = "result" - ), - basic_result_from_size1 %>% rename(result_value = slide_value) + toy_edf %>% + filter( + geo_value == "a" + ) %>% + epi_slide_mean( + value, + before = 6L, na.rm = TRUE + ), + basic_result_from_size1_mean %>% dplyr::mutate( + slide_value_value = slide_value + ) %>% + select(-slide_value) + ) + expect_error( + toy_edf %>% + filter( + geo_value == "a" + ) %>% + epi_slide_mean( + value, + before = 6L, as_list_col = TRUE, na.rm = TRUE + ), + class = "epiproces__epi_slide_mean__list_not_supported" ) + # `epi_slide_mean` doesn't return dataframe columns +}) + +test_that("nested dataframe output names are controllable", { expect_identical( - toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value_sum = sum(.x$value)), - names_sep = NULL - ), - basic_result_from_size1 %>% rename(value_sum = slide_value) + toy_edf %>% + epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result" + ), + basic_result_from_size1_sum %>% rename(result_value = slide_value) ) + expect_identical( + toy_edf %>% + epi_slide( + before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL + ), + basic_result_from_size1_sum %>% rename(value_sum = slide_value) + ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% + epi_slide_mean( + value, + before = 6L, names_sep = NULL, na.rm = TRUE + ), + basic_result_from_size1_mean + ) +}) + +test_that("non-size-1 outputs are recycled", { # trying with non-size-1 computation outputs: # nolint start: line_length_linter. basic_result_from_size2 <- tibble::tribble( @@ -267,6 +473,43 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { slide_value, NA_integer_ )) ) + + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% + epi_slide_mean( + value, + before = 6L, names_sep = NULL, na.rm = TRUE + ), + basic_result_from_size1_mean + ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% + epi_slide_mean( + value, + before = 6L, ref_time_values = c(2L, 8L), + names_sep = NULL, na.rm = TRUE + ), + filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% + epi_slide_mean( + value, + before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, + names_sep = NULL, na.rm = TRUE + ), + basic_result_from_size1_mean %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_ + )) + ) + # slide computations returning data frames: expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), @@ -410,15 +653,26 @@ test_that("basic grouped epi_slide computation produces expected output", { expect_identical(result3, expected_output) }) +test_that("basic grouped epi_slide_mean computation produces expected output", { + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result1, expected_output) +}) + test_that("ungrouped epi_slide computation completes successfully", { - expect_error( + expect_no_error( small_x %>% ungroup() %>% epi_slide( before = 2, slide_value = sum(.x$value) - ), - regexp = NA + ) ) }) @@ -458,6 +712,27 @@ test_that("basic ungrouped epi_slide computation produces expected output", { expect_identical(result2, expected_output) }) +test_that("basic ungrouped epi_slide_mean computation produces expected output", { + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + ) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + ungroup() %>% + filter(geo_value == "ak") %>% + epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result1, expected_output) + + # Ungrouped with multiple geos + # epi_slide_mean fails when input data groups contain duplicate time_values, + # e.g. aggregating across geos + expect_error( + small_x %>% ungroup() %>% epi_slide_mean(value, before = 6L), + class = "epiprocess__epi_slide_mean__duplicate_time_values" + ) +}) + test_that("epi_slide computation via formula can use ref_time_value", { expected_output <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), @@ -634,7 +909,7 @@ test_that("`epi_slide` can access objects inside of helper functions", { ) }) -test_that("epi_slide basic behavior is correct when groups have non-overlapping date ranges", { +test_that("basic slide behavior is correct when groups have non-overlapping date ranges", { small_x_misaligned_dates <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) @@ -643,14 +918,17 @@ test_that("epi_slide basic behavior is correct when groups have non-overlapping group_by(geo_value) expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5))) + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) - result1 <- epi_slide(small_x_misaligned_dates, f = ~ sum(.x$value), before = 50) + result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50) expect_identical(result1, expected_output) + + result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result2, expected_output) }) @@ -677,3 +955,434 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin expect_identical(result1, expected_output) }) + +test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { + test_time_type_mean <- function(dates, vals, before = 6L, after = 0L, n, m, n_obs, k, ...) { + # Three states, with 2 variables. a is linear, going up in one state and down in the other + # b is just random. last (m-1):(n-1) dates are missing + epi_data <- epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = dates, + a = 1:n_obs, + b = vals + ), tibble( + geo_value = "ca", + time_value = dates, + a = n_obs:1, + b = vals + 10 + ))) %>% + group_by(geo_value) + + # Use the `epi_slide` result as a reference. + result1 <- epi_slide(epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL, ... + ) + result2 <- epi_slide_mean(epi_data, + col_names = c(a, b), na.rm = TRUE, + before = before, after = after, ... + ) + expect_identical(result1, result2) + } + + set.seed(0) + + # 3 missing dates + n <- 15 # Max date index + m <- 3 # Number of missing dates + n_obs <- n + 1 - m # Number of obs created + k <- c(0:(n - (m + 1)), n) # Date indices + + rand_vals <- rnorm(n_obs) + # Basic time type + days <- as.Date("2022-01-01") + k + + test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) + + # Without any missing dates + n <- 15 # Max date index + m <- 0 # Number of missing dates + n_obs <- n + 1 - m # Number of obs created + k <- c(0:(n - (m + 1)), n) # Date indices + + rand_vals <- rnorm(n_obs) + # Basic time type + days <- as.Date("2022-01-01") + k + + test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) +}) + +test_that("results for different time_types match between epi_slide and epi_slide_mean", { + n <- 6L # Max date index + m <- 1L # Number of missing dates + n_obs <- n + 1L - m # Number of obs created + k <- c(0L:(n - (m + 1L)), n) # Date indices + + set.seed(0) + rand_vals <- rnorm(n_obs) + + generate_special_date_data <- function(date_seq, ...) { + epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = date_seq, + a = seq_along(date_seq), + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = date_seq, + a = rev(seq_along(date_seq)), + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = date_seq, + a = rev(seq_along(date_seq)), + b = rand_vals * 2 + )), ...) + } + + # Basic time type + days <- as.Date("2022-01-01") + k + + # Require lubridate::period function to be passed as `time_step` + day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes + day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours + weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks + + # Don't require a `time_step` fn + yearweeks <- tsibble::yearweek(10L + k) + yearmonths <- tsibble::yearmonth(10L + k) + yearquarters <- tsibble::yearquarter(10L + k) + years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + + # Not supported + custom_dates <- c( + "January 1, 2022", "January 2, 2022", "January 3, 2022", + "January 4, 2022", "January 5, 2022", "January 6, 2022" + ) + not_dates <- c("a", "b", "c", "d", "e", "f") + + ref_epi_data <- generate_special_date_data(days) %>% + group_by(geo_value) + + ref_result <- epi_slide(ref_epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = 6L, after = 0L, names_sep = NULL + ) + + test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { + # Three states, with 2 variables. a is linear, going up in one state and down in the other + # b is just random. date 10 is missing + epi_data <- generate_special_date_data(dates) %>% + group_by(geo_value) + + result1 <- epi_slide(epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL, ... + ) + result2 <- epi_slide_mean(epi_data, + col_names = c(a, b), na.rm = TRUE, + before = before, after = after, ... + ) + expect_identical(result1, result2) + + # All fields except dates + expect_identical(select(ref_result, -time_value), select(result1, -time_value)) + expect_identical(select(ref_result, -time_value), select(result2, -time_value)) + } + + test_time_type_mean(days) + test_time_type_mean(yearweeks) + test_time_type_mean(yearmonths) + test_time_type_mean(yearquarters) + test_time_type_mean(years) + test_time_type_mean(day_times_minute, time_step = lubridate::minutes) + test_time_type_mean(day_times_hour, time_step = lubridate::hours) + test_time_type_mean(weeks, time_step = lubridate::weeks) + + # `epi_slide_mean` can also handle `weeks` without `time_step` being + # provided, but `epi_slide` can't + epi_data <- generate_special_date_data(weeks) %>% + group_by(geo_value) + result2 <- epi_slide_mean(epi_data, + col_names = c(a, b), na.rm = TRUE, + before = 6L, after = 0L + ) + expect_identical(select(ref_result, -time_value), select(result2, -time_value)) +}) + +test_that("special time_types without time_step fail in epi_slide_mean", { + n_obs <- 6 + k <- 1:n_obs + + day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes + day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours + + # Not supported + custom_dates <- c( + "January 1, 2022", "January 2, 2022", "January 3, 2022", + "January 4, 2022", "January 5, 2022", "January 6, 2022" + ) + not_dates <- c("a", "b", "c", "d", "e", "f") + + test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { + epi_data <- epiprocess::as_epi_df(tibble( + geo_value = "al", + time_value = dates, + a = 1:n_obs + )) + + expect_error( + epi_slide_mean(epi_data, + col_names = a, + before = before, after = after + ), + class = "epiprocess__epi_slide_mean__unmappable_time_type" + ) + } + + test_time_type_mean(custom_dates) + test_time_type_mean(not_dates) + test_time_type_mean(day_times_minute) + test_time_type_mean(day_times_hour) +}) + +test_that("helper `full_date_seq` returns expected date values", { + n <- 6L # Max date index + m <- 1L # Number of missing dates + n_obs <- n + 1L - m # Number of obs created + k <- c(0L:(n - (m + 1L)), n) # Date indices + + set.seed(0) + rand_vals <- rnorm(n_obs) + + generate_special_date_data <- function(date_seq, ...) { + epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = date_seq, + a = seq_along(date_seq), + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = date_seq, + a = rev(seq_along(date_seq)), + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = date_seq, + a = rev(seq_along(date_seq)), + b = rand_vals * 2 + )), ...) + } + + # Basic time type + days <- as.Date("2022-01-01") + k + + # Require lubridate::period function to be passed as `time_step` + day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes + day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours + weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks + + # Don't require a `time_step` fn + yearweeks <- tsibble::yearweek(10L + k) + yearmonths <- tsibble::yearmonth(10L + k) + yearquarters <- tsibble::yearquarter(10L + k) + years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + + before <- 2L + after <- 1L + + expect_identical( + full_date_seq( + generate_special_date_data(days), + before = before, after = after + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", + "2022-01-05", "2022-01-06", "2022-01-07" + )), + pad_early_dates = as.Date(c("2021-12-30", "2021-12-31")), + pad_late_dates = as.Date(c("2022-01-08")) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(yearweeks), + before = before, after = after + ), + list( + all_dates = tsibble::yearweek(10:16), + pad_early_dates = tsibble::yearweek(8:9), + pad_late_dates = tsibble::yearweek(17) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(yearmonths), + before = before, after = after + ), + list( + all_dates = tsibble::yearmonth(10:16), + pad_early_dates = tsibble::yearmonth(8:9), + pad_late_dates = tsibble::yearmonth(17) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(yearquarters), + before = before, after = after + ), + list( + all_dates = tsibble::yearquarter(10:16), + pad_early_dates = tsibble::yearquarter(8:9), + pad_late_dates = tsibble::yearquarter(17) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(years), + before = before, after = after + ), + list( + all_dates = 2000L:2006L, + pad_early_dates = 1998L:1999L, + pad_late_dates = 2007L + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(day_times_minute), + before = before, after = after, + time_step = lubridate::minutes + ), + list( + all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(0:6), + pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::minutes(2:1), + pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(7) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(day_times_hour), + before = before, after = after, + time_step = lubridate::hours + ), + list( + all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(0:6), + pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::hours(2:1), + pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(7) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(weeks), + before = before, after = after, + time_step = lubridate::weeks + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", + "2022-01-29", "2022-02-05", "2022-02-12" + )), + pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), + pad_late_dates = as.Date(c("2022-02-19")) + ) + ) + # Check the middle branch (`if (missing(time_step))`) of `full_date_seq`. + expect_identical( + full_date_seq( + generate_special_date_data(weeks, time_type = "week"), + before = before, after = after + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", + "2022-01-29", "2022-02-05", "2022-02-12" + )), + pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), + pad_late_dates = as.Date(c("2022-02-19")) + ) + ) + + # Other before/after values + before <- 5L + after <- 0L + + expect_identical( + full_date_seq( + generate_special_date_data(days), + before = before, after = after + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", + "2022-01-05", "2022-01-06", "2022-01-07" + )), + pad_early_dates = as.Date(c( + "2021-12-27", "2021-12-28", "2021-12-29", "2021-12-30", + "2021-12-31" + )), + pad_late_dates = NULL + ) + ) + + before <- 0L + after <- 3L + + expect_identical( + full_date_seq( + generate_special_date_data(days), + before = before, after = after + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", + "2022-01-05", "2022-01-06", "2022-01-07" + )), + pad_early_dates = NULL, + pad_late_dates = as.Date(c( + "2022-01-08", "2022-01-09", "2022-01-10" + )) + ) + ) +}) + +test_that("`epi_slide_mean` errors when passed `time_values` with closer than expected spacing", { + time_df <- tibble( + geo_value = 1, + value = c(0:7, 3.5, 10, 20), + # Adding the value 3.5 creates a time that has fractional seconds, which + # doesn't follow the expected 1-second spacing of the `time_values`. + # This leads to `frollmean` using obs spanning less than the expected + # time frame for some computation windows. + time_value = Sys.time() + value + ) %>% + as_epi_df() + expect_error( + epi_slide_mean(time_df, value, before = 6L, time_step = lubridate::seconds), + class = "epiprocess__epi_slide_mean__unexpected_row_number" + ) +}) + +test_that("`epi_slide_mean` errors when passed `col_names` as list", { + expect_error( + epi_slide_mean(grouped, col_names = list(value), before = 1L, after = 0L, ref_time_values = d + 1), + class = "epiprocess__epi_slide_mean__col_names_in_list" + ) +}) diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 3238f08b..b0295865 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -60,12 +60,34 @@ x <- jhu_csse_daily_subset %>% as_epi_df() ``` +## Optimized rolling mean + +We first demonstrate how to apply a 7-day trailing average to the daily cases +in order to smooth the signal, by passing in the name of the column(s) we +want to average for the first argument of `epi_slide_mean()`. `epi_slide_mean +()` can only be used for averaging. To do this computation per state, we +first call `group_by()`. + +```{r} +x %>% + group_by(geo_value) %>% + epi_slide_mean("cases", before = 6) %>% + ungroup() %>% + head(10) +``` + +The calculation is done using `data.table::frollmean`, whose behavior can be +adjusted by passing relevant arguments via `...`. ## Slide with a formula -We first demonstrate how to apply a 7-day trailing average to the daily cases in -order to smooth the signal, by passing in a formula for the first argument of -`epi_slide()`. To do this computation per state, we first call `group_by()`. +The previous computation can also be performed using `epi_slide()`, which is +more flexible but quite a bit slower than `epi_slide_mean()`. It is +recommended to use `epi_slide_mean()` when possible. + +The same 7-day trailing average of daily cases can be computed by passing in a +formula for the first argument of `epi_slide()`. To do this per state, we +first call `group_by()`. ```{r} x %>%