diff --git a/DESCRIPTION b/DESCRIPTION index 3c409c31..0c871dca 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.10 +Version: 0.7.11 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 1bfdd9c5..57256cd7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Add new `epi_slide_opt` function to allow much faster rolling computations in some cases, using `data.table` and `slider` optimized rolling functions (#433). +- Add tidyselect interface for `epi_slide_opt` and derivatives (#452). - regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API - changed approach to versioning, see DEVELOPMENT.md for details diff --git a/R/slide.R b/R/slide.R index 416127ef..27a3135c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -86,8 +86,8 @@ #' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`] #' @examples #' # slide a 7-day trailing average formula on cases -#' # This and other simple sliding means are much faster to do using -#' # the `epi_slide_mean` function instead. +#' # Simple sliding means and sums are much faster to do using +#' # the `epi_slide_mean` and `epi_slide_sum` functions instead. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), before = 6) %>% @@ -377,7 +377,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select #' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg -#' @importFrom purrr map map_lgl +#' @importFrom tidyselect eval_select +#' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period #' @importFrom checkmate assert_function @@ -390,23 +391,23 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 +#' f = data.table::frollmean, before = 6 #' ) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' # Remove a nonessential var. to ensure new col is printed, and rename new col +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, -#' f = data.table::frollmean, -#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, +#' epi_slide_opt( +#' cases, +#' f = data.table::frollmean, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day leading average @@ -414,10 +415,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 +#' f = slider::slide_mean, after = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` @@ -425,15 +426,15 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 +#' f = data.table::frollsum, before = 3, after = 3 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + new_col_name = NULL, as_list_col = NULL, + names_sep = NULL, all_rows = FALSE) { assert_class(x, "epi_df") if (nrow(x) == 0L) { @@ -443,15 +444,27 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, "i" = "If this computation is occuring within an `epix_slide` call, check that `epix_slide` `ref_time_values` argument was set appropriately" ), - class = "epiprocess__epi_slide_mean__0_row_input", + class = "epiprocess__epi_slide_opt__0_row_input", epiprocess__x = x ) } if (!is.null(as_list_col)) { cli_abort( - "`as_list_col` is not supported for `epi_slide_mean`", - class = "epiproces__epi_slide_mean__list_not_supported" + "`as_list_col` is not supported for `epi_slide_[opt/mean/sum]`", + class = "epiprocess__epi_slide_opt__list_not_supported" + ) + } + if (!is.null(new_col_name)) { + cli_abort( + "`new_col_name` is not supported for `epi_slide_[opt/mean/sum]`", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if (!is.null(names_sep)) { + cli_abort( + "`names_sep` is not supported for `epi_slide_[opt/mean/sum]`", + class = "epiprocess__epi_slide_opt__name_sep_not_supported" ) } @@ -543,48 +556,16 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `before` and `after` params. window_size <- before + after + 1L - col_names_quo <- enquo(col_names) - col_names_chr <- as.character(rlang::quo_get_expr(col_names_quo)) - if (startsWith(rlang::as_label(col_names_quo), "c(")) { - # List or vector of col names. We need to drop the first element since it - # will be either "c" (if built as a vector) or "list" (if built as a - # list). - col_names_chr <- col_names_chr[-1] - } else if (startsWith(rlang::as_label(col_names_quo), "list(")) { - cli_abort( - "`col_names` must be a single tidy column name or a vector - (`c()`) of tidy column names", - class = "epiprocess__epi_slide_mean__col_names_in_list", - epiprocess__col_names = col_names_chr - ) - } - # If single column name, do nothing. - - if (is.null(names_sep)) { - if (length(new_col_name) != length(col_names_chr)) { - cli_abort( - c( - "`new_col_name` must be the same length as `col_names` when - `names_sep` is NULL to avoid duplicate output column names." - ), - class = "epiprocess__epi_slide_mean__col_names_length_mismatch", - epiprocess__new_col_name = new_col_name, - epiprocess__col_names = col_names_chr - ) - } - result_col_names <- new_col_name - } else { - if (length(new_col_name) != 1L && length(new_col_name) != length(col_names_chr)) { - cli_abort( - "`new_col_name` must be either length 1 or the same length as `col_names`.", - class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", - epiprocess__new_col_name = new_col_name, - epiprocess__col_names = col_names_chr - ) - } - result_col_names <- paste(new_col_name, col_names_chr, sep = names_sep) - } - + # The position of a given column can be differ between input `x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + pos <- eval_select(rlang::enquo(col_names), data = x, allow_rename = FALSE) + col_names_chr <- names(x)[pos] + # Always rename results to "slide_value_". + result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -600,19 +581,19 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # If a group contains duplicate time values, `frollmean` will still only # use the last `k` obs. It isn't looking at dates, it just goes in row # order. So if the computation is aggregating across multiple obs for the - # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` - # should be used instead. + # same date, `epi_slide_opt` and derivates will produce incorrect + # results; `epi_slide` should be used instead. if (anyDuplicated(.data_group$time_value) != 0L) { cli_abort( c( - "group contains duplicate time values. Using `epi_slide_mean` on this + "group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this group will result in incorrect results", "i" = "Please change the grouping structure of the input data so that each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) - %>% epi_slide_mean`)", + %>% epi_slide_opt(f = frollmean)`)", "i" = "Use `epi_slide` to aggregate across groups" ), - class = "epiprocess__epi_slide_mean__duplicate_time_values", + class = "epiprocess__epi_slide_opt__duplicate_time_values", epiprocess__data_group = .data_group, epiprocess__group_key = .group_key ) @@ -624,7 +605,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, "i" = c("Input data may contain `time_values` closer together than the expected `time_step` size") ), - class = "epiprocess__epi_slide_mean__unexpected_row_number", + class = "epiprocess__epi_slide_opt__unexpected_row_number", epiprocess__data_group = .data_group, epiprocess__group_key = .group_key ) @@ -669,7 +650,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, } if (!is_epi_df(result)) { - # `all_rows`handling strip epi_df format and metadata. + # `all_rows`handling strips epi_df format and metadata. # Restore them. result <- reclass(result, attributes(x)$metadata) } @@ -700,50 +681,51 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_mean(cases, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, -#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, +#' epi_slide_mean( +#' cases, +#' before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_mean(cases, after = 6) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean(cases, before = 3, after = 3) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean(cases, before = 6, after = 7) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% #' ungroup() epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + new_col_name = NULL, as_list_col = NULL, + names_sep = NULL, all_rows = FALSE) { epi_slide_opt( x = x, col_names = {{ col_names }}, @@ -783,14 +765,14 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' # slide a 7-day trailing sum formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) %>% +#' epi_slide_sum(cases, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% #' ungroup() epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + new_col_name = NULL, as_list_col = NULL, + names_sep = NULL, all_rows = FALSE) { epi_slide_opt( x = x, col_names = {{ col_names }}, @@ -859,7 +841,7 @@ full_date_seq <- function(x, before, after, time_step) { "i" = c("The input data's `time_type` was probably `custom` or `day-time`. These require also passing a `time_step` function.") ), - class = "epiprocess__epi_slide_mean__unmappable_time_type", + class = "epiprocess__full_date_seq__unmappable_time_type", epiprocess__time_type = ttype ) } diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index a7d5b04a..d13921b2 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,5 +1,13 @@ -#' @param col_names A single tidyselection or a tidyselection vector of the -#' names of one or more columns for which to calculate the rolling mean. +#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), or +#' [other tidy-select expression][tidyselect::language]. Variable names can +#' be used as if they were positions in the data frame, so expressions like +#' `x:y` can be used to select a range of variables. If you have the desired +#' column names stored in a vector `vars`, use `col_names = all_of(vars)`. +#' +#' The tidy-selection renaming interface is not supported, and cannot be used +#' to provide output column names; if you want to customize the output column +#' names, use [`dplyr::rename`] after the slide. #' @param as_list_col Not supported. Included to match `epi_slide` interface. #' @param new_col_name Character vector indicating the name(s) of the new #' column(s) that will contain the derivative values. Default diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 0d0dfb55..a1319f99 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -160,8 +160,8 @@ through the \code{new_col_name} argument. } \examples{ # slide a 7-day trailing average formula on cases -# This and other simple sliding means are much faster to do using -# the `epi_slide_mean` function instead. +# Simple sliding means and sums are much faster to do using +# the `epi_slide_mean` and `epi_slide_sum` functions instead. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), before = 6) \%>\% diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index ee3e7838..850a45a1 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -12,9 +12,9 @@ epi_slide_mean( after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_name = NULL, as_list_col = NULL, - names_sep = "_", + names_sep = NULL, all_rows = FALSE ) } @@ -23,8 +23,16 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A single tidyselection or a tidyselection vector of the -names of one or more columns for which to calculate the rolling mean.} +\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or +\link[tidyselect:language]{other tidy-select expression}. Variable names can +be used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. If you have the desired +column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. + +The tidy-selection renaming interface is not supported, and cannot be used +to provide output column names; if you want to customize the output column +names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically @@ -125,45 +133,46 @@ misspelled.) # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) \%>\% + epi_slide_mean(cases, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, - new_col_name = "cases_7dav", names_sep = NULL, before = 6, + epi_slide_mean( + cases, + before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) \%>\% + epi_slide_mean(cases, after = 6) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + epi_slide_mean(cases, before = 3, after = 3) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean(cases, before = 6, after = 7) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) \%>\% ungroup() } \seealso{ diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 0772b431..4b011c16 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,9 +13,9 @@ epi_slide_opt( after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_name = NULL, as_list_col = NULL, - names_sep = "_", + names_sep = NULL, all_rows = FALSE ) } @@ -24,8 +24,16 @@ epi_slide_opt( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A single tidyselection or a tidyselection vector of the -names of one or more columns for which to calculate the rolling mean.} +\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or +\link[tidyselect:language]{other tidy-select expression}. Variable names can +be used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. If you have the desired +column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. + +The tidy-selection renaming interface is not supported, and cannot be used +to provide output column names; if you want to customize the output column +names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} \item{f}{Function; together with \code{...} specifies the computation to slide. \code{f} must be one of \code{data.table}'s rolling functions @@ -148,23 +156,23 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 + f = data.table::frollmean, before = 6 ) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + # Remove a nonessential var. to ensure new col is printed, and rename new col + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt(cases, - f = data.table::frollmean, - new_col_name = "cases_7dav", names_sep = NULL, before = 6, + epi_slide_opt( + cases, + f = data.table::frollmean, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day leading average @@ -172,10 +180,10 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 + f = slider::slide_mean, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` @@ -183,10 +191,10 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 + f = data.table::frollsum, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() } \seealso{ diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index d5961f27..8c835bdb 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -12,9 +12,9 @@ epi_slide_sum( after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_name = NULL, as_list_col = NULL, - names_sep = "_", + names_sep = NULL, all_rows = FALSE ) } @@ -23,8 +23,16 @@ epi_slide_sum( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A single tidyselection or a tidyselection vector of the -names of one or more columns for which to calculate the rolling mean.} +\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or +\link[tidyselect:language]{other tidy-select expression}. Variable names can +be used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. If you have the desired +column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. + +The tidy-selection renaming interface is not supported, and cannot be used +to provide output column names; if you want to customize the output column +names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} \item{...}{Additional arguments to pass to \code{data.table::frollsum}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically @@ -125,9 +133,9 @@ misspelled.) # slide a 7-day trailing sum formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) \%>\% + epi_slide_sum(cases, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dsum) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) \%>\% ungroup() } \seealso{ diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 6d66e0c4..8765d50c 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -29,7 +29,7 @@ toy_edf <- tibble::tribble( as_epi_df(as_of = 100) # nolint start: line_length_linter. -basic_result_from_size1_sum <- tibble::tribble( +basic_sum_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), @@ -38,7 +38,7 @@ basic_result_from_size1_sum <- tibble::tribble( dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) -basic_result_from_size1_mean <- tibble::tribble( +basic_mean_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% @@ -315,27 +315,29 @@ test_that( ) test_that("computation output formats x as_list_col", { - # See `toy_edf` and `basic_result_from_size1_sum` definitions at top of file. + # See `toy_edf` and `basic_sum_result` definitions at top of file. # We'll try 7d sum with a few formats. expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), - basic_result_from_size1_sum + basic_sum_result ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), - basic_result_from_size1_sum %>% dplyr::mutate(slide_value = as.list(slide_value)) + basic_sum_result %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), - basic_result_from_size1_sum %>% rename(slide_value_value = slide_value) + basic_sum_result %>% rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), - basic_result_from_size1_sum %>% + basic_sum_result %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) +}) - # See `toy_edf` and `basic_result_from_size1_mean` definitions at top of file. +test_that("epi_slide_mean errors when `as_list_col` non-NULL", { + # See `toy_edf` and `basic_mean_result` definitions at top of file. # We'll try 7d avg with a few formats. # Warning: not exactly the same naming behavior as `epi_slide`. expect_identical( @@ -347,7 +349,7 @@ test_that("computation output formats x as_list_col", { value, before = 6L, na.rm = TRUE ), - basic_result_from_size1_mean %>% dplyr::mutate( + basic_mean_result %>% dplyr::mutate( slide_value_value = slide_value ) %>% select(-slide_value) @@ -361,7 +363,7 @@ test_that("computation output formats x as_list_col", { value, before = 6L, as_list_col = TRUE, na.rm = TRUE ), - class = "epiproces__epi_slide_mean__list_not_supported" + class = "epiprocess__epi_slide_opt__list_not_supported" ) # `epi_slide_mean` doesn't return dataframe columns }) @@ -373,7 +375,7 @@ test_that("nested dataframe output names are controllable", { before = 6L, ~ data.frame(value = sum(.x$value)), new_col_name = "result" ), - basic_result_from_size1_sum %>% rename(result_value = slide_value) + basic_sum_result %>% rename(result_value = slide_value) ) expect_identical( toy_edf %>% @@ -381,17 +383,7 @@ test_that("nested dataframe output names are controllable", { before = 6L, ~ data.frame(value_sum = sum(.x$value)), names_sep = NULL ), - basic_result_from_size1_sum %>% rename(value_sum = slide_value) - ) - expect_identical( - toy_edf %>% filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6L, names_sep = NULL, na.rm = TRUE - ), - basic_result_from_size1_mean + basic_sum_result %>% rename(value_sum = slide_value) ) }) @@ -482,7 +474,8 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { value, before = 6L, names_sep = NULL, na.rm = TRUE ), - basic_result_from_size1_mean + basic_mean_result %>% + rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% filter( @@ -493,7 +486,8 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { before = 6L, ref_time_values = c(2L, 8L), names_sep = NULL, na.rm = TRUE ), - filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) + filter(basic_mean_result, time_value %in% c(2L, 8L)) %>% + rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% filter( @@ -504,10 +498,11 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, names_sep = NULL, na.rm = TRUE ), - basic_result_from_size1_mean %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + basic_mean_result %>% + dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% c(2L, 8L), slide_value, NA_integer_ - )) + )) %>% + select(-slide_value) ) # slide computations returning data frames: @@ -662,7 +657,7 @@ test_that("basic grouped epi_slide_mean computation produces expected output", { as_epi_df(as_of = d + 6) result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result1, expected_output) + expect_identical(result1, expected_output %>% rename(slide_value_value = slide_value)) }) test_that("ungrouped epi_slide computation completes successfully", { @@ -722,14 +717,14 @@ test_that("basic ungrouped epi_slide_mean computation produces expected output", ungroup() %>% filter(geo_value == "ak") %>% epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result1, expected_output) + expect_identical(result1, expected_output %>% rename(slide_value_value = slide_value)) # Ungrouped with multiple geos # epi_slide_mean fails when input data groups contain duplicate time_values, # e.g. aggregating across geos expect_error( small_x %>% ungroup() %>% epi_slide_mean(value, before = 6L), - class = "epiprocess__epi_slide_mean__duplicate_time_values" + class = "epiprocess__epi_slide_opt__duplicate_time_values" ) }) @@ -928,7 +923,7 @@ test_that("basic slide behavior is correct when groups have non-overlapping date expect_identical(result1, expected_output) result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result2, expected_output) + expect_identical(result2, expected_output %>% rename(slide_value_value = slide_value)) }) @@ -1152,7 +1147,7 @@ test_that("special time_types without time_step fail in epi_slide_mean", { col_names = a, before = before, after = after ), - class = "epiprocess__epi_slide_mean__unmappable_time_type" + class = "epiprocess__full_date_seq__unmappable_time_type" ) } @@ -1376,14 +1371,7 @@ test_that("`epi_slide_mean` errors when passed `time_values` with closer than ex as_epi_df() expect_error( epi_slide_mean(time_df, value, before = 6L, time_step = lubridate::seconds), - class = "epiprocess__epi_slide_mean__unexpected_row_number" - ) -}) - -test_that("`epi_slide_mean` errors when passed `col_names` as list", { - expect_error( - epi_slide_mean(grouped, col_names = list(value), before = 1L, after = 0L, ref_time_values = d + 1), - class = "epiprocess__epi_slide_mean__col_names_in_list" + class = "epiprocess__epi_slide_opt__unexpected_row_number" ) })