From 7d22ef829fc9c0b4d16bd67c6ddf0c0fe74f21d0 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 31 Jul 2024 13:44:13 -0500 Subject: [PATCH] first draft of epi_slide in step_epi_slide --- NAMESPACE | 3 + R/step_epi_slide.R | 83 +++++++++++++++++++++------- man/epi_slide_wrapper.Rd | 28 ++++++++++ man/step_epi_slide.Rd | 5 +- tests/testthat/test-step_epi_slide.R | 79 ++++++++++++++++---------- 5 files changed, 146 insertions(+), 52 deletions(-) create mode 100644 man/epi_slide_wrapper.Rd diff --git a/NAMESPACE b/NAMESPACE index f7f880501..5d045ec8f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -226,10 +226,12 @@ importFrom(checkmate,assert_scalar) importFrom(cli,cli_abort) importFrom(dplyr,across) importFrom(dplyr,all_of) +importFrom(dplyr,bind_cols) importFrom(dplyr,group_by) importFrom(dplyr,n) importFrom(dplyr,summarise) importFrom(dplyr,ungroup) +importFrom(epiprocess,epi_slide) importFrom(epiprocess,growth_rate) importFrom(generics,augment) importFrom(generics,fit) @@ -269,6 +271,7 @@ importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) importFrom(tibble,tibble) +importFrom(tidyr,crossing) importFrom(tidyr,drop_na) importFrom(vctrs,as_list_of) importFrom(vctrs,field) diff --git a/R/step_epi_slide.R b/R/step_epi_slide.R index 48f322561..64cd5370f 100644 --- a/R/step_epi_slide.R +++ b/R/step_epi_slide.R @@ -166,30 +166,71 @@ bake.step_epi_slide <- function(object, new_data, ...) { c("In `step_epi_slide()` a name collision occurred. The following variable names already exist:", `*` = "{.var {nms}}" ), - call = caller_env() + call = caller_env(), + class = "epipredict__step__name_collision_error" ) } - - ok <- object$keys - names(col_names) <- newnames - gr <- new_data %>% - dplyr::select(dplyr::all_of(c(ok, object$columns))) %>% - group_by(dplyr::across(dplyr::all_of(ok[-1]))) %>% - dplyr::arrange(time_value) %>% - dplyr::mutate( - dplyr::across( - dplyr::all_of(object$columns), - ~ slider::slide_index_vec( - .x, - .i = time_value, - object$.f, .before = object$before, .after = object$after - ) - ) + if (any(vapply(c(mean, sum), \(x) identical(x, object$.f), logical(1L)))) { + cli_warn( + c("There is an optimized version of both mean and sum. See `step_epi_slide_mean`, `step_epi_slide_sum`, or `step_epi_slide_opt`." + ), + class = "epipredict__step_epi_slide__optimized_version" + ) + } + epi_slide_wrapper( + new_data, + object$before, + object$after, + object$columns, + c(object$.f), + object$f_name, + object$keys[-1], + object$prefix + ) +} +#' wrapper to handle epi_slide particulars +#' @description +#' This should simplify somewhat in the future when we can run `epi_slide` on +#' columns. Surprisingly, lapply is several orders of magnitude faster than +#' using roughly equivalent tidy select style. +#' @param fns vector of functions, even if it's length 1. +#' @param group_keys the keys to group by. likely epi_keys[-1] (to remove time_value) +#' @importFrom tidyr crossing +#' @importFrom dplyr bind_cols group_by ungroup +#' @importFrom epiprocess epi_slide +#' @keywords internal +epi_slide_wrapper <- function(new_data, before, after, columns, fns, fn_names, group_keys, name_prefix) { + cols_fns <- tidyr::crossing(col_name = columns, fn_name = fn_names, fn = fns) + seq_len(nrow(cols_fns)) %>% + lapply( # iterate over the rows of cols_fns + # takes in the row number, outputs the transformed column + function(comp_i) { + # extract values from the row + col_name <- cols_fns[[comp_i, "col_name"]] + fn_name <- cols_fns[[comp_i, "fn_name"]] + fn <- cols_fns[[comp_i, "fn"]][[1L]] + result_name <- paste(name_prefix, fn_name, col_name, sep="_") + result <- new_data %>% + group_by(across(group_keys)) %>% + epi_slide( + before = before, + after = after, + new_col_name = result_name, + f = function(slice, geo_key, ref_time_value) { + fn(slice[[col_name]]) + } + ) %>% + ungroup() + # the first result needs to include all of the original columns + if (comp_i == 1L) { + result + } else { + # everything else just needs that column transformed + result[result_name] + } + } ) %>% - dplyr::rename(dplyr::all_of(col_names)) %>% - dplyr::ungroup() - - dplyr::left_join(new_data, gr, by = ok) + bind_cols() } diff --git a/man/epi_slide_wrapper.Rd b/man/epi_slide_wrapper.Rd new file mode 100644 index 000000000..4f1ffe285 --- /dev/null +++ b/man/epi_slide_wrapper.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_epi_slide.R +\name{epi_slide_wrapper} +\alias{epi_slide_wrapper} +\title{wrapper to handle epi_slide particulars} +\usage{ +epi_slide_wrapper( + new_data, + before, + after, + columns, + fns, + fn_names, + group_keys, + name_prefix +) +} +\arguments{ +\item{fns}{vector of functions, even if it's length 1.} + +\item{group_keys}{the keys to group by. likely epi_keys\link{-1} (to remove time_value)} +} +\description{ +This should simplify somewhat in the future when we can run \code{epi_slide} on +columns. Surprisingly, lapply is several orders of magnitude faster than +using roughly equivalent tidy select style. +} +\keyword{internal} diff --git a/man/step_epi_slide.Rd b/man/step_epi_slide.Rd index 3baefbc3a..0e066009d 100644 --- a/man/step_epi_slide.Rd +++ b/man/step_epi_slide.Rd @@ -38,7 +38,8 @@ won't produce an error until you try to actually fit the model) Note that in cases 3 and 4, \code{x} can be any variable name you like (for example \verb{\\(dog) mean(dog, na.rm = TRUE)} will work). But in case 5, the argument must be named \code{.x}. A common, though very difficult to debug -error is using something like \code{function(x) mean}. This will not work.} +error is using something like \code{function(x) mean}. This will not work +because it returns the function mean, rather than \code{mean(x)}} \item{before, after}{non-negative integers. How far \code{before} and \code{after} each \code{time_value} should @@ -67,7 +68,7 @@ they be assigned? \code{lag} is default a predictor while \code{ahead} is an out \item{f_name}{a character string of at most 20 characters that describes the function. This will be combined with \code{prefix} and the columns in \code{...} -to name the result using \verb{\{prefix\}\{f_name\}_\{column\}}. It will be determined +to name the result using \verb{\{prefix\}\{f_name\}_\{column\}}. By default it will be determined automatically using \code{clean_f_name()}.} \item{skip}{A logical. Should the step be skipped when the diff --git a/tests/testthat/test-step_epi_slide.R b/tests/testthat/test-step_epi_slide.R index 0f92be588..215fdb43c 100644 --- a/tests/testthat/test-step_epi_slide.R +++ b/tests/testthat/test-step_epi_slide.R @@ -55,36 +55,54 @@ rolled_after <- edf %>% test_that("epi_slide handles classed before/after", { - baseline <- r %>% - step_epi_slide(value, .f = mean, before = 3L) %>% - prep(edf) %>% - bake(new_data = NULL) + expect_warning( + baseline <- r %>% + step_epi_slide(value, .f = mean, before = 3L) %>% + prep(edf) %>% + bake(new_data = NULL), + regexp = "There is an optimized version" + ) expect_equal(baseline[[4]], rolled_before) - pbefore <- r %>% - step_epi_slide(value, .f = mean, before = lubridate::period("3 days")) %>% - prep(edf) %>% - bake(new_data = NULL) - cbefore <- r %>% - step_epi_slide(value, .f = mean, before = "3 days") %>% - prep(edf) %>% - bake(new_data = NULL) + expect_warning( + pbefore <- r %>% + step_epi_slide(value, .f = mean, before = lubridate::period("3 days")) %>% + prep(edf) %>% + bake(new_data = NULL), + regexp = "There is an optimized version" + ) + expect_warning( + cbefore <- r %>% + step_epi_slide(value, .f = mean, before = "3 days") %>% + prep(edf) %>% + bake(new_data = NULL), + regexp = "There is an optimized version" + ) expect_equal(baseline, pbefore) expect_equal(baseline, cbefore) - baseline <- r %>% - step_epi_slide(value, .f = mean, after = 3L) %>% - prep(edf) %>% - bake(new_data = NULL) + expect_warning( + baseline <- r %>% + step_epi_slide(value, .f = mean, after = 3L) %>% + prep(edf) %>% + bake(new_data = NULL), + regexp = "There is an optimized version" + ) expect_equal(baseline[[4]], rolled_after) - pafter <- r %>% - step_epi_slide(value, .f = mean, after = lubridate::period("3 days")) %>% - prep(edf) %>% - bake(new_data = NULL) - cafter <- r %>% - step_epi_slide(value, .f = mean, after = "3 days") %>% - prep(edf) %>% - bake(new_data = NULL) + expect_warning( + pafter <- r %>% + step_epi_slide(value, .f = mean, after = lubridate::period("3 days")) %>% + prep(edf) %>% + bake(new_data = NULL), + regexp = "There is an optimized version" + ) + expect_warning( + cafter <- r %>% + step_epi_slide(value, .f = mean, after = "3 days") %>% + prep(edf) %>% + bake(new_data = NULL), + regexp = "There is an optimized version" + ) expect_equal(baseline, pafter) expect_equal(baseline, cafter) }) @@ -99,10 +117,13 @@ test_that("epi_slide handles different function specs", { step_epi_slide(value, .f = mean, before = 3L) %>% prep(edf) %>% bake(new_data = NULL) - lfun <- r %>% - step_epi_slide(value, .f = ~ mean(.x, na.rm = TRUE), before = 3L) %>% - prep(edf) %>% - bake(new_data = NULL) + # formula NOT currently supported + expect_error( + lfun <- r %>% + step_epi_slide(value, .f = ~ mean(.x, na.rm = TRUE), before = 3L) %>% + prep(edf) %>% + bake(new_data = NULL) + ) blfun <- r %>% step_epi_slide(value, .f = function(x) mean(x, na.rm = TRUE), before = 3L) %>% prep(edf) %>% @@ -114,7 +135,7 @@ test_that("epi_slide handles different function specs", { expect_equal(cfun[[4]], rolled_before) expect_equal(ffun[[4]], rolled_before) - expect_equal(lfun[[4]], rolled_before) + #expect_equal(lfun[[4]], rolled_before) expect_equal(blfun[[4]], rolled_before) expect_equal(nblfun[[4]], rolled_before) })