diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index da88be7d..8ee2ee80 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -2,7 +2,7 @@ library(cli) library(dplyr) library(purrr) -num_rows_per_group <- 20 +num_rows_per_group <- 30 get_test_date <- function(time_type = "day") { switch(time_type, day = as.Date("2020-01-01"), @@ -19,49 +19,74 @@ get_test_units <- function(time_type = "day") { integer = 1L ) } -get_test_dataset <- function(n, time_type = "day", other_keys = character()) { +# Returns a tibble with two geos on the same time index and one geo with a +# different but overlapping time index. Each geo has a missing value somewhere +# in the middle and a separate reported NA elsewhere. +get_test_dataset <- function(n, time_type = "day", other_keys = FALSE) { checkmate::assert_integerish(n, lower = 1) checkmate::assert_character(time_type) - checkmate::assert_character(other_keys) - checkmate::assert_subset(other_keys, "x") + checkmate::assert_logical(other_keys) # Do this to actually get n rows per group. n_ <- n - 1 + values <- vctrs::vec_assign(0:n_, floor(n * 2 / 3), value = NA_real_) test_date <- get_test_date(time_type) units <- get_test_units(time_type) - # A tibble with two geos on the same time index and one geo with a different - # but overlapping time index. Each geo has a missing value somewhere in the middle. - tibble::tribble( - ~geo_value, ~time_value, ~value, ~x, - "a", test_date + units * 0:n_, (0:n_)**2, rep(c(1, 2), length.out = n), - "b", test_date + units * 0:n_, (10 * n + 0:n_)**2, rep(c(1, 2), length.out = n), - "c", test_date + units * (floor(n / 2) + 0:n_), (100 * n + 0:n_)**2, rep(c(1, 2), length.out = n) + df <- tibble::tribble( + ~geo_value, ~time_value, ~value, + "a", test_date + units * 0:n_, values**2, + "b", test_date + units * 0:n_, (10 * n + values)**2, + "c", test_date + units * (floor(n / 2) + 0:n_), (100 * n + values)**2, ) %>% - tidyr::unnest_longer(c(time_value, value, x)) %>% - slice(-10) %>% - as_epi_df(as_of = test_date + n, other_keys = other_keys) %>% - group_by(geo_value) + tidyr::unnest_longer(c(time_value, value)) %>% + slice(-10) + + if (other_keys) { + df <- bind_rows( + df %>% mutate(x = 1, value = value + 1), + df %>% mutate(x = 2, value = value + 2), + ) %>% + as_epi_df(as_of = test_date + n, other_keys = "x") + } else { + df <- df %>% + as_epi_df(as_of = test_date + n) + } + df %>% + arrange_canonical() %>% + group_epi_df() } test_data <- get_test_dataset(num_rows_per_group, "day") -# TODO: Add a test that uses an 'other_key' grouping column. -# TODO: Add a case where the data contains NA values (not just gaps in time_value). - epi_slide_sum_test <- function( .x, - .window_size = 1, .align = "right", .ref_time_values = NULL, .all_rows = FALSE) { + .window_size = 7, .align = "right", .ref_time_values = NULL, .all_rows = FALSE) { + checkmate::assert_class(.x, "epi_df") + if (!(checkmate::test_integerish(.window_size, lower = 1, upper = Inf) || identical(as.numeric(.window_size), Inf))) { + cli::cli_abort("`.window_size` must be a positive integer or Inf.") + } + checkmate::assert_character(.align) + checkmate::assert_subset(.align, c("right", "center", "left")) + checkmate::assert( + checkmate::checkClass(.ref_time_values, "Date", null.ok = TRUE), + checkmate::checkClass(.ref_time_values, "yearmonth"), + checkmate::checkClass(.ref_time_values, "numeric") + ) + checkmate::assert_logical(.all_rows) + time_type <- attr(.x, "metadata")$time_type window_args <- get_before_after_from_window(.window_size, .align, time_type) date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) if (is.null(.ref_time_values)) { .ref_time_values <- date_seq_list$all_dates } + group_keys <- setdiff(key_colnames(.x), "time_value") .x %>% mutate(.real = TRUE) %>% - group_by(geo_value) %>% + group_epi_df() %>% complete(time_value = vctrs::vec_c(!!!date_seq_list, .name_spec = rlang::zap())) %>% - arrange(geo_value, time_value) %>% + arrange_canonical() %>% + group_epi_df() %>% mutate( slide_value = slider::slide_index_sum( .data$value, @@ -83,7 +108,8 @@ epi_slide_sum_test <- function( } }) %>% filter(.real) %>% - select(-.real) + select(-.real) %>% + relocate(all_of(key_colnames(.x)), .before = 1) } concatenate_list_params <- function(p) { paste(paste0(names(p), "=", p), collapse = "\n") @@ -125,6 +151,7 @@ expect_equal_handle_null <- function(x, y) { # Core functionality tests across an exhaustive combination of parameters on # non-trivial data sets with three geo_groups, with non-identical time indices, # with missing time values, and with reported NA values. +# # .ref_time_values can be: # - NULL is a special case where we just use all the unique time_values in the # data. @@ -134,32 +161,49 @@ expect_equal_handle_null <- function(x, y) { # either empty or NA (depending if .all_rows is TRUE or not). # - c(8, 9) corresponds to test_date + 8 * units amd test_date + 9 * units. # In this case, groups a and b have values, but c does not. +# +# We filter down to reduce the number of combinations: +# - Since time_types only interact with .ref_time_values, we fix all the other +# parameters to a single common value. +# - We separate out .window_size=Inf, because it is only defined for +# .align="right". +# - We test .align and .all_rows separately, with a fixed .time_Type and +# .other_keys. param_combinations <- bind_rows( tidyr::expand_grid( .time_type = c("day", "week", "yearmonth", "integer"), + .other_keys = c(TRUE), .ref_time_values = list(NULL, c(1, 2), c(8, 9)), - .all_rows = c(FALSE, TRUE), - .align = c("right", "center", "left"), - .window_size = c(1, 7), + .all_rows = c(TRUE), + .align = c("right"), + .window_size = c(7), ), tidyr::expand_grid( .time_type = c("day", "week", "yearmonth", "integer"), + .other_keys = c(TRUE), .ref_time_values = list(NULL, c(1, 2), c(8, 9)), - .all_rows = c(FALSE, TRUE), + .all_rows = c(TRUE), .align = c("right"), .window_size = c(Inf), - ) + ), + tidyr::expand_grid( + .time_type = c("day"), + .other_keys = c(FALSE), + .ref_time_values = list(NULL, c(1, 2), c(8, 9)), + .all_rows = c(FALSE, TRUE), + .align = c("right", "center", "left"), + .window_size = c(7), + ), ) for (p in (param_combinations %>% transpose())) { - test_data <- get_test_dataset(num_rows_per_group, p$.time_type) + test_data <- get_test_dataset(num_rows_per_group, p$.time_type, p$.other_keys) units <- get_test_units(p$.time_type) test_date <- get_test_date(p$.time_type) p$.window_size <- p$.window_size * units if (!is.null(p$.ref_time_values)) { p$.ref_time_values <- test_date + units * p$.ref_time_values } - as_of <- attr(test_data, "metadata")$as_of - slide_args <- p[setdiff(names(p), c(".time_type"))] + slide_args <- p[setdiff(names(p), c(".time_type", ".other_keys"))] test_that( format_inline( @@ -202,8 +246,8 @@ for (p in (param_combinations %>% transpose())) { rowwise() %>% mutate(slide_value = list(slide_value)) %>% ungroup() %>% - as_epi_df(as_of = as_of) %>% - group_by(geo_value) + as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>% + group_epi_df() expect_equal( out %>% select(-slide_value), @@ -224,8 +268,8 @@ for (p in (param_combinations %>% transpose())) { rowwise() %>% mutate(slide_value = list(slide_value)) %>% ungroup() %>% - as_epi_df(as_of = as_of) %>% - group_by(geo_value) + as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>% + group_epi_df() expect_equal( out %>% select(-slide_value), expected_out %>% select(-slide_value) @@ -241,7 +285,7 @@ for (p in (param_combinations %>% transpose())) { ), { expected_out <- rlang::inject(epi_slide_sum_test(test_data, !!!slide_args)) - expect_equal_mod( + expect_equal( rlang::inject(epi_slide( test_data, , data.frame(slide_value = sum(.x$value)), !!!slide_args @@ -290,7 +334,6 @@ for (p in (param_combinations %>% transpose())) { ) } -# TODO: This. test_that(".window_size as integer works", { expect_equal( epi_slide(test_data, ~ sum(.x$value), .window_size = 7), @@ -320,11 +363,11 @@ for (bad_value in bad_values) { test_that(format_inline("epi_slide should fail when `.ref_time_values` is out of range for all groups "), { bad_values <- c(min(test_data$time_value) - 1, max(test_data$time_value) + 1) expect_error( - epi_slide(test_data, ~ sum(.x), .ref_time_values = bad_values), + epi_slide(test_data, ~ sum(.x), .ref_time_values = bad_values, .window_size = 7), class = "epiprocess__epi_slide_invalid_ref_time_values" ) expect_error( - epi_slide_mean(test_data, .col_names = value, .ref_time_values = bad_values), + epi_slide_mean(test_data, .col_names = value, .ref_time_values = bad_values, .window_size = 7), class = "epiprocess__epi_slide_opt_invalid_ref_time_values" ) }) @@ -332,14 +375,15 @@ test_that(format_inline("epi_slide should fail when `.ref_time_values` is out of test_that("epi_slide alerts if the provided f doesn't take enough args", { f_tib_avg_count <- function(x, g, t) dplyr::tibble(avg = mean(x$value), count = length(x$value)) expect_no_error( - epi_slide(test_data, f_tib_avg_count), + epi_slide(test_data, f_tib_avg_count, .window_size = 7), ) expect_no_warning( - epi_slide(test_data, f_tib_avg_count), + epi_slide(test_data, f_tib_avg_count, .window_size = 7), ) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) - expect_warning(epi_slide(test_data, f_x_dots), + expect_warning( + epi_slide(test_data, f_x_dots, .window_size = 7), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) @@ -347,19 +391,19 @@ test_that("epi_slide alerts if the provided f doesn't take enough args", { test_that("epi_slide computation via f can use ref_time_value", { expected_out <- test_data %>% mutate(slide_value = time_value) expect_equal( - test_data %>% epi_slide(~.ref_time_value), + epi_slide(test_data, ~.ref_time_value, .window_size = 7), expected_out ) expect_equal( - test_data %>% epi_slide(~.z), + epi_slide(test_data, ~.z, .window_size = 7), expected_out ) expect_equal( - test_data %>% epi_slide(~..3), + epi_slide(test_data, ~..3, .window_size = 7), expected_out ) expect_equal( - test_data %>% epi_slide(.f = function(x, g, t) t), + epi_slide(test_data, .f = function(x, g, t) t, .window_size = 7), expected_out ) }) @@ -367,66 +411,64 @@ test_that("epi_slide computation via f can use ref_time_value", { test_that("epi_slide computation via f can use group", { expected_out <- test_data %>% mutate(slide_value = geo_value) expect_equal( - test_data %>% epi_slide(~ .group_key$geo_value), + epi_slide(test_data, .f = ~ .group_key$geo_value, .window_size = 7), expected_out ) expect_equal( - test_data %>% epi_slide(~ .y$geo_value), + epi_slide(test_data, .f = ~ .y$geo_value, .window_size = 7), expected_out ) expect_equal( - test_data %>% epi_slide(~ ..2$geo_value), + epi_slide(test_data, .f = ~ ..2$geo_value, .window_size = 7), expected_out ) expect_equal( - test_data %>% epi_slide(.f = function(x, g, t) g$geo_value), + epi_slide(test_data, .f = function(x, g, t) g$geo_value, .window_size = 7), expected_out ) }) test_that("epi_slide computation via dots can use ref_time_value", { expect_equal( - test_data %>% epi_slide(slide_value = .ref_time_value), - test_data %>% mutate(slide_value = time_value) + epi_slide(test_data, slide_value = .ref_time_value, .window_size = 7), + mutate(test_data, slide_value = time_value) ) }) test_that("epi_slide computation via dots can use group", { expect_equal( - test_data %>% epi_slide(slide_value = nrow(.group_key)), - test_data %>% mutate(slide_value = 1L) + epi_slide(test_data, slide_value = nrow(.group_key), .window_size = 7), + mutate(test_data, slide_value = 1L) ) expect_equal( - test_data %>% epi_slide(slide_value = .group_key$geo_value), - test_data %>% mutate(slide_value = geo_value) + epi_slide(test_data, slide_value = .group_key$geo_value, .window_size = 7), + mutate(test_data, slide_value = geo_value) ) }) test_that("epi_slide computation should not allow access from .data and .env", { - expect_error(test_data %>% epi_slide(slide_value = .env$.ref_time_value)) - expect_error(test_data %>% epi_slide(slide_value = .data$.ref_time_value)) - expect_error(test_data %>% epi_slide(slide_value = .env$.group_key)) - expect_error(test_data %>% epi_slide(slide_value = .data$.group_key)) + expect_error(epi_slide(test_data, slide_value = .env$.ref_time_value, .window_size = 7)) + expect_error(epi_slide(test_data, slide_value = .data$.ref_time_value, .window_size = 7)) + expect_error(epi_slide(test_data, slide_value = .env$.group_key, .window_size = 7)) + expect_error(epi_slide(test_data, slide_value = .data$.group_key, .window_size = 7)) }) test_that("epi_slide computation via dots outputs the same result using col names and the data var", { - expected_output <- test_data %>% epi_slide(slide_value = max(time_value)) + expected_output <- epi_slide(test_data, slide_value = max(time_value), .window_size = 7) expect_equal( - test_data %>% epi_slide(slide_value = max(.x$time_value)), + epi_slide(test_data, slide_value = max(.x$time_value), .window_size = 7), expected_output ) expect_equal( - test_data %>% epi_slide(slide_value = max(.data$time_value)), + epi_slide(test_data, slide_value = max(.data$time_value), .window_size = 7), expected_output ) }) test_that("`epi_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epi_slide( - has_needle = time_value_needle %in% time_value, .window_size = Inf - ) + epi_slide(archive_haystack, has_needle = time_value_needle %in% time_value, .window_size = 7) } expect_no_error(helper(test_data, as.Date("2021-01-01"))) }) @@ -439,6 +481,7 @@ test_that("epi_slide can use sequential data masking expressions including NULL" ) %>% as_epi_df(as_of = 12L) + # TODO: Something's borked here. out1 <- edf_a %>% group_by(geo_value) %>% epi_slide( @@ -449,9 +492,9 @@ test_that("epi_slide can use sequential data masking expressions including NULL" m1 = NULL ) %>% ungroup() %>% - tidyr::drop_na() %>% as_epi_df(as_of = 12L) expect_equal(out1$m5, out1$derived_m5) + expect_true(!"m1" %in% names(out1)) out2 <- edf_a %>% group_by(geo_value) %>% @@ -469,34 +512,31 @@ test_that("epi_slide can use sequential data masking expressions including NULL" test_that("epi_slide complains on invalid computation outputs", { expect_error( - test_data %>% epi_slide(~ lm(value ~ time_value, .x)), + epi_slide(test_data, .f = ~ lm(value ~ time_value, .x), .window_size = 7), class = "epiprocess__invalid_slide_comp_value" ) expect_no_error( - test_data %>% epi_slide(~ list(lm(value ~ time_value, .x))), + epi_slide(test_data, .f = ~ list(lm(value ~ time_value, .x)), .window_size = 7), class = "epiprocess__invalid_slide_comp_value" ) expect_error( - test_data %>% epi_slide(model = lm(value ~ time_value, .x)), + epi_slide(test_data, model = lm(value ~ time_value, .x), .window_size = 7), class = "epiprocess__invalid_slide_comp_tidyeval_output" ) expect_no_error( - test_data %>% epi_slide(model = list(lm(value ~ time_value, .x))), + epi_slide(test_data, model = list(lm(value ~ time_value, .x)), .window_size = 7), class = "epiprocess__invalid_slide_comp_tidyeval_output" ) expect_error( - test_data %>% - epi_slide(.window_size = 6, ~ sum(.x$value) + c(0, 0, 0)), + epi_slide(test_data, .f = ~ sum(.x$value) + c(0, 0, 0), .window_size = 7), class = "epiprocess__invalid_slide_comp_value" ) expect_error( - test_data %>% - epi_slide(.window_size = 6, ~ as.list(sum(.x$value) + c(0, 0, 0))), + epi_slide(test_data, .f = ~ as.list(sum(.x$value) + c(0, 0, 0)), .window_size = 7), class = "epiprocess__invalid_slide_comp_value" ) expect_error( - test_data %>% - epi_slide(.window_size = 6, ~ data.frame(slide_value = sum(.x$value) + c(0, 0, 0))), + epi_slide(test_data, .f = ~ data.frame(slide_value = sum(.x$value) + c(0, 0, 0)), .window_size = 7), class = "epiprocess__invalid_slide_comp_value" ) }) @@ -505,7 +545,7 @@ test_that("epi_slide can use {nm} :=", { nm <- "slide_value" expect_identical( # unfortunately, we can't pass this directly as `f` and need an extra comma - test_data %>% epi_slide(, !!nm := sum(value), .window_size = 7), + epi_slide(test_data, , !!nm := sum(value), .window_size = 7), epi_slide_sum_test(test_data, .window_size = 7) ) }) @@ -700,14 +740,13 @@ test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` fun ) }) -multi_columns <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200, value2 = -1:-200), - dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), value2 = 1:5) -) %>% - as_epi_df() %>% - group_by(geo_value) - test_that("no dplyr warnings from selecting multiple columns", { + multi_columns <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200, value2 = -1:-200), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), value2 = 1:5) + ) %>% + as_epi_df() %>% + group_by(geo_value) expect_no_warning( multi_slid <- epi_slide_mean(multi_columns, .col_names = c("value", "value2"), .window_size = 7) )