From 92ea3e3036d84c3560c81f4625d8029616e62c22 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 5 Sep 2024 11:43:05 -0500
Subject: [PATCH] step locf tests passing, grf pkgdown

---
 NAMESPACE                                 |  3 ++
 R/get_test_data.R                         |  2 +-
 R/step_adjust_latency.R                   |  2 +-
 R/utils-latency.R                         | 23 +++-----------
 _pkgdown.yml                              |  1 +
 man/get_grouping_columns.Rd               |  4 +++
 tests/testthat/_snaps/snapshots.md        | 26 ++++++++++++++++
 tests/testthat/test-pad_to_end.R          | 37 -----------------------
 tests/testthat/test-snapshots.R           | 13 ++++++++
 tests/testthat/test-step_adjust_latency.R | 20 ++++++------
 tests/testthat/test-utils_latency.R       |  2 +-
 11 files changed, 64 insertions(+), 69 deletions(-)
 delete mode 100644 tests/testthat/test-pad_to_end.R

diff --git a/NAMESPACE b/NAMESPACE
index 4840c2568..0900d3594 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -302,6 +302,7 @@ importFrom(stats,family)
 importFrom(stats,lm)
 importFrom(stats,median)
 importFrom(stats,model.frame)
+importFrom(stats,na.omit)
 importFrom(stats,poly)
 importFrom(stats,predict)
 importFrom(stats,qnorm)
@@ -315,6 +316,8 @@ importFrom(tidyr,expand_grid)
 importFrom(tidyr,fill)
 importFrom(tidyr,unnest)
 importFrom(tidyselect,all_of)
+importFrom(utils,capture.output)
+importFrom(utils,head)
 importFrom(vctrs,as_list_of)
 importFrom(vctrs,field)
 importFrom(vctrs,new_rcrd)
diff --git a/R/get_test_data.R b/R/get_test_data.R
index f1d83aad0..5e74da5a1 100644
--- a/R/get_test_data.R
+++ b/R/get_test_data.R
@@ -25,8 +25,8 @@
 #'   step_epi_lag(case_rate, lag = c(0, 7, 14))
 #' get_test_data(recipe = rec, x = case_death_rate_subset)
 #' @importFrom rlang %@%
+#' @importFrom stats na.omit
 #' @export
-
 get_test_data <- function(recipe, x) {
   if (!is_epi_df(x)) cli::cli_abort("`x` must be an `epi_df`.")
 
diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R
index cdef74565..1275c3948 100644
--- a/R/step_adjust_latency.R
+++ b/R/step_adjust_latency.R
@@ -302,7 +302,7 @@ bake.step_adjust_latency <- function(object, new_data, ...) {
     rel_keys <- setdiff(key_colnames(new_data), "time_value")
     unnamed_columns <- object$columns %>% unname()
     new_data <- new_data %>%
-      pad_to_end(rel_keys, object$forecast_date) %>%
+      pad_to_end(rel_keys, object$forecast_date, unnamed_columns) %>%
       # group_by_at(rel_keys) %>%
       arrange(time_value) %>%
       as_tibble() %>%
diff --git a/R/utils-latency.R b/R/utils-latency.R
index b32edf7b5..35db0b484 100644
--- a/R/utils-latency.R
+++ b/R/utils-latency.R
@@ -33,6 +33,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name
 #' @keywords internal
 #' @importFrom dplyr select
 #' @importFrom tidyr drop_na
+#' @importFrom utils capture.output
 set_forecast_date <- function(new_data, info, epi_keys_checked, latency) {
   original_columns <- info %>%
     filter(source == "original") %>%
@@ -161,25 +162,6 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new
 }
 
 
-fill_locf <- function(x, forecast_date) {
-  cannot_be_used <- x %>%
-    dplyr::filter(forecast_date - time_value <= n_recent) %>%
-    dplyr::mutate(fillers = forecast_date - time_value > keep) %>%
-    dplyr::summarise(
-      dplyr::across(
-        -tidyselect::any_of(key_colnames(recipe)),
-        ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1))
-      ),
-      .groups = "drop"
-    ) %>%
-    dplyr::select(-fillers) %>%
-    dplyr::summarise(dplyr::across(
-      -tidyselect::any_of(key_colnames(recipe)), ~ any(.x)
-    )) %>%
-    unlist()
-  x <- tidyr::fill(x, !time_value)
-}
-
 #' pad every group at the right interval
 #' @description
 #' Perform last observation carried forward on a group by group basis. It uses
@@ -226,6 +208,9 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) {
 }
 
 #' return the names of the grouped columns, or `NULL`
+#' @param x an epi_df
+#' @keywords internal
+#' @importFrom utils head
 get_grouping_columns <- function(x) {
   group_names <- names(attributes(x)$groups)
   head(group_names, -1)
diff --git a/_pkgdown.yml b/_pkgdown.yml
index c6df4c82d..468da62ac 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -76,6 +76,7 @@ reference:
     contents:
       - quantile_reg
       - smooth_quantile_reg
+      - grf_quantiles
   - title: Custom panel data forecasting workflows
     contents:
     - epi_recipe
diff --git a/man/get_grouping_columns.Rd b/man/get_grouping_columns.Rd
index 6b653628d..f8b61af42 100644
--- a/man/get_grouping_columns.Rd
+++ b/man/get_grouping_columns.Rd
@@ -6,6 +6,10 @@
 \usage{
 get_grouping_columns(x)
 }
+\arguments{
+\item{x}{an epi_df}
+}
 \description{
 return the names of the grouped columns, or \code{NULL}
 }
+\keyword{internal}
diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md
index 6c439dbd8..6b337c056 100644
--- a/tests/testthat/_snaps/snapshots.md
+++ b/tests/testthat/_snaps/snapshots.md
@@ -1057,6 +1057,32 @@
     18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, 
     -6L), class = c("tbl_df", "tbl", "data.frame"))
 
+---
+
+    structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx"
+    ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, 
+    0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list(
+        structure(list(values = c(0.136509784083987, 0.469979623951498
+        ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", 
+        "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list(
+            values = c(0.364597933377326, 0.698067773244837), quantile_levels = c(0.05, 
+            0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", 
+        "vctrs_vctr")), structure(list(values = c(0.422093024752224, 
+        0.755562864619735), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", 
+        "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list(
+            values = c(0.821955329282474, 1.15542516914998), quantile_levels = c(0.05, 
+            0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", 
+        "vctrs_vctr")), structure(list(values = c(0.628067077067883, 
+        0.961536916935394), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", 
+        "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list(
+            values = c(0.140160537291566, 0.473630377159077), quantile_levels = c(0.05, 
+            0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", 
+        "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", 
+    "list")), forecast_date = structure(c(18997, 18997, 18997, 18997, 
+    18997, 18997), class = "Date"), target_date = structure(c(18998, 
+    18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, 
+    -6L), class = c("tbl_df", "tbl", "data.frame"))
+
 # arx_forecaster output format snapshots
 
     Code
diff --git a/tests/testthat/test-pad_to_end.R b/tests/testthat/test-pad_to_end.R
deleted file mode 100644
index 0ea6244b0..000000000
--- a/tests/testthat/test-pad_to_end.R
+++ /dev/null
@@ -1,37 +0,0 @@
-test_that("test set padding works", {
-  dat <- tibble::tibble(
-    gr1 = rep(c("a", "b"), times = c(3, 4)),
-    time_value = c(1:3, 1:4),
-    value = 1:7
-  ) %>% arrange(time_value, gr1)
-  expect_identical(pad_to_end(dat, "gr1", 3), dat)
-  expect_equal(nrow(pad_to_end(dat, "gr1", 4)), 8L)
-  p <- pad_to_end(dat, "gr1", 5)
-  expect_equal(nrow(p), 10L)
-  expect_identical(p$gr1, rep(c("a", "b"), times = 5))
-  expect_identical(p$time_value, rep(1:5, each = 2))
-  expect_identical(p$value, as.integer(c(1, 4, 2, 5, 3, 6, NA, 7, NA, NA)))
-
-  dat <- dat %>% arrange(gr1)
-  dat$gr2 <- c("c", "c", "d", "c", "c", "d", "d")
-  dat <- dat %>% arrange(time_value)
-  # don't treat it as a group
-  p <- pad_to_end(dat, "gr1", 4)
-  expect_identical(nrow(p), 8L)
-  expect_identical(p$gr2, c(rep("c", 4), "d", "d", NA, "d"))
-
-  # treat it as a group (needs different time_value)
-  dat$time_value <- c(1, 1, 2, 2, 1, 1, 2) # double
-  p <- pad_to_end(dat, c("gr1", "gr2"), 2)
-  expect_equal(nrow(p), 8L)
-  expect_identical(p$gr1, rep(c("a", "a", "b", "b"), times = 2))
-  expect_identical(p$gr2, rep(c("c", "d"), times = 4))
-  expect_identical(p$time_value, rep(c(1, 2), each = 4))
-  expect_identical(p$value, as.integer(c(1, 3, 4, 6, 2, NA, 5, 7)))
-
-  # make sure it maintains the epi_df
-  dat <- dat %>%
-    dplyr::rename(geo_value = gr1) %>%
-    as_epi_df()
-  expect_s3_class(pad_to_end(dat, "geo_value", 2), "epi_df")
-})
diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R
index 7f1d46006..003fc8319 100644
--- a/tests/testthat/test-snapshots.R
+++ b/tests/testthat/test-snapshots.R
@@ -94,6 +94,19 @@ test_that("arx_forecaster snapshots", {
   )
   # not the same predictions
   expect_false(all(arx2$predictions == arx3$predictions))
+
+
+  arx4 <- arx_forecaster(
+    train_data,
+    "death_rate_7d_av",
+    c("death_rate_7d_av", "case_rate_7d_av"),
+    args_list = arx_args_list(
+      ahead = 1L,
+      adjust_latency = "locf"
+    )
+  )
+  # consistency check
+  expect_snapshot_tibble(arx3$predictions)
 })
 
 test_that("arx_forecaster output format snapshots", {
diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R
index a557fc146..75656989a 100644
--- a/tests/testthat/test-step_adjust_latency.R
+++ b/tests/testthat/test-step_adjust_latency.R
@@ -186,14 +186,14 @@ test_that("epi_adjust_latency correctly locfs", {
     last_dates,
     tribble(
       ~name, ~last_date,
-      "lag_11_death_rate", max_time + 11,
-      "lag_6_death_rate", max_time + 6,
-      "lag_5_case_rate", max_time + 5,
-      "lag_1_case_rate", max_time + 1,
-      "case_rate", max_time,
-      "death_rate", max_time,
-      "lag_0_death_rate", max_time + 0,
-      "ahead_7_death_rate", max_time - 7,
+      "lag_11_death_rate", max_time + 16,
+      "lag_6_death_rate", max_time + 11,
+      "lag_5_case_rate", max_time + 10,
+      "lag_1_case_rate", max_time + 6,
+      "case_rate", max_time + 5,
+      "death_rate", max_time + 5,
+      "lag_0_death_rate", max_time + 5,
+      "ahead_7_death_rate", max_time - 2,
     )
   )
   # we expect a 5-fold repetition of the last values found in the original
@@ -204,7 +204,7 @@ test_that("epi_adjust_latency correctly locfs", {
     slice_tail() %>%
     ungroup() %>%
     select(case_rate, death_rate) %>%
-    uncount(5)
+    tidyr::uncount(5)
   # pulling just the region between the last day and the prediction day
   filled_values <-
     baked_x %>%
@@ -450,7 +450,7 @@ test_that("`step_adjust_latency` only uses the columns specified in the `...`",
     summarise(last_date = max(time_value)) %>%
     arrange(desc(last_date)) %>%
     mutate(locf_date = last_date - latency)
-  # iterate over all columns and make sure the latent time period has the exact same values
+  # iterate over all columns and make sure the latent time period has the exact same values (so the variance is zero)
   for (ii in seq(nrow(last_dates))) {
     baked_var <- baked_x %>%
       filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>%
diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R
index a23be628b..78f294564 100644
--- a/tests/testthat/test-utils_latency.R
+++ b/tests/testthat/test-utils_latency.R
@@ -149,7 +149,7 @@ test_that("pad_to_end handles weeks", {
         ),
         a = 3, b = .9
       )
-    ) %>% arrange(time_value, geo_value)
+    ) %>% arrange(geo_value, time_value)
   )
 })
 # todo case where somehow columns of different roles are selected