Merge remote-tracking branch 'upstream/dev' into lcb/fix-guess-period…

…-datetimes
cmu-delphi · Jul 19, 2024 · 8948868 · 8948868
2 parents a5f397f + 69ea5e4
commit 8948868
Show file tree

Hide file tree

Showing 22 changed files with 404 additions and 78 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.7.13
+Version: 0.7.14
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
@@ -30,9 +30,6 @@ Imports:
     cli,
     data.table,
     dplyr (>= 1.0.0),
-    fabletools,
-    feasts,
-    generics,
     genlasso,
     ggplot2,
     lifecycle (>= 1.0.1),

diff --git a/NAMESPACE b/NAMESPACE
@@ -64,6 +64,7 @@ export(epix_merge)
 export(epix_slide)
 export(epix_truncate_versions_after)
 export(filter)
+export(geo_column_names)
 export(group_by)
 export(group_modify)
 export(growth_rate)
@@ -79,9 +80,11 @@ export(next_after)
 export(relocate)
 export(rename)
 export(slice)
+export(time_column_names)
 export(ungroup)
 export(unnest)
 export(validate_epi_archive)
+export(version_column_names)
 importFrom(checkmate,anyInfinite)
 importFrom(checkmate,anyMissing)
 importFrom(checkmate,assert)
@@ -104,6 +107,7 @@ importFrom(checkmate,test_subset)
 importFrom(checkmate,vname)
 importFrom(cli,cat_line)
 importFrom(cli,cli_abort)
+importFrom(cli,cli_inform)
 importFrom(cli,cli_vec)
 importFrom(cli,cli_warn)
 importFrom(cli,format_message)
@@ -190,6 +194,7 @@ importFrom(tibble,as_tibble)
 importFrom(tibble,new_tibble)
 importFrom(tibble,validate_tibble)
 importFrom(tidyr,unnest)
+importFrom(tidyselect,any_of)
 importFrom(tidyselect,eval_select)
 importFrom(tidyselect,starts_with)
 importFrom(tsibble,as_tsibble)

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 
 # epiprocess 0.8
 
+## Breaking changes
+- `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use
+  `detect_outlr_stl(seasonal_period = <value>, seasonal_as_residual = TRUE)`
+  instead.  See `?detect_outlr_stl` for more details.
+
 ## Improvements
 
 - `epi_slide` computations are now 2-4 times faster after changing how
@@ -35,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 - Improved documentation web site landing page's introduction.
 - Fixed documentation referring to old `epi_slide()` interface (#466, thanks
   @XuedaShen!).
+- `as_epi_df` and `as_epi_archive` now support arguments to specify column names
+  e.g.  `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list
+  of default conversions, see `time_column_names`  for a list of columns that
+  will automatically be recognized and converted to  `time_value` column (there
+  are similar functions for `geo` and `version`).
 - Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would
   output a huge number of `ref_time_values` spaced apart by mere seconds.
 
@@ -45,6 +55,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 - Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior
   of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471).
 
+## Cleanup
+- Removed some external package dependencies.
+
 # epiprocess 0.7.0
 
 ## Breaking changes:

diff --git a/R/archive.R b/R/archive.R
@@ -442,6 +442,11 @@ validate_epi_archive <- function(
 
 #' `as_epi_archive` converts a data frame, data table, or tibble into an
 #' `epi_archive` object.
+#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
+#'   example `version = release_date`
+#' @param .versions_end location based versions_end, used to avoid prefix
+#'   `version = issue` from being assigned to `versions_end` instead of being
+#'   used to rename columns.
 #'
 #' @rdname epi_archive
 #'
@@ -454,11 +459,19 @@ as_epi_archive <- function(
     additional_metadata = NULL,
     compactify = NULL,
     clobberable_versions_start = NULL,
-    versions_end = NULL) {
+    .versions_end = NULL, ...,
+    versions_end = .versions_end) {
   assert_data_frame(x)
+  x <- rename(x, ...)
+  x <- guess_column_name(x, "time_value", time_column_names())
+  x <- guess_column_name(x, "geo_value", geo_column_names())
+  x <- guess_column_name(x, "version", version_column_names())
   if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
     cli_abort(
-      "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
+      "Either columns `geo_value`, `time_value`, and `version`, or related columns
+      (see the internal functions `guess_time_column_name()`,
+      `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete
+      list) must be present in `x`."
     )
   }
   if (anyMissing(x$version)) {

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -95,7 +95,7 @@ NULL
 #'
 #' @export
 new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
-                       additional_metadata = list(), ...) {
+                       additional_metadata = list()) {
   assert_data_frame(x)
   assert_list(additional_metadata)
 
@@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
 #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
 #' examples.
 #'
+#' @param ... Additional arguments passed to methods.
 #' @template epi_df-params
 #'
 #' @export
@@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) {
 
 #' @method as_epi_df tbl_df
 #' @describeIn as_epi_df The input tibble `x` must contain the columns
-#'   `geo_value` and `time_value`. All other columns will be preserved as is,
-#'   and treated as measured variables. If `as_of` is missing, then the function
-#'   will try to guess it from an `as_of`, `issue`, or `version` column of `x`
-#'   (if any of these are present), or from as an `as_of` field in its metadata
-#'   (stored in its attributes); if this fails, then the current day-time will
-#'   be used.
+#'   `geo_value` and `time_value`, or column names that uniquely map onto these
+#'   (e.g. `date` or `province`). Alternatively, you can specify the conversion
+#'   explicitly (`time_value = someWeirdColumnName`). All other columns not
+#'   specified as `other_keys` will be preserved as is, and treated as measured
+#'   variables.
+#'
+#'  If `as_of` is missing, then the function will try to guess it from an
+#'   `as_of`, `issue`, or `version` column of `x` (if any of these are present),
+#'   or from as an `as_of` field in its metadata (stored in its attributes); if
+#'   this fails, then the current day-time will be used.
 #' @importFrom rlang .data
+#' @importFrom tidyselect any_of
+#' @importFrom cli cli_inform
 #' @export
 as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
-                             additional_metadata = list(), ...) {
+                             additional_metadata = list(),
+                             ...) {
+  # possible standard substitutions for time_value
+  x <- rename(x, ...)
+  x <- guess_column_name(x, "time_value", time_column_names())
+  x <- guess_column_name(x, "geo_value", geo_column_names())
   if (!test_subset(c("geo_value", "time_value"), names(x))) {
     cli_abort(
-      "Columns `geo_value` and `time_value` must be present in `x`."
+      "Either columns `geo_value` and `time_value` or related columns
+       (see the internal functions `guess_time_column_name()` and/or
+       `guess_geo_column_name()` for a complete list)
+       must be present in `x`."
     )
   }
 
   new_epi_df(
     x, geo_type, time_type, as_of,
-    additional_metadata, ...
+    additional_metadata
   )
 }
 

diff --git a/R/outliers.R b/R/outliers.R
@@ -64,9 +64,10 @@
 #'     args = list(list(
 #'       detect_negatives = TRUE,
 #'       detection_multiplier = 2.5,
-#'       seasonal_period = NULL
+#'       seasonal_period = 7,
+#'       seasonal_as_residual = TRUE
 #'     )),
-#'     abbr = "stl_nonseasonal"
+#'     abbr = "stl_reseasonal"
 #'   )
 #' )
 #'
@@ -216,18 +217,28 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
 #' @param n_trend Number of time steps to use in the rolling window for trend.
 #'   Default is 21.
 #' @param n_seasonal Number of time steps to use in the rolling window for
-#'   seasonality. Default is 21.
+#'   seasonality. Default is 21. Can also be the string "periodic". See
+#'   `s.window` in [`stats::stl`].
 #' @param n_threshold Number of time steps to use in rolling window for the IQR
 #'   outlier thresholds.
-#' @param seasonal_period Integer specifying period of seasonality. For example,
-#'   for daily data, a period 7 means weekly seasonality. The default is `NULL`,
-#'   meaning that no seasonal term will be included in the STL decomposition.
+#' @param seasonal_period Integer specifying period of "seasonality". For
+#'   example, for daily data, a period 7 means weekly seasonality. It must be
+#'   strictly larger than 1. Also impacts the size of the low-pass filter
+#'   window; see `l.window` in [`stats::stl`].
+#' @param seasonal_as_residual Boolean specifying whether the seasonal(/weekly)
+#'   component should be treated as part of the residual component instead of as
+#'   part of the predictions. The default, FALSE, treats them as part of the
+#'   predictions, so large seasonal(/weekly) components will not lead to
+#'   flagging points as outliers. `TRUE` may instead consider the extrema of
+#'   large seasonal variations to be outliers; `n_seasonal` and
+#'   `seasonal_period` will still have an impact on the result, though, by
+#'   impacting the estimation of the trend component.
 #' @template outlier-detection-options
 #' @template detect-outlr-return
 #'
-#' @details The STL decomposition is computed using the `feasts` package. Once
+#' @details The STL decomposition is computed using [`stats::stl()`]. Once
 #'   computed, the outlier detection method is analogous to the rolling median
-#'   method in `detect_outlr_rm()`, except with the fitted values and residuals
+#'   method in [`detect_outlr_rm()`], except with the fitted values and residuals
 #'   from the STL decomposition taking the place of the rolling median and
 #'   residuals to the rolling median, respectively.
 #'
@@ -252,12 +263,34 @@ detect_outlr_stl <- function(x = seq_along(y), y,
                              n_trend = 21,
                              n_seasonal = 21,
                              n_threshold = 21,
-                             seasonal_period = NULL,
+                             seasonal_period,
+                             seasonal_as_residual = FALSE,
                              log_transform = FALSE,
                              detect_negatives = FALSE,
                              detection_multiplier = 2,
                              min_radius = 0,
                              replacement_multiplier = 0) {
+  if (dplyr::n_distinct(x) != length(y)) {
+    cli_abort("`x` contains duplicate values. (If being run on a column in an
+               `epi_df`, did you group by relevant key variables?)")
+  }
+  if (length(y) <= 1L) {
+    cli_abort("`y` has length {length(y)}; that's definitely too little for
+               STL.  (If being run in a `mutate()` or `epi_slide()`, check
+               whether you grouped by too many variables; you should not be
+               grouping by `time_value` in particular.)")
+  }
+  distinct_x_skips <- unique(diff(x))
+  if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) {
+    cli_abort("`x` does not appear to have regular spacing; consider filling in
+               gaps with imputed values (STL does not allow NAs).")
+  }
+  if (is.unsorted(x)) { # <- for performance in common (sorted) case
+    o <- order(x)
+    x <- x[o]
+    y <- y[o]
+  }
+
   # Transform if requested
   if (log_transform) {
     # Replace all negative values with 0
@@ -266,32 +299,22 @@ detect_outlr_stl <- function(x = seq_along(y), y,
     y <- log(y + offset)
   }
 
-  # Make a tsibble for fabletools, setup and run STL
-  z_tsibble <- tsibble::tsibble(x = x, y = y, index = x)
-
-  stl_formula <- y ~ trend(window = n_trend) +
-    season(period = seasonal_period, window = n_seasonal)
+  assert_int(seasonal_period, lower = 2L)
+  assert_logical(seasonal_as_residual, len = 1L, any.missing = FALSE)
 
-  stl_components <- z_tsibble %>%
-    fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>%
-    generics::components() %>%
+  yts <- stats::ts(y, frequency = seasonal_period)
+  stl_comp <- stats::stl(yts,
+    t.window = n_trend, s.window = n_seasonal,
+    robust = TRUE
+  )$time.series %>%
     tibble::as_tibble() %>%
-    dplyr::select(.data$trend:.data$remainder) %>% #
-    dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>%
     dplyr::rename(resid = .data$remainder)
 
   # Allocate the seasonal term from STL to either fitted or resid
-  if (!is.null(seasonal_period)) {
-    stl_components <- stl_components %>%
-      dplyr::mutate(
-        fitted = .data$trend + .data$seasonal
-      )
+  if (!seasonal_as_residual) {
+    stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend + .data$seasonal)
   } else {
-    stl_components <- stl_components %>%
-      dplyr::mutate(
-        fitted = .data$trend,
-        resid = .data$seasonal + resid
-      )
+    stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend, resid = .data$seasonal + .data$resid)
   }
 
   # Detect negatives if requested
@@ -306,10 +329,7 @@ detect_outlr_stl <- function(x = seq_along(y), y,
 
   # Calculate lower and upper thresholds and replacement value
   z <- z %>%
-    dplyr::mutate(
-      fitted = stl_components$fitted,
-      resid = stl_components$resid
-    ) %>%
+    dplyr::mutate(fitted = stl_comp$fitted, resid = stl_comp$resid) %>%
     roll_iqr(
       n = n_threshold,
       detection_multiplier = detection_multiplier,
@@ -337,7 +357,12 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius,
     as_type <- as.numeric
   }
 
-  epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>%
+  z %>%
+    epi_slide(
+      roll_iqr = stats::IQR(resid),
+      before = floor((n - 1) / 2),
+      after = ceiling((n - 1) / 2)
+    ) %>%
     dplyr::mutate(
       lower = pmax(
         min_lower,