Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into lcb/fix-guess-period…
Browse files Browse the repository at this point in the history
…-datetimes
  • Loading branch information
brookslogan committed Jul 19, 2024
2 parents a5f397f + 69ea5e4 commit 8948868
Show file tree
Hide file tree
Showing 22 changed files with 404 additions and 78 deletions.
5 changes: 1 addition & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: epiprocess
Title: Tools for basic signal processing in epidemiology
Version: 0.7.13
Version: 0.7.14
Authors@R: c(
person("Jacob", "Bien", role = "ctb"),
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
Expand Down Expand Up @@ -30,9 +30,6 @@ Imports:
cli,
data.table,
dplyr (>= 1.0.0),
fabletools,
feasts,
generics,
genlasso,
ggplot2,
lifecycle (>= 1.0.1),
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ export(epix_merge)
export(epix_slide)
export(epix_truncate_versions_after)
export(filter)
export(geo_column_names)
export(group_by)
export(group_modify)
export(growth_rate)
Expand All @@ -79,9 +80,11 @@ export(next_after)
export(relocate)
export(rename)
export(slice)
export(time_column_names)
export(ungroup)
export(unnest)
export(validate_epi_archive)
export(version_column_names)
importFrom(checkmate,anyInfinite)
importFrom(checkmate,anyMissing)
importFrom(checkmate,assert)
Expand All @@ -104,6 +107,7 @@ importFrom(checkmate,test_subset)
importFrom(checkmate,vname)
importFrom(cli,cat_line)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_vec)
importFrom(cli,cli_warn)
importFrom(cli,format_message)
Expand Down Expand Up @@ -190,6 +194,7 @@ importFrom(tibble,as_tibble)
importFrom(tibble,new_tibble)
importFrom(tibble,validate_tibble)
importFrom(tidyr,unnest)
importFrom(tidyselect,any_of)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,starts_with)
importFrom(tsibble,as_tsibble)
Expand Down
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat

# epiprocess 0.8

## Breaking changes
- `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use
`detect_outlr_stl(seasonal_period = <value>, seasonal_as_residual = TRUE)`
instead. See `?detect_outlr_stl` for more details.

## Improvements

- `epi_slide` computations are now 2-4 times faster after changing how
Expand Down Expand Up @@ -35,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
- Improved documentation web site landing page's introduction.
- Fixed documentation referring to old `epi_slide()` interface (#466, thanks
@XuedaShen!).
- `as_epi_df` and `as_epi_archive` now support arguments to specify column names
e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list
of default conversions, see `time_column_names` for a list of columns that
will automatically be recognized and converted to `time_value` column (there
are similar functions for `geo` and `version`).
- Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would
output a huge number of `ref_time_values` spaced apart by mere seconds.

Expand All @@ -45,6 +55,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
- Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior
of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471).

## Cleanup
- Removed some external package dependencies.

# epiprocess 0.7.0

## Breaking changes:
Expand Down
17 changes: 15 additions & 2 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,11 @@ validate_epi_archive <- function(

#' `as_epi_archive` converts a data frame, data table, or tibble into an
#' `epi_archive` object.
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
#' example `version = release_date`
#' @param .versions_end location based versions_end, used to avoid prefix
#' `version = issue` from being assigned to `versions_end` instead of being
#' used to rename columns.
#'
#' @rdname epi_archive
#'
Expand All @@ -454,11 +459,19 @@ as_epi_archive <- function(
additional_metadata = NULL,
compactify = NULL,
clobberable_versions_start = NULL,
versions_end = NULL) {
.versions_end = NULL, ...,
versions_end = .versions_end) {
assert_data_frame(x)
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
x <- guess_column_name(x, "version", version_column_names())
if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
cli_abort(
"Columns `geo_value`, `time_value`, and `version` must be present in `x`."
"Either columns `geo_value`, `time_value`, and `version`, or related columns
(see the internal functions `guess_time_column_name()`,
`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete
list) must be present in `x`."
)
}
if (anyMissing(x$version)) {
Expand Down
35 changes: 25 additions & 10 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ NULL
#'
#' @export
new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list()) {
assert_data_frame(x)
assert_list(additional_metadata)

Expand Down Expand Up @@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
#' examples.
#'
#' @param ... Additional arguments passed to methods.
#' @template epi_df-params
#'
#' @export
Expand Down Expand Up @@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) {

#' @method as_epi_df tbl_df
#' @describeIn as_epi_df The input tibble `x` must contain the columns
#' `geo_value` and `time_value`. All other columns will be preserved as is,
#' and treated as measured variables. If `as_of` is missing, then the function
#' will try to guess it from an `as_of`, `issue`, or `version` column of `x`
#' (if any of these are present), or from as an `as_of` field in its metadata
#' (stored in its attributes); if this fails, then the current day-time will
#' be used.
#' `geo_value` and `time_value`, or column names that uniquely map onto these
#' (e.g. `date` or `province`). Alternatively, you can specify the conversion
#' explicitly (`time_value = someWeirdColumnName`). All other columns not
#' specified as `other_keys` will be preserved as is, and treated as measured
#' variables.
#'
#' If `as_of` is missing, then the function will try to guess it from an
#' `as_of`, `issue`, or `version` column of `x` (if any of these are present),
#' or from as an `as_of` field in its metadata (stored in its attributes); if
#' this fails, then the current day-time will be used.
#' @importFrom rlang .data
#' @importFrom tidyselect any_of
#' @importFrom cli cli_inform
#' @export
as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list(),
...) {
# possible standard substitutions for time_value
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
if (!test_subset(c("geo_value", "time_value"), names(x))) {
cli_abort(
"Columns `geo_value` and `time_value` must be present in `x`."
"Either columns `geo_value` and `time_value` or related columns
(see the internal functions `guess_time_column_name()` and/or
`guess_geo_column_name()` for a complete list)
must be present in `x`."
)
}

new_epi_df(
x, geo_type, time_type, as_of,
additional_metadata, ...
additional_metadata
)
}

Expand Down
93 changes: 59 additions & 34 deletions R/outliers.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@
#' args = list(list(
#' detect_negatives = TRUE,
#' detection_multiplier = 2.5,
#' seasonal_period = NULL
#' seasonal_period = 7,
#' seasonal_as_residual = TRUE
#' )),
#' abbr = "stl_nonseasonal"
#' abbr = "stl_reseasonal"
#' )
#' )
#'
Expand Down Expand Up @@ -216,18 +217,28 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
#' @param n_trend Number of time steps to use in the rolling window for trend.
#' Default is 21.
#' @param n_seasonal Number of time steps to use in the rolling window for
#' seasonality. Default is 21.
#' seasonality. Default is 21. Can also be the string "periodic". See
#' `s.window` in [`stats::stl`].
#' @param n_threshold Number of time steps to use in rolling window for the IQR
#' outlier thresholds.
#' @param seasonal_period Integer specifying period of seasonality. For example,
#' for daily data, a period 7 means weekly seasonality. The default is `NULL`,
#' meaning that no seasonal term will be included in the STL decomposition.
#' @param seasonal_period Integer specifying period of "seasonality". For
#' example, for daily data, a period 7 means weekly seasonality. It must be
#' strictly larger than 1. Also impacts the size of the low-pass filter
#' window; see `l.window` in [`stats::stl`].
#' @param seasonal_as_residual Boolean specifying whether the seasonal(/weekly)
#' component should be treated as part of the residual component instead of as
#' part of the predictions. The default, FALSE, treats them as part of the
#' predictions, so large seasonal(/weekly) components will not lead to
#' flagging points as outliers. `TRUE` may instead consider the extrema of
#' large seasonal variations to be outliers; `n_seasonal` and
#' `seasonal_period` will still have an impact on the result, though, by
#' impacting the estimation of the trend component.
#' @template outlier-detection-options
#' @template detect-outlr-return
#'
#' @details The STL decomposition is computed using the `feasts` package. Once
#' @details The STL decomposition is computed using [`stats::stl()`]. Once
#' computed, the outlier detection method is analogous to the rolling median
#' method in `detect_outlr_rm()`, except with the fitted values and residuals
#' method in [`detect_outlr_rm()`], except with the fitted values and residuals
#' from the STL decomposition taking the place of the rolling median and
#' residuals to the rolling median, respectively.
#'
Expand All @@ -252,12 +263,34 @@ detect_outlr_stl <- function(x = seq_along(y), y,
n_trend = 21,
n_seasonal = 21,
n_threshold = 21,
seasonal_period = NULL,
seasonal_period,
seasonal_as_residual = FALSE,
log_transform = FALSE,
detect_negatives = FALSE,
detection_multiplier = 2,
min_radius = 0,
replacement_multiplier = 0) {
if (dplyr::n_distinct(x) != length(y)) {
cli_abort("`x` contains duplicate values. (If being run on a column in an
`epi_df`, did you group by relevant key variables?)")
}
if (length(y) <= 1L) {
cli_abort("`y` has length {length(y)}; that's definitely too little for
STL. (If being run in a `mutate()` or `epi_slide()`, check
whether you grouped by too many variables; you should not be
grouping by `time_value` in particular.)")
}
distinct_x_skips <- unique(diff(x))
if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) {
cli_abort("`x` does not appear to have regular spacing; consider filling in
gaps with imputed values (STL does not allow NAs).")
}
if (is.unsorted(x)) { # <- for performance in common (sorted) case
o <- order(x)
x <- x[o]
y <- y[o]
}

# Transform if requested
if (log_transform) {
# Replace all negative values with 0
Expand All @@ -266,32 +299,22 @@ detect_outlr_stl <- function(x = seq_along(y), y,
y <- log(y + offset)
}

# Make a tsibble for fabletools, setup and run STL
z_tsibble <- tsibble::tsibble(x = x, y = y, index = x)

stl_formula <- y ~ trend(window = n_trend) +
season(period = seasonal_period, window = n_seasonal)
assert_int(seasonal_period, lower = 2L)
assert_logical(seasonal_as_residual, len = 1L, any.missing = FALSE)

stl_components <- z_tsibble %>%
fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>%
generics::components() %>%
yts <- stats::ts(y, frequency = seasonal_period)
stl_comp <- stats::stl(yts,
t.window = n_trend, s.window = n_seasonal,
robust = TRUE
)$time.series %>%
tibble::as_tibble() %>%
dplyr::select(.data$trend:.data$remainder) %>% #
dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>%
dplyr::rename(resid = .data$remainder)

# Allocate the seasonal term from STL to either fitted or resid
if (!is.null(seasonal_period)) {
stl_components <- stl_components %>%
dplyr::mutate(
fitted = .data$trend + .data$seasonal
)
if (!seasonal_as_residual) {
stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend + .data$seasonal)
} else {
stl_components <- stl_components %>%
dplyr::mutate(
fitted = .data$trend,
resid = .data$seasonal + resid
)
stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend, resid = .data$seasonal + .data$resid)
}

# Detect negatives if requested
Expand All @@ -306,10 +329,7 @@ detect_outlr_stl <- function(x = seq_along(y), y,

# Calculate lower and upper thresholds and replacement value
z <- z %>%
dplyr::mutate(
fitted = stl_components$fitted,
resid = stl_components$resid
) %>%
dplyr::mutate(fitted = stl_comp$fitted, resid = stl_comp$resid) %>%
roll_iqr(
n = n_threshold,
detection_multiplier = detection_multiplier,
Expand Down Expand Up @@ -337,7 +357,12 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius,
as_type <- as.numeric
}

epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>%
z %>%
epi_slide(
roll_iqr = stats::IQR(resid),
before = floor((n - 1) / 2),
after = ceiling((n - 1) / 2)
) %>%
dplyr::mutate(
lower = pmax(
min_lower,
Expand Down
Loading

0 comments on commit 8948868

Please sign in to comment.