diff --git a/.Rbuildignore b/.Rbuildignore index 0582014a6..cb0b7ed2b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,3 +16,5 @@ ^.lintr$ ^DEVELOPMENT.md$ man-roxygen +^.venv$ +^sandbox.R$ \ No newline at end of file diff --git a/.gitignore b/.gitignore index de393a316..8dc001be4 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ docs renv/ renv.lock .Rprofile +sandbox.R \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 333bf13cd..790b36a54 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,7 +50,8 @@ Imports: tidyselect (>= 1.2.0), tsibble, utils, - vctrs + vctrs, + waldo Suggests: covidcast, devtools, diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index be34211b8..0304d9a60 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -731,7 +731,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' library(dplyr) #' #' # Reference time points for which we want to compute slide values: -#' versions <- seq(as.Date("2020-06-01"), +#' versions <- seq(as.Date("2020-06-02"), #' as.Date("2020-06-15"), #' by = "1 day" #' ) @@ -780,7 +780,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' .versions = versions #' ) %>% #' ungroup() %>% -#' arrange(geo_value, time_value) +#' arrange(geo_value, version) #' #' # --- Advanced: --- #' diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index c8582239e..901b9b32e 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -427,6 +427,10 @@ arrange_col_canonical.epi_df <- function(x, ...) { x %>% dplyr::relocate(dplyr::all_of(cols), .before = 1) } +#' Group an `epi_df` object by default keys +#' @param x an `epi_df` +#' @param exclude character vector of column names to exclude from grouping +#' @return a grouped `epi_df` #' @export group_epi_df <- function(x, exclude = character()) { cols <- key_colnames(x, exclude = exclude) @@ -440,7 +444,7 @@ group_epi_df <- function(x, exclude = character()) { #' the resulting `epi_df` will have `geo_value` set to `"total"`. #' #' @param .x an `epi_df` -#' @param value_col character vector of the columns to aggregate +#' @param sum_cols character vector of the columns to aggregate #' @param group_cols character vector of column names to group by. "time_value" is #' included by default. #' @return an `epi_df` object diff --git a/R/outliers.R b/R/outliers.R index 8be492ddf..c2187de0a 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -161,8 +161,7 @@ detect_outlr <- function(x = seq_along(y), y, #' group_by(geo_value) %>% #' mutate(outlier_info = detect_outlr_rm( #' x = time_value, y = cases -#' )) %>% -#' unnest(outlier_info) +#' )) detect_outlr_rm <- function(x = seq_along(y), y, n = 21, log_transform = FALSE, detect_negatives = FALSE, @@ -189,7 +188,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, # Calculate lower and upper thresholds and replacement value z <- z %>% - epi_slide(fitted = median(y), .window_size = n, .align = "center") %>% + epi_slide(fitted = median(y, na.rm = TRUE), .window_size = n, .align = "center") %>% dplyr::mutate(resid = y - fitted) %>% roll_iqr( n = n, @@ -256,9 +255,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' group_by(geo_value) %>% #' mutate(outlier_info = detect_outlr_stl( #' x = time_value, y = cases, -#' seasonal_period = 7 -#' )) %>% # weekly seasonality for daily data -#' unnest(outlier_info) +#' seasonal_period = 7 # weekly seasonality for daily data +#' )) detect_outlr_stl <- function(x = seq_along(y), y, n_trend = 21, n_seasonal = 21, @@ -359,7 +357,7 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius, z %>% epi_slide( - roll_iqr = stats::IQR(resid), + roll_iqr = stats::IQR(resid, na.rm = TRUE), .window_size = n, .align = "center" ) %>% dplyr::mutate( diff --git a/R/revision_analysis.R b/R/revision_analysis.R index be83d68c6..7be0cd248 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -81,8 +81,8 @@ revision_summary <- function(epi_arch, should_compactify = TRUE) { arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) if (length(arg) == 0) { - first_non_key <- !(names(epi_arch$DT) %in% c(key_colnames(epi_arch), "version")) - arg <- names(epi_arch$DT)[first_non_key][1] + # Choose the first column that's not a key or version + arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]] } else if (length(arg) > 1) { cli_abort("Not currently implementing more than one column at a time. Run each separately") } @@ -99,11 +99,9 @@ revision_summary <- function(epi_arch, # # revision_tibble keys <- key_colnames(epi_arch) - names(epi_arch$DT) - revision_behavior <- - epi_arch$DT %>% - select(c(geo_value, time_value, all_of(keys), version, !!arg)) + revision_behavior <- epi_arch$DT %>% + select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) if (!is.null(min_waiting_period)) { revision_behavior <- revision_behavior %>% filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) diff --git a/R/slide.R b/R/slide.R index 50c8acf42..5a7fbd6aa 100644 --- a/R/slide.R +++ b/R/slide.R @@ -728,7 +728,7 @@ epi_slide_opt <- function( # positions of user-provided `col_names` into string column names. We avoid # using `names(pos)` directly for robustness and in case we later want to # allow users to rename fields via tidyselection. - if (class(quo_get_expr(enquo(.col_names))) == "character") { + if (inherits(quo_get_expr(enquo(.col_names)), "character")) { pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE) } else { pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) diff --git a/_pkgdown.yml b/_pkgdown.yml index 62f006fec..1bc7f795d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -48,7 +48,6 @@ articles: - aggregation - outliers - archive - - advanced - compactify repo: diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R index 64570976f..12bc1f4cb 100644 --- a/man-roxygen/basic-slide-details.R +++ b/man-roxygen/basic-slide-details.R @@ -9,7 +9,7 @@ #' boundary of the dataset) and will attempt to perform the computation #' anyway. The issue of what to do with partial computations (those run on #' incomplete windows) is therefore left up to the user, either through the -#' specified function or formula `f`, or through post-processing. +#' specified function or formula, or through post-processing. #' #' Let's look at some window examples, assuming that the reference time value #' is "tv". With .align = "right" and .window_size = 3, the window will be: diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 333c4a7b5..b57c44450 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -65,6 +65,5 @@ incidence_num_outlier_example \%>\% group_by(geo_value) \%>\% mutate(outlier_info = detect_outlr_rm( x = time_value, y = cases - )) \%>\% - unnest(outlier_info) + )) } diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 695c2de78..fb69e8da3 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -96,7 +96,6 @@ incidence_num_outlier_example \%>\% group_by(geo_value) \%>\% mutate(outlier_info = detect_outlr_stl( x = time_value, y = cases, - seasonal_period = 7 - )) \%>\% # weekly seasonality for daily data - unnest(outlier_info) + seasonal_period = 7 # weekly seasonality for daily data + )) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 323fdf4d0..1c87a4d2a 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -109,7 +109,7 @@ keep NAs around. boundary of the dataset) and will attempt to perform the computation anyway. The issue of what to do with partial computations (those run on incomplete windows) is therefore left up to the user, either through the -specified function or formula \code{f}, or through post-processing. +specified function or formula, or through post-processing. Let's look at some window examples, assuming that the reference time value is "tv". With .align = "right" and .window_size = 3, the window will be: diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 1f3018460..1326cc185 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -170,7 +170,7 @@ necessary (as it its purpose). library(dplyr) # Reference time points for which we want to compute slide values: -versions <- seq(as.Date("2020-06-01"), +versions <- seq(as.Date("2020-06-02"), as.Date("2020-06-15"), by = "1 day" ) @@ -219,7 +219,7 @@ archive_cases_dv_subset \%>\% .versions = versions ) \%>\% ungroup() \%>\% - arrange(geo_value, time_value) + arrange(geo_value, version) # --- Advanced: --- diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd deleted file mode 100644 index 65f9ce05c..000000000 --- a/vignettes/advanced.Rmd +++ /dev/null @@ -1,488 +0,0 @@ ---- -title: Advanced sliding with nonstandard outputs -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Advanced sliding with nonstandard outputs} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with less common grouping schemes or with computations that -have advanced output structures. The output of a slide computation should either -be an atomic value/vector, or a data frame. This data frame can have multiple -columns, multiple rows, or both. - -During basic usage (e.g., when all optional arguments are set to their defaults): - -* `epi_slide(edf, , .....)`: - * keeps **all** columns of `edf`, adds computed column(s) - * outputs **one row per row in `edf`** (recycling outputs from - computations appropriately if there are multiple time series bundled - together inside any group(s)) - * maintains the grouping or ungroupedness of `edf` - * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by - `dplyr::arrange(time_value, .by_group = TRUE)`** - * outputs an **`epi_df`** if the required columns are present, otherwise a - tibble -* `epix_slide(ea, , .....)`: - * keeps **grouping and `time_value`** columns of `ea`, adds computed - column(s) - * outputs **any number of rows** (computations are allowed to output any - number of elements/rows, and no recycling is performed) - * maintains the grouping or ungroupedness of `ea`, unless it was explicitly - grouped by zero variables; this isn't supported by `grouped_df` and it will - automatically turn into an ungrouped tibble - * is roughly analogous to (the non-sliding) **`dplyr::group_modify`** - * outputs a **tibble** - -These differences in basic behavior make some common slide operations require less boilerplate: - -* predictors and targets calculated with `epi_slide` are automatically lined up - with each other and with the signals from which they were calculated; and -* computations for an `epix_slide` can output data frames with any number of - rows, containing models, forecasts, evaluations, etc., and will not be - recycled. - -When using more advanced features, more complex rules apply: - -* Generalization: `epi_slide(edf, ....., .ref_time_values=my_ref_time_values)` - will output one row for every row in `edf` with `time_value` appearing inside - `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` - followed by `dplyr::filter` to those `.ref_time_values`. We call this property - **size stability**, and describe how it is achieved in the following sections. - The default behavior described above is a special case of this general rule - based on a default value of `.ref_time_values`. -* Exception/feature: `epi_slide(edf, ....., .ref_time_values=my_ref_time_values, - .all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but - instead will output one row per row in `edf`. -* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% - epix_slide(, .....)` will call the computation on any missing - groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional - output rows. - -Below we demonstrate some advanced use cases of sliding with different output -structures. We focus on `epi_slide()` for the most part, though some of the -behavior we demonstrate also carries over to `epix_slide()`. - -## Recycling outputs - -When a computation returns a single atomic value, `epi_slide()` will internally -try to recycle the output so that it is size stable (in the sense described -above). We can use this to our advantage, for example, in order to compute a -trailing average marginally over geo values, which we demonstrate below in a -simple synthetic example. - -```{r message = FALSE} -library(epiprocess) -library(dplyr) -set.seed(123) - -edf <- tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - x = seq_along(geo_value) + 0.01 * rnorm(length(geo_value)), -) %>% - as_epi_df(as_of = as.Date("2024-03-20")) - -# 2-day trailing average, per geo value -edf %>% - group_by(geo_value) %>% - epi_slide(x_2dav = mean(x), .window_size = 2) %>% - ungroup() - -# 2-day trailing average, marginally -edf %>% - epi_slide(x_2dav = mean(x), .window_size = 2) -``` - -```{r, include = FALSE} -# More checks (not included) -edf %>% - epi_slide(x_2dav = mean(x), .window_size = 2, .ref_time_values = as.Date("2020-06-02")) - -edf %>% - # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), .before = 1, .versions = as.Date("2020-06-02")) %>% - ungroup() - -edf %>% - # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide(~ mean(.x$x), .before = 1, .ref_time_values = as.Date("2020-06-02")) %>% - ungroup() -``` - -When the slide computation returns an atomic vector (rather than a single value) -`epi_slide()` checks whether its return length ensures size stability, and if -so, uses it to fill the new column. For example, this next computation gives the -same result as the last one. - -```{r} -edf %>% - epi_slide(y_2dav = rep(mean(x), 3), .window_size = 2) -``` - -However, if the output is an atomic vector (rather than a single value) and it -is *not* size stable, then `epi_slide()` throws an error. For example, below we -are trying to return 2 things for 3 states. - -```{r, error = TRUE} -edf %>% - epi_slide(x_2dav = rep(mean(x), 2), .window_size = 2) -``` - -## Multi-column outputs - -Now we move on to outputs that are data frames with a single row but multiple -columns. Working with this type of output structure has in fact has already been -demonstrated in the [slide -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). - -```{r} -edf2 <- edf %>% - group_by(geo_value) %>% - epi_slide( - a = list(data.frame(x_2dav = mean(x), x_2dma = mad(x))), - .window_size = 2 - ) %>% - ungroup() - -class(edf2$a) -length(edf2$a) -edf2$a[[2]] -``` - -If you do not wrap the data.frame in a list above, the resulting `epi_df` has -multiple new columns containing the slide values. The default is to name these -unnested columns by prefixing the name assigned to the list column (here `a`) -onto the column names of the output data frame from the slide computation (here -`x_2dav` and `x_2dma`) separated by "_". - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - .window_size = 2 - ) %>% - ungroup() -``` - -Furthermore, `epi_slide()` will recycle the single row data frame as needed in -order to make the result size stable, just like the case for atomic values (note -that we are not grouping here by geo_value). - -```{r} -edf %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - .window_size = 2 - ) -``` - -## Multi-row outputs - -For a slide computation that outputs a data frame with more than one row, the -behavior is analogous to a slide computation that outputs an atomic vector. -Meaning, `epi_slide()` will check that the result is size stable, and if so, -will fill the new column(s) in the resulting `epi_df` object appropriately. - -This can be convenient for modeling in the following sense: we can, for example, -fit a sliding, data-versioning-unaware nowcasting or forecasting model by -pooling data from different locations, and then return separate forecasts from -this common model for each location. We use our synthetic example to demonstrate -this idea abstractly but simply by forecasting (actually, nowcasting) `y` from -`x` by fitting a time-windowed linear model that pooling data across all -locations. - -```{r} -edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) - -edf %>% - epi_slide(function(d, group_key, ref_time_value) { - obj <- lm(y ~ x, data = d) - return( - predict( - obj, - newdata = d %>% group_by(geo_value) %>% filter(time_value == max(time_value)), - interval = "prediction", - level = 0.9 - ) %>% - as.data.frame() %>% - list() - ) - }, .window_size = 2) -``` - -The above example focused on simplicity to show how to work with multi-row -outputs. Note however, the following issues in this example: - -* The `lm` fitting data includes the testing instances, as no training-test split was performed. -* Adding a simple training-test split would not factor in reporting latency properly. -* Data revisions are not taken into account. - -All three of these factors contribute to unrealistic retrospective forecasts and -overly optimistic retrospective performance evaluations. Instead, one should -favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using -`epix_slide` also makes it easier to express certain types of forecasts; while -in `epi_slide`, forecasts for additional aheads or quantile levels would need to -be expressed as additional columns, or nested inside list columns, `epix_slide` -does not perform size stability checks or recycling, allowing computations to -output any number of rows. - -## Version-aware forecasting, revisited - -We revisit the COVID-19 forecasting example from the [archive -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the preceding points regarding forecast evaluation in a more -realistic setting. First, we fetch the versioned data and build the archive. - -```{r, message = FALSE, warning = FALSE, eval =FALSE} -library(epidatr) -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -y1 <- pub_covidcast( - source = "doctor-visits", - signals = "smoothed_adj_cli", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -y2 <- pub_covidcast( - source = "jhu-csse", - signal = "confirmed_7dav_incidence_prop", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -x <- y1 %>% - select(geo_value, time_value, - version = issue, - percent_cli = value - ) %>% - as_epi_archive(compactify = FALSE) - -# mutating merge operation: -x <- epix_merge( - x, - y2 %>% - select(geo_value, time_value, - version = issue, - case_rate_7d_av = value - ) %>% - as_epi_archive(compactify = FALSE), - sync = "locf", - compactify = FALSE -) -``` - -```{r, message = FALSE, echo =FALSE} -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -x <- archive_cases_dv_subset$DT %>% - filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) -``` - -Next, we extend the ARX function to handle multiple geo values, since in the -present case, we will not be grouping by geo value and each slide computation -will be run on multiple geo values at once. Note that, because `epix_slide()` -only returns the grouping variables, `time_value`, and the slide computations in -the eventual returned tibble, we need to include `geo_value` as a column in the -output data frame from our ARX computation. - -```{r} -library(tidyr) -library(purrr) - -prob_arx_args <- function(lags = c(0, 7, 14), - ahead = 7, - min_train_window = 20, - lower_level = 0.05, - upper_level = 0.95, - symmetrize = TRUE, - intercept = FALSE, - nonneg = TRUE) { - return(list( - lags = lags, - ahead = ahead, - min_train_window = min_train_window, - lower_level = lower_level, - upper_level = upper_level, - symmetrize = symmetrize, - intercept = intercept, - nonneg = nonneg - )) -} - -prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { - # Return NA if insufficient training data - if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = NA, lower = NA, upper = NA - )) - } - - # Set up x, y, lags list - if (!missing(x)) { - x <- data.frame(x, y) - } else { - x <- data.frame(y) - } - if (!is.list(args$lags)) args$lags <- list(args$lags) - args$lags <- rep(args$lags, length.out = ncol(x)) - - # Build features and response for the AR model, and then fit it - dat <- - tibble(i = seq_len(ncol(x)), lag = args$lags) %>% - unnest(lag) %>% - mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter - # One list element for each lagged feature - pmap(function(i, lag, name) { - tibble( - geo_value = geo_value, - time_value = time_value + lag, # Shift back - !!name := x[, i] - ) - }) %>% - # One list element for the response vector - c(list( - tibble( - geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward - y = y - ) - )) %>% - # Combine them together into one data frame - reduce(full_join, by = c("geo_value", "time_value")) %>% - arrange(time_value) - if (args$intercept) dat$x0 <- rep(1, nrow(dat)) - obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) - - # Use LOCF to fill NAs in the latest feature values (do this by geo value) - setDT(dat) # Convert to a data.table object by reference - cols <- setdiff(names(dat), c("geo_value", "time_value")) - dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] - - # Make predictions - test_time_value <- max(time_value) - point <- predict( - obj, - newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value) - ) - - # Compute bands - r <- residuals(obj) - s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? - q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) - lower <- point + q[1] - upper <- point + q[2] - - # Clip at zero if we need to, then return - if (args$nonneg) { - point <- pmax(point, 0) - lower <- pmax(lower, 0) - upper <- pmax(upper, 0) - } - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = point, lower = lower, upper = upper - )) -} -``` - -We now make forecasts on the archive and compare to forecasts on the latest -data. - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -# Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) - -# Simple function to produce forecasts k weeks ahead -k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { - if (as_of) { - x %>% - epix_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - .before = 219, .versions = fc_time_values - ) %>% - mutate( - target_date = .data$time_value + ahead, as_of = TRUE, - geo_value = .data$fc$geo_value - ) - } else { - x_latest %>% - epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - .window_size = 220, .ref_time_values = fc_time_values - ) %>% - mutate(target_date = .data$time_value + ahead, as_of = FALSE) - } -} - -# Generate the forecasts, and bind them together -fc <- bind_rows( - k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE) -) - -# Plot them, on top of latest COVID-19 case rates -ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + - geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + - geom_line( - data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = fc$point)) + - geom_point(aes(y = fc$point), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(as_of), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` - -We can see that these forecasts, which come from training an ARX model jointly -over CA and FL, exhibit generally less variability and wider prediction bands -compared to the ones from the archive vignette, which come from training a -separate ARX model on each state. As in the archive vignette, we can see a -difference between version-aware (right column) and -unaware (left column) -forecasting, as well. - -## Attribution -The `case_rate_7d_av` data used in this document is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index 585b5b0a4..9d205f530 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -52,13 +52,12 @@ x <- jhu_csse_county_level_subset ## Converting to `tsibble` format For manipulating and wrangling time series data, the -[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a whole -bunch of useful tools. A tsibble object (formerly, of class `tbl_ts`) is -basically a tibble (data frame) but with two specially-marked columns: an -**index** column representing the time variable (defining an order from past to -present), and a **key** column identifying a unique observational unit for each -time point. In fact, the key can be made up of any number of columns, not just a -single one. +[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a host of +useful tools. A tsibble object (formerly, of class `tbl_ts`) is basically a +tibble (data frame) but with two specially-marked columns: an **index** column +representing the time variable (defining an order from past to present), and a +**key** column identifying a unique observational unit for each time point. In +fact, the key can be made up of any number of columns, not just a single one. In an `epi_df` object, the index variable is `time_value`, and the key variable is typically `geo_value` (though this need not always be the case: for example, @@ -113,11 +112,13 @@ Let's first remove certain dates from our data set to create gaps: ```{r} # First make geo value more readable for tables, plots, etc. x <- x %>% - mutate(geo_value = paste( - substr(county_name, 1, nchar(county_name) - 7), - name_to_abbr(state_name), - sep = ", " - )) %>% + mutate( + geo_value = paste( + substr(county_name, 1, nchar(county_name) - 7), + name_to_abbr(state_name), + sep = ", " + ) + ) %>% select(geo_value, time_value, cases) xt <- as_tsibble(x) %>% filter(cases >= 3) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 074131263..62eea2aa5 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -51,6 +51,10 @@ library(data.table) library(dplyr) library(purrr) library(ggplot2) +dv <- archive_cases_dv_subset$DT %>% + select(-case_rate_7d_av) %>% + rename(issue = version, value = percent_cli) %>% + tibble() ``` ## Getting data into `epi_archive` format @@ -72,7 +76,7 @@ format, with `issue` playing the role of `version`. We can now use redundant version updates in `as_epi_archive` using compactify, please refer to the [compactify vignette](articles/compactify.html). -```{r, eval=FALSE} +```{r} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% as_epi_archive(compactify = TRUE) @@ -81,15 +85,6 @@ class(x) print(x) ``` -```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset$DT %>% - select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) - -class(x) -print(x) -``` - An `epi_archive` is consists of a primary field `DT`, which is a data table (from the `data.table` package) that has the columns `geo_value`, `time_value`, `version` (and possibly additional ones), and other metadata fields, such as @@ -127,17 +122,27 @@ object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). ## Summarizing Revision Behavior -There are many ways to examine the ways that signals change across different revisions. -The simplest that is included directly in epiprocess is `revision_summary()`, which computes simple summary statistics for each key (by default, `(geo_value,time_value)` pairs), such as the lag to the first value (latency). In addition to the per key summary, it also returns an overall summary: + +There are many ways to examine the ways that signals change across different +revisions. The simplest that is included directly in epiprocess is +`revision_summary()`, which computes simple summary statistics for each key (by +default, `(geo_value,time_value)` pairs), such as the lag to the first value +(latency). In addition to the per key summary, it also returns an overall +summary: + ```{r} revision_details <- revision_summary(x, print_inform = TRUE) ``` -So as was mentioned at the top, this is clearly a data set where basically everything has some amount of revisions, only 0.37% have no revision at all, and 0.92 have fewer than 3. -Over 94% change by more than 10%. -On the other hand, most are within plus or minus 20% within 5-9 days, so the revisions converge relatively quickly, even if the revisions continue for longer. +So as was mentioned at the top, this is clearly a data set where basically +everything has some amount of revisions, only 0.37% have no revision at all, and +0.92 have fewer than 3. Over 94% change by more than 10%. On the other hand, +most are within plus or minus 20% within 5-9 days, so the revisions converge +relatively quickly, even if the revisions continue for longer. + +To do more detailed analysis than is possible with the above printing, we have +`revision_details`: -To do more detailed analysis than is possible with the above printing, we have `revision_details`: ```{r} revision_details %>% group_by(geo_value) %>% @@ -150,13 +155,16 @@ revision_details %>% time_near_latest = mean(time_near_latest) ) ``` -Most of the states have similar stats on most of these features, except for Florida, which takes nearly double the amount of time to get close to the right value, with California not too far behind. + +Most of the states have similar stats on most of these features, except for +Florida, which takes nearly double the amount of time to get close to the right +value, with California not too far behind. ## Producing snapshots in `epi_df` form -A key method of an `epi_archive` class is `epix_as_of()`, which generates a snapshot -of the archive in `epi_df` format. This represents the most up-to-date values of -the signal variables as of a given version. +A key method of an `epi_archive` class is `epix_as_of()`, which generates a +snapshot of the archive in `epi_df` format. This represents the most up-to-date +values of the signal variables as of a given version. ```{r} x_snapshot <- epix_as_of(x, as.Date("2021-06-01")) @@ -180,6 +188,7 @@ latest snapshot `x_latest` that the archive can provide). ```{r, fig.width = 8, fig.height = 7} theme_set(theme_bw()) +x_latest <- epix_as_of(x, x$versions_end) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { @@ -237,7 +246,7 @@ When merging archives, unless the archives have identical data release patterns, download the currently available version data for one of the archives, but not the other). -```{r, message = FALSE, warning = FALSE,eval=FALSE} +```{r, message = FALSE, warning = FALSE, eval=FALSE} y <- pub_covidcast( source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", @@ -337,15 +346,13 @@ Next we slide this forecaster over the working `epi_archive` object, in order to forecast COVID-19 case rates 7 days into the future. ```{r} -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) +fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month") z <- x %>% group_by(geo_value) %>% epix_slide( - fc = prob_arx(x = percent_cli, y = case_rate_7d_av), .before = 119, + fc = prob_arx(x = percent_cli, y = case_rate_7d_av, ahead = 7), + .before = 119, .versions = fc_time_values ) %>% ungroup() @@ -353,8 +360,6 @@ z <- x %>% head(z, 10) ``` - - We get back a tibble `z` with the grouping variables (here geo value), the (reference) time values, and a ["packed"][tidyr::pack] data frame column `fc` containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point @@ -377,22 +382,22 @@ points in time and forecast horizons. The former comes from using x_latest <- epix_as_of(x, x$versions_end) # Simple function to produce forecasts k weeks ahead -k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { +forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% epix_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .before = 119, .versions = fc_time_values ) %>% - mutate(target_date = .data$time_value + ahead, as_of = TRUE) %>% + mutate(target_date = .data$version + ahead, as_of = TRUE) %>% ungroup() } else { x_latest %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% epi_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .window_size = 120, - .versions = fc_time_values + .ref_time_values = fc_time_values ) %>% mutate(target_date = .data$time_value + ahead, as_of = FALSE) %>% ungroup() @@ -401,14 +406,14 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { # Generate the forecasts, and bind them together fc <- bind_rows( - k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE) + forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) ) # Plot them, on top of latest COVID-19 case rates @@ -447,9 +452,250 @@ to look for more robust forecasting methodology. The forecasters that appear in the vignettes in the current package are only meant to demo the slide functionality with some of the most basic forecasting methodology possible. +## Sliding version-aware computations with geo-pooling + +First, we fetch the versioned data and build the archive. + +```{r, message = FALSE, warning = FALSE, eval =FALSE} +library(epidatr) +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +y1 <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) + +y2 <- pub_covidcast( + source = "jhu-csse", + signal = "confirmed_7dav_incidence_prop", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) + +x <- y1 %>% + select(geo_value, time_value, + version = issue, + percent_cli = value + ) %>% + as_epi_archive(compactify = FALSE) + +# mutating merge operation: +x <- epix_merge( + x, + y2 %>% + select(geo_value, time_value, + version = issue, + case_rate_7d_av = value + ) %>% + as_epi_archive(compactify = FALSE), + sync = "locf", + compactify = FALSE +) +``` + +```{r, message = FALSE, echo =FALSE} +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +x <- archive_cases_dv_subset$DT %>% + filter(geo_value %in% c("ca", "fl")) %>% + as_epi_archive(compactify = FALSE) +``` + +Next, we extend the ARX function to handle multiple geo values, since in the +present case, we will not be grouping by geo value and each slide computation +will be run on multiple geo values at once. Note that, because `epix_slide()` +only returns the grouping variables, `time_value`, and the slide computations in +the eventual returned tibble, we need to include `geo_value` as a column in the +output data frame from our ARX computation. + +```{r} +library(tidyr) +library(purrr) + +prob_arx_args <- function(lags = c(0, 7, 14), + ahead = 7, + min_train_window = 20, + lower_level = 0.05, + upper_level = 0.95, + symmetrize = TRUE, + intercept = FALSE, + nonneg = TRUE) { + return(list( + lags = lags, + ahead = ahead, + min_train_window = min_train_window, + lower_level = lower_level, + upper_level = upper_level, + symmetrize = symmetrize, + intercept = intercept, + nonneg = nonneg + )) +} + +prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { + # Return NA if insufficient training data + if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = NA, lower = NA, upper = NA + )) + } + + # Set up x, y, lags list + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } + if (!is.list(args$lags)) args$lags <- list(args$lags) + args$lags <- rep(args$lags, length.out = ncol(x)) + + # Build features and response for the AR model, and then fit it + dat <- tibble(i = seq_len(ncol(x)), lag = args$lags) %>% + unnest(lag) %>% + mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter + # One list element for each lagged feature + pmap(function(i, lag, name) { + tibble( + geo_value = geo_value, + time_value = time_value + lag, # Shift back + !!name := x[, i] + ) + }) %>% + # One list element for the response vector + c(list( + tibble( + geo_value = geo_value, + time_value = time_value - args$ahead, # Shift forward + y = y + ) + )) %>% + # Combine them together into one data frame + reduce(full_join, by = c("geo_value", "time_value")) %>% + arrange(time_value) + if (args$intercept) dat$x0 <- rep(1, nrow(dat)) + obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) + + # Use LOCF to fill NAs in the latest feature values (do this by geo value) + setDT(dat) # Convert to a data.table object by reference + cols <- setdiff(names(dat), c("geo_value", "time_value")) + dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] + + # Make predictions + test_time_value <- max(time_value) + point <- predict( + obj, + newdata = dat %>% + dplyr::group_by(geo_value) %>% + dplyr::filter(time_value == test_time_value) + ) + + # Compute bands + r <- residuals(obj) + s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? + q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) + lower <- point + q[1] + upper <- point + q[2] + + # Clip at zero if we need to, then return + if (args$nonneg) { + point <- pmax(point, 0) + lower <- pmax(lower, 0) + upper <- pmax(upper, 0) + } + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = point, lower = lower, upper = upper + )) +} +``` + +We now make forecasts on the archive and compare to forecasts on the latest +data. + +```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} +# Latest snapshot of data, and forecast dates +x_latest <- epix_as_of(x, version = max(x$DT$version)) +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), + by = "1 month" +) + +# Simple function to produce forecasts k weeks ahead +forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { + if (as_of) { + x %>% + epix_slide( + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, + args = prob_arx_args(ahead = ahead) + ), + .before = 219, .versions = fc_time_values + ) %>% + mutate( + target_date = .data$version + ahead, as_of = TRUE, + geo_value = .data$fc$geo_value + ) + } else { + x_latest %>% + epi_slide( + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, + args = prob_arx_args(ahead = ahead) + ), + .window_size = 220, .ref_time_values = fc_time_values + ) %>% + mutate(target_date = .data$time_value + ahead, as_of = FALSE) + } +} + +# Generate the forecasts, and bind them together +fc <- bind_rows( + forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), + forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), + forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) +) + +# Plot them, on top of latest COVID-19 case rates +ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + + geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc$point)) + + geom_point(aes(y = fc$point), size = 0.5) + + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + + facet_grid(vars(geo_value), vars(as_of), scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") +``` + +We can see that these forecasts, which come from training an ARX model jointly +over CA and FL, exhibit generally less variability and wider prediction bands +compared to the ones from the archive vignette, which come from training a +separate ARX model on each state. As in the archive vignette, we can see a +difference between version-aware (right column) and -unaware (left column) +forecasting, as well. + ## Attribution + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. - - diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index e6c78aba4..b1840bb2e 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -128,9 +128,7 @@ columns required for an `epi_df` object (along with many others). We can use frame into `epi_df` format. ```{r, message = FALSE} -x <- as_epi_df(cases, - as_of = max(cases$issue) -) %>% +x <- as_epi_df(cases, as_of = max(cases$issue)) %>% select(geo_value, time_value, total_cases = value) class(x) @@ -176,9 +174,11 @@ attributes(x)$metadata ``` ## Using additional key columns in `epi_df` + In the following examples we will show how to create an `epi_df` with additional keys. ### Converting a `tsibble` that has county code as an extra key + ```{r} ex1 <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), @@ -200,10 +200,10 @@ The metadata now includes `county_code` as an extra key. attr(ex1, "metadata") ``` - ### Dealing with misspecified column names `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. + ```{r, error = TRUE} data.frame( # misnamed @@ -211,12 +211,13 @@ data.frame( # extra key pol = rep(c("blue", "swing", "swing"), each = 3), # misnamed - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = 9), + value = 1:9 + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, 9)) ) %>% as_epi_df(as_of = as.Date("2024-03-20")) ``` The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. + ```{r} ex2 <- tibble( # misnamed @@ -240,7 +241,6 @@ ex2 <- ex2 %>% attr(ex2, "metadata") ``` - ### Adding additional keys to an `epi_df` object In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index abef646fc..acbb53eee 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -22,6 +22,7 @@ library(tidyr) ``` The data is fetched with the following query: + ```{r, message = FALSE, eval=F} x <- pub_covidcast( source = "jhu-csse", @@ -38,7 +39,6 @@ x <- pub_covidcast( The data has 1,158 rows and 3 columns. - ```{r, echo=FALSE} data(jhu_csse_daily_subset) x <- jhu_csse_daily_subset %>% diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index ea3c30ac7..1a2cfa416 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -127,11 +127,14 @@ vote across the base methods to determine whether a value is an outlier. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(outlier_info = detect_outlr( - x = time_value, y = cases, - methods = detection_methods, - combiner = "median" - )) %>% + mutate( + outlier_info = detect_outlr( + x = time_value, + y = cases, + methods = detection_methods, + combiner = "median" + ) + ) %>% ungroup() %>% unnest(outlier_info) @@ -240,10 +243,9 @@ ggplot(y, aes(x = time_value)) + More advanced correction functionality will be coming at some point in the future. - ## Attribution + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 7ec6cc9bf..92d8456d3 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -11,21 +11,19 @@ A central tool in the `epiprocess` package is `epi_slide()`, which is based on the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In `epiprocess`, to "slide" means to apply a computation---represented as a -function or formula---over a sliding/rolling data window. Suitable -groupings can always be achieved by a preliminary call to `group_by()`. +function or formula---over a sliding/rolling data window. The function always +applies the slide inside each group and the grouping is assumed to be across all +group keys of the `epi_df` (this is the grouping used by default if you do not +group the `epi_df` with a `group_by()`). -By default, the meaning of one time step is inferred from the `time_value` -column of the `epi_df` object under consideration, based on the way this column -understands addition and subtraction. For example, if the time values are coded -as `Date` objects, then one time step is one day, since `as.Date("2022-01-01") + -1` equals `as.Date("2022-01-02")`. Alternatively, the time step can be specified -manually in the call to `epi_slide()`; you can read the documentation for more -details. +By default, the `.window_size` units depend on the `time_type` of the `epi_df`, +which is determined from the types in the `time_value` column of the `epi_df`. +See the "Details" in `epi_slide()` for more. As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the -[`epidatr`](https://github.com/cmu-delphi/epidatr) package, -and then convert this to `epi_df` format. +[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert +this to `epi_df` format. ```{r, message = FALSE, warning=FALSE} library(epidatr) @@ -34,8 +32,9 @@ library(dplyr) ``` The data is fetched with the following query: + ```{r, message = FALSE, eval=F} -x <- pub_covidcast( +edf <- pub_covidcast( source = "jhu-csse", signals = "confirmed_incidence_num", geo_type = "state", @@ -52,99 +51,106 @@ The data has 2,684 rows and 3 columns. ```{r, echo=FALSE} data(jhu_csse_daily_subset) -x <- jhu_csse_daily_subset %>% +edf <- jhu_csse_daily_subset %>% select(geo_value, time_value, cases) %>% arrange(geo_value, time_value) %>% as_epi_df() ``` -## Optimized rolling mean +## Optimized rolling mean and sums -We first demonstrate how to apply a 7-day trailing average to the daily cases -in order to smooth the signal, by passing in the name of the column(s) we -want to average for the first argument of `epi_slide_mean()`. `epi_slide_mean -()` can only be used for averaging. To do this computation per state, we -first call `group_by()`. +For the two most common sliding operations, we offer two optimized versions: +`epi_slide_mean()` and `epi_slide_sum()`. This example gets the 7-day trailing +average of the daily cases. Note that the name of the column(s) that we want to +average is specified as the first argument of `epi_slide_mean()`. ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide_mean("cases", .window_size = 7) %>% + epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) %>% ungroup() %>% head(10) ``` -The calculation is done using `data.table::frollmean`, whose behavior can be -adjusted by passing relevant arguments via `...`. +Note that we passed `na.rm = TRUE` to `data.table::frollmean()` via `...` to +`epi_slide_mean`. + +The following computes the 7-day trailing sum of daily cases (and passed `na.rm` +to `data.table::frollsum()` similarly): + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) %>% + ungroup() %>% + head(10) +``` -## Slide with a formula +## General sliding with a formula -The previous computation can also be performed using `epi_slide()`, which is -more flexible but quite a bit slower than `epi_slide_mean()`. It is -recommended to use `epi_slide_mean()` when possible. +The previous computations can also be performed using `epi_slide()`, which can +be used for more general sliding computations (but is much slower for the +specific cases of mean and sum). The same 7-day trailing average of daily cases can be computed by passing in a -formula for the first argument of `epi_slide()`. To do this per state, we -first call `group_by()`. +formula for the first argument of `epi_slide()`: ```{r} -x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), .window_size = 7) %>% + epi_slide(~ mean(.x$cases, na.rm = TRUE), .window_size = 7) %>% ungroup() %>% head(10) ``` -The formula specified has access to all non-grouping columns present in the -original `epi_df` object (and must refer to them with the prefix `.x$`). As we -can see, the function `epi_slide()` returns an `epi_df` object with a new column -appended that contains the results (from sliding), named `slide_value` as the -default. We can of course change this post hoc, or we can instead specify a new -name up front using the `.new_col_name` argument: +If your formula returns a data.frame, then the columns of the data.frame +will be unpacked into the resulting `epi_df`. For example, the following +computes the 7-day trailing average of daily cases and the 7-day trailing sum of +daily cases: ```{r} -x <- x %>% +edf %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), .window_size = 7, .new_col_name = "cases_7dav") %>% - ungroup() - -head(x, 10) + epi_slide( + ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sum = sum(.x$cases, na.rm = TRUE)), + .window_size = 7 + ) %>% + ungroup() %>% + head(10) ``` +Note that this formula has access to all non-grouping columns present in the +original `epi_df` object and must refer to them with the prefix `.x$...`. As we +can see, the function `epi_slide()` returns an `epi_df` object with a new column +appended that contains the results (from sliding), named `slide_value` as the +default. + Some other information is available in additional variables: * `.group_key` is a one-row tibble containing the values of the grouping variables for the associated group * `.ref_time_value` is the reference time value the time window was based on -Like in `group_modify()`, there are alternative names for these variables as -well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` -instead of `.ref_time_value`. - -## Slide with a function - -We can also pass a function for the first argument in `epi_slide()`. In this -case, the passed function must accept the following arguments: - -In this case, the passed function `.f` must accept the following arguments: a -data frame with the same column names as the original object, minus any grouping -variables, containing the time window data for one group-`.ref_time_value` -combination; followed by a one-row tibble containing the values of the grouping -variables for the associated group; followed by the associated `.ref_time_value`. -It can accept additional arguments; `epi_slide()` will forward any `...` args it -receives to `.f`. - -Recreating the last example of a 7-day trailing average: - ```{r} -x <- x %>% +# Returning geo_value in the formula +edf %>% group_by(geo_value) %>% - epi_slide(function(x, gk, rtv) mean(x$cases), .window_size = 7, .new_col_name = "cases_7dav") %>% - ungroup() + epi_slide(~ .x$geo_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) -head(x, 10) +# Returning time_value in the formula +edf %>% + group_by(geo_value) %>% + epi_slide(~ .x$time_value[[1]], .window_size = 7) %>% + ungroup() %>% + head(10) ``` +While the computations above do not look very useful, these can be used as +building blocks for computations that do something different depending on the +geo_value or ref_time_value. + ## Slide the tidy way Perhaps the most convenient way to setup a computation in `epi_slide()` is to @@ -154,15 +160,17 @@ to a computation in which we can access any columns of `.x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} -x <- x %>% +slide_output <- edf %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% - ungroup() - -head(x, 10) + epi_slide(cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) ``` -In addition to referring to individual columns by name, you can refer to the -time window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse "pronouns" `.data` and `.env` can also be used. + +In addition to referring to individual columns by name, you can refer to +`epi_df` time window as `.x` (`.group_key` and `.ref_time_value` are still +available). Also, the tidyverse "pronouns" `.data` and `.env` can also be used +if you need distinguish between the data and environment. As a simple sanity check, we visualize the 7-day trailing averages computed on top of the original counts: @@ -171,7 +179,7 @@ top of the original counts: library(ggplot2) theme_set(theme_bw()) -ggplot(x, aes(x = time_value)) + +ggplot(slide_output, aes(x = time_value)) + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + facet_wrap(~geo_value, scales = "free_y") + @@ -182,18 +190,40 @@ ggplot(x, aes(x = time_value)) + As we can see from the top right panel, it looks like Texas moved to weekly reporting of COVID-19 cases in summer of 2021. -## Running a local forecaster +## Slide with a function + +We can also pass a function to the second argument in `epi_slide()`. In this +case, the passed function `.f` must have the form `function(x, g, t, ...)`, +where -As a more complex example, we create a forecaster based on a local (in time) -autoregression or AR model. AR models can be fit in numerous ways (using base R -functions and various packages), but here we define it "by hand" both because it -provides a more advanced example of sliding a function over an `epi_df` object, -and because it allows us to be a bit more flexible in defining a *probabilistic* -forecaster: one that outputs not just a point prediction, but a notion of -uncertainty around this. In particular, our forecaster will output a point -prediction along with an 90\% uncertainty band, represented by a predictive -quantiles at the 5\% and 95\% levels (lower and upper endpoints of the -uncertainty band). +- "x" is an epi_df with the same column names as the archive's `DT`, minus + the `version` column +- "g" is a one-row tibble containing the values of the grouping variables +for the associated group +- "t" is the ref_time_value for the current window +- "..." are additional arguments + +Recreating the last example of a 7-day trailing average: + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide(function(x, g, t) mean(x$cases, na.rm = TRUE), .window_size = 7) %>% + ungroup() %>% + head(10) +``` + +## Running a simple autoregressive forecaster + +As a more complex example, we create a forecaster based on an autoregression or +AR model. AR models can be fit in numerous ways (using base R functions and +various packages), but here we define it "by hand" both because it provides a +more advanced example of sliding a function over an `epi_df` object, and because +it allows us to be a bit more flexible in defining a *probabilistic* forecaster: +one that outputs not just a point prediction, but a notion of uncertainty around +this. In particular, our forecaster will output a point prediction along with an +90\% uncertainty band, represented by a predictive quantiles at the 5\% and 95\% +levels (lower and upper endpoints of the uncertainty band). The function defined below, `prob_ar()`, is our probabilistic AR forecaster. The `lags`argument indicates which lags to use in the model, and `ahead` indicates @@ -210,6 +240,9 @@ prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, return(data.frame(point = NA, lower = NA, upper = NA)) } + # Filter down the edge-NAs + y <- y[!is.na(y)] + # Build features and response for the AR model dat <- do.call( data.frame, @@ -246,29 +279,21 @@ scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, to modeling weekly sums of COVID-19 cases. ```{r} -fc_time_values <- seq(as.Date("2020-06-01"), - as.Date("2021-12-01"), - by = "1 months" -) -x %>% +fc_time_values <- seq(as.Date("2020-06-01"), as.Date("2021-12-01"), by = "1 months") +edf %>% group_by(geo_value) %>% - epi_slide( - fc = prob_ar(cases_7dav), .window_size = 120, - .ref_time_values = fc_time_values - ) %>% + epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% + epi_slide(fc = prob_ar(.data$cases_7dav), .window_size = 120, .ref_time_values = fc_time_values) %>% ungroup() %>% head(10) ``` Note that here we have utilized an argument `.ref_time_values` to perform the sliding computation (here, compute a forecast) at a specific subset of reference -time values. We get out a ["packed"][tidyr::pack] data frame column `fc` -containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point -forecast, and the lower and upper endpoints of the 95\% prediction band, -respectively. (We could also have used `, prob_ar(cases_7dav)` to get three -separate columns `point`, `lower`, and `upper`, or used `fc = -list(prob_ar(cases_7dav))` to make an `fc` column with a ["nested"][tidyr::nest] -format (list of data frames) instead.) +time values (the start of every month from mid 2020 to the end of 2021). The +resulting epi_df now contains three new columns: `fc$point`, `fc$lower`, and +`fc$upper` corresponding to the point forecast, and the lower and upper +endpoints of the 95\% prediction band, respectively. To finish off, we plot the forecasts at some times (spaced out by a few months) over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do @@ -279,10 +304,13 @@ so that we can call it a few times. # Note the use of .all_rows = TRUE (keeps all original rows in the output) k_week_ahead <- function(x, ahead = 7) { x %>% - group_by(.data$geo_value) %>% + group_by(geo_value) %>% + epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% epi_slide( fc = prob_ar(.data$cases_7dav, ahead = ahead), - .window_size = 120, .ref_time_values = fc_time_values, .all_rows = TRUE + .window_size = 120, + .ref_time_values = fc_time_values, + .all_rows = TRUE ) %>% ungroup() %>% mutate(target_date = .data$time_value + ahead) @@ -290,10 +318,10 @@ k_week_ahead <- function(x, ahead = 7) { # First generate the forecasts, and bind them together z <- bind_rows( - k_week_ahead(x, ahead = 7), - k_week_ahead(x, ahead = 14), - k_week_ahead(x, ahead = 21), - k_week_ahead(x, ahead = 28) + k_week_ahead(edf, ahead = 7), + k_week_ahead(edf, ahead = 14), + k_week_ahead(edf, ahead = 21), + k_week_ahead(edf, ahead = 28) ) # Now plot them, on top of actual COVID-19 case counts @@ -341,8 +369,10 @@ example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). ## Attribution + +The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. + This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - +These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes.