From c4abb04470f9728083c13f6ea4fc3ff88c68ac28 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Nov 2023 14:40:56 -0800 Subject: [PATCH 1/4] refactor+style: update for epidatr 1.0.0 --- data-raw/archive_cases_dv_subset.R | 31 ++-- data-raw/incidence_num_outlier_example.R | 9 +- data-raw/jhu_csse_county_level_subset.R | 9 +- data-raw/jhu_csse_daily_subset.R | 59 +++---- vignettes/advanced.Rmd | 166 +++++++++--------- vignettes/aggregation.Rmd | 101 ++++++----- vignettes/archive.Rmd | 214 ++++++++++++----------- vignettes/correlation.Rmd | 59 ++++--- vignettes/epiprocess.Rmd | 149 +++++++++------- vignettes/growth_rate.Rmd | 121 +++++++------ vignettes/outliers.Rmd | 138 +++++++++------ vignettes/slide.Rmd | 143 ++++++++------- 12 files changed, 642 insertions(+), 557 deletions(-) diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index d907fd89..5ba7ac4b 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -3,41 +3,40 @@ library(epiprocess) library(data.table) library(dplyr) -dv_subset <- covidcast( - data_source = "doctor-visits", +dv_subset <- pub_covidcast( + source = "doctor-visits", signals = "smoothed_adj_cli", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% # We're using compactify=FALSE here and below to avoid some testthat test # failures on tests that were based on a non-compactified version. - as_epi_archive(compactify=FALSE) + as_epi_archive(compactify = FALSE) -case_rate_subset <- covidcast( - data_source = "jhu-csse", +case_rate_subset <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify=FALSE) + as_epi_archive(compactify = FALSE) -archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, - sync="locf", - compactify=FALSE) +archive_cases_dv_subset <- epix_merge(dv_subset, case_rate_subset, + sync = "locf", + compactify = FALSE +) # If we directly store an epi_archive R6 object as data, it will store its class # implementation there as well. To prevent mismatches between these stored # implementations and the latest class definition, don't store them as R6 # objects; store the DT and construct the R6 object on request. -archive_cases_dv_subset_dt = archive_cases_dv_subset$DT +archive_cases_dv_subset_dt <- archive_cases_dv_subset$DT usethis::use_data(archive_cases_dv_subset_dt, overwrite = TRUE, internal = TRUE) diff --git a/data-raw/incidence_num_outlier_example.R b/data-raw/incidence_num_outlier_example.R index 0aea397b..a5cb4d89 100644 --- a/data-raw/incidence_num_outlier_example.R +++ b/data-raw/incidence_num_outlier_example.R @@ -3,16 +3,15 @@ library(epiprocess) library(dplyr) library(tidyr) -incidence_num_outlier_example <- covidcast( - data_source = "jhu-csse", +incidence_num_outlier_example <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20210531), + time_type = "day", geo_values = "fl,nj", + time_values = epirange(20200601, 20210531), as_of = 20211028 ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() diff --git a/data-raw/jhu_csse_county_level_subset.R b/data-raw/jhu_csse_county_level_subset.R index 61328423..faed75e8 100644 --- a/data-raw/jhu_csse_county_level_subset.R +++ b/data-raw/jhu_csse_county_level_subset.R @@ -9,15 +9,14 @@ y <- covidcast::county_census %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) # Fetch only counties from Massachusetts and Vermont, then append names columns as well -jhu_csse_county_level_subset <- covidcast( - data_source = "jhu-csse", +jhu_csse_county_level_subset <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "county", + time_type = "day", + geo_values = paste(y$geo_value, collapse = ","), time_values = epirange(20200601, 20211231), - geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() diff --git a/data-raw/jhu_csse_daily_subset.R b/data-raw/jhu_csse_daily_subset.R index ce94ff2e..14ca85c8 100644 --- a/data-raw/jhu_csse_daily_subset.R +++ b/data-raw/jhu_csse_daily_subset.R @@ -2,61 +2,60 @@ library(epidatr) library(epiprocess) library(dplyr) -confirmed_7dav_incidence_prop <- covidcast( - data_source = "jhu-csse", +confirmed_7dav_incidence_prop <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, case_rate_7d_av = value) %>% - arrange(geo_value, time_value) + arrange(geo_value, time_value) -deaths_7dav_incidence_prop <- covidcast( - data_source = "jhu-csse", +deaths_7dav_incidence_prop <- pub_covidcast( + source = "jhu-csse", signals = "deaths_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, death_rate_7d_av = value) %>% - arrange(geo_value, time_value) + arrange(geo_value, time_value) -confirmed_incidence_num <- covidcast( - data_source = "jhu-csse", +confirmed_incidence_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% - arrange(geo_value, time_value) + arrange(geo_value, time_value) -confirmed_7dav_incidence_num <- covidcast( - data_source = "jhu-csse", +confirmed_7dav_incidence_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases_7d_av = value) %>% arrange(geo_value, time_value) jhu_csse_daily_subset <- confirmed_7dav_incidence_prop %>% - full_join(deaths_7dav_incidence_prop, - by = c("geo_value", "time_value")) %>% - full_join(confirmed_incidence_num, - by = c("geo_value", "time_value")) %>% - full_join(confirmed_7dav_incidence_num, - by = c("geo_value", "time_value")) %>% - as_epi_df() + full_join(deaths_7dav_incidence_prop, + by = c("geo_value", "time_value") + ) %>% + full_join(confirmed_incidence_num, + by = c("geo_value", "time_value") + ) %>% + full_join(confirmed_7dav_incidence_num, + by = c("geo_value", "time_value") + ) %>% + as_epi_df() usethis::use_data(jhu_csse_daily_subset, overwrite = TRUE) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 02288905..91f8f37f 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -75,11 +75,11 @@ behavior we demonstrate also carries over to `epix_slide()`. ## Recycling outputs -When a computation returns a single atomic value, `epi_slide()` will internally -try to recycle the output so that it is size stable (in the sense described +When a computation returns a single atomic value, `epi_slide()` will internally +try to recycle the output so that it is size stable (in the sense described above). We can use this to our advantage, for example, in order to compute a trailing average marginally over geo values, which we demonstrate below in a -simple synthetic example. +simple synthetic example. ```{r message = FALSE} library(epiprocess) @@ -94,32 +94,32 @@ edf <- tibble( as_epi_df() # 2-day trailing average, per geo value -edf %>% +edf %>% group_by(geo_value) %>% epi_slide(x_2dav = mean(x), before = 1) %>% ungroup() -# 2-day trailing average, marginally -edf %>% +# 2-day trailing average, marginally +edf %>% epi_slide(x_2dav = mean(x), before = 1) ``` ```{r, include = FALSE} # More checks (not included) -edf %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) -edf %>% +edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% + mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() -edf %>% +edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% + mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% @@ -127,35 +127,35 @@ edf %>% ``` When the slide computation returns an atomic vector (rather than a single value) -`epi_slide()` checks whether its return length ensures size stability, and if +`epi_slide()` checks whether its return length ensures size stability, and if so, uses it to fill the new column. For example, this next computation gives the same result as the last one. ```{r} -edf %>% +edf %>% epi_slide(y_2dav = rep(mean(x), 3), before = 1) ``` However, if the output is an atomic vector (rather than a single value) and it -is *not* size stable, then `epi_slide()` throws an error. For example, below we +is *not* size stable, then `epi_slide()` throws an error. For example, below we are trying to return 2 things for 3 states. ```{r, error = TRUE} -edf %>% +edf %>% epi_slide(x_2dav = rep(mean(x), 2), before = 1) ``` ## Multi-column outputs -Now we move on to outputs that are data frames with a single row but multiple -columns. Working with this type of output structure has in fact has already been +Now we move on to outputs that are data frames with a single row but multiple +columns. Working with this type of output structure has in fact has already been demonstrated in the [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). When -we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` +we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` object returned by `epi_slide()` has a list column containing the slide values. ```{r} -edf2 <- edf %>% +edf2 <- edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = TRUE) %>% @@ -166,15 +166,15 @@ length(edf2$a) edf2$a[[2]] ``` -When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function -unnests (in the sense of `tidyr::unnest()`) the list column `a`, so that the +When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function +unnests (in the sense of `tidyr::unnest()`) the list column `a`, so that the resulting `epi_df` has multiple new columns containing the slide values. The -default is to name these unnested columns by prefixing the name assigned to the +default is to name these unnested columns by prefixing the name assigned to the list column (here `a`) onto the column names of the output data frame from the slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} -edf %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE) %>% @@ -185,7 +185,7 @@ We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop the prefix associated with list column name, in naming the unnested columns. ```{r} -edf %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) %>% @@ -196,20 +196,20 @@ Furthermore, `epi_slide()` will recycle the single row data frame as needed in order to make the result size stable, just like the case for atomic values. ```{r} -edf %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) ``` ```{r, include = FALSE} # More checks (not included) -edf %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL) -edf %>% - mutate(version = time_value) %>% +edf %>% + mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), @@ -220,8 +220,8 @@ edf %>% ## Multi-row outputs -For a slide computation that outputs a data frame with more than one row, the -behavior is analogous to a slide computation that outputs an atomic vector. +For a slide computation that outputs a data frame with more than one row, the +behavior is analogous to a slide computation that outputs an atomic vector. Meaning, `epi_slide()` will check that the result is size stable, and if so, will fill the new column(s) in the resulting `epi_df` object appropriately. @@ -241,9 +241,9 @@ edf %>% obj <- lm(y ~ x, data = d) return( as.data.frame( - predict(obj, newdata = d %>% + predict(obj, newdata = d %>% group_by(geo_value) %>% - filter(time_value == max(time_value)), + filter(time_value == max(time_value)), interval = "prediction", level = 0.9) )) }, before = 1, new_col_name = "fc", names_sep = NULL) @@ -279,24 +279,24 @@ library(ggplot2) theme_set(theme_bw()) y1 <- covidcast( - data_source = "doctor-visits", + source = "doctor-visits", signals = "smoothed_adj_cli", - time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl", + time_value = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) -) %>% fetch() +) -y2 <- covidcast( - data_source = "jhu-csse", +y2 <- pub_covidcast( + source = "jhu-csse", signal = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl", + time_value = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) -) %>% fetch() +) x <- y1 %>% select(geo_value, time_value, @@ -328,48 +328,48 @@ x <- archive_cases_dv_subset$DT %>% ``` -Next, we extend the ARX function to handle multiple geo values, since in the +Next, we extend the ARX function to handle multiple geo values, since in the present case, we will not be grouping by geo value and each slide computation -will be run on multiple geo values at once. Note that, because `epix_slide()` +will be run on multiple geo values at once. Note that, because `epix_slide()` only returns the grouping variables, `time_value`, and the slide computations in -the eventual returned tibble, we need to include `geo_value` as a column in the +the eventual returned tibble, we need to include `geo_value` as a column in the output data frame from our ARX computation. ```{r} library(tidyr) library(purrr) -prob_arx_args <- function(lags = c(0, 7, 14), - ahead = 7, +prob_arx_args <- function(lags = c(0, 7, 14), + ahead = 7, min_train_window = 20, - lower_level = 0.05, - upper_level = 0.95, - symmetrize = TRUE, + lower_level = 0.05, + upper_level = 0.95, + symmetrize = TRUE, intercept = FALSE, nonneg = TRUE) { - return(list(lags = lags, - ahead = ahead, + return(list(lags = lags, + ahead = ahead, min_train_window = min_train_window, lower_level = lower_level, upper_level = upper_level, - symmetrize = symmetrize, + symmetrize = symmetrize, intercept = intercept, nonneg = nonneg)) } -prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { +prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Return NA if insufficient training data if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { return(data.frame(geo_value = unique(geo_value), # Return geo value! point = NA, lower = NA, upper = NA)) } - + # Set up x, y, lags list if (!missing(x)) x <- data.frame(x, y) else x <- data.frame(y) if (!is.list(args$lags)) args$lags <- list(args$lags) args$lags = rep(args$lags, length.out = ncol(x)) - + # Build features and response for the AR model, and then fit it dat <- tibble(i = 1:ncol(x), lag = args$lags) %>% @@ -378,43 +378,43 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # One list element for each lagged feature pmap(function(i, lag, name) { tibble(geo_value = geo_value, - time_value = time_value + lag, # Shift back + time_value = time_value + lag, # Shift back !!name := x[,i]) - }) %>% + }) %>% # One list element for the response vector c(list( tibble(geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward + time_value = time_value - args$ahead, # Shift forward y = y))) %>% # Combine them together into one data frame reduce(full_join, by = c("geo_value", "time_value")) %>% arrange(time_value) if (args$intercept) dat$x0 = rep(1, nrow(dat)) obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) - + # Use LOCF to fill NAs in the latest feature values (do this by geo value) setDT(dat) # Convert to a data.table object by reference cols <- setdiff(names(dat), c("geo_value", "time_value")) dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] - + # Make predictions test_time_value = max(time_value) - point <- predict(obj, newdata = dat %>% + point <- predict(obj, newdata = dat %>% dplyr::group_by(geo_value) %>% dplyr::filter(time_value == test_time_value)) - + # Compute bands r <- residuals(obj) s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) lower <- point + q[1] upper <- point + q[2] - + # Clip at zero if we need to, then return - if (args$nonneg) { - point = pmax(point, 0) - lower = pmax(lower, 0) - upper = pmax(upper, 0) + if (args$nonneg) { + point = pmax(point, 0) + lower = pmax(lower, 0) + upper = pmax(upper, 0) } return(data.frame(geo_value = unique(geo_value), # Return geo value! point = point, lower = lower, upper = upper)) @@ -422,31 +422,31 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { ``` We now make forecasts on the archive and compare to forecasts on the latest -data. +data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), by = "1 month") # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), + epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead)), before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = TRUE, + mutate(target_date = time_value + ahead, as_of = TRUE, geo_value = fc_geo_value) } else { - x_latest %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), + x_latest %>% + epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead)), before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = FALSE) + mutate(target_date = time_value + ahead, as_of = FALSE) } } @@ -460,22 +460,22 @@ fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), k_week_ahead(x, ahead = 21, as_of = FALSE), k_week_ahead(x, ahead = 28, as_of = FALSE)) -# Plot them, on top of latest COVID-19 case rates +# Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), + geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), inherit.aes = FALSE, color = "gray50") + - geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + + geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") ``` We can see that these forecasts, which come from training an ARX model jointly over CA and FL, exhibit generally less variability and wider prediction bands -compared to the ones from the archive vignette, which come from training a +compared to the ones from the archive vignette, which come from training a separate ARX model on each state. As in the archive vignette, we can see a difference between version-aware (right column) and -unaware (left column) forecasting, as well. diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index bdf6279e..205ed084 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -9,7 +9,7 @@ vignette: > Aggregation, both time-wise and geo-wise, are common tasks when working with epidemiological data sets. This vignette demonstrates how to carry out these -kinds of tasks with `epi_df` objects. We'll work with county-level reported +kinds of tasks with `epi_df` objects. We'll work with county-level reported COVID-19 cases in MA and VT. ```{r, message = FALSE, eval= FALSE, warning= FALSE} @@ -24,15 +24,14 @@ y <- covidcast::county_census %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) # Fetch only counties from Massachusetts and Vermont, then append names columns as well -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "county", + time_type = "day", + geo_values = paste(y$geo_value, collapse = ","), time_values = epirange(20200601, 20211231), - geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() @@ -47,7 +46,7 @@ library(epiprocess) library(dplyr) data(jhu_csse_county_level_subset) -x <- jhu_csse_county_level_subset +x <- jhu_csse_county_level_subset ``` ## Converting to `tsibble` format @@ -59,7 +58,7 @@ basically a tibble (data frame) but with two specially-marked columns: an **index** column representing the time variable (defining an order from past to present), and a **key** column identifying a unique observational unit for each time point. In fact, the key can be made up of any number of columns, not just a -single one. +single one. In an `epi_df` object, the index variable is `time_value`, and the key variable is typically `geo_value` (though this need not always be the case: for example, @@ -101,7 +100,7 @@ head(as_tsibble(x, key = c("county_name", "state_name"))) ``` ## Detecting and filling time gaps - + One of the major advantages of the `tsibble` package is its ability to handle **implicit gaps** in time series data. In other words, it can infer what time scale we're interested in (say, daily data), and detect apparent gaps (say, when @@ -113,13 +112,15 @@ Let's first remove certain dates from our data set to create gaps: ```{r} # First make geo value more readable for tables, plots, etc. -x <- x %>% - mutate(geo_value = paste( - substr(county_name, 1, nchar(county_name) - 7), - name_to_abbr(state_name), sep = ", ")) %>% - select(geo_value, time_value, cases) - -xt <- as_tsibble(x) %>% filter(cases >= 3) +x <- x %>% + mutate(geo_value = paste( + substr(county_name, 1, nchar(county_name) - 7), + name_to_abbr(state_name), + sep = ", " + )) %>% + select(geo_value, time_value, cases) + +xt <- as_tsibble(x) %>% filter(cases >= 3) ``` The functions `has_gaps()`, `scan_gaps()`, `count_gaps()` in the `tsibble` @@ -131,20 +132,24 @@ head(scan_gaps(xt)) head(count_gaps(xt)) ``` -We can also visualize the patterns of missingness: +We can also visualize the patterns of missingness: ```{r, message = FALSE, warning = FALSE} library(ggplot2) theme_set(theme_bw()) -ggplot(count_gaps(xt), - aes(x = reorder(geo_value, desc(geo_value)), - color = geo_value)) + - geom_linerange(aes(ymin = .from, ymax = .to)) + +ggplot( + count_gaps(xt), + aes( + x = reorder(geo_value, desc(geo_value)), + color = geo_value + ) +) + + geom_linerange(aes(ymin = .from, ymax = .to)) + geom_point(aes(y = .from)) + - geom_point(aes(y = .to)) + - coord_flip() + - labs(x = "County", y = "Date") + + geom_point(aes(y = .to)) + + coord_flip() + + labs(x = "County", y = "Date") + theme(legend.position = "none") ``` @@ -168,8 +173,8 @@ went back to June 6, 2020. By setting `.full = TRUE`, we can at zero-fill over the entire span of the observed (censored) data. ```{r} -xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) - +xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) + head(xt_filled) ``` @@ -186,45 +191,49 @@ running `epi_slide()` on the zero-filled data brings these trailing averages 2021. ```{r} -xt %>% +xt %>% as_epi_df() %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% - filter(geo_value == "Plymouth, MA", - abs(time_value - as.Date("2021-07-01")) <= 3) %>% + filter( + geo_value == "Plymouth, MA", + abs(time_value - as.Date("2021-07-01")) <= 3 + ) %>% print(n = 7) - -xt_filled %>% + +xt_filled %>% as_epi_df() %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% - filter(geo_value == "Plymouth, MA", - abs(time_value - as.Date("2021-07-01")) <= 3) %>% + filter( + geo_value == "Plymouth, MA", + abs(time_value - as.Date("2021-07-01")) <= 3 + ) %>% print(n = 7) ``` -## Aggregate to different time scales +## Aggregate to different time scales Continuing on with useful `tsibble` functionality, we can aggregate to different -time scales using `index_by()` from `tsibble`, which modifies the index variable +time scales using `index_by()` from `tsibble`, which modifies the index variable in the given object by applying a suitable time-coarsening transformation (say, -moving from days to weeks, or weeks to months, and so on). The most common use -case would be to follow up with a call to a `dplyr` verb like `summarize()` in +moving from days to weeks, or weeks to months, and so on). The most common use +case would be to follow up with a call to a `dplyr` verb like `summarize()` in order to perform some kind of aggregation of our measured variables over the new index variable. -Below, we use the functions `yearweek()` and `yearmonth()` that are provided in -the `tsibble` package in order to aggregate to weekly and monthly resolutions. -In the former call, we set `week_start = 7` to coincide with the CDC definition +Below, we use the functions `yearweek()` and `yearmonth()` that are provided in +the `tsibble` package in order to aggregate to weekly and monthly resolutions. +In the former call, we set `week_start = 7` to coincide with the CDC definition of an epiweek (epidemiological week). ```{r} # Aggregate to weekly xt_filled_week <- xt_filled %>% index_by(epiweek = ~ yearweek(., week_start = 7)) %>% - group_by(geo_value) %>% + group_by(geo_value) %>% summarize(cases = sum(cases, na.rm = TRUE)) head(xt_filled_week) @@ -232,19 +241,19 @@ head(xt_filled_week) # Aggregate to monthly xt_filled_month <- xt_filled_week %>% index_by(month = ~ yearmonth(.)) %>% - group_by(geo_value) %>% - summarize(cases = sum(cases, na.rm = TRUE)) + group_by(geo_value) %>% + summarize(cases = sum(cases, na.rm = TRUE)) head(xt_filled_month) ``` -## Geographic aggregation +## Geographic aggregation TODO ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index f14c8663..b351d684 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -33,16 +33,15 @@ library(dplyr) library(purrr) library(ggplot2) -dv <- covidcast( - data_source = "doctor-visits", +dv <- pub_covidcast( + source = "doctor-visits", signals = "smoothed_adj_cli", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) -) %>% fetch() - +) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} @@ -68,16 +67,16 @@ has (at least) the following columns: the data for January 14, 2022 that were available one day later. As we can see from the above, the data frame returned by -`epidatr::covidcast()` has the columns required for the `epi_archive` +`epidatr::pub_covidcast()` has the columns required for the `epi_archive` format, with `issue` playing the role of `version`. We can now use `as_epi_archive()` to bring it into `epi_archive` format. For removal of -redundant version updates in `as_epi_archive` using compactify, please refer -to the compactify vignette. +redundant version updates in `as_epi_archive` using compactify, please refer to +the compactify vignette. ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive(compactify=TRUE) + as_epi_archive(compactify = TRUE) class(x) print(x) @@ -85,8 +84,8 @@ print(x) ```{r, echo=FALSE, message=FALSE, warning=FALSE} x <- archive_cases_dv_subset$DT %>% - select(geo_value, time_value, version , percent_cli) %>% - as_epi_archive(compactify=TRUE) + select(geo_value, time_value, version, percent_cli) %>% + as_epi_archive(compactify = TRUE) class(x) print(x) @@ -107,22 +106,22 @@ for the data table, as well as any other specified in the metadata (described below). There can only be a single row per unique combination of key variables, and therefore the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version (also described below). - + ```{r, error=TRUE} key(x$DT) ``` - + In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions. **A word of caution:** R6 objects, unlike most other objects in R, have reference semantics. An important consequence of this is that objects are not copied when modified. - + ```{r} original_value <- x$DT$percent_cli[1] y <- x # This DOES NOT make a copy of x -y$DT$percent_cli[1] = 0 +y$DT$percent_cli[1] <- 0 head(y$DT) -head(x$DT) +head(x$DT) x$DT$percent_cli[1] <- original_value ``` @@ -133,7 +132,7 @@ x$clone()`. You can read more about reference semantics in Hadley Wickham's ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` -object: +object: * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. @@ -150,7 +149,7 @@ call (as it did in the case above). A key method of an `epi_archive` class is `as_of()`, which generates a snapshot of the archive in `epi_df` format. This represents the most up-to-date values of the signal variables as of a given version. This can be accessed via `x$as_of()` -for an `epi_archive` object `x`, but the package also provides a simple wrapper +for an `epi_archive` object `x`, but the package also provides a simple wrapper function `epix_as_of()` since this is likely a more familiar interface for users not familiar with R6 (or object-oriented programming). @@ -162,7 +161,7 @@ max(x_snapshot$time_value) attributes(x_snapshot)$metadata$as_of ``` -We can see that the max time value in the `epi_df` object `x_snapshot` that was +We can see that the max time value in the `epi_df` object `x_snapshot` that was generated from the archive is May 29, 2021, even though the specified version date was June 1, 2021. From this we can infer that the doctor's visits signal was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` @@ -171,7 +170,7 @@ object has the version date recorded in the `as_of` field. By default, using the maximum of the `version` column in the underlying data table in an `epi_archive` object itself generates a snapshot of the latest values of signal variables in the entire archive. The `epix_as_of()` function issues a warning in -this case, since updates to the current version may still come in at a later +this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} @@ -180,44 +179,48 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) Below, we pull several snapshots from the archive, spaced one month apart. We overlay the corresponding signal curves as colored lines, with the version dates -marked by dotted vertical lines, and draw the latest curve in black (from the +marked by dotted vertical lines, and draw the latest curve in black (from the latest snapshot `x_latest` that the archive can provide). ```{r, fig.width = 8, fig.height = 7} theme_set(theme_bw()) -self_max = max(x$DT$version) -versions = seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") -snapshots <- map_dfr(versions, function(v) { +self_max <- max(x$DT$version) +versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") +snapshots <- map_dfr(versions, function(v) { epix_as_of(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows(x_latest %>% mutate(version = self_max)) %>% mutate(latest = version == self_max) -ggplot(snapshots %>% filter(!latest), - aes(x = time_value, y = percent_cli)) + - geom_line(aes(color = factor(version)), na.rm=TRUE) + +ggplot( + snapshots %>% filter(!latest), + aes(x = time_value, y = percent_cli) +) + + geom_line(aes(color = factor(version)), na.rm = TRUE) + geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + - facet_wrap(~ geo_value, scales = "free_y", ncol = 1) + + facet_wrap(~geo_value, scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "% of doctor's visits with CLI") + + labs(x = "Date", y = "% of doctor's visits with CLI") + theme(legend.position = "none") + - geom_line(data = snapshots %>% filter(latest), - aes(x = time_value, y = percent_cli), - inherit.aes = FALSE, color = "black", na.rm=TRUE) + geom_line( + data = snapshots %>% filter(latest), + aes(x = time_value, y = percent_cli), + inherit.aes = FALSE, color = "black", na.rm = TRUE + ) ``` We can see some interesting and highly nontrivial revision behavior: at some points in time the provisional data snapshots grossly underestimate the latest curve (look in particular at Florida close to the end of 2021), and at others -they overestimate it (both states towards the beginning of 2021), though not +they overestimate it (both states towards the beginning of 2021), though not quite as dramatically. Modeling the revision process, which is often called *backfill modeling*, is an important statistical problem in it of itself. -## Merging `epi_archive` objects +## Merging `epi_archive` objects Now we demonstrate how to merge two `epi_archive` objects together, e.g., so that grabbing data from multiple sources as of a particular version can be @@ -243,20 +246,19 @@ When merging archives, unless the archives have identical data release patterns, the other). ```{r, message = FALSE, warning = FALSE,eval=FALSE} -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify=TRUE) + as_epi_archive(compactify = TRUE) -x$merge(y, sync="locf", compactify=FALSE) +x$merge(y, sync = "locf", compactify = FALSE) print(x) head(x$DT) ``` @@ -273,70 +275,75 @@ documentation for either for more detailed descriptions of what mutation, pointer aliasing, and pointer reseating is possible. ## Sliding version-aware computations - + Lastly, we demonstrate another key method of the `epi_archive` class, which is the `slide()` method. It works just like `epi_slide()` does for an `epi_df` object, but with one key difference: it performs version-aware computations. That is, for the computation at any given reference time t, it only uses **data that would have been available as of t**. The wrapper function is called -`epix_slide()`; again, this is just for convenience/familiarity---and its +`epix_slide()`; again, this is just for convenience/familiarity---and its interface is purposely designed mirror that of `epi_slide()` for `epi_df` objects. -For the demonstration, we'll revisit the forecasting example from the [slide +For the demonstration, we'll revisit the forecasting example from the [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html), and now we'll build a forecaster that uses properly-versioned data (that would have been available in real-time) to forecast future COVID-19 case rates from current and -past COVID-19 case rates, as well as current and past values of the outpatient +past COVID-19 case rates, as well as current and past values of the outpatient CLI signal from medical claims. We'll extend the `prob_ar()` function from the slide vignette to accomodate exogenous variables in the autoregressive model, -which is often referred to as an ARX model. +which is often referred to as an ARX model. ```{r} prob_arx <- function(x, y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20, - lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, - intercept = FALSE, nonneg = TRUE) { + lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, + intercept = FALSE, nonneg = TRUE) { # Return NA if insufficient training data if (length(y) < min_train_window + max(lags) + ahead) { return(data.frame(point = NA, lower = NA, upper = NA)) } - + # Useful transformations - if (!missing(x)) x <- data.frame(x, y) - else x <- data.frame(y) + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } if (!is.list(lags)) lags <- list(lags) - lags = rep(lags, length.out = ncol(x)) - + lags <- rep(lags, length.out = ncol(x)) + # Build features and response for the AR model, and then fit it dat <- do.call( - data.frame, + data.frame, unlist( # Below we loop through and build the lagged features - purrr::map(1:ncol(x), function(i) { - purrr::map(lags[[i]], function(j) lag(x[,i], n = j)) + purrr::map(1:ncol(x), function(i) { + purrr::map(lags[[i]], function(j) lag(x[, i], n = j)) }), - recursive = FALSE)) - names(dat) = paste0("x", 1:ncol(dat)) - if (intercept) dat$x0 = rep(1, nrow(dat)) - dat$y <- lead(y, n = ahead) + recursive = FALSE + ) + ) + names(dat) <- paste0("x", 1:ncol(dat)) + if (intercept) dat$x0 <- rep(1, nrow(dat)) + dat$y <- lead(y, n = ahead) obj <- lm(y ~ . + 0, data = dat) - + # Use LOCF to fill NAs in the latest feature values, make a prediction - setDT(dat) + setDT(dat) setnafill(dat, type = "locf") point <- predict(obj, newdata = tail(dat, 1)) - - # Compute a band + + # Compute a band r <- residuals(obj) s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) lower <- point + q[1] upper <- point + q[2] - + # Clip at zero if we need to, then return - if (nonneg) { - point = max(point, 0) - lower = max(lower, 0) - upper = max(upper, 0) + if (nonneg) { + point <- max(point, 0) + lower <- max(lower, 0) + upper <- max(upper, 0) } return(data.frame(point = point, lower = lower, upper = upper)) } @@ -346,14 +353,17 @@ Next we slide this forecaster over the working `epi_archive` object, in order to forecast COVID-19 case rates 7 days into the future. ```{r} -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month") +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), + by = "1 month" +) z <- x %>% group_by(geo_value) %>% - epix_slide(fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, - ref_time_values = fc_time_values) %>% + epix_slide( + fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, + ref_time_values = fc_time_values + ) %>% ungroup() head(z, 10) @@ -371,7 +381,7 @@ On the whole, `epix_slide()` works similarly to `epix_slide()`, though there are a few notable differences, even apart from the version-aware aspect. You can read the documentation for `epix_slide()` for details. -We finish off by comparing version-aware and -unaware forecasts at various +We finish off by comparing version-aware and -unaware forecasts at various points in time and forecast horizons. The former comes from using `epix_slide()` with the `epi_archive` object `x`, and the latter from applying `epi_slide()` to the latest snapshot of the data `x_latest`. @@ -384,42 +394,50 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(geo_value) %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values) %>% + epix_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + ref_time_values = fc_time_values + ) %>% mutate(target_date = time_value + ahead, as_of = TRUE) %>% ungroup() - } - else { - x_latest %>% + } else { + x_latest %>% group_by(geo_value) %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values) %>% + epi_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + ref_time_values = fc_time_values + ) %>% mutate(target_date = time_value + ahead, as_of = FALSE) %>% ungroup() } } # Generate the forecasts, and bind them together -fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE)) - -# Plot them, on top of latest COVID-19 case rates +fc <- bind_rows( + k_week_ahead(x, ahead = 7, as_of = TRUE), + k_week_ahead(x, ahead = 14, as_of = TRUE), + k_week_ahead(x, ahead = 21, as_of = TRUE), + k_week_ahead(x, ahead = 28, as_of = TRUE), + k_week_ahead(x, ahead = 7, as_of = FALSE), + k_week_ahead(x, ahead = 14, as_of = FALSE), + k_week_ahead(x, ahead = 21, as_of = FALSE), + k_week_ahead(x, ahead = 28, as_of = FALSE) +) + +# Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50") + - geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc_point)) + + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") ``` Each row displays the forecasts for a different location (CA, FL, NY, and TX), and each diff --git a/vignettes/correlation.Rmd b/vignettes/correlation.Rmd index 0d357402..34e8c0f0 100644 --- a/vignettes/correlation.Rmd +++ b/vignettes/correlation.Rmd @@ -23,26 +23,24 @@ library(dplyr) The data is fetched with the following query: ```{r, message = FALSE} -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "*", time_values = epirange(20200301, 20211231), - geo_values = "*" ) %>% - fetch() %>% select(geo_value, time_value, case_rate = value) -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "deaths_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "*", time_values = epirange(20200301, 20211231), - geo_values = "*" ) %>% - fetch() %>% select(geo_value, time_value, death_rate = value) x <- x %>% @@ -68,7 +66,7 @@ theme_set(theme_bw()) z1 <- epi_cor(x, case_rate, death_rate, cor_by = "time_value") -ggplot(z1, aes(x = time_value, y = cor)) + +ggplot(z1, aes(x = time_value, y = cor)) + geom_line() + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Correlation") @@ -95,14 +93,16 @@ correlated with case rates at an offset of -10 days.) ```{r, message = FALSE, warning = FALSE} z2 <- epi_cor(x, case_rate, death_rate, cor_by = time_value, dt1 = -10) -z <- rbind(z1 %>% mutate(lag = 0), - z2 %>% mutate(lag = 10)) %>% +z <- rbind( + z1 %>% mutate(lag = 0), + z2 %>% mutate(lag = 10) +) %>% mutate(lag = as.factor(lag)) ggplot(z, aes(x = time_value, y = cor)) + geom_line(aes(color = lag)) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Correlation", col = "Lag") + labs(x = "Date", y = "Correlation", col = "Lag") ``` Note that `epi_cor()` takes an argument `shift_by` that determines the grouping @@ -117,53 +117,56 @@ days from now. ## Correlations grouped by state The second option we have is to group by geo value, obtained by setting `cor_by -= geo_value`. We'll again look at correlations for both 0- and 10-day lagged += geo_value`. We'll again look at correlations for both 0- and 10-day lagged case rates. ```{r, message = FALSE, warning = FALSE} z1 <- epi_cor(x, case_rate, death_rate, cor_by = geo_value) z2 <- epi_cor(x, case_rate, death_rate, cor_by = geo_value, dt1 = -10) -z <- rbind(z1 %>% mutate(lag = 0), - z2 %>% mutate(lag = 10)) %>% +z <- rbind( + z1 %>% mutate(lag = 0), + z2 %>% mutate(lag = 10) +) %>% mutate(lag = as.factor(lag)) ggplot(z, aes(cor)) + geom_density(aes(fill = lag, col = lag), alpha = 0.5) + - labs(x = "Correlation", y = "Density", fill = "Lag", col = "Lag") + labs(x = "Correlation", y = "Density", fill = "Lag", col = "Lag") ``` -We can again see that, generally speaking, lagging the case rates back by 10 +We can again see that, generally speaking, lagging the case rates back by 10 days improves the correlations. ## More systematic lag analysis -Next we perform a more systematic investigation of the correlations over a broad -range of lag values. +Next we perform a more systematic investigation of the correlations over a broad +range of lag values. ```{r, message = FALSE, warning = FALSE} library(purrr) -lags = 0:35 +lags <- 0:35 z <- map_dfr(lags, function(lag) { epi_cor(x, case_rate, death_rate, cor_by = geo_value, dt1 = -lag) %>% - mutate(lag = .env$lag) + mutate(lag = .env$lag) }) z %>% group_by(lag) %>% summarize(mean = mean(cor, na.rm = TRUE)) %>% - ggplot(aes(x = lag, y = mean)) + - geom_line() + geom_point() + + ggplot(aes(x = lag, y = mean)) + + geom_line() + + geom_point() + labs(x = "Lag", y = "Mean correlation") ``` -We can see that some pretty clear curvature here in the mean correlation between +We can see that some pretty clear curvature here in the mean correlation between case and death rates (where the correlations come from grouping by geo value) as a function of lag. The maximum occurs at a lag of somewhere around 17 days. ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index ebab0ec9..a1b52daa 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -5,13 +5,13 @@ vignette: > %\VignetteIndexEntry{Get started with `epiprocess`} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} -editor_options: +editor_options: chunk_output_type: console --- This package introduces a common data structure for epidemiological data sets measured over space and time, and offers associated utilities to perform basic -signal processing tasks. +signal processing tasks. ## Installing @@ -19,7 +19,7 @@ This package is not on CRAN yet, so it can be installed using the [`devtools`](https://devtools.r-lib.org) package: ```{r, eval = FALSE} -devtools::install_github("cmu-delphi/epiprocess", ref = "main") +devtools::install_github("cmu-delphi/epiprocess", ref = "main") ``` Building the vignettes, such as this getting started guide, takes a significant @@ -27,13 +27,15 @@ amount of time. They are not included in the package by default. If you want to include vignettes, then use this modified command: ```{r, eval = FALSE} -devtools::install_github("cmu-delphi/epiprocess", ref = "main", - build_vignettes = TRUE, dependencies = TRUE) +devtools::install_github("cmu-delphi/epiprocess", + ref = "main", + build_vignettes = TRUE, dependencies = TRUE +) ``` ## Getting data into `epi_df` format -We'll start by showing how to get data into +We'll start by showing how to get data into epi_df format, which is just a tibble with a bit of special structure, and is the format assumed by all of the functions in the `epiprocess` package. An `epi_df` object has (at least) the @@ -60,28 +62,29 @@ library(epiprocess) library(dplyr) library(withr) -cases <- covidcast( - data_source = "jhu-csse", +cases <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_cumulative_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", time_values = epirange(20200301, 20220131), - geo_values = "ca,fl,ny,tx" -) %>% fetch() +) colnames(cases) ``` -As we can see, a data frame returned by `epidatr::covidcast()` has the +As we can see, a data frame returned by `epidatr::pub_covidcast()` has the columns required for an `epi_df` object (along with many others). We can use `as_epi_df()`, with specification of some relevant metadata, to bring the data frame into `epi_df` format. ```{r, message = FALSE} -x <- as_epi_df(cases, - geo_type = "state", - time_type = "day", - as_of = max(cases$issue)) %>% +x <- as_epi_df(cases, + geo_type = "state", + time_type = "day", + as_of = max(cases$issue) +) %>% select(geo_value, time_value, total_cases = value) class(x) @@ -93,7 +96,7 @@ attributes(x)$metadata ## Some details on metadata In general, an `epi_df` object has the following fields in its metadata: - + * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. * `as_of`: the time value at which the given data were available. @@ -115,10 +118,10 @@ data set. See the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for more. -If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a +If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a call to `as_epi_df()`, then this function will try to infer them from the passed object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value` -and `time_value` columns, respectively, but inferring the `as_of` field is not +and `time_value` columns, respectively, but inferring the `as_of` field is not as easy. See the documentation for `as_epi_df()` more details. ```{r} @@ -135,13 +138,16 @@ In the following examples we will show how to create an `epi_df` with additional ```{r} ex1 <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c("06059","06061","06067", - "12111","12113","12117", - "42101","42103","42105"), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), + by = "day" + ), length.out = length(geo_value)), value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) - ) %>% +) %>% as_tsibble(index = time_value, key = c(geo_value, county_code)) ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03") @@ -149,11 +155,11 @@ ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-0 The metadata now includes `county_code` as an extra key. ```{r} -attr(ex1,"metadata") +attr(ex1, "metadata") ``` -### Dealing with misspecified column names +### Dealing with misspecified column names `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. ```{r, error = TRUE} @@ -161,56 +167,62 @@ data.frame( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), # misnamed + by = "day" + ), length.out = length(geo_value)), # misnamed value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) -) %>% as_epi_df() +) %>% as_epi_df() ``` -The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. +The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. ```{r} ex2 <- tibble( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(state)), # misnamed + by = "day" + ), length.out = length(state)), # misnamed value = 1:length(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state))) ) %>% data.frame() -head(ex2) +head(ex2) -ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>% - as_epi_df(geo_type = "state", as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol")) +ex2 <- ex2 %>% + rename(geo_value = state, time_value = reported_date) %>% + as_epi_df( + geo_type = "state", as_of = "2020-06-03", + additional_metadata = list(other_keys = "pol") + ) -attr(ex2,"metadata") +attr(ex2, "metadata") ``` ### Adding additional keys to an `epi_df` object -In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. +In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. We use a toy data set included in `epiprocess` prepared using the `covidcast` library and are filtering to a single state for simplicity. ```{r} ex3 <- jhu_csse_county_level_subset %>% filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% - slice_tail(n = 6) - -attr(ex3,"metadata") # geo_type is county currently + slice_tail(n = 6) + +attr(ex3, "metadata") # geo_type is county currently ``` -Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column. +Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column. ```{r} -ex3 <- ex3 %>% +ex3 <- ex3 %>% as_tibble() %>% # needed to add the additional metadata mutate( - state = rep(tolower("MA"),6), - pol = rep(c("blue", "swing", "swing"), each = 2)) %>% + state = rep(tolower("MA"), 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) %>% as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) -attr(ex3,"metadata") +attr(ex3, "metadata") ``` Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. @@ -229,15 +241,15 @@ like plotting, which is pretty easy to do `ggplot2`. library(ggplot2) theme_set(theme_bw()) -ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + +ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + geom_line() + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State") ``` -As a last couple examples, we'll look at some more data sets just to show how -we might get them into `epi_df` format. Data on daily new (not cumulative) SARS -cases in Canada in 2003, from the +As a last couple examples, we'll look at some more data sets just to show how +we might get them into `epi_df` format. Data on daily new (not cumulative) SARS +cases in Canada in 2003, from the [outbreaks](https://github.com/reconverse/outbreaks) package: ```{r} @@ -249,12 +261,12 @@ x <- outbreaks::sars_canada_2003 %>% head(x) library(tidyr) -x <- x %>% +x <- x %>% pivot_longer(starts_with("cases"), names_to = "type") %>% mutate(type = substring(type, 7)) -yrange <- range(x %>% group_by(time_value) %>% - summarize(value = sum(value)) %>% pull(value)) +yrange <- range(x %>% group_by(time_value) %>% + summarize(value = sum(value)) %>% pull(value)) ggplot(x, aes(x = time_value, y = value)) + geom_col(aes(fill = type)) + @@ -271,22 +283,29 @@ x <- outbreaks::ebola_sierraleone_2014 %>% cases = ifelse(status == "confirmed", 1, 0), province = case_when( district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern", - district %in% c("Bombali", "Kambia", "Koinadugu", - "Port Loko", "Tonkolili") ~ "Northern", + district %in% c( + "Bombali", "Kambia", "Koinadugu", + "Port Loko", "Tonkolili" + ) ~ "Northern", district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern", - district %in% c("Western Rural", "Western Urban") ~ "Western")) %>% - select(geo_value = province, - time_value = date_of_onset, - cases) %>% filter(cases==1) %>% - group_by(geo_value, time_value) %>% + district %in% c("Western Rural", "Western Urban") ~ "Western" + ) + ) %>% + select( + geo_value = province, + time_value = date_of_onset, + cases + ) %>% + filter(cases == 1) %>% + group_by(geo_value, time_value) %>% summarise(cases = sum(cases)) %>% - as_epi_df(geo_type="province") + as_epi_df(geo_type = "province") -ggplot(x, aes(x = time_value, y = cases)) + - geom_col(aes(fill = geo_value), show.legend = FALSE) + - facet_wrap(~ geo_value, scales = "free_y") + +ggplot(x, aes(x = time_value, y = cases)) + + geom_col(aes(fill = geo_value), show.legend = FALSE) + + facet_wrap(~geo_value, scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") + labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") ``` @@ -294,6 +313,6 @@ ggplot(x, aes(x = time_value, y = cases)) + ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index 9c185b15..4fb4eda5 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -9,7 +9,7 @@ vignette: > A basic way of assessing growth in a signal is to look at its relative change over two neighboring time windows. The `epiprocess` package provides a function -`growth_rate()` to compute such relative changes, as well as more sophisticated +`growth_rate()` to compute such relative changes, as well as more sophisticated estimates the growth rate of a signal. We investigate this functionality in the current vignette, applied to state-level daily reported COVID-19 cases from GA and PA, smoothed using a 7-day trailing average. @@ -23,15 +23,14 @@ library(tidyr) The data is fetched with the following query: ```{r, message = FALSE, eval=F} -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ga,pa", time_values = epirange(20200601, 20211231), - geo_values = "ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) %>% as_epi_df() @@ -56,16 +55,16 @@ $t$ is defined as $f'(t)/f(t)$, where $f'(t)$ is the derivative of $f$ at $t$. To estimate the growth rate of a signal in discrete-time (which can be thought of as evaluations or discretizations of an underlying function in continuous-time), we can estimate the derivative and divide by the signal value -itself (or possibly a smoothed version of the signal value). +itself (or possibly a smoothed version of the signal value). The `growth_rate()` function takes a sequence of underlying design points `x` and corresponding sequence `y` of signal values, and allows us to choose from the following methods for estimating the growth rate at a given reference point -`x0`, by setting the `method` argument: +`x0`, by setting the `method` argument: * "rel_change": uses $(\bar B/\bar A - 1) / h$, where $\bar B$ is the average of `y` over the second half of a sliding window of bandwidth `h` centered at the - reference point `x0`, and $\bar A$ the average over the first half. This can + reference point `x0`, and $\bar A$ the average over the first half. This can be seen as using a first-difference approximation to the derivative. * "linear_reg": uses the slope from a linear regression of `y` on `x` over a sliding window centered at the reference point `x0`, divided by the fitted @@ -79,7 +78,7 @@ the following methods for estimating the growth rate at a given reference point at `x0`. The default in `growth_rate()` is `x0 = x`, so that it returns an estimate of -the growth rate at each underlying design point. +the growth rate at each underlying design point. ## Relative change @@ -90,8 +89,8 @@ a call to `dplyr::mutate()` to append a new column to our `epi_df` object with the computed growth rates. ```{r} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% mutate(cases_gr1 = growth_rate(time_value, cases)) head(x, 10) @@ -99,7 +98,7 @@ head(x, 10) We can visualize these growth rate estimates by plotting the signal values and highlighting the periods in time for which the relative change is above 1% (in -red) and below -1% (in blue), faceting by geo value. +red) and below -1% (in blue), faceting by geo value. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 4} library(ggplot2) @@ -108,30 +107,30 @@ theme_set(theme_bw()) upper = 0.01 lower = -0.01 -ggplot(x, aes(x = time_value, y = cases)) + +ggplot(x, aes(x = time_value, y = cases)) + geom_tile(data = x %>% filter(cases_gr1 >= upper), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 2, alpha = 0.08) + + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 2, alpha = 0.08) + geom_tile(data = x %>% filter(cases_gr1 <= lower), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 4, alpha = 0.08) + - geom_line() + + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 4, alpha = 0.08) + + geom_line() + facet_wrap(vars(geo_value), scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 cases") ``` As a more direct visualization, we plot the estimated growth rates themselves, -overlaying the curves for the two states on one plot. +overlaying the curves for the two states on one plot. ```{r, message = FALSE, warning = FALSE} -ggplot(x, aes(x = time_value, y = cases_gr1)) + - geom_line(aes(col = geo_value)) + +ggplot(x, aes(x = time_value, y = cases_gr1)) + + geom_line(aes(col = geo_value)) + geom_hline(yintercept = upper, linetype = 2, col = 2) + geom_hline(yintercept = lower, linetype = 2, col = 4) + scale_color_manual(values = c(3,6)) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "State") + labs(x = "Date", y = "Growth rate", col = "State") ``` We can see that the estimated growth rates from the relative change method are @@ -150,23 +149,23 @@ again `h = 7`. Compared to "rel_change", it appears to behave similarly overall, but thankfully avoids some of the troublesome spikes: ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% mutate(cases_gr2 = growth_rate(time_value, cases, method = "linear_reg")) x %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr1 = "rel_change", cases_gr2 = "linear_reg")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(2,4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") ``` ## Nonparametric estimation @@ -179,32 +178,32 @@ particular implementations and default settings for these methods: "trend_filter" is based on a full solution path algorithm provided in the `genlasso` package, and performs cross-validation by default in order to pick the level of regularization; read the documentation for `growth_rate()` more -details.) +details.) ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% mutate(cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), cases_gr4 = growth_rate(time_value, cases, method = "trend_filter")) x %>% select(geo_value, time_value, cases_gr3, cases_gr4) %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr3 = "smooth_spline", cases_gr4 = "trend_filter")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(3,6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") ``` -In this particular example, the trend filtering estimates of growth rate appear -to be much more stable than those from the smoothing spline, and also much more +In this particular example, the trend filtering estimates of growth rate appear +to be much more stable than those from the smoothing spline, and also much more stable than the estimates from local relative changes and linear regressions. The smoothing spline growth rate estimates are based on the default settings in @@ -220,79 +219,79 @@ the documentation for `growth_rate()` gives the full details. In general, and alternative view for the growth rate of a function $f$ is given by defining $g(t) = \log(f(t))$, and then observing that $g'(t) = f'(t)/f(t)$. Therefore, any method that estimates the derivative can be simply applied to the -log of the signal of interest, and in this light, each method above +log of the signal of interest, and in this light, each method above ("rel_change", "linear_reg", "smooth_spline", and "trend_filter") has a log scale analog, which can be used by setting the argument `log_scale = TRUE` in the call to `growth_rate()`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} -x <- x %>% - group_by(geo_value) %>% - mutate(cases_gr5 = growth_rate(time_value, cases, method = "rel_change", +x <- x %>% + group_by(geo_value) %>% + mutate(cases_gr5 = growth_rate(time_value, cases, method = "rel_change", log_scale = TRUE), - cases_gr6 = growth_rate(time_value, cases, method = "linear_reg", + cases_gr6 = growth_rate(time_value, cases, method = "linear_reg", log_scale = TRUE), - cases_gr7 = growth_rate(time_value, cases, method = "smooth_spline", + cases_gr7 = growth_rate(time_value, cases, method = "smooth_spline", log_scale = TRUE), - cases_gr8 = growth_rate(time_value, cases, method = "trend_filter", + cases_gr8 = growth_rate(time_value, cases, method = "trend_filter", log_scale = TRUE)) x %>% select(geo_value, time_value, cases_gr5, cases_gr6) %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr5 = "rel_change_log", cases_gr6 = "linear_reg_log")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(2,4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") x %>% select(geo_value, time_value, cases_gr7, cases_gr8) %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr7 = "smooth_spline_log", cases_gr8 = "trend_filter_log")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(3,6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") ``` -Comparing the `rel_change_log` curves with their `rel_change` counterparts -(shown in earlier figures), we see that the former curves appear less volatile +Comparing the `rel_change_log` curves with their `rel_change` counterparts +(shown in earlier figures), we see that the former curves appear less volatile and match the linear regression estimates much more closely. In particular, when `rel_change` has upward spikes, `rel_change_log` has less pronounced spikes. Why does this occur? The estimate of $g'(t)$ here can be expressed as $\mathbb -E[\log(B)-\log(A)]/h = \mathbb E[\log(1+hR)]/h$, where $R = ((B-A)/h) / A$, and +E[\log(B)-\log(A)]/h = \mathbb E[\log(1+hR)]/h$, where $R = ((B-A)/h) / A$, and the expectation refers to averaging over the $h$ observations in each window. Consider the following two relevant inequalities, both due to concavity of the -logarithm function: +logarithm function: $$ \mathbb E[\log(1+hR)]/h \leq \log(1+h\mathbb E[R])/h \leq \mathbb E[R]. $$ -The first inequality is Jensen's; the second inequality is because the tangent +The first inequality is Jensen's; the second inequality is because the tangent line of a concave function lies above it. Finally, we observe that $\mathbb -E[R] \approx ((\bar B-\bar A)/h) / \bar A$, which the `rel_change` estimate. +E[R] \approx ((\bar B-\bar A)/h) / \bar A$, which the `rel_change` estimate. This explains why the `rel_change_log` curve often lies below the `rel_change` curve. - ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 8d820531..416a135f 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -20,16 +20,15 @@ library(epiprocess) library(dplyr) library(tidyr) -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20210531), + time_type = "day", geo_values = "fl,nj", + time_values = epirange(20200601, 20210531), as_of = 20211028 ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() ``` @@ -68,13 +67,13 @@ methods on a given signal, and then (optionally) combine the results from those methods. Here, we'll investigate outlier detection results from the following methods. -1. Detection based on a rolling median, using `detect_outlr_rm()`, which - computes a rolling median on with a default window size of `n` time points - centered at the time point under consideration, and then computes thresholds - based on a multiplier times a rolling IQR computed on the residuals. +1. Detection based on a rolling median, using `detect_outlr_rm()`, which + computes a rolling median on with a default window size of `n` time points + centered at the time point under consideration, and then computes thresholds + based on a multiplier times a rolling IQR computed on the residuals. 2. Detection based on a seasonal-trend decomposition using LOESS (STL), using - `detect_outlr_stl()`, which is similar to the rolling median method but - replaces the rolling median with fitted values from STL. + `detect_outlr_stl()`, which is similar to the rolling median method but + replaces the rolling median with fitted values from STL. 3. Detection based on an STL decomposition, but without seasonality term, which amounts to smoothing using LOESS. @@ -82,25 +81,38 @@ The outlier detection methods are specified using a `tibble` that is passed to `detect_outlr()`, with one row per method, and whose columms specify the outlier detection function, any input arguments (only nondefault values need to be supplied), and an abbreviated name for the method used in tracking results. -Abbreviations "rm" and "stl" can be used for the built-in detection functions +Abbreviations "rm" and "stl" can be used for the built-in detection functions `detect_outlr_rm()` and `detect_outlr_stl()`, respectively. ```{r} -detection_methods = bind_rows( - tibble(method = "rm", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5)), - abbr = "rm"), - tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = 7)), - abbr = "stl_seasonal"), - tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = NULL)), - abbr = "stl_nonseasonal")) +detection_methods <- bind_rows( + tibble( + method = "rm", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5 + )), + abbr = "rm" + ), + tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = 7 + )), + abbr = "stl_seasonal" + ), + tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = NULL + )), + abbr = "stl_nonseasonal" + ) +) detection_methods ``` @@ -113,10 +125,11 @@ vote across the base methods to determine whether a value is an outlier. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(outlier_info = detect_outlr( + mutate(outlier_info = detect_outlr( x = time_value, y = cases, methods = detection_methods, - combiner = "median")) %>% + combiner = "median" + )) %>% ungroup() %>% unnest(outlier_info) @@ -127,38 +140,49 @@ To visualize the results, we first define a convenience function for plotting. ```{r} # Plot outlier detection bands and/or points identified as outliers -plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, +plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, facet_vars = vars(geo_value), nrow = NULL, ncol = NULL, scales = "fixed") { - # Convert outlier detection results to long format + # Convert outlier detection results to long format signal <- rlang::enquo(signal) x_long <- x %>% pivot_longer( cols = starts_with(method_abbr), names_to = c("method", ".value"), - names_pattern = "(.+)_(.+)") - + names_pattern = "(.+)_(.+)" + ) + # Start of plot with observed data p <- ggplot() + geom_line(data = x, mapping = aes(x = time_value, y = !!signal)) # If requested, add bands - if (bands) - p <- p + geom_ribbon(data = x_long, - aes(x = time_value, ymin = lower, ymax = upper, - color = method), fill = NA) + if (bands) { + p <- p + geom_ribbon( + data = x_long, + aes( + x = time_value, ymin = lower, ymax = upper, + color = method + ), fill = NA + ) + } # If requested, add points if (points) { x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper)) - p <- p + geom_point(data = x_detected, - aes(x = time_value, y = !!signal, color = method, - shape = method)) + p <- p + geom_point( + data = x_detected, + aes( + x = time_value, y = !!signal, color = method, + shape = method + ) + ) } # If requested, add faceting - if (!is.null(facet_vars)) + if (!is.null(facet_vars)) { p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales) + } return(p) } @@ -170,29 +194,35 @@ Now we produce plots for each state at a time, faceting by the detection method. method_abbr <- c(detection_methods$abbr, "combined") plot_outlr(x %>% filter(geo_value == "fl"), cases, method_abbr, - facet_vars = vars(method), scales = "free_y", ncol = 1) + + facet_vars = vars(method), scales = "free_y", ncol = 1 +) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 counts", color = "Method", - shape = "Method") + labs( + x = "Date", y = "Reported COVID-19 counts", color = "Method", + shape = "Method" + ) plot_outlr(x %>% filter(geo_value == "nj"), cases, method_abbr, - facet_vars = vars(method), scales = "free_y", ncol = 1) + + facet_vars = vars(method), scales = "free_y", ncol = 1 +) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 counts", color = "Method", - shape = "Method") + labs( + x = "Date", y = "Reported COVID-19 counts", color = "Method", + shape = "Method" + ) ``` ## Outlier correction Finally, in order to correct outliers, we can use the posited replacement values returned by each outlier detection method. Below we use the replacement value -from the combined method, which is defined by the median of replacement values +from the combined method, which is defined by the median of replacement values from the base methods at each time point. ```{r, fig.width = 8, fig.height = 7} -y <- x %>% +y <- x %>% mutate(cases_corrected = combined_replacement) %>% - select(geo_value, time_value, cases, cases_corrected) + select(geo_value, time_value, cases, cases_corrected) y %>% filter(cases != cases_corrected) @@ -205,13 +235,13 @@ ggplot(y, aes(x = time_value)) + labs(x = "Date", y = "Reported COVID-19 counts") ``` -More advanced correction functionality will be coming at some point in the -future. +More advanced correction functionality will be coming at some point in the +future. ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index cb435e54..34d5bd59 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -8,7 +8,7 @@ vignette: > --- A central tool in the `epiprocess` package is `epi_slide()`, which is based on -the powerful functionality provided in the +the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In `epiprocess`, to "slide" means to apply a computation---represented as a function or formula---over a sliding/rolling data window. Suitable @@ -37,15 +37,14 @@ library(dplyr) The data is fetched with the following query: ```{r, message = FALSE, eval=F} -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) %>% as_epi_df() @@ -69,8 +68,8 @@ order to smooth the signal, by passing in a formula for the first argument of `epi_slide()`. To do this computation per state, we first call `group_by()`. ```{r} -x %>% - group_by(geo_value) %>% +x %>% + group_by(geo_value) %>% epi_slide(~ mean(.x$cases), before = 6) %>% ungroup() %>% head(10) @@ -84,7 +83,7 @@ default. We can of course change this post hoc, or we can instead specify a new name up front using the `new_col_name` argument: ```{r} -x <- x %>% +x <- x %>% group_by(geo_value) %>% epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav") %>% ungroup() @@ -102,7 +101,7 @@ Like in `group_modify()`, there are alternative names for these variables as well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` instead of `.ref_time_value`. -## Slide with a function +## Slide with a function We can also pass a function for the first argument in `epi_slide()`. In this case, the passed function must accept the following arguments: @@ -118,8 +117,8 @@ receives to `f`. Recreating the last example of a 7-day trailing average: ```{r} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% epi_slide(function(x, gk, rtv) mean(x$cases), before = 6, new_col_name = "cases_7dav") %>% ungroup() @@ -135,8 +134,8 @@ to a computation in which we can access any columns of `x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() @@ -155,13 +154,13 @@ theme_set(theme_bw()) ggplot(x, aes(x = time_value)) + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + - facet_wrap(~ geo_value, scales = "free_y") + + facet_wrap(~geo_value, scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 cases") ``` -As we can see from the top right panel, it looks like Texas moved to weekly -reporting of COVID-19 cases in summer of 2021. +As we can see from the top right panel, it looks like Texas moved to weekly +reporting of COVID-19 cases in summer of 2021. ## Running a local forecaster @@ -183,57 +182,60 @@ units of the `time_value` column; so, days, in the working `epi_df` being considered in this vignette). ```{r} -prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, +prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, - intercept = FALSE, nonneg = TRUE) { + intercept = FALSE, nonneg = TRUE) { # Return NA if insufficient training data if (length(y) < min_train_window + max(lags) + ahead) { return(data.frame(point = NA, lower = NA, upper = NA)) } - + # Build features and response for the AR model dat <- do.call( - data.frame, + data.frame, purrr::map(lags, function(j) lag(y, n = j)) ) - names(dat) = paste0("x", 1:ncol(dat)) - if (intercept) dat$x0 = rep(1, nrow(dat)) - dat$y <- lead(y, n = ahead) - + names(dat) <- paste0("x", 1:ncol(dat)) + if (intercept) dat$x0 <- rep(1, nrow(dat)) + dat$y <- lead(y, n = ahead) + # Now fit the AR model and make a prediction obj <- lm(y ~ . + 0, data = dat) point <- predict(obj, newdata = tail(dat, 1)) - - # Compute a band + + # Compute a band r <- residuals(obj) s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) lower <- point + q[1] upper <- point + q[2] - + # Clip at zero if we need to, then return - if (nonneg) { - point = max(point, 0) - lower = max(lower, 0) - upper = max(upper, 0) + if (nonneg) { + point <- max(point, 0) + lower <- max(lower, 0) + upper <- max(upper, 0) } return(data.frame(point = point, lower = lower, upper = upper)) } ``` -We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 -cases. Note that we actually model the `cases_7dav` column, to operate on the +We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 +cases. Note that we actually model the `cases_7dav` column, to operate on the scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, to modeling weekly sums of COVID-19 cases. ```{r} -fc_time_values <- seq(as.Date("2020-06-01"), - as.Date("2021-12-01"), - by = "1 months") -x %>% +fc_time_values <- seq(as.Date("2020-06-01"), + as.Date("2021-12-01"), + by = "1 months" +) +x %>% group_by(geo_value) %>% - epi_slide(fc = prob_ar(cases_7dav), before = 119, - ref_time_values = fc_time_values) %>% + epi_slide( + fc = prob_ar(cases_7dav), before = 119, + ref_time_values = fc_time_values + ) %>% ungroup() %>% head(10) ``` @@ -243,42 +245,51 @@ sliding computation (here, compute a forecast) at a specific subset of reference time values. We get out three columns `fc_point`, `fc_lower`, and `fc_upper` that correspond to the point forecast, and the lower and upper endpoints of the 95\% prediction band, respectively. (If instead we had set `as_list_col = TRUE` -in the call to `epi_slide()`, then we would have gotten a list column `fc`, +in the call to `epi_slide()`, then we would have gotten a list column `fc`, where each element of `fc` is a data frame with named columns `point`, `lower`, and `upper`.) To finish off, we plot the forecasts at some times (spaced out by a few months) -over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do -so, we encapsulate the process of generating forecasts into a simple function, +over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do +so, we encapsulate the process of generating forecasts into a simple function, so that we can call it a few times. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Note the use of all_rows = TRUE (keeps all original rows in the output) k_week_ahead <- function(x, ahead = 7) { - x %>% + x %>% group_by(geo_value) %>% - epi_slide(fc = prob_ar(cases_7dav, ahead = ahead), before = 119, - ref_time_values = fc_time_values, all_rows = TRUE) %>% + epi_slide( + fc = prob_ar(cases_7dav, ahead = ahead), before = 119, + ref_time_values = fc_time_values, all_rows = TRUE + ) %>% ungroup() %>% - mutate(target_date = time_value + ahead) + mutate(target_date = time_value + ahead) } # First generate the forecasts, and bind them together -z <- bind_rows(k_week_ahead(x, ahead = 7), - k_week_ahead(x, ahead = 14), - k_week_ahead(x, ahead = 21), - k_week_ahead(x, ahead = 28)) - -# Now plot them, on top of actual COVID-19 case counts +z <- bind_rows( + k_week_ahead(x, ahead = 7), + k_week_ahead(x, ahead = 14), + k_week_ahead(x, ahead = 21), + k_week_ahead(x, ahead = 28) +) + +# Now plot them, on top of actual COVID-19 case counts ggplot(z) + - geom_line(aes(x = time_value, y = cases_7dav), color = "gray50") + - geom_ribbon(aes(x = target_date, ymin = fc_lower, ymax = fc_upper, - group = time_value), fill = 6, alpha = 0.4) + - geom_line(aes(x = target_date, y = fc_point, group = time_value)) + - geom_point(aes(x = target_date, y = fc_point, group = time_value), - size = 0.5) + - geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x), - linetype = 2, alpha = 0.5) + + geom_line(aes(x = time_value, y = cases_7dav), color = "gray50") + + geom_ribbon(aes( + x = target_date, ymin = fc_lower, ymax = fc_upper, + group = time_value + ), fill = 6, alpha = 0.4) + + geom_line(aes(x = target_date, y = fc_point, group = time_value)) + + geom_point(aes(x = target_date, y = fc_point, group = time_value), + size = 0.5 + ) + + geom_vline( + data = tibble(x = fc_time_values), aes(xintercept = x), + linetype = 2, alpha = 0.5 + ) + facet_wrap(vars(geo_value), scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 cases") @@ -290,7 +301,7 @@ spotty. At various points in time, we can see that its forecasts are volatile too narrow), or both at the same time. This is only meant as a simple demo and not entirely unexpected given the way the AR model is set up. The [`epipredict`](https://cmu-delphi.github.io/epipredict) package, which is a -companion package to `epiprocess`, offers a suite of predictive modeling tools +companion package to `epiprocess`, offers a suite of predictive modeling tools that can improve on some of the shortcomings of the above simple AR model. Second, the AR forecaster here is using finalized data, meaning, it uses the @@ -304,13 +315,13 @@ therein. Fortunately, the `epiprocess` package provides a data structure called `epi_archive` that can be used to store all data revisions, and furthermore, an `epi_archive` object knows how to slide computations in the correct version-aware sense (for the computation at each reference time $t$, it uses -only data that would have been available as of $t$). We will revisit this -example in the [archive +only data that would have been available as of $t$). We will revisit this +example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. From c8bc7a4adbe7a26c99f1b3e8d96f52f5ba8f6717 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Nov 2023 16:30:15 -0800 Subject: [PATCH 2/4] docs(NEWS): fix formatting --- NEWS.md | 163 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 81 insertions(+), 82 deletions(-) diff --git a/NEWS.md b/NEWS.md index df674425..be8b9b56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ Note that `epiprocess` uses the [Semantic Versioning ("semver")](https://semver.org/) scheme for all release versions, but any inter-release development versions will include an additional ".9999" suffix. -## Breaking changes: +## Breaking changes * Changes to `epi_slide` and `epix_slide`: * If `f` is a function, it is now required to take at least three arguments. @@ -19,7 +19,7 @@ inter-release development versions will include an additional ".9999" suffix. g, ) { }` to `f = function(x, g, rt, ) { }`. -## New features: +## New features * `epi_slide` and `epix_slide` also make the window data, group key and reference time value available to slide computations specified as formulas or tidy @@ -45,7 +45,7 @@ inter-release development versions will include an additional ".9999" suffix. * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` when desired and set the metadata appropriately. -## Improvements: +## Improvements * `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide computations output atomic vectors, and output a list column in "chopped" @@ -55,7 +55,7 @@ inter-release development versions will include an additional ".9999" suffix. # epiprocess 0.6.0 -## Breaking changes: +## Breaking changes * Changes to both `epi_slide` and `epix_slide`: * The `n`, `align`, and `before` arguments have been replaced by new `before` @@ -102,7 +102,7 @@ inter-release development versions will include an additional ".9999" suffix. the old behavior, pass in `clobberable_versions_start = max_version_with_row_in(x)`. -## Potentially-breaking changes: +## Potentially-breaking changes * Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when dropping the `epi_df` class (e.g., when removing the `time_value` column). @@ -116,7 +116,7 @@ inter-release development versions will include an additional ".9999" suffix. * `epi_slide` and `epix_slide` now raise an error rather than silently filtering out `ref_time_values` that don't meet their expectations. -## New features: +## New features * `epix_slide`, `$slide` have a new parameter `all_versions`. With `all_versions=TRUE`, `epix_slide` will pass a filtered `epi_archive` to each @@ -124,7 +124,7 @@ inter-release development versions will include an additional ".9999" suffix. pseudoprospective forecasts with a revision-aware forecaster using nested `epix_slide` operations. -## Improvements: +## Improvements * Added `dplyr::group_by` and `dplyr::ungroup` S3 methods for `epi_archive` objects, plus corresponding `$group_by` and `$ungroup` R6 methods. The @@ -134,35 +134,35 @@ inter-release development versions will include an additional ".9999" suffix. requirement (part of [#154](https://github.com/cmu-delphi/epiprocess/issues/154)). -## Cleanup: +## Cleanup * Added a `NEWS.md` file to track changes to the package. * Implemented `?dplyr::dplyr_extending` for `epi_df`s ([#223](https://github.com/cmu-delphi/epiprocess/issues/223)). * Fixed various small documentation issues ([#217](https://github.com/cmu-delphi/epiprocess/issues/217)). -# epiprocess 0.5.0: +# epiprocess 0.5.0 -## Potentially-breaking changes: +## Potentially-breaking changes * `epix_slide`, `$slide` now feed `f` an `epi_df` rather than converting to a tibble/`tbl_df` first, allowing use of `epi_df` methods and metadata, and often yielding `epi_df`s out of the slide as a result. To obtain the old behavior, convert to a tibble within `f`. -## Improvements: +## Improvements * Fixed `epix_merge`, `$merge` always raising error on `sync="truncate"`. -## Cleanup: +## Cleanup * Added `Remotes:` entry for `genlasso`, which was removed from CRAN. * Added `as_epi_archive` tests. * Added missing `epix_merge` test for `sync="truncate"`. -# epiprocess 0.4.0: +# epiprocess 0.4.0 -## Potentially-breaking changes: +## Potentially-breaking changes * Fixed `[.epi_df` to not reorder columns, which was incompatible with downstream packages. @@ -177,20 +177,20 @@ inter-release development versions will include an additional ".9999" suffix. * Fixed `[.epi_df` to drop metadata if decaying to a tibble (due to removal of essential columns). -## Improvements: +## Improvements * Added check that `epi_df` `additional_metadata` is list. * Fixed some incorrect `as_epi_df` examples. -## Cleanup: +## Cleanup * Applied rename of upstream package in examples: `delphi.epidata` -> `epidatr`. * Rounded out `[.epi_df` tests. -# epiprocess 0.3.0: +# epiprocess 0.3.0 -## Breaking changes: +## Breaking changes * `as_epi_archive`, `epi_archive$new`: * Compactification (see below) by default may change results if working @@ -223,7 +223,7 @@ inter-release development versions will include an additional ".9999" suffix. reporting latency, `n=7` will *not* yield 7 days of data in a typical daily-reporting surveillance data source, as one might have assumed). -## New features: +## New features * `as_epi_archive`, `epi_archive$new`: * New `compactify` parameter allows removal of rows that are redundant for the @@ -248,20 +248,20 @@ inter-release development versions will include an additional ".9999" suffix. `epi_archive` rather than an outdated R6 implementation from whenever the data object was generated. -# epiprocess 0.2.0: +# epiprocess 0.2.0 -## Breaking changes: +## Breaking changes * Removed default `n=7` argument to `epix_slide`. -## Improvements: +## Improvements * Ignore `NA`s when printing `time_value` range for an `epi_archive`. * Fixed misleading column naming in `epix_slide` example. * Trimmed down `epi_slide` examples. * Synced out-of-date docs. -## Cleanup: +## Cleanup * Removed dependency of some `epi_archive` tests on an example archive. object, and made them more understandable by reading without running. @@ -271,16 +271,16 @@ inter-release development versions will include an additional ".9999" suffix. * Removed some dead code. * Made `.{Rbuild,git}ignore` files more comprehensive. -# epiprocess 0.1.2: +# epiprocess 0.1.2 -## New features: +## New features * New `new_epi_df` function is similar to `as_epi_df`, but (i) recalculates, overwrites, and/or drops most metadata of `x` if it has any, (ii) may still reorder the columns of `x` even if it's already an `epi_df`, and (iii) treats `x` as optional, constructing an empty `epi_df` by default. -## Improvements: +## Improvements * Fixed `geo_type` guessing on alphabetical strings with more than 2 characters to yield `"custom"`, not US `"nation"`. @@ -300,20 +300,20 @@ inter-release development versions will include an additional ".9999" suffix. * Improved `as_epi_archive` and `epi_archive$new`/`$initialize` documentation, including constructing a toy archive. -## Cleanup: +## Cleanup * Added tests for `epi_slide`, `epi_cor`, and internal utility functions. * Fixed currently-unused internal utility functions `MiddleL`, `MiddleR` to yield correct results on odd-length vectors. -# epiprocess 0.1.1: +# epiprocess 0.1.1 -## New features: +## New features * New example data objects allow one to quickly experiment with `epi_df`s and `epi_archives` without relying/waiting on an API to fetch data. -## Improvements: +## Improvements * Improved `epi_slide` error messaging. * Fixed description of the appropriate parameters for an `f` argument to @@ -322,61 +322,60 @@ inter-release development versions will include an additional ".9999" suffix. * Added some examples throughout the package. * Using example data objects in vignettes also speeds up vignette compilation. -## Cleanup: +## Cleanup * Set up gh-actions CI. * Added tests for `epi_df`s. # epiprocess 0.1.0 -## Implemented core functionality, vignettes: - -Classes: -* `epi_df`: specialized `tbl_df` for geotemporal epidemiological time - series data, with optional metadata recording other key columns (e.g., - demographic breakdowns) and `as_of` what time/version this data was - current/published. Associated functions: - * `as_epi_df` converts to an `epi_df`, guessing the `geo_type`, - `time_type`, `other_keys`, and `as_of` if not specified. - * `as_epi_df.tbl_ts` and `as_tsibble.epi_df` automatically set - `other_keys` and `key`&`index`, respectively. - * `epi_slide` applies a user-supplied computation to a sliding/rolling - time window and user-specified groups, adding the results as new - columns, and recycling/broadcasting results to keep the result size - stable. Allows computation to be provided as a function, `purrr`-style - formula, or tidyeval dots. Uses `slider` underneath for efficiency. - * `epi_cor` calculates Pearson, Kendall, or Spearman correlations - between two (optionally time-shifted) variables in an `epi_df` within - user-specified groups. - * Convenience function: `is_epi_df`. -* `epi_archive`: R6 class for version (patch) data for geotemporal - epidemiological time series data sets. Comes with S3 methods and regular - functions that wrap around this functionality for those unfamiliar with R6 - methods. Associated functions: - * `as_epi_archive`: prepares an `epi_archive` object from a data frame - containing snapshots and/or patch data for every available version of - the data set. - * `as_of`: extracts a snapshot of the data set as of some requested - version, in `epi_df` format. - * `epix_slide`, `$slide`: similar to `epi_slide`, but for - `epi_archive`s; for each requested `ref_time_value` and group, applies - a time window and user-specified computation to a snapshot of the data - as of `ref_time_value`. - * `epix_merge`, `$merge`: like `merge` for `epi_archive`s, - but allowing for the last version of each observation to be carried - forward to fill in gaps in `x` or `y`. - * Convenience function: `is_epi_archive`. - -Additional functions: -* `growth_rate`: estimates growth rate of a time series using one of a few - built-in `method`s based on relative change, linear regression, - smoothing splines, or trend filtering. -* `detect_outlr`: applies one or more outlier detection methods to a given - signal variable, and optionally aggregates the outputs to create a - consensus result. -* `detect_outlr_rm`: outlier detection function based on a - rolling-median-based outlier detection function; one of the methods - included in `detect_outlr`. -* `detect_outlr_stl`: outlier detection function based on a seasonal-trend - decomposition using LOESS (STL); one of the methods included in - `detect_outlr`. +## Implemented core functionality, vignettes + +* Classes: + * `epi_df`: specialized `tbl_df` for geotemporal epidemiological time + series data, with optional metadata recording other key columns (e.g., + demographic breakdowns) and `as_of` what time/version this data was + current/published. Associated functions: + * `as_epi_df` converts to an `epi_df`, guessing the `geo_type`, + `time_type`, `other_keys`, and `as_of` if not specified. + * `as_epi_df.tbl_ts` and `as_tsibble.epi_df` automatically set + `other_keys` and `key`&`index`, respectively. + * `epi_slide` applies a user-supplied computation to a sliding/rolling + time window and user-specified groups, adding the results as new + columns, and recycling/broadcasting results to keep the result size + stable. Allows computation to be provided as a function, `purrr`-style + formula, or tidyeval dots. Uses `slider` underneath for efficiency. + * `epi_cor` calculates Pearson, Kendall, or Spearman correlations + between two (optionally time-shifted) variables in an `epi_df` within + user-specified groups. + * Convenience function: `is_epi_df`. + * `epi_archive`: R6 class for version (patch) data for geotemporal + epidemiological time series data sets. Comes with S3 methods and regular + functions that wrap around this functionality for those unfamiliar with R6 + methods. Associated functions: + * `as_epi_archive`: prepares an `epi_archive` object from a data frame + containing snapshots and/or patch data for every available version of + the data set. + * `as_of`: extracts a snapshot of the data set as of some requested + version, in `epi_df` format. + * `epix_slide`, `$slide`: similar to `epi_slide`, but for + `epi_archive`s; for each requested `ref_time_value` and group, applies + a time window and user-specified computation to a snapshot of the data + as of `ref_time_value`. + * `epix_merge`, `$merge`: like `merge` for `epi_archive`s, + but allowing for the last version of each observation to be carried + forward to fill in gaps in `x` or `y`. + * Convenience function: `is_epi_archive`. +* Additional functions: + * `growth_rate`: estimates growth rate of a time series using one of a few + built-in `method`s based on relative change, linear regression, + smoothing splines, or trend filtering. + * `detect_outlr`: applies one or more outlier detection methods to a given + signal variable, and optionally aggregates the outputs to create a + consensus result. + * `detect_outlr_rm`: outlier detection function based on a + rolling-median-based outlier detection function; one of the methods + included in `detect_outlr`. + * `detect_outlr_stl`: outlier detection function based on a seasonal-trend + decomposition using LOESS (STL); one of the methods included in + `detect_outlr`. From c2d58ca1f99ba0e06015f588790f61030414f545 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 Nov 2023 17:06:04 -0800 Subject: [PATCH 3/4] Fix straggling `covidcast` call --- vignettes/advanced.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 91f8f37f..812cb711 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -278,7 +278,7 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -y1 <- covidcast( +y1 <- pub_covidcast( source = "doctor-visits", signals = "smoothed_adj_cli", geo_type = "state", From cf894c32f71d334ec37b37f8a976745d7064ff14 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 Nov 2023 17:08:25 -0800 Subject: [PATCH 4/4] Bump to a .9999 version + add NEWS.md entry --- DESCRIPTION | 2 +- NEWS.md | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9a8dea6..dc48eb86 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.0 +Version: 0.7.0.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index be8b9b56..e3a9ea13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ + +# epiprocess 0.7.0.9999 + +## Improvements + +* Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. + # epiprocess 0.7.0 Note that `epiprocess` uses the [Semantic Versioning