diff --git a/DESCRIPTION b/DESCRIPTION index c580b872c..da9cc7909 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.2 +Version: 0.1.3 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), @@ -36,6 +36,7 @@ Imports: ggplot2, glue, hardhat (>= 1.3.0), + here, lifecycle, magrittr, recipes (>= 1.0.4), diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 8bc251e77..b86eb5beb 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,10 +1,8 @@ ## Setting up the development environment ```r -install.packages(c('devtools', 'pkgdown', 'styler', 'lintr')) # install dev dependencies -devtools::install_deps(dependencies = TRUE) # install package dependencies -devtools::document() # generate package meta data and man files -devtools::build() # build package +install.packages(c('devtools', 'pkgdown', 'styler', 'lintr', 'pak')) # install dev dependencies +pak::pkg_install(".") # install package and dependencies ``` ## Validating the package @@ -13,35 +11,69 @@ devtools::build() # build package styler::style_pkg() # format code lintr::lint_package() # lint code +devtools::check() # run R CMD check, which runs everything below +devtools::document() # generate package meta data and man files devtools::test() # test package -devtools::check() # check package for errors +devtools::build_vignettes() # build vignettes only +devtools::run_examples() # run doc examples +devtools::check(vignettes = FALSE) # check package without vignettes ``` ## Developing the documentation site -The [documentation site](https://cmu-delphi.github.io/epipredict/) is built off of the `main` branch. The `dev` version of the site is available at https://cmu-delphi.github.io/epipredict/dev. - -The documentation site can be previewed locally by running in R - -```r -pkgdown::build_site(preview=TRUE) -``` +Our CI builds two version of the documentation: -The `main` version is available at `file:////epidatr/epipredict/index.html` and `dev` at `file:////epipredict/docs/dev/index.html`. +- https://cmu-delphi.github.io/epipredict/ from the `main` branch and +- https://cmu-delphi.github.io/epipredict/dev from the `dev` branch. -You can also build the docs manually and launch the site with python. From the terminal, this looks like +We include the script `pkgdown-watch.R` that will automatically rebuild the +documentation locally and preview it. It can be used with: -```bash -R -e 'pkgdown::clean_site()' -R -e 'devtools::document()' -R -e 'pkgdown::build_site()' -python -m http.server -d docs +```sh +# Make sure you have servr installed +R -e 'renv::install("servr")' +# Will start a local server +Rscript pkgdown-watch.R +# You may need to first build the site with +R -e 'pkgdown::build_site(".", examples = FALSE, devel = TRUE, preview = FALSE)' ``` ## Versioning Please follow the guidelines in the [PR template document](.github/pull_request_template.md). -## Release process +## Planned CRAN release process + +Open a release issue and then copy and follow this checklist in the issue (modified from the checklist generated by `usethis::use_release_issue(version = "1.0.2")`): + +- [ ] `git pull` on `dev` branch. +- [ ] Make sure all changes are committed and pushed. +- [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_epipredict.html). +- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "false"))`. + - Aim for 10/10, no notes. +- [ ] If check works well enough, merge to main. Otherwise open a PR to fix up. +- [ ] [Polish NEWS](https://github.com/cmu-delphi/epipredict/blob/dev/NEWS.md). + - Some [guidelines](https://style.tidyverse.org/news.html#news-release). +- [ ] `git checkout main` +- [ ] `git pull` +- [ ] `urlchecker::url_check()`. + - This may choke on the MIT license url, and that's ok. +- [ ] `devtools::build_readme()` +- [ ] `devtools::check_win_devel()` +- [ ] Have maintainer ("cre" in description) check email for problems. +- [ ] `revdepcheck::revdep_check(num_workers = 4)`. + - This may choke, it is very sensitive to the binary versions of packages on a given system. Either bypass or ask someone else to run it if you're concerned. +- [ ] Update `cran-comments.md` +- [ ] PR with any changes (and go through the list again) into `dev` and run through the list again. + +Submit to CRAN: + +- [ ] `devtools::submit_cran()`. +- [ ] Maintainer approves email. + +Wait for CRAN... -TBD +- [ ] If accepted :tada:, move to next steps. If rejected, fix and resubmit. +- [ ] Open and merge a PR containing any updates made to `main` back to `dev`. +- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push) will create a draft release based on the commit hash in CRAN-SUBMISSION and push a tag to the GitHub repo. +- [ ] Go to the repo, verify the release notes, and publish when ready. diff --git a/_pkgdown.yml b/_pkgdown.yml index 468da62ac..5222999b6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -22,20 +22,20 @@ navbar: type: light articles: -- title: Get started - navbar: ~ - contents: - - epipredict - - preprocessing-and-models - - arx-classifier - - update + - title: Get started + navbar: ~ + contents: + - epipredict + - preprocessing-and-models + - backtesting + - arx-classifier + - update -- title: Advanced methods - contents: - - articles/sliding - - articles/smooth-qr - - articles/symptom-surveys - - panel-data + - title: Advanced methods + contents: + - articles/smooth-qr + - articles/symptom-surveys + - panel-data repo: url: @@ -79,15 +79,15 @@ reference: - grf_quantiles - title: Custom panel data forecasting workflows contents: - - epi_recipe - - epi_workflow - - add_epi_recipe - - adjust_epi_recipe - - Add_model - - predict.epi_workflow - - fit.epi_workflow - - augment.epi_workflow - - forecast.epi_workflow + - epi_recipe + - epi_workflow + - add_epi_recipe + - adjust_epi_recipe + - Add_model + - predict.epi_workflow + - fit.epi_workflow + - augment.epi_workflow + - forecast.epi_workflow - title: Epi recipe preprocessing steps contents: diff --git a/pkgdown-watch.r b/pkgdown-watch.r new file mode 100644 index 000000000..00aae3228 --- /dev/null +++ b/pkgdown-watch.r @@ -0,0 +1,58 @@ +# Run with: Rscript pkgdown-watch.R +# +# Modifying this: https://gist.github.com/gadenbuie/d22e149e65591b91419e41ea5b2e0621 +# - Removed docopts cli interface and various configs/features I didn't need. +# - Sped up reference building by not running examples. +# +# Note that the `pattern` regex is case sensitive, so make sure your Rmd files +# end in `.Rmd` and not `.rmd`. + +rlang::check_installed(c("pkgdown", "servr", "devtools", "here", "cli", "fs")) + +pkg <- pkgdown::as_pkgdown(".") + +servr::httw( + dir = here::here("docs"), + watch = here::here(), + pattern = "[.](Rm?d|y?ml|s[ac]ss|css|js)$", + handler = function(files) { + devtools::load_all() + + files_rel <- fs::path_rel(files, start = getwd()) + cli::cli_inform("{cli::col_yellow('Updated')} {.val {files_rel}}") + + articles <- grep("vignettes.+Rmd$", files, value = TRUE) + + if (length(articles) == 1) { + name <- fs::path_ext_remove(fs::path_rel(articles, fs::path(pkg$src_path, "vignettes"))) + pkgdown::build_article(name, pkg) + } else if (length(articles) > 1) { + pkgdown::build_articles(pkg, preview = FALSE) + } + + refs <- grep("man.+R(m?d)?$", files, value = TRUE) + if (length(refs)) { + # TODO: This does not work for me, so I just run it manually. + # pkgdown::build_reference(pkg, preview = FALSE, examples = FALSE, lazy = FALSE) + } + + pkgdown <- grep("pkgdown", files, value = TRUE) + if (length(pkgdown) && !pkgdown %in% c(articles, refs)) { + pkgdown::init_site(pkg) + } + + pkgdown_index <- grep("index[.]Rmd$", files_rel, value = TRUE) + if (length(pkgdown_index)) { + devtools::build_rmd(pkgdown_index) + pkgdown::build_home(pkg) + } + + readme <- grep("README[.]rmd$", files, value = TRUE, ignore.case = TRUE) + if (length(readme)) { + devtools::build_readme(".") + pkgdown::build_home(pkg) + } + + cli::cli_alert("Site rebuild done!") + } +) diff --git a/vignettes/.gitignore b/vignettes/.gitignore index 324ceaf7e..5bf4e1a97 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -1,3 +1,4 @@ *.html *_cache/ *.R +!_common.R \ No newline at end of file diff --git a/vignettes/_common.R b/vignettes/_common.R new file mode 100644 index 000000000..3bd7c929d --- /dev/null +++ b/vignettes/_common.R @@ -0,0 +1,23 @@ +knitr::opts_chunk$set( + digits = 3, + comment = "#>", + collapse = TRUE, + cache = TRUE, + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 +) diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd deleted file mode 100644 index 908caf307..000000000 --- a/vignettes/articles/sliding.Rmd +++ /dev/null @@ -1,460 +0,0 @@ ---- -title: "Demonstrations of sliding AR and ARX forecasters" ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - warning = FALSE, - message = FALSE, - cache = TRUE -) -``` - -```{r pkgs} -library(epipredict) -library(epidatr) -library(data.table) -library(dplyr) -library(tidyr) -library(ggplot2) -library(magrittr) -library(purrr) -``` - -# Demonstrations of sliding AR and ARX forecasters - -A key function from the epiprocess package is `epix_slide()` (refer to the -following vignette for the basics of the function: ["Work with archive objects -and data -revisions"](https://cmu-delphi.github.io/epiprocess/articles/archive.html)) -which allows performing version-aware computations. That is, the function only -uses data that would have been available as of time t for that reference time. - -In this vignette, we use `epix_slide()` for backtesting our `arx_forecaster` on -historical COVID-19 case data from the US and from Canada. We first examine the -results from a version-unaware forecaster, comparing two different fitting -engines and then we contrast this with version-aware forecasting. The former -will proceed by constructing an `epi_archive` that erases its version -information and then use `epix_slide()` to forecast the future. The latter will -keep the versioned data and proceed similarly by using `epix_slide()` to -forecast the future. - -## Comparing different forecasting engines - -### Example using CLI and case data from US states - -First, we download the version history (ie. archive) of the percentage of -doctor's visits with CLI (COVID-like illness) computed from medical insurance -claims and the number of new confirmed COVID-19 cases per 100,000 population -(daily) for all 50 states from the COVIDcast API. - -
- -Load a data archive - -We process as before, with the modification that we use `sync = locf` in -`epix_merge()` so that the last version of each observation can be carried -forward to extrapolate unavailable versions for the less up-to-date input -archive. - -```{r grab-epi-data} -theme_set(theme_bw()) - -y <- readRDS("all_states_covidcast_signals.rds") %>% - purrr::map(~ select(.x, geo_value, time_value, version = issue, value)) - -x <- epix_merge( - y[[1]] %>% rename(percent_cli = value) %>% as_epi_archive(compactify = FALSE), - y[[2]] %>% rename(case_rate = value) %>% as_epi_archive(compactify = FALSE), - sync = "locf", - compactify = TRUE -) -rm(y) -``` - -
- -We then obtaining the latest snapshot of the data and proceed to fake the -version information by setting `version = time_value`. This has the effect of -obtaining data that arrives exactly at the day of the time_value. - -```{r arx-kweek-preliminaries, warning = FALSE} -# Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, version = max(x$versions_end)) %>% - mutate(version = time_value) %>% - as_epi_archive() -fc_time_values <- seq( - from = as.Date("2020-08-01"), - to = as.Date("2021-11-01"), - by = "1 month" -) -aheads <- c(7, 14, 21, 28) - -forecast_k_week_ahead <- function(epi_archive, outcome, predictors, ahead = 7, engine) { - epi_archive %>% - epix_slide( - .f = function(x, gk, rtv) { - arx_forecaster( - x, outcome, predictors, engine, - args_list = arx_args_list(ahead = ahead) - )$predictions %>% - mutate(engine_type = engine$engine) %>% - pivot_quantiles_wider(.pred_distn) - }, - .before = 120, - .versions = fc_time_values - ) -} -``` - -```{r make-arx-kweek} -# Generate the forecasts and bind them together -fc <- bind_rows( - map(aheads, ~ forecast_k_week_ahead( - x_latest, - outcome = "case_rate", - predictors = c("case_rate", "percent_cli"), - ahead = .x, - engine = linear_reg() - )), - map(aheads, ~ forecast_k_week_ahead( - x_latest, - outcome = "case_rate", - predictors = c("case_rate", "percent_cli"), - ahead = .x, - engine = rand_forest(mode = "regression") - )) -) -``` - -Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the -target (respecting time stamps and locations) along with lags of the features -(here, the response and doctors visits), estimates a forecasting model using the -specified engine, creates predictions, and non-parametric confidence bands. - -To see how the predictions compare, we plot them on top of the latest case -rates. Note that even though we've fitted the model on all states, we'll just -display the results for two states, California (CA) and Florida (FL), to get a -sense of the model performance while keeping the graphic simple. - -
- -Code for plotting -```{r plot-arx, message = FALSE, warning = FALSE} -fc_cafl <- fc %>% - tibble() %>% - filter(geo_value %in% c("ca", "fl")) -x_latest_cafl <- x_latest$DT %>% - tibble() %>% - filter(geo_value %in% c("ca", "fl")) - -p1 <- ggplot(fc_cafl, aes(target_date, group = forecast_date, fill = engine_type)) + - geom_line( - data = x_latest_cafl, aes(x = time_value, y = case_rate), - inherit.aes = FALSE, color = "gray50" - ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) + - geom_line(aes(y = .pred)) + - geom_point(aes(y = .pred), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(engine_type), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - scale_fill_brewer(palette = "Set1") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` -
- -```{r show-plot1, fig.width = 9, fig.height = 6, echo=FALSE} -p1 -``` - -For the two states of interest, simple linear regression clearly performs better -than random forest in terms of accuracy of the predictions and does not result -in such in overconfident predictions (overly narrow confidence bands). Though, -in general, neither approach produces amazingly accurate forecasts. This could -be because the behaviour is rather different across states and the effects of -other notable factors such as age and public health measures may be important to -account for in such forecasting. Including such factors as well as making -enhancements such as correcting for outliers are some improvements one could -make to this simple model.[^1] - -[^1]: Note that, despite the above caveats, simple models like this tend to -out-perform many far more complicated models in the online Covid forecasting due -to those models high variance predictions. - - -### Example using case data from Canada - -
- -Data and forecasts. Similar to the above. - -By leveraging the flexibility of `epiprocess`, we can apply the same techniques -to data from other sources. Since some collaborators are in British Columbia, -Canada, we'll do essentially the same thing for Canada as we did above. - -The [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects -daily time series data on COVID-19 cases, deaths, recoveries, testing and -vaccinations at the health region and province levels. Data are collected from -publicly available sources such as government datasets and news releases. -Unfortunately, there is no simple versioned source, so we have created our own -from the Github commit history. - -First, we load versioned case rates at the provincial level. After converting -these to 7-day averages (due to highly variable provincial reporting -mismatches), we then convert the data to an `epi_archive` object, and extract -the latest version from it. Finally, we run the same forcasting exercise as for -the American data, but here we compare the forecasts produced from using simple -linear regression with those from using boosted regression trees. - -```{r get-can-fc, warning = FALSE} -# source("drafts/canada-case-rates.R) -can <- readRDS(system.file( - "extdata", "can_prov_cases.rds", - package = "epipredict", mustWork = TRUE -)) %>% - group_by(version, geo_value) %>% - arrange(time_value) %>% - mutate(cr_7dav = RcppRoll::roll_meanr(case_rate, n = 7L)) %>% - as_epi_archive(compactify = TRUE) - -can_latest <- epix_as_of(can, version = max(can$DT$version)) %>% - mutate(version = time_value) %>% - as_epi_archive() - -# Generate the forecasts, and bind them together -can_fc <- bind_rows( - map( - aheads, - ~ forecast_k_week_ahead(can_latest, "cr_7dav", "cr_7dav", .x, linear_reg()) - ), - map( - aheads, - ~ forecast_k_week_ahead( - can_latest, "cr_7dav", "cr_7dav", .x, - boost_tree(mode = "regression", trees = 20) - ) - ) -) -``` - -The figures below shows the results for all of the provinces. - -```{r plot-can-fc-lr, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} -ggplot( - can_fc %>% filter(engine_type == "lm"), - aes(x = target_date, group = forecast_date) -) + - coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + - geom_line( - data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav), - inherit.aes = FALSE, color = "gray50" - ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), - alpha = 0.4 - ) + - geom_line(aes(y = .pred)) + - geom_point(aes(y = .pred), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_wrap(~geo_value, scales = "free_y", ncol = 3) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs( - title = "Using simple linear regression", x = "Date", - y = "Reported COVID-19 case rates" - ) + - theme(legend.position = "none") -``` - -```{r plot-can-fc-boost, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} -ggplot( - can_fc %>% filter(engine_type == "xgboost"), - aes(x = target_date, group = forecast_date) -) + - coord_cartesian(xlim = lubridate::ymd(c("2020-12-01", NA))) + - geom_line( - data = can_latest$DT %>% tibble(), aes(x = time_value, y = cr_7dav), - inherit.aes = FALSE, color = "gray50" - ) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), - alpha = 0.4 - ) + - geom_line(aes(y = .pred)) + - geom_point(aes(y = .pred), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_wrap(~geo_value, scales = "free_y", ncol = 3) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs( - title = "Using boosted regression trees", x = "Date", - y = "Reported COVID-19 case rates" - ) + - theme(legend.position = "none") -``` - -Both approaches tend to produce quite volatile forecasts (point predictions) -and/or are overly confident (very narrow bands), particularly when boosted -regression trees are used. But as this is meant to be a simple demonstration of -sliding with different engines in `arx_forecaster`, we may devote another -vignette to work on improving the predictive modelling using the suite of tools -available in `{epipredict}`. - -
- -## Version-aware forecasting - -### Example using case data from US states - -We will now employ a forecaster that uses properly-versioned data (that would -have been available in real-time) to forecast the 7 day average of future -COVID-19 case rates from current and past COVID-19 case rates and death rates -for all states. That is, we can make forecasts on the archive, `x`, and compare -those to forecasts on the latest data, `x_latest` using the same general set-up -as above. Note that in this example, we use a geo-pooled approach (using -combined data from all US states and territories) to train our model. - -
- -Download data using `{epidatr}` -```{r load-data, eval=FALSE} -# loading in the data -states <- "*" - -confirmed_incidence_prop <- pub_covidcast( - source = "jhu-csse", - signals = "confirmed_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20200301, 20211231), - geo_values = states, - issues = epirange(20000101, 20221231) -) %>% - select(geo_value, time_value, version = issue, case_rate = value) %>% - arrange(geo_value, time_value) %>% - as_epi_archive(compactify = FALSE) - -deaths_incidence_prop <- pub_covidcast( - source = "jhu-csse", - signals = "deaths_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20200301, 20211231), - geo_values = states, - issues = epirange(20000101, 20221231) -) %>% - select(geo_value, time_value, version = issue, death_rate = value) %>% - arrange(geo_value, time_value) %>% - as_epi_archive(compactify = FALSE) - - -x <- epix_merge(confirmed_incidence_prop, deaths_incidence_prop, sync = "locf") - -x <- x %>% - epix_slide( - .versions = fc_time_values, - function(x, gk, rtv) { - x %>% - group_by(geo_value) %>% - epi_slide_mean(case_rate, .window_size = 7L) %>% - rename(case_rate_7d_av = slide_value_case_rate) %>% - epi_slide_mean(death_rate, ..window_size = 7L) %>% - rename(death_rate_7d_av = slide_value_death_rate) %>% - ungroup() - } - ) %>% - rename(version = time_value) %>% - rename( - time_value = slide_value_time_value, - geo_value = slide_value_geo_value, - case_rate = slide_value_case_rate, - death_rate = slide_value_death_rate, - case_rate_7d_av = slide_value_case_rate_7d_av, - death_rate_7d_av = slide_value_death_rate_7d_av - ) %>% - as_epi_archive(compactify = TRUE) - -saveRDS(x$DT, file = "case_death_rate_archive.rds") -``` - -```{r load-stored-data} -x <- readRDS("case_death_rate_archive.rds") -x <- as_epi_archive(x) -``` -
- -Here we specify the ARX model. - -```{r make-arx-model} -aheads <- c(7, 14, 21) -fc_time_values <- seq( - from = as.Date("2020-09-01"), - to = as.Date("2021-12-31"), - by = "1 month" -) -forecaster <- function(x) { - map(aheads, function(ahead) { - arx_forecaster( - epi_data = x, - outcome = "death_rate_7d_av", - predictors = c("death_rate_7d_av", "case_rate_7d_av"), - trainer = quantile_reg(), - args_list = arx_args_list(lags = c(0, 7, 14, 21), ahead = ahead) - )$predictions - }) %>% - bind_rows() -} -``` - -We can now use our forecaster function that we've created and use it in the -pipeline for forecasting the predictions. We store the predictions into the -`arx_preds` variable and calculate the most up to date version of the data in the -epi archive and store it as `x_latest`. - -```{r running-arx-forecaster} -arx_preds <- x %>% - epix_slide( - ~ forecaster(.x), - .before = 120, .versions = fc_time_values - ) %>% - mutate(engine_type = quantile_reg()$engine) %>% - mutate(ahead_val = target_date - forecast_date) - -x_latest <- epix_as_of(x, version = max(x$versions_end)) -``` - -Now we plot both the actual and predicted 7 day average of the death rate for -the chosen states - -
- -Code for the plot -```{r plot-arx-asof, message = FALSE, warning = FALSE} -states_to_show <- c("ca", "ny", "mi", "az") -fc_states <- arx_preds %>% - filter(geo_value %in% states_to_show) %>% - pivot_quantiles_wider(.pred_distn) - -x_latest_states <- x_latest %>% filter(geo_value %in% states_to_show) - -p2 <- ggplot(fc_states, aes(target_date, group = forecast_date)) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = geo_value), alpha = 0.4) + - geom_line( - data = x_latest_states, aes(x = time_value, y = death_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = .pred, color = geo_value)) + - geom_point(aes(y = .pred, color = geo_value), size = 0.5) + - geom_vline(aes(xintercept = forecast_date), linetype = 2, alpha = 0.5) + - facet_wrap(~geo_value, scales = "free_y", ncol = 1L) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - scale_fill_brewer(palette = "Set1") + - scale_color_brewer(palette = "Set1") + - labs(x = "Date", y = "7 day average COVID-19 death rates") + - theme(legend.position = "none") -``` -
- -```{r show-plot2, fig.width = 9, fig.height = 6, echo = FALSE} -p2 -``` diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd new file mode 100644 index 000000000..84f0a811c --- /dev/null +++ b/vignettes/backtesting.Rmd @@ -0,0 +1,460 @@ +--- +title: "Accurately backtesting forecasters" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Accurately backtesting forecasters} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + +```{r pkgs} +library(epipredict) +library(epidatr) +library(data.table) +library(dplyr) +library(tidyr) +library(ggplot2) +library(magrittr) +library(purrr) +``` + +# Accurately backtesting forecasters + +Backtesting is a crucial step in the development of forecasting models. It +involves testing the model on historical data to see how well it performs. This +is important because it allows us to see how well the model generalizes to new +data and to identify any potential issues with the model. In the context of +epidemiological forecasting, to do backtesting accurately, we need to account +for the fact that the data available at the time of the forecast would have been +different from the data available at the time of the backtest. This is because +new data is constantly being collected and added to the dataset, which can +affect the accuracy of the forecast. + +For this reason, it is important to use version-aware forecasting, where the +model is trained on data that would have been available at the time of the +forecast. This ensures that the model is tested on data that is as close as +possible to what would have been available in real-time. + +In the `{epiprocess}` package, we provide `epix_slide()`, a function that allows +a convenient way to perform version-aware forecasting by only using the data as +it would have been available at forecast reference time. In +`vignette("epi_archive", package = "epiprocess")`, we introduced the concept of +an `epi_archive` and we demonstrated how to use `epix_slide()` to forecast the +future using a simple quantile regression model. In this vignette, we will +demonstrate how to use `epix_slide()` to backtest an auto-regressive forecaster +on historical COVID-19 case data from the US and Canada. Instead of building a +forecaster from scratch as we did in the previous vignette, we will use the +`arx_forecaster()` function from the `{epipredict}` package. + +## Getting case data from US states into an `epi_archive` + +First, we download the version history (ie. archive) of the percentage of +doctor's visits with CLI (COVID-like illness) computed from medical insurance +claims and the number of new confirmed COVID-19 cases per 100,000 population +(daily) for 6 states from the COVIDcast API (as used in the `epiprocess` +vignette mentioned above). + +```{r grab-epi-data} +archive_cases_dv_subset +``` + +The data can also be fetched from the Delphi Epidata API with the following +query and merged with `epix_merge()`: + +```{r, message = FALSE, warning = FALSE, eval = FALSE} +library(epidatr) + +dv <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) %>% + rename(version = issue, percent_cli = value) %>% + as_epi_archive(compactify = TRUE) +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) %>% + select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% + as_epi_archive(compactify = TRUE) +archive_cases_dv_subset <- epix_merge(dv, y, sync = "locf", compactify = TRUE) +``` + +## Backtesting a simple autoregressive forecaster + +To start, let's use a simple autoregressive forecaster to predict the percentage of +doctor's visits with CLI (`percent_cli`) in the future. + +The `arx_forecaster()` function wraps the autoregressive forecaster we need and +comes with sensible defaults: + +- we specify the predicted outcome to be the percentage of doctor's visits with + CLI (`percent_cli`), +- the predictors are the 7-day average of the case rate and the percentage of + doctor's visits with CLI (`percent_cli`), +- we use a linear regression model as the engine, +- the autoregressive features assume lags of 0, 7, and 14 days, +- we forecast 7 days ahead. + +All these default settings and more can be seen by calling `arx_args_list()`: + +```{r} +arx_args_list() +``` + +These can be modified as needed, by sending your desired arguments into +`arx_forecaster(args_list = arx_args_list())`. For now we will use the defaults. + +__Note__: Unlike in the previous vignette, we will not train and forecast each +geo location indivudally. Instead, we will use a __geo-pooled approach__, where +we train the model on data from all states and territories combined. This is +because the data is quite similar across states, and pooling the data can help +improve the accuracy of the forecasts, while also reducing the susceptibility of +the model to noise. In the interest of computational speed, we only use the 6 +state dataset here, but the full archive can be used in the same way and has +performed well in the past. Implementation-wise, geo-pooling is achieved by +simply dropping the `group_by(geo_value)` prior to `epix_slide()`. + +Let's use the `epix_as_of()` method to generate a snapshot of the archive at the +last date, and then run the forecaster. + +```{r} +# The .versions argument selects only the last version in the archive and +# produces a forecast only on that date. +forecasts <- archive_cases_dv_subset %>% + epix_slide( + ~ arx_forecaster( + .x, + outcome = "percent_cli", + predictors = c("case_rate_7d_av", "percent_cli") + )$predictions, + .versions = archive_cases_dv_subset$versions_end + ) +# Join the forecasts with with the latest data at the time of the forecast to +# compare. Since `percent_cli` data has a few days of lag, we use `tidyr::fill` to +# fill the missing values with the last observed value. +forecasts %>% + inner_join( + archive_cases_dv_subset %>% + epix_as_of(archive_cases_dv_subset$versions_end) %>% + group_by(geo_value) %>% + tidyr::fill(percent_cli), + by = c("geo_value", "forecast_date" = "time_value") + ) %>% + select(geo_value, forecast_date, .pred, .pred_distn, percent_cli) +``` + +The resulting epi_df now contains two new columns: `.pred` and `.pred_distn`, +corresponding to the point forecast (median) and the quantile distribution +containing our requested quantile forecasts (in this case, 0.05 and 0.95) +respectively. The forecasts look reasonable and in line with the data. The point +forecast is close to the last observed value (joined in the table above as +`percent_cli`), and the uncertainty bands are wide enough to capture the +variability in the data. + +Now let's go ahead and slide this forecaster in a version unaware way and a +version aware way. For the version unaware way, we need to snapshot the latest +version of the data, and then make a faux archive by setting `version = +time_value`. This has the effect of simulating a data set that receives the +final version updates every day. For the version aware way, we will simply use +the true `epi_archive` object. + +```{r} +archive_cases_dv_subset_faux <- archive_cases_dv_subset %>% + epix_as_of(archive_cases_dv_subset$versions_end) %>% + mutate(version = time_value) %>% + as_epi_archive() +``` + +To reduce typing, we create the wrapper function `forecast_k_week_ahead()`. + +```{r arx-kweek-preliminaries, warning = FALSE} +# Latest snapshot of data, and forecast dates +forecast_dates <- seq(from = as.Date("2020-08-01"), to = as.Date("2021-11-01"), by = "1 month") +aheads <- c(7, 14, 21, 28) + +# @param epi_archive The epi_archive object to forecast from +# @param ahead The number of days ahead to forecast +# @param outcome The outcome variable to forecast +# @param predictors The predictors to use in the model +# @param forecast_dates The dates to forecast on +# @param process_data A function to process the data before forecasting +forecast_k_week_ahead <- function( + epi_archive, + ahead = 7, + outcome = NULL, predictors = NULL, forecast_dates = NULL, process_data = identity) { + if (is.null(forecast_dates)) { + forecast_dates <- epi_archive$versions_end + } + if (is.null(outcome) || is.null(predictors)) { + stop("Please specify the outcome and predictors.") + } + epi_archive %>% + epix_slide( + ~ arx_forecaster( + process_data(.x), outcome, predictors, + args_list = arx_args_list(ahead = ahead) + )$predictions %>% + pivot_quantiles_wider(.pred_distn), + .before = 120, + .versions = forecast_dates + ) +} +``` + +```{r} +# Generate the forecasts and bind them together +forecasts <- bind_rows( + map(aheads, ~ forecast_k_week_ahead( + archive_cases_dv_subset_faux, + ahead = .x, + outcome = "percent_cli", + predictors = c("case_rate_7d_av", "percent_cli"), + forecast_dates = forecast_dates + ) %>% mutate(version_aware = FALSE)), + map(aheads, ~ forecast_k_week_ahead( + archive_cases_dv_subset, + ahead = .x, + outcome = "percent_cli", + predictors = c("case_rate_7d_av", "percent_cli"), + forecast_dates = forecast_dates + ) %>% mutate(version_aware = TRUE)) +) +``` + +Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the +target (respecting time stamps and locations) along with lags of the features +(here, the response and doctors visits), estimates a forecasting model using the +specified engine, creates predictions, and non-parametric confidence bands. + +To see how the predictions compare, we plot them on top of the latest case +rates. Note that even though we've fitted the model on all states, we'll just +display the results for two states, California (CA) and Florida (FL), to get a +sense of the model performance while keeping the graphic simple. + +
+Code for plotting + +```{r} +geo_choose <- "ca" +forecasts_filtered <- forecasts %>% + filter(geo_value == geo_choose) %>% + mutate(time_value = version) +percent_cli_data <- bind_rows( + # Snapshotted data for the version-aware forecasts + map( + forecast_dates, + ~ archive_cases_dv_subset %>% + epix_as_of(.x) %>% + mutate(version = .x) + ) %>% + bind_rows() %>% + mutate(version_aware = TRUE), + # Latest data for the version-unaware forecasts + archive_cases_dv_subset %>% + epix_as_of(archive_cases_dv_subset$versions_end) %>% + mutate(version_aware = FALSE) +) %>% + filter(geo_value == geo_choose) + +p1 <- ggplot(data = forecasts_filtered, aes(x = target_date, group = time_value)) + + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = factor(time_value)), alpha = 0.4) + + geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) + + geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) + + geom_vline(data = percent_cli_data, aes(color = factor(version), xintercept = version), lty = 2) + + geom_line( + data = percent_cli_data, + aes(x = time_value, y = percent_cli, color = factor(version)), + inherit.aes = FALSE, na.rm = TRUE + ) + + facet_grid(version_aware ~ geo_value, scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + scale_y_continuous(expand = expansion(c(0, 0.05))) + + labs(x = "Date", y = "smoothed, day of week adjusted covid-like doctors visits") + + theme(legend.position = "none") +``` + +```{r} +geo_choose <- "fl" +forecasts_filtered <- forecasts %>% + filter(geo_value == geo_choose) %>% + mutate(time_value = version) +percent_cli_data <- bind_rows( + # Snapshotted data for the version-aware forecasts + map( + forecast_dates, + ~ archive_cases_dv_subset %>% + epix_as_of(.x) %>% + mutate(version = .x) + ) %>% + bind_rows() %>% + mutate(version_aware = TRUE), + # Latest data for the version-unaware forecasts + archive_cases_dv_subset %>% + epix_as_of(archive_cases_dv_subset$versions_end) %>% + mutate(version_aware = FALSE) +) %>% + filter(geo_value == geo_choose) + +p2 <- ggplot(data = forecasts_filtered, aes(x = target_date, group = time_value)) + + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = factor(time_value)), alpha = 0.4) + + geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) + + geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) + + geom_vline(data = percent_cli_data, aes(color = factor(version), xintercept = version), lty = 2) + + geom_line( + data = percent_cli_data, + aes(x = time_value, y = percent_cli, color = factor(version)), + inherit.aes = FALSE, na.rm = TRUE + ) + + facet_grid(version_aware ~ geo_value, scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + scale_y_continuous(expand = expansion(c(0, 0.05))) + + labs(x = "Date", y = "smoothed, day of week adjusted covid-like doctors visits") + + theme(legend.position = "none") +``` +
+ +```{r show-plot1, echo=FALSE} +p1 +p2 +``` + +For the two states of interest, neither approach produces amazingly accurate +forecasts. This could be because the behaviour is rather different across states +and the effects of other notable factors such as age and public health measures +may be important to account for in such forecasting. Including such factors as +well as making enhancements such as correcting for outliers are some +improvements one could make to this simple model.[^1] + +[^1]: Note that, despite the above caveats, simple models like this tend to +out-perform many far more complicated models in the online Covid forecasting due +to those models high variance predictions. + +### Example using case data from Canada + +
+ +Data and forecasts. Similar to the above. + +By leveraging the flexibility of `epiprocess`, we can apply the same techniques +to data from other sources. Since some collaborators are in British Columbia, +Canada, we'll do essentially the same thing for Canada as we did above. + +The [COVID-19 Canada Open Data Working Group](https://opencovid.ca/) collects +daily time series data on COVID-19 cases, deaths, recoveries, testing and +vaccinations at the health region and province levels. Data are collected from +publicly available sources such as government datasets and news releases. +Unfortunately, there is no simple versioned source, so we have created our own +from the Github commit history. + +First, we load versioned case rates at the provincial level. After converting +these to 7-day averages (due to highly variable provincial reporting +mismatches), we then convert the data to an `epi_archive` object, and extract +the latest version from it. Finally, we run the same forcasting exercise as for +the American data, but here we compare the forecasts produced from using simple +linear regression with those from using boosted regression trees. + +```{r get-can-fc, warning = FALSE} +aheads <- c(7, 14, 21, 28) +canada_archive <- epidatasets::can_prov_cases +canada_archive_faux <- epix_as_of(canada_archive, canada_archive$versions_end) %>% + mutate(version = time_value) %>% + as_epi_archive() +# This function will add the 7-day average of the case rate to the data +# before forecasting. +smooth_cases <- function(epi_df) { + epi_df %>% + group_by(geo_value) %>% + epi_slide_mean("case_rate", .window_size = 7, na.rm = TRUE) %>% + rename(cr_7dav = slide_value_case_rate) +} +forecast_dates <- seq.Date( + from = min(canada_archive$DT$version), + to = max(canada_archive$DT$version), + by = "1 month" +) + +# Generate the forecasts, and bind them together +canada_forecasts <- bind_rows( + map( + aheads, + ~ forecast_k_week_ahead( + canada_archive_faux, + ahead = .x, + outcome = "cr_7dav", + predictors = "cr_7dav", + forecast_dates = forecast_dates, + process_data = smooth_cases + ) %>% mutate(version_aware = FALSE) + ), + map( + aheads, + ~ forecast_k_week_ahead( + canada_archive, + ahead = .x, + outcome = "cr_7dav", + predictors = "cr_7dav", + forecast_dates = forecast_dates, + process_data = smooth_cases + ) %>% mutate(version_aware = TRUE) + ) +) +``` + +The figures below shows the results for a single province. + +```{r plot-can-fc-lr, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 12} +geo_choose <- "Alberta" +forecasts_filtered <- canada_forecasts %>% + filter(geo_value == geo_choose) %>% + mutate(time_value = version) +case_rate_data <- bind_rows( + # Snapshotted data for the version-aware forecasts + map( + forecast_dates, + ~ canada_archive %>% + epix_as_of(.x) %>% + smooth_cases() %>% + mutate(case_rate = cr_7dav, version = .x) + ) %>% + bind_rows() %>% + mutate(version_aware = TRUE), + # Latest data for the version-unaware forecasts + canada_archive %>% + epix_as_of(archive_cases_dv_subset$versions_end) %>% + smooth_cases() %>% + mutate(case_rate = cr_7dav, version_aware = FALSE) +) %>% + filter(geo_value == geo_choose) + +ggplot(data = forecasts_filtered, aes(x = target_date, group = time_value)) + + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = factor(time_value)), alpha = 0.4) + + geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) + + geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) + + geom_vline(data = case_rate_data, aes(color = factor(version), xintercept = version), lty = 2) + + geom_line( + data = case_rate_data, + aes(x = time_value, y = case_rate, color = factor(version)), + inherit.aes = FALSE, na.rm = TRUE + ) + + facet_grid(version_aware ~ geo_value, scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + scale_y_continuous(expand = expansion(c(0, 0.05))) + + labs(x = "Date", y = "smoothed, day of week adjusted covid-like doctors visits") + + theme(legend.position = "none") +``` + +