cmu-delphi · nmdefries · Nov 9, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,7 @@ Authors@R:
     person(c("Daniel", "J."), "McDonald", , "[email protected]", role = c("cre", "aut"))
 Description: This package contains data sets used to compile vignettes and
     other documentation in Delphi R Packages. The goal is to avoid calls
-    to the Delphi Epidata API, and deposit some examples here for easy
+    to the Delphi Epidata API, and to deposit some examples here for easy
     offline use.
 License: MIT + file LICENSE
 Depends:
@@ -15,15 +15,16 @@ Suggests:
     covidcast,
     dplyr,
     epidatr,
-    epipredict,
+    epiprocess,
     here,
+    httr,
+    jsonlite,
     lubridate,
     magrittr,
     purrr,
     readr
 Remotes:
     cmu-delphi/epidatr,
-    cmu-delphi/epipredict,
     cmu-delphi/epiprocess
 Encoding: UTF-8
 LazyData: true

diff --git a/R/epipredict-data.R b/R/epipredict-data.R
@@ -80,3 +80,26 @@
 #' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
 #' Copyright Johns Hopkins University 2020.
 "counts_subset"
+
+#' Canadian COVID-19 case rates
+#'
+#' Data set for all Canadian provinces and territories containing COVID-19
+#' case rates (COVID-19 cases per 100,000 people) derived from COVID-19 case
+#' counts as reported by the
+#' \href{https://opencovid.ca/}{COVID-19 Canada Open Data Working Group (CCODWG)}.
+#' Data is available both through the archived
+#' \href{https://github.com/ccodwg/Covid19Canada}{ccodwg/Covid19Canada GitHub repository}
+#' and the newer
+#' \href{https://github.com/ccodwg/CovidTimelineCanada}{ccodwg/CovidTimelineCanada GitHub repository},
+#' which also reports vaccine-related signals.
+#'
+#' This dataset contains versioned data covering the period from April 2020 to
+#' December 2021 and is used in the [epipredict] slide vignette.
+#'
+#' @source This object contains a modified part of the COVID-19 Canada Open
+#' Data Working Group's
+#' \href{https://github.com/ccodwg/Covid19Canada}{Covid19Canada data repository} (archived).
+#' This data set is licensed under the terms of the
+#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
+#' by the COVID-19 Canada Open Data Working Group.
+"can_prov_cases"
diff --git a/data-raw/archive_cases_dv_subset.R → data-raw/archive_cases_dv_subset_dt.R b/data-raw/archive_cases_dv_subset.R → data-raw/archive_cases_dv_subset_dt.R
@@ -1,28 +1,30 @@
-dv_subset <- covidcast(
-  data_source = "doctor-visits",
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+
+dv_subset <- pub_covidcast(
+  source = "doctor-visits",
   signals = "smoothed_adj_cli",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200601, 20211201),
   geo_values = "ca,fl,ny,tx",
   issues = epirange(20200601, 20211201)
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, version = issue, percent_cli = value) %>%
   # We're using compactify=FALSE here and below to avoid some testthat test
   # failures on tests that were based on a non-compactified version.
   as_epi_archive(compactify = FALSE)
 
-case_rate_subset <- covidcast(
-  data_source = "jhu-csse",
+case_rate_subset <- pub_covidcast(
+  source = "jhu-csse",
   signals = "confirmed_7dav_incidence_prop",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200601, 20211201),
   geo_values = "ca,fl,ny,tx",
   issues = epirange(20200601, 20211201)
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%
   as_epi_archive(compactify = FALSE)
 

diff --git a/data-raw/can_prov_cases.R b/data-raw/can_prov_cases.R
@@ -0,0 +1,148 @@
+## code to prepare `can_prov_cases` dataset goes here
+
+library(dplyr)
+library(epiprocess)
+library(readr)
+library(purrr)
+library(httr)
+library(jsonlite)
+
+
+# Look for a GitHub API token.
+# Returns an empty string "" if env variable not found.
+gh_token <- Sys.getenv("GITHUB_PAT")
+if (gh_token == "") {
+  # Try again with the secondary name.
+  gh_token <- Sys.getenv("GITHUB_TOKEN")
+}
+if (gh_token == "") {
+  warning("Token is not set or is not able to be fetched from the environment.",
+          " Proceeding without authentication, but the requests may be blocked", 
+          " due to GitHub API rate limits.")
+}
+
+# Construct a header to send with GET requests
+if (gh_token == "") {
+  # Empty header
+  auth_header <- httr::add_headers()
+} else {
+  auth_header <- httr::add_headers(Authorization = paste("Bearer", gh_token))
+}
+
+## Get list of new and modified files to download
+# The `path` field filters commits to only those that modifying the listed dir
+# From https://www.github.com/ccodwg/Covid19Canada
+BASE_URL <- "https://api.github.com/repos/ccodwg/Covid19Canada/commits?sha=%s&per_page=%s&path=timeseries_prov/cases_timeseries_prov.csv&until=%s&page=%s"
+ITEMS_PER_PAGE <- 100
+BRANCH <- "master"
+
+
+
+# We want to fetch all commits made since Mar 13 2022 (version the original
+# dataset was created from).
+#
+# Timestamp should be in ISO 8601 format. See
+# https://docs.github.com/en/rest/reference/commits#list-commits--parameters for
+# details.
+since_date <- strftime("2022-03-13", "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
+
+page <- 0
+commit_pages <- list()
+
+# Fetch list of commits from API, one page at a time. Each page contains up to
+# 100 commits. If a page contains 100 commits, assume that there are more
+# results and fetch the next page.
+while (page == 0 || nrow(commit_page) == 100) {
+  page <- page + 1
+  # Construct the URL
+  commits_url <- sprintf(BASE_URL, BRANCH, ITEMS_PER_PAGE, since_date, page)
+
+  request <- GET(commits_url, auth_header)
+  # Convert any HTTP errors to R errors automatically.
+  stop_for_status(request)
+
+  # Convert results from nested JSON/list to dataframe. If no results returned,
+  # `commit_page` will be an empty list.
+  commit_page <- content(request, as = "text") %>%
+    fromJSON(simplifyDataFrame = TRUE, flatten = TRUE) %>% 
+    # Trim message down a bit.
+    mutate(message = substr(commit.message, 1, 40)) %>% 
+    select(sha, url = commit.url, message)
+
+  # No more results are being returned.
+  if (identical(commit_page, list())) {
+    break
+  }
+
+  commit_pages[[page]] <- commit_page
+}
+
+# Combine all requested pages of commits into one dataframe
+commit_pages <- bind_rows(commit_pages)
+
+# Missing value `%s` to be filled in with a commit sha or branch name.
+BASE_DATA_URL <- "https://raw.githubusercontent.com/ccodwg/Covid19Canada/%s/timeseries_prov/cases_timeseries_prov.csv"
+
+fc_time_values <- seq(as.Date("2021-02-01"), as.Date("2021-12-01"),
+                      by = "1 month")
+commit_pages <- mutate(
+  commit_pages,
+  data_url = sprintf(BASE_DATA_URL, sha),
+  date = strsplit(message, " ") %>% map_chr(~ substr(.x[3], start=1, stop=10)) %>% as.Date()
+) %>% 
+  # select(data_url, date) %>% 
+  na.omit() %>% 
+  filter(date %in% fc_time_values)
+
+# From https://github.com/mountainMath/BCCovidSnippets/blob/main/data/prov_pop.csv
+ca_pop_url <- "https://raw.githubusercontent.com/mountainMath/BCCovidSnippets/main/data/prov_pop.csv"
+ca_pop <- read_csv(
+  ca_pop_url,
+  col_types = cols(
+    Province = col_character(),
+    shortProvince = col_character(),
+    Population = col_integer()
+  )
+) %>% 
+  rename(province = Province, abbreviation = shortProvince, population = Population)
+abbrev_map <- setNames(ca_pop$province, ca_pop$abbreviation)
+
+# Read in data and convert to `epi_df`s.
+can_prov_cases <- purrr::map2(commit_pages$data_url, commit_pages$date, function(url, date) {
+  raw <- readr::read_csv(
+    url,
+    col_types = cols(
+      province = col_character(),
+      date_report = col_character(),
+      cases = col_double(),
+      cumulative_cases = col_double()
+    )
+  )
+
+  # Raw data uses a mix of full names and abbreviations. Switch to using only full names.
+  raw$province <- case_when(
+    raw$province == "NWT" ~ abbrev_map["NT"],
+    raw$province == "PEI" ~ abbrev_map["PE"],
+    raw$province %in% ca_pop$province ~ raw$province,
+    raw$province %in% ca_pop$abbreviation ~ abbrev_map[raw$province],
+    # Mark everything else as missing. Only applies to "Repatriated" region.
+    TRUE ~ NA
+  )
+
+  result <- raw %>% 
+    mutate(time_value = lubridate::dmy(date_report)) %>%
+    left_join(ca_pop, by="province") %>%
+    filter(!is.na(province), time_value > "2020-04-01") %>%
+    mutate(geo_value = province,
+           case_rate = cases / population * 1e5) %>%
+    select(geo_value, time_value, case_rate) %>%
+    as_epi_df(geo_type = "province", as_of = date)
+
+  return(result)
+})
+names(can_prov_cases) <- commit_pages$date
+can_prov_cases <- can_prov_cases %>% bind_rows(.id = "version") %>%
+  mutate(version = lubridate::ymd(version)) %>% 
+  arrange(version)
+
+usethis::use_data(can_prov_cases, overwrite = TRUE)
diff --git a/data-raw/cancovid.R b/data-raw/cancovid.R
diff --git a/data-raw/cases_deaths_subset.R b/data-raw/cases_deaths_subset.R
@@ -1,48 +1,48 @@
-confirmed_7dav_incidence_prop <- covidcast(
-  data_source = "jhu-csse",
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+
+confirmed_7dav_incidence_prop <- pub_covidcast(
+  source = "jhu-csse",
   signals = "confirmed_7dav_incidence_prop",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200301, 20211231),
   geo_values = "ca,fl,ny,tx,ga,pa"
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, case_rate_7d_av = value) %>%
   arrange(geo_value, time_value)
 
-deaths_7dav_incidence_prop <- covidcast(
-  data_source = "jhu-csse",
+deaths_7dav_incidence_prop <- pub_covidcast(
+  source = "jhu-csse",
   signals = "deaths_7dav_incidence_prop",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200301, 20211231),
   geo_values = "ca,fl,ny,tx,ga,pa"
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, death_rate_7d_av = value) %>%
   arrange(geo_value, time_value)
 
-confirmed_incidence_num <- covidcast(
-  data_source = "jhu-csse",
+confirmed_incidence_num <- pub_covidcast(
+  source = "jhu-csse",
   signals = "confirmed_incidence_num",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200301, 20211231),
   geo_values = "ca,fl,ny,tx,ga,pa"
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, cases = value) %>%
   arrange(geo_value, time_value)
 
-confirmed_7dav_incidence_num <- covidcast(
-  data_source = "jhu-csse",
+confirmed_7dav_incidence_num <- pub_covidcast(
+  source = "jhu-csse",
   signals = "confirmed_7dav_incidence_num",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200301, 20211231),
   geo_values = "ca,fl,ny,tx,ga,pa"
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, cases_7d_av = value) %>%
   arrange(geo_value, time_value)
 

diff --git a/data-raw/counts_subset.R b/data-raw/counts_subset.R
@@ -1,23 +1,25 @@
 ## code to prepare jhu_incidence_num dataset goes here
 
-x <- covidcast(
-  data_source = "jhu-csse",
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+
+x <- pub_covidcast(
+  source = "jhu-csse",
   signals = "confirmed_incidence_num",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20210604, 20211231),
   geo_values = "ca,fl,tx,ny,nj") %>%
-  fetch() %>%
   select(geo_value, time_value, cases = value)
 
-y <- covidcast(
-  data_source = "jhu-csse",
+y <- pub_covidcast(
+  source = "jhu-csse",
   signals = "deaths_incidence_num",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20210604, 20211231),
   geo_values = "ca,fl,tx,ny,nj") %>%
-  fetch() %>%
   select(geo_value, time_value, deaths = value)
 
 counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>%

diff --git a/data-raw/covid_case_death_rates.R b/data-raw/covid_case_death_rates.R
@@ -1,27 +1,29 @@
-x <- covidcast(
-  data_source = "jhu-csse",
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+
+x <- pub_covidcast(
+  source = "jhu-csse",
   signals = "confirmed_7dav_incidence_prop",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200301, 20211231),
   geo_values = "*"
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, case_rate = value)
 
-y <- covidcast(
-  data_source = "jhu-csse",
+y <- pub_covidcast(
+  source = "jhu-csse",
   signals = "deaths_7dav_incidence_prop",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20200301, 20211231),
   geo_values = "*"
 ) %>%
-  fetch() %>%
   select(geo_value, time_value, death_rate = value)
 
-case_death_rate_subset <- x %>%
+covid_case_death_rates <- x %>%
   full_join(y, by = c("geo_value", "time_value")) %>%
   as_epi_df()
 
-usethis::use_data(case_death_rate_subset, overwrite = TRUE)
+usethis::use_data(covid_case_death_rates, overwrite = TRUE)