diff --git a/DESCRIPTION b/DESCRIPTION index 0fdef78..def2039 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Authors@R: person(c("Daniel", "J."), "McDonald", , "daniel@stat.ubc.ca", role = c("cre", "aut")) Description: This package contains data sets used to compile vignettes and other documentation in Delphi R Packages. The goal is to avoid calls - to the Delphi Epidata API, and deposit some examples here for easy + to the Delphi Epidata API, and to deposit some examples here for easy offline use. License: MIT + file LICENSE Depends: @@ -15,15 +15,16 @@ Suggests: covidcast, dplyr, epidatr, - epipredict, + epiprocess, here, + httr, + jsonlite, lubridate, magrittr, purrr, readr Remotes: cmu-delphi/epidatr, - cmu-delphi/epipredict, cmu-delphi/epiprocess Encoding: UTF-8 LazyData: true diff --git a/R/epipredict-data.R b/R/epipredict-data.R index 9d78952..0663fb1 100644 --- a/R/epipredict-data.R +++ b/R/epipredict-data.R @@ -80,3 +80,26 @@ #' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. #' Copyright Johns Hopkins University 2020. "counts_subset" + +#' Canadian COVID-19 case rates +#' +#' Data set for all Canadian provinces and territories containing COVID-19 +#' case rates (COVID-19 cases per 100,000 people) derived from COVID-19 case +#' counts as reported by the +#' \href{https://opencovid.ca/}{COVID-19 Canada Open Data Working Group (CCODWG)}. +#' Data is available both through the archived +#' \href{https://github.com/ccodwg/Covid19Canada}{ccodwg/Covid19Canada GitHub repository} +#' and the newer +#' \href{https://github.com/ccodwg/CovidTimelineCanada}{ccodwg/CovidTimelineCanada GitHub repository}, +#' which also reports vaccine-related signals. +#' +#' This dataset contains versioned data covering the period from April 2020 to +#' December 2021 and is used in the [epipredict] slide vignette. +#' +#' @source This object contains a modified part of the COVID-19 Canada Open +#' Data Working Group's +#' \href{https://github.com/ccodwg/Covid19Canada}{Covid19Canada data repository} (archived). +#' This data set is licensed under the terms of the +#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +#' by the COVID-19 Canada Open Data Working Group. +"can_prov_cases" diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset_dt.R similarity index 88% rename from data-raw/archive_cases_dv_subset.R rename to data-raw/archive_cases_dv_subset_dt.R index 3aefd93..6f505c1 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset_dt.R @@ -1,5 +1,9 @@ -dv_subset <- covidcast( - data_source = "doctor-visits", +library(dplyr) +library(epidatr) +library(epiprocess) + +dv_subset <- pub_covidcast( + source = "doctor-visits", signals = "smoothed_adj_cli", time_type = "day", geo_type = "state", @@ -7,14 +11,13 @@ dv_subset <- covidcast( geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% # We're using compactify=FALSE here and below to avoid some testthat test # failures on tests that were based on a non-compactified version. as_epi_archive(compactify = FALSE) -case_rate_subset <- covidcast( - data_source = "jhu-csse", +case_rate_subset <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", @@ -22,7 +25,6 @@ case_rate_subset <- covidcast( geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive(compactify = FALSE) diff --git a/data-raw/can_prov_cases.R b/data-raw/can_prov_cases.R new file mode 100644 index 0000000..5dce2cf --- /dev/null +++ b/data-raw/can_prov_cases.R @@ -0,0 +1,148 @@ +## code to prepare `can_prov_cases` dataset goes here + +library(dplyr) +library(epiprocess) +library(readr) +library(purrr) +library(httr) +library(jsonlite) + + +# Look for a GitHub API token. +# Returns an empty string "" if env variable not found. +gh_token <- Sys.getenv("GITHUB_PAT") +if (gh_token == "") { + # Try again with the secondary name. + gh_token <- Sys.getenv("GITHUB_TOKEN") +} +if (gh_token == "") { + warning("Token is not set or is not able to be fetched from the environment.", + " Proceeding without authentication, but the requests may be blocked", + " due to GitHub API rate limits.") +} + +# Construct a header to send with GET requests +if (gh_token == "") { + # Empty header + auth_header <- httr::add_headers() +} else { + auth_header <- httr::add_headers(Authorization = paste("Bearer", gh_token)) +} + +## Get list of new and modified files to download +# The `path` field filters commits to only those that modifying the listed dir +# From https://www.github.com/ccodwg/Covid19Canada +BASE_URL <- "https://api.github.com/repos/ccodwg/Covid19Canada/commits?sha=%s&per_page=%s&path=timeseries_prov/cases_timeseries_prov.csv&until=%s&page=%s" +ITEMS_PER_PAGE <- 100 +BRANCH <- "master" + + + +# We want to fetch all commits made since Mar 13 2022 (version the original +# dataset was created from). +# +# Timestamp should be in ISO 8601 format. See +# https://docs.github.com/en/rest/reference/commits#list-commits--parameters for +# details. +since_date <- strftime("2022-03-13", "%Y-%m-%dT%H:%M:%SZ", tz = "UTC") + +page <- 0 +commit_pages <- list() + +# Fetch list of commits from API, one page at a time. Each page contains up to +# 100 commits. If a page contains 100 commits, assume that there are more +# results and fetch the next page. +while (page == 0 || nrow(commit_page) == 100) { + page <- page + 1 + # Construct the URL + commits_url <- sprintf(BASE_URL, BRANCH, ITEMS_PER_PAGE, since_date, page) + + request <- GET(commits_url, auth_header) + # Convert any HTTP errors to R errors automatically. + stop_for_status(request) + + # Convert results from nested JSON/list to dataframe. If no results returned, + # `commit_page` will be an empty list. + commit_page <- content(request, as = "text") %>% + fromJSON(simplifyDataFrame = TRUE, flatten = TRUE) %>% + # Trim message down a bit. + mutate(message = substr(commit.message, 1, 40)) %>% + select(sha, url = commit.url, message) + + # No more results are being returned. + if (identical(commit_page, list())) { + break + } + + commit_pages[[page]] <- commit_page +} + +# Combine all requested pages of commits into one dataframe +commit_pages <- bind_rows(commit_pages) + +# Missing value `%s` to be filled in with a commit sha or branch name. +BASE_DATA_URL <- "https://raw.githubusercontent.com/ccodwg/Covid19Canada/%s/timeseries_prov/cases_timeseries_prov.csv" + +fc_time_values <- seq(as.Date("2021-02-01"), as.Date("2021-12-01"), + by = "1 month") +commit_pages <- mutate( + commit_pages, + data_url = sprintf(BASE_DATA_URL, sha), + date = strsplit(message, " ") %>% map_chr(~ substr(.x[3], start=1, stop=10)) %>% as.Date() +) %>% + # select(data_url, date) %>% + na.omit() %>% + filter(date %in% fc_time_values) + +# From https://github.com/mountainMath/BCCovidSnippets/blob/main/data/prov_pop.csv +ca_pop_url <- "https://raw.githubusercontent.com/mountainMath/BCCovidSnippets/main/data/prov_pop.csv" +ca_pop <- read_csv( + ca_pop_url, + col_types = cols( + Province = col_character(), + shortProvince = col_character(), + Population = col_integer() + ) +) %>% + rename(province = Province, abbreviation = shortProvince, population = Population) +abbrev_map <- setNames(ca_pop$province, ca_pop$abbreviation) + +# Read in data and convert to `epi_df`s. +can_prov_cases <- purrr::map2(commit_pages$data_url, commit_pages$date, function(url, date) { + raw <- readr::read_csv( + url, + col_types = cols( + province = col_character(), + date_report = col_character(), + cases = col_double(), + cumulative_cases = col_double() + ) + ) + + # Raw data uses a mix of full names and abbreviations. Switch to using only full names. + raw$province <- case_when( + raw$province == "NWT" ~ abbrev_map["NT"], + raw$province == "PEI" ~ abbrev_map["PE"], + raw$province %in% ca_pop$province ~ raw$province, + raw$province %in% ca_pop$abbreviation ~ abbrev_map[raw$province], + # Mark everything else as missing. Only applies to "Repatriated" region. + TRUE ~ NA + ) + + result <- raw %>% + mutate(time_value = lubridate::dmy(date_report)) %>% + left_join(ca_pop, by="province") %>% + filter(!is.na(province), time_value > "2020-04-01") %>% + mutate(geo_value = province, + case_rate = cases / population * 1e5) %>% + select(geo_value, time_value, case_rate) %>% + as_epi_df(geo_type = "province", as_of = date) + + return(result) +}) +names(can_prov_cases) <- commit_pages$date +can_prov_cases <- can_prov_cases %>% bind_rows(.id = "version") %>% + mutate(version = lubridate::ymd(version)) %>% + arrange(version) + +usethis::use_data(can_prov_cases, overwrite = TRUE) diff --git a/data-raw/cancovid.R b/data-raw/cancovid.R deleted file mode 100644 index 8fe5a8a..0000000 --- a/data-raw/cancovid.R +++ /dev/null @@ -1,25 +0,0 @@ -## code to prepare `cancovid` dataset goes here - -path_to_csvs <- here::here("../../COVID-BC/Covid19Canada/updates.nosync/") -files <- list.files(path_to_csvs) -ca_as_ofs <- as.Date(substr(files, 1, 10)) %>% - intersect(fc_time_values) %>% - as.Date(origin = "1970-01-01") - -can <- purrr::map(ca_as_ofs, ~ { - readr::read_csv(here::here(path_to_csvs, paste0(.x, ".csv"))) %>% - left_join(ca_pop) %>% - mutate(time_value = lubridate::dmy(date_report)) %>% - filter(province %in% ca_pop$province, time_value > "2020-04-01") %>% - mutate(geo_value = province, - case_rate = cases / population * 1e5) %>% - select(geo_value, time_value, case_rate) %>% - as_epi_df(geo_type = "province", as_of = .x) -}) -names(can) <- ca_as_ofs -can <- can %>% bind_rows(.id = "version") %>% - mutate(version = lubridate::ymd(version)) -saveRDS(can, "inst/extdata/can_prov_cases.rds") - - -usethis::use_data(cancovid, overwrite = TRUE) diff --git a/data-raw/cases_deaths_subset.R b/data-raw/cases_deaths_subset.R index 06e2259..184a5c3 100644 --- a/data-raw/cases_deaths_subset.R +++ b/data-raw/cases_deaths_subset.R @@ -1,48 +1,48 @@ -confirmed_7dav_incidence_prop <- covidcast( - data_source = "jhu-csse", +library(dplyr) +library(epidatr) +library(epiprocess) + +confirmed_7dav_incidence_prop <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, case_rate_7d_av = value) %>% arrange(geo_value, time_value) -deaths_7dav_incidence_prop <- covidcast( - data_source = "jhu-csse", +deaths_7dav_incidence_prop <- pub_covidcast( + source = "jhu-csse", signals = "deaths_7dav_incidence_prop", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, death_rate_7d_av = value) %>% arrange(geo_value, time_value) -confirmed_incidence_num <- covidcast( - data_source = "jhu-csse", +confirmed_incidence_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) -confirmed_7dav_incidence_num <- covidcast( - data_source = "jhu-csse", +confirmed_7dav_incidence_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases_7d_av = value) %>% arrange(geo_value, time_value) diff --git a/data-raw/counts_subset.R b/data-raw/counts_subset.R index 83d5461..7cf9bc1 100644 --- a/data-raw/counts_subset.R +++ b/data-raw/counts_subset.R @@ -1,23 +1,25 @@ ## code to prepare jhu_incidence_num dataset goes here -x <- covidcast( - data_source = "jhu-csse", +library(dplyr) +library(epidatr) +library(epiprocess) + +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = "ca,fl,tx,ny,nj") %>% - fetch() %>% select(geo_value, time_value, cases = value) -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "deaths_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = "ca,fl,tx,ny,nj") %>% - fetch() %>% select(geo_value, time_value, deaths = value) counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>% diff --git a/data-raw/covid_case_death_rates.R b/data-raw/covid_case_death_rates.R index 1f9af3d..ceab111 100644 --- a/data-raw/covid_case_death_rates.R +++ b/data-raw/covid_case_death_rates.R @@ -1,27 +1,29 @@ -x <- covidcast( - data_source = "jhu-csse", +library(dplyr) +library(epidatr) +library(epiprocess) + +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = "*" ) %>% - fetch() %>% select(geo_value, time_value, case_rate = value) -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "deaths_7dav_incidence_prop", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = "*" ) %>% - fetch() %>% select(geo_value, time_value, death_rate = value) -case_death_rate_subset <- x %>% +covid_case_death_rates <- x %>% full_join(y, by = c("geo_value", "time_value")) %>% as_epi_df() -usethis::use_data(case_death_rate_subset, overwrite = TRUE) +usethis::use_data(covid_case_death_rates, overwrite = TRUE) diff --git a/data-raw/covid_incidence_county_subset.R b/data-raw/covid_incidence_county_subset.R index 35dce13..cc94485 100644 --- a/data-raw/covid_incidence_county_subset.R +++ b/data-raw/covid_incidence_county_subset.R @@ -1,18 +1,22 @@ # Use covidcast::county_census to get the county and state names +library(dplyr) +library(covidcast) +library(epidatr) +library(epiprocess) + y <- covidcast::county_census %>% filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) # Fetch only counties from Massachusetts and Vermont, then append names columns as well -covid_incidence_county_subset <- covidcast( - data_source = "jhu-csse", +covid_incidence_county_subset <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", time_type = "day", geo_type = "county", time_values = epirange(20200601, 20211231), geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() diff --git a/data-raw/covid_incidence_outliers.R b/data-raw/covid_incidence_outliers.R index 3903fe6..a81bf8a 100644 --- a/data-raw/covid_incidence_outliers.R +++ b/data-raw/covid_incidence_outliers.R @@ -1,5 +1,9 @@ -incidence_num_outlier_example <- covidcast( - data_source = "jhu-csse", +library(dplyr) +library(epidatr) +library(epiprocess) + +covid_incidence_outliers <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", time_type = "day", geo_type = "state", @@ -7,7 +11,6 @@ incidence_num_outlier_example <- covidcast( geo_values = "fl,nj", as_of = 20211028 ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() diff --git a/data-raw/ctis_covid_behaviours.R b/data-raw/ctis_covid_behaviours.R index 6ce2e65..e0336de 100644 --- a/data-raw/ctis_covid_behaviours.R +++ b/data-raw/ctis_covid_behaviours.R @@ -1,27 +1,26 @@ ## code to prepare `ctis_covid_behaviours` dataset goes here -behav_ind_mask <- covidcast( - data_source = "fb-survey", +library(dplyr) +library(epidatr) + +behav_ind_mask <- pub_covidcast( + source = "fb-survey", signals = "smoothed_wwearing_mask_7d", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = "ca,fl,tx,ny,nj") %>% - fetch() %>% select(geo_value, time_value, masking = value) -behav_ind_distancing <- covidcast( - data_source = "fb-survey", +behav_ind_distancing <- pub_covidcast( + source = "fb-survey", signals = "smoothed_wothers_distanced_public", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = "ca,fl,tx,ny,nj") %>% - fetch() %>% select(geo_value, time_value, distancing = value) -pop_dat <- state_census %>% select(abbr, pop) - ctis_covid_behaviours <- behav_ind_mask %>% full_join(behav_ind_distancing, by = c("geo_value", "time_value")) diff --git a/data-raw/jhu_confirmed_cumulative_num.R b/data-raw/jhu_confirmed_cumulative_num.R index 16b16bf..2610d69 100644 --- a/data-raw/jhu_confirmed_cumulative_num.R +++ b/data-raw/jhu_confirmed_cumulative_num.R @@ -1,13 +1,13 @@ -## code to prepare `jhu_confirmed_cumulative_num` dataset goes here +library(dplyr) +library(epidatr) -jhu_confirmed_cumulative_num <- covidcast( - data_source = "jhu-csse", +jhu_confirmed_cumulative_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_cumulative_num", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20220131), geo_values = "ca,fl,ny,tx" -) %>% fetch() - +) usethis::use_data(jhu_confirmed_cumulative_num, overwrite = TRUE) diff --git a/data-raw/state_census.R b/data-raw/state_census.R index 54f6a8f..c5ecb75 100644 --- a/data-raw/state_census.R +++ b/data-raw/state_census.R @@ -1,8 +1,10 @@ +library(dplyr) +library(covidcast) + state_census <- covidcast::state_census %>% select(STATE, NAME, POPESTIMATE2019, ABBR) %>% rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>% mutate(abbr = tolower(abbr)) %>% as_tibble() - usethis::use_data(state_census, overwrite = TRUE) diff --git a/data/can_prov_cases.rda b/data/can_prov_cases.rda new file mode 100644 index 0000000..40aa917 Binary files /dev/null and b/data/can_prov_cases.rda differ diff --git a/man/can_prov_cases.Rd b/man/can_prov_cases.Rd new file mode 100644 index 0000000..c3e5cd7 --- /dev/null +++ b/man/can_prov_cases.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epipredict-data.R +\docType{data} +\name{can_prov_cases} +\alias{can_prov_cases} +\title{Canadian COVID-19 case rates} +\format{ +An object of class \code{epi_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 65299 rows and 4 columns. +} +\source{ +This object contains a modified part of the COVID-19 Canada Open +Data Working Group's +\href{https://github.com/ccodwg/Covid19Canada}{Covid19Canada data repository} (archived). +This data set is licensed under the terms of the +\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +by the COVID-19 Canada Open Data Working Group. +} +\usage{ +can_prov_cases +} +\description{ +Data set for all Canadian provinces and territories containing COVID-19 +case rates (COVID-19 cases per 100,000 people) derived from COVID-19 case +counts as reported by the +\href{https://opencovid.ca/}{COVID-19 Canada Open Data Working Group (CCODWG)}. +Data is available both through the archived +\href{https://github.com/ccodwg/Covid19Canada}{ccodwg/Covid19Canada GitHub repository} +and the newer +\href{https://github.com/ccodwg/CovidTimelineCanada}{ccodwg/CovidTimelineCanada GitHub repository}, +which also reports vaccine-related signals. +} +\details{ +This dataset contains versioned data covering the period from April 2020 to +December 2021 and is used in the [epipredict] slide vignette. +} +\keyword{datasets}