move epiprocess vignette datasets to epidatasets; move existing from …

…tibble to epidf
cmu-delphi · Sep 26, 2024 · a42a262 · a42a262
1 parent 89de76e
commit a42a262
Show file tree

Hide file tree

Showing 15 changed files with 524 additions and 4 deletions.
diff --git a/R/epipredict-data.R b/R/epipredict-data.R
@@ -73,7 +73,7 @@
 #' ranges from June 4, 2021 to December 31, 2021.
 #' It is limited to California, Florida, Texas, New Jersey, and New York.
 #'
-#' @format A [`tibble::tibble`] (object of class `c("tbl_df", "tbl", "data.frame")`) with 1055 rows and 4 columns.
+#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 1055 rows and 4 columns.
 #' @section Data dictionary:
 #' The data has columns:
 #' \describe{
@@ -202,3 +202,113 @@
 #'   drop the level-specific rows.
 #' * No modifications were made to the time range of the data
 "grad_employ_subset"
+
+#' Percent CLI from different surveys, compared to ground truth COVID incidence in a subset of counties
+#'
+#' @description
+#' Data set for more than 400 US counties containing CLI
+#' (COVID-19-like-illness) incidence derived from two surveys, and a reference signal as
+#' reported by JHU CSSE. This example data is a snapshot as of September 21,
+#' 2020, and ranges from April 11, 2020 to September 01, 2020.
+#'
+#' The reference sign is based on reports made available
+#' by the Center for Systems Science and Engineering at Johns Hopkins
+#' University.
+#'
+#' One survey is ... Google
+#'
+#' One survey is ... CTIS in collaboration with Facebook.
+#'
+#' Data is reported for counties that had at least 200 cumulative COVID-19 cases
+#' on May 14, 2020, according to JHU CSSE.
+#'
+#' @format An [`epiprocess::epi_df`] (object of class `c("epi_df", "tbl_df", "tbl", "data.frame")`) with 63840 rows and 5 columns.
+#' @section Data dictionary:
+#' The data has columns:
+#' \describe{
+#'   \item{geo_value}{The 5-digit county FIPS code associated with each
+#'      row of measurements.}
+#'   \item{time_value}{The time value, a date in YYYY-MM-DD format}
+#'   \item{goog}{Seven-day average of CLI (covid-like-illness) cases}
+#'   \item{fb}{Seven-day average of CLI (covid-like-illness) cases}
+#'   \item{case}{Seven-day average of CLI (covid-like-illness) cases}
+#' }
+#' @source This object contains modified data adapted from
+#'  ...
+#'
+#' The data is licensed under the terms of the
+#' \href{url}{name}.
+#'
+#' Modifications:
+#' *
+"county_smoothed_cli_comparison"
+
+#' Daily COVID-19 case and death rates from all states in archive format
+#'
+#' @description
+#' Data set containing COVID-19 case and death rates (counts per 100000
+#' population) as reported by the Delphi API, based on reports made available
+#' by the Center for Systems Science and Engineering at Johns Hopkins
+#' University. This example data ranges from March 1, 2020 to November 30,
+#' 2021, issued monthly on the first day of each month from September 1, 2020
+#' to December 1, 2021. It includes all US states, Washington DC, Guam, Puerto
+#' Rico, and the Virgin Islands.
+#'
+#' @format An [`epiprocess::epi_archive`]. The DT attribute contains the data formatted as a [`data.table::data.table`] (object of class `c("data.table", "data.frame")`) with 72086 rows and 7 columns.
+#' @section Data dictionary:
+#' The data in the `epi_archive$DT` attribute has columns:
+#' \describe{
+#'   \item{geo_value}{the geographic value associated with each row of measurements.}
+#'   \item{time_value}{the time value associated with each row of measurements.}
+#'   \item{version}{the time value specifying the version for each row of measurements. }
+#'   \item{case_rate}{Number of new confirmed cases due to COVID-19 per 100,000 population, daily}
+#'   \item{case_rate_7d_av}{7-day average signal of number of new confirmed cases due to COVID-19 per 100,000 population, daily}
+#'   \item{death_rate}{Number of new confirmed deaths due to COVID-19 per 100,000 population, daily}
+#'   \item{death_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily}
+#' }
+#' @source
+#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the
+#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
+#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
+#' Copyright Johns Hopkins University 2020.
+#'
+#' Modifications:
+#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:  The signals `case_rate` and `death_rate` are taken directly from the JHU CSSE GitHub repo without changes, served through the Delphi API.
+#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: `case_rate_7d_av` signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive.
+#' * Furthermore, the data has been limited to a specific time range, the
+#'   signal names slightly altered, and formatted into an `epi_archive`.
+"case_death_rate_archive"
+
+#' Daily COVID-19 doctor visits and cases from all states in archive format
+#' @description
+#' This data source is based on information about outpatient visits, provided
+#' to us by health system partners, and also contains confirmed COVID-19
+#' cases based on reports made available by the Center for Systems Science
+#' and Engineering at Johns Hopkins University. This example data ranges from
+#' June 1, 2020 to December 1, 2021, issued on dates from June 1, 2020 to December 1,
+#' 2021. It includes all US states.
+#'
+#' It is used in the {epipredict} `sliding` article.
+#'
+#' @format An [`epiprocess::epi_archive`]. The DT attribute contains the data formatted as a [`data.table::data.table`] (object of class `c("data.table", "data.frame")`) with 1514489 rows and 5 columns.
+#' @section Data dictionary:
+#' The data in the `epi_archive$DT` attribute has columns:
+#' \describe{
+#'   \item{geo_value}{the geographic value associated with each row of measurements.}
+#'   \item{time_value}{the time value associated with each row of measurements.}
+#'   \item{version}{the time value specifying the version for each row of measurements. }
+#'   \item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims}
+#'   \item{case_rate}{7-day average signal of number of new confirmed cases due to COVID-19 per 100,000 population, daily}
+#' }
+#' @source
+#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the
+#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
+#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
+#' Copyright Johns Hopkins University 2020.
+#'
+#' Modifications:
+#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits API}: The signal `percent_cli` is taken directly from the API without changes.
+#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: `case_rate` signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive.
+#' * Furthermore, the data has been limited to a very small number of rows, the
+#'   signal names slightly altered, and formatted into an `epi_archive`.
+"archive_cases_dv_subset_all_states"
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data-raw/archive_cases_dv_subset_all_states_dt.R b/data-raw/archive_cases_dv_subset_all_states_dt.R
@@ -0,0 +1,46 @@
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+
+source(here::here("data-raw/_helper.R"))
+
+dv_subset <- pub_covidcast(
+  source = "doctor-visits",
+  signals = "smoothed_adj_cli",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20200601, 20211201),
+  geo_values = "*",
+  issues = epirange(20200601, 20211201)
+) %>%
+  select(geo_value, time_value, version = issue, percent_cli = value) %>%
+  # We're using compactify=FALSE here and below to avoid some testthat test
+  # failures on tests that were based on a non-compactified version.
+  as_epi_archive(compactify = FALSE)
+
+case_rate_subset <- pub_covidcast(
+  source = "jhu-csse",
+  signals = "confirmed_7dav_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20200601, 20211201),
+  geo_values = "*",
+  issues = epirange(20200601, 20211201)
+) %>%
+  select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%
+  as_epi_archive(compactify = FALSE)
+
+# Use `epiprocess::epix_merge` to avoid having to reimplement `sync`ing
+# behavior. After merging, convert DT component back to tibble.
+archive_cases_dv_subset_all_states_dt = epix_merge(
+  dv_subset, case_rate_subset,
+  sync = "locf",
+  compactify = TRUE)$DT %>%
+  as_tibble()
+
+# We're trying to do:
+#   usethis::use_data(archive_cases_dv_subset_all_states_dt, internal = TRUE, overwrite = TRUE)
+# but `usethis::use_data` can only store multiple objects if they're added in
+# the same call. This workaround is from
+# https://github.com/r-lib/usethis/issues/1512
+save_to_sysdata(archive_cases_dv_subset_all_states_dt, "archive_cases_dv_subset_all_states_dt")
diff --git a/data-raw/case_death_rate_archive_dt.R b/data-raw/case_death_rate_archive_dt.R
@@ -0,0 +1,80 @@
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+
+source(here::here("data-raw/_helper.R"))
+
+states <- "*"
+fc_time_values <- seq(
+  from = as.Date("2020-09-01"),
+  to = as.Date("2021-12-31"),
+  by = "1 month"
+)
+
+confirmed_incidence_prop <- pub_covidcast(
+  source = "jhu-csse",
+  signals = "confirmed_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20200301, 20211231),
+  geo_values = states,
+  issues = epirange(20000101, 20211231)
+) %>%
+  select(geo_value, time_value, version = issue, case_rate = value) %>%
+  arrange(geo_value, time_value) %>%
+  as_epi_archive(compactify = FALSE)
+
+deaths_incidence_prop <- pub_covidcast(
+  source = "jhu-csse",
+  signals = "deaths_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20200301, 20211231),
+  geo_values = states,
+  issues = epirange(20000101, 20211231)
+) %>%
+  select(geo_value, time_value, version = issue, death_rate = value) %>%
+  arrange(geo_value, time_value) %>%
+  as_epi_archive(compactify = FALSE)
+
+# Use `epiprocess::epix_merge` to avoid having to reimplement `sync`ing
+# behavior.
+case_death_rate_archive_dt <- epix_merge(
+  confirmed_incidence_prop, deaths_incidence_prop,
+  sync = "locf"
+)
+
+# Calculate 7-day averages for case and death rates.
+case_death_rate_archive_dt <- case_death_rate_archive_dt %>%
+  epix_slide(
+    before = 365000L, ref_time_values = fc_time_values,
+    function(x, gk, rtv) {
+      x %>%
+        group_by(geo_value) %>%
+        epi_slide_mean(case_rate, before = 6L) %>%
+        rename(case_rate_7d_av = slide_value_case_rate) %>%
+        epi_slide_mean(death_rate, before = 6L) %>%
+        ungroup() %>%
+        rename(death_rate_7d_av = slide_value_death_rate)
+    }
+  ) %>%
+  rename(
+    version = time_value,
+    time_value = slide_value_time_value,
+    geo_value = slide_value_geo_value,
+    case_rate = slide_value_case_rate,
+    death_rate = slide_value_death_rate,
+    case_rate_7d_av = slide_value_case_rate_7d_av,
+    death_rate_7d_av = slide_value_death_rate_7d_av
+  ) %>%
+  as_epi_archive(compactify = TRUE)
+# Convert DT component back to tibble.
+case_death_rate_archive_dt <- case_death_rate_archive_dt$DT %>% 
+  as_tibble()
+
+# We're trying to do:
+#   usethis::use_data(case_death_rate_archive_dt, internal = TRUE, overwrite = TRUE)
+# but `usethis::use_data` can only store multiple objects if they're added in
+# the same call. This workaround is from
+# https://github.com/r-lib/usethis/issues/1512
+save_to_sysdata(case_death_rate_archive_dt, "case_death_rate_archive_dt")
diff --git a/data-raw/county_smoothed_cli_comparison_dt.R b/data-raw/county_smoothed_cli_comparison_dt.R
@@ -0,0 +1,91 @@
+library(dplyr)
+library(epidatr)
+
+source(here::here("data-raw/_helper.R"))
+
+d <- "2020-09-21"
+
+case_num <- 200
+geos_date <- "2020-05-14"
+
+# Find counties that on 2020-05-14 had >= 200 cases reported.
+# For later datasets, we will only keep data for these geos.
+geo_values_initial <- pub_covidcast(
+  source = "jhu-csse",
+  signals = "confirmed_cumulative_num",
+  geo_type = "county",
+  time_type = "day",
+  geo_values = "*",
+  time_values = epirange(geos_date, geos_date),
+  as_of = d
+) %>%
+  filter(value >= case_num) %>%
+  pull(geo_value) %>%
+  unique()
+
+# Fetch county-level Google and Facebook % CLI-in-community signals, and JHU
+# confirmed case incidence proportion
+start_day <- "2020-04-11"
+end_day <- "2020-09-01"
+
+goog_sm_cli <- pub_covidcast(
+  source = "google-survey",
+  signals = "smoothed_cli",
+  geo_type = "county",
+  time_type = "day",
+  geo_values = "*",
+  time_values = epirange(start_day, end_day),
+  as_of = d
+) %>%
+  filter(geo_value %in% geo_values_initial) %>%
+  select(geo_value, time_value, value) %>%
+  rename(goog = value)
+
+fb_survey <- pub_covidcast(
+  source = "fb-survey",
+  signals = "smoothed_hh_cmnty_cli",
+  geo_type = "county",
+  time_type = "day",
+  geo_values = "*",
+  time_values = epirange(start_day, end_day),
+  as_of = d
+) %>%
+  filter(geo_value %in% geo_values_initial) %>%
+  select(geo_value, time_value, value) %>%
+  rename(fb = value)
+
+jhu_7dav_incid <- pub_covidcast(
+  source = "jhu-csse",
+  signals = "confirmed_7dav_incidence_prop",
+  geo_type = "county",
+  time_type = "day",
+  geo_values = "*",
+  time_values = epirange(start_day, end_day),
+  as_of = d
+) %>%
+  filter(geo_value %in% geo_values_initial) %>%
+  select(geo_value, time_value, value) %>%
+  rename(case = value)
+
+# Find "complete" counties, present in all three data signals, and also 
+# present in the `geo_values_initial` object.
+geo_values_complete <- intersect(
+  intersect(goog_sm_cli$geo_value, fb_survey$geo_value),
+  jhu_7dav_incid$geo_value
+)
+
+# Join the three data frames together
+county_smoothed_cli_comparison_dt <- full_join(
+  full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")),
+  jhu_7dav_incid,
+  by = c("geo_value", "time_value")
+) %>%
+  filter(geo_value %in% geo_values_complete) %>%
+  as_tibble()
+
+# We're trying to do:
+#   usethis::use_data(county_smoothed_cli_comparison_dt, internal = TRUE, overwrite = TRUE)
+# but `usethis::use_data` can only store multiple objects if they're added in
+# the same call. This workaround is from
+# https://github.com/r-lib/usethis/issues/1512
+save_to_sysdata(county_smoothed_cli_comparison_dt, "county_smoothed_cli_comparison_dt")
diff --git a/data-raw/ctis_covid_behaviours.R → data-raw/ctis_covid_behaviours_dt.R b/data-raw/ctis_covid_behaviours.R → data-raw/ctis_covid_behaviours_dt.R
@@ -1,6 +1,8 @@
 library(dplyr)
 library(epidatr)
 
+source(here::here("data-raw/_helper.R"))
+
 d <- as.Date("2024-03-20")
 
 behav_ind_mask <- pub_covidcast(
@@ -25,8 +27,13 @@ behav_ind_distancing <- pub_covidcast(
 )  %>%
   select(geo_value, time_value, distancing = value)
 
-ctis_covid_behaviours <- behav_ind_mask %>%
+ctis_covid_behaviours_dt <- behav_ind_mask %>%
   full_join(behav_ind_distancing, by = c("geo_value", "time_value")) %>%
   as_tibble()
 
-usethis::use_data(ctis_covid_behaviours, overwrite = TRUE)
+# We're trying to do:
+#   usethis::use_data(ctis_covid_behaviours_dt, internal = TRUE, overwrite = TRUE)
+# but `usethis::use_data` can only store multiple objects if they're added in
+# the same call. This workaround is from
+# https://github.com/r-lib/usethis/issues/1512
+save_to_sysdata(ctis_covid_behaviours_dt, "ctis_covid_behaviours_dt")
diff --git a/data/archive_cases_dv_subset_all_states.R b/data/archive_cases_dv_subset_all_states.R
@@ -0,0 +1,8 @@
+delayedAssign("archive_cases_dv_subset_all_states", local({
+  if (requireNamespace("epiprocess", quietly = TRUE)) {
+    epiprocess::as_epi_archive(epidatasets:::archive_cases_dv_subset_all_states_dt, compactify = TRUE)
+  } else {
+    warning("Since the package `epiprocess` is not installed, this object will be loaded as a tibble (class `tbl_df`)")
+    epidatasets:::archive_cases_dv_subset_all_states_dt
+  }
+}))
diff --git a/data/case_death_rate_archive.R b/data/case_death_rate_archive.R
@@ -0,0 +1,8 @@
+delayedAssign("case_death_rate_archive", local({
+  if (requireNamespace("epiprocess", quietly = TRUE)) {
+    epiprocess::as_epi_archive(epidatasets:::case_death_rate_archive_dt, compactify = TRUE)
+  } else {
+    warning("Since the package `epiprocess` is not installed, this object will be loaded as a tibble (class `tbl_df`)")
+    epidatasets:::case_death_rate_archive_dt
+  }
+}))
diff --git a/data/county_smoothed_cli_comparison.R b/data/county_smoothed_cli_comparison.R
@@ -0,0 +1,9 @@
+delayedAssign("county_smoothed_cli_comparison", local({
+  if (requireNamespace("epiprocess", quietly = TRUE)) {
+    d <- as.Date("2020-09-21")
+    epiprocess::as_epi_df(epidatasets:::county_smoothed_cli_comparison_dt, as_of = d)
+  } else {
+    warning("Since the package `epiprocess` is not installed, this object will be loaded as a tibble (class `tbl_df`)")
+    epidatasets:::county_smoothed_cli_comparison_dt
+  }
+}))