CDCgov · beansrowning · Mar 10, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,10 @@
 Package: SaviR
 Type: Package
 Title: ITF Situational Awareness and Visualization
-Version: 0.2.0
+Version: 0.3.0
 Authors@R: c(
     person("Sean", "Browning", email = "[email protected]", role = c("aut", "cre")),
-    person("Kimberly", "Wong", email = "[email protected]", role = "aut"),
+    person("Kimberly", "Lockwood", email = "[email protected]", role = "aut"),
     person("Nartlada", "Chantharojwong", email = "[email protected]", role = "aut"),
     person("James", "Fuller", role = "aut"),
     person("Dante", "Bugli", role = "ctb"),
@@ -39,7 +39,7 @@ Imports:
     bit64
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.1
 Roxygen: list(markdown = TRUE)
 Suggests: 
     rmarkdown,

diff --git a/R/get_combined_table.R b/R/get_combined_table.R
@@ -1,36 +1,61 @@
 #' @title A function to retrieve a dataframe (df) with combined Case/Death/Vaccine data by country
 #'
-#' @param type (character) Specifies whether df should include disaggregated China data ("Both" separates China, Taiwan, Hong Kong, and Macau data) or combined China data ("WHO" combines China, Taiwan, Hong Kong, and Macau data as China)
+#' @param type (character) Specifies what data streams to include for case/death data. See details for further information
+#' whether df should include disaggregated China data ("Both" separates China, Taiwan, Hong Kong, and Macau data) or combined China data ("WHO" combines China, Taiwan, Hong Kong, and Macau data as China)
 #' @param geometry (logical, default: FALSE) Specifies whether df should include the geometry column
 #'
 #' @returns Returns an object of class \code{data.frame} with n rows and 56(57, if \code{geometry = TRUE}) columns
 #'
 #' @seealso [get_covid_df()], [get_vax()], and [calc_add_risk()] for full column data documentation
 #'
+#' @details 
+#' The `type` argument used to take two values: "WHO" and "Both", referring to whether to take WHO data as-is, or to supplement WHO data with disaggregated China data from JHU.
+#' In early Jan 2023, China CDC ceased providing daily COVID-19 updates, so the Mainland China data provided by JHU also stopped. On Mar 10, 2023 JHU closed their dashboard entirely,
+#' so new sources had to be located for HK, Macau, and Taiwan data.
+#' 
+#' For legacy analyses, the old behavior for "Both" is now available as "legacy"
+#' 
+#' The new "Both" type pulls data from HK CHP, Taiwan CDC, and JHU (for Macau data thru Mar 10) in addition to the China data in WHO (which also includes Taiwan, HK, and Macau data).
+#' Because data from HK and Taiwan are duplicated in this way, you should not use data from the "Both" option to compute regional or global trends.
 #' @examples
 #' \dontrun{
 #' # Get the df that combines China with Taiwan, Hong Kong, and Macau data
 #' df_who <- get_combined_table("WHO")
 #' print(df_who)
-#' # Get the df that uses both disagreggated China, Taiwan, Hong Kong, and Macau data (WHO + JHU= "Both")
+#' # Get the df that combines WHO China data (aggregated) with disggregated entries for HK, Taiwan, and Macau (from JHU thru Mar 10, 2023)
 #' df_both <- get_combined_table("Both")
 #' print(df_both)
-#'
-#' # get_combined_table() is identical to the following sequence:
-#' onetable %>%
-#'   select(-geometry) %>% # In the case that geometry = FALSE
-#'   right_join(get_covid_df(), by = "iso2code") %>%
-#'   filter(source == "WHO") %>% # In the case of type = "WHO"
-#'   # filter(!(country == "China" & source == "WHO")) %>% # In the case of type = "Both"
-#'   calc_add_risk() %>%
-#'   left_join(get_vax(), by = c("id", "date"))
+#' # Get the df that uses both disaggregated China, Taiwan, Hong Kong, and Macau data (WHO + JHU = "legacy")
+#' # (JHU sunset on Mar 10, 2023 and China mainland data ceased earlier in the year)
+#' df_both <- get_combined_table("legacy")
+#' print(df_both)
 #' }
+#' @md
 #' @export
 
-get_combined_table <- function(type = c("WHO", "Both"), geometry = FALSE) {
+get_combined_table <- function(type = c("WHO", "Both", "legacy"), geometry = FALSE) {
   type <- match.arg(type)
+
+  case_death_df <- switch(
+    type,
+    WHO = get_covid_df("WHO"),
+    Both = get_covid_df("WHO+Primary"),
+    legacy = get_covid_df("WHO+JHU")
+  )
+
+  if (type == "legacy") {
+    # How "Both" used to work before data stopped flowing in:
+    # - WHO data for everything except for China where we use JHU to replace
+    #   china mainland data, HK, Macau, and Taiwan.
+    # As of 3/10/2023, these data won't be updated by JHU, and China mainland data
+    # haven't been updating since early Jan 2023 in JHU.
+    # But I'll leave in for historical analyses.
+    case_death_df <- filter(
+      case_death_df,
+      !(country == "China" & source == "WHO")
+    )
+  }
 
-  case_death_df <- get_covid_df()
   vax_df <- get_vax()
   meta_df <- onetable
 
@@ -45,17 +70,10 @@ get_combined_table <- function(type = c("WHO", "Both"), geometry = FALSE) {
   out <- meta_df %>%
     right_join(case_death_df, by = "iso2code")
 
-  # If we want Taiwan / HK / Macau, remove china estimates
-  # and keep JHU
-  if (type == "Both") {
-    out <- filter(out, !(country == "China" & source == "WHO"))
-  } else {
-    # If we only want WHO data, remove the JHU rows
-    out <- filter(out, source == "WHO")
-  }
-
   out <- out %>%
     calc_add_risk() %>%
+    # BUG: I'm not sure we want this as a left_join
+    # but I don't want to break everything by switching it to full
     left_join(vax_df, by = c("id", "date")) %>%
     calc_vax_carryforward()
 

diff --git a/R/get_covid_sources.R b/R/get_covid_sources.R
@@ -4,25 +4,62 @@
 #' @description Get and prepare COVID data.
 #'
 #' Pull in current case and death counts from WHO source.
-#' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from John Hopkins source.
+#' For disaggregated China, Taiwan, Hong Kong, and Macau data we pull from JHU or primary sources.
 #'
+#' @param sources one of "all", "WHO", "WHO+JHU", "WHO+Primary" specifying the data sources to pull from. See details.
+#'
+#' @details
+#' In legacy versions, the default was to pull "all" sources, which included the WHO case/death time-series and JHU data for China Mainland, HK, Macau, and Taiwan.
+#' Due to sun-setting and changes in reporting, we now capture HK and Taiwan data from primary sources ("WHO+Primary"). Note that this also includes JHU data on Macau
+#' which will be reported thru Mar 10, 2023 when JHU closes their dashboard.
 #'
 #' @return Returns a data frame with n rows and 8 columns, including:
 #' \itemize{
 #'   \item{\code{date}}{  date Date of observation}
 #'   \item{\code{iso2code}}{  character ISO 3166-1 alpha-2 country code}
-#'   \item{\code{country}}{  character WHO/JHU english country name}
+#'   \item{\code{country}}{  character WHO english country name}
 #'   \item{\code{new_cases}}{  integer Number of new cases reported on date}
 #'   \item{\code{cumulative_cases}}{  integer Number of cumulative cases to date}
 #'   \item{\code{new_deaths}}{  integer Number of new deaths reported on date}
 #'   \item{\code{cumulative_deaths}}{  integer Number of cumulative deaths to date}
-#'   \item{\code{source}}{  character Data Source (JHU, WHO)}
+#'   \item{\code{source}}{  character Data Source}
 #' }
 #' @import dplyr
 #' @importFrom data.table fread
 #' @export
 
-get_covid_df <- function() {
+get_covid_df <- function(sources = c("all", "WHO", "WHO+JHU", "WHO+Primary")) {
+  sources <- match.arg(sources)
+
+  out <- get_who_data()
+
+  if (sources == "WHO") {
+    return(out)
+  }
+
+  jhu_data <- get_jhu_data()
+  out <- bind_rows(out, jhu_data)
+
+  if (sources == "WHO+JHU") {
+    return(out)
+  }
+
+  hk_data <- get_hk_data()
+  tw_data <- get_taiwan_data()
+
+  out <- bind_rows(out, hk_data, tw_data)
+
+  # Keep only Macau data from JHU if we want primary sources + WHO
+  # else, keep all of it
+  if (sources %in% c("WHO+Primary")) {
+    out <- out |>
+      filter(!(source == "JHU" & country %in% c("Hong Kong", "China", "Taiwan")))
+  }
+
+  return(out)
+}
+
+get_who_data <- function() {
   who_data <- fread(datasource_lk$who_all, stringsAsFactors = FALSE, encoding = "UTF-8") %>%
     rename_all(tolower) %>%
     rename(iso2code = country_code) %>%
@@ -43,6 +80,10 @@ get_covid_df <- function() {
     ) %>%
     select(-who_region)
 
+  return(who_data)
+}
+
+get_jhu_data <- function() {
   jhu_cases <- fread(datasource_lk$jhu_case, stringsAsFactors = FALSE, check.names = FALSE) %>%
     rename_all(tolower) %>%
     filter(`country/region` %in% c("Taiwan*", "China")) %>%
@@ -100,7 +141,110 @@ get_covid_df <- function() {
     ) %>%
     arrange(country, date)
 
-  df <- bind_rows(who_data, jhu_data)
+  return(jhu_data)
+}
+
+get_hk_data <- function() {
+  hk_data_raw <- fread(datasource_lk$hk_case_deaths, stringsAsFactors = FALSE, encoding = "UTF-8", data.table = FALSE, check.names = FALSE) |>
+    as_tibble()
+
+  hk_data_raw[["pcr_and_rat"]] <- rowSums(
+    hk_data_raw[, c("Number of cases tested positive for SARS-CoV-2 virus by nucleic acid tests", "Number of cases tested positive for SARS-CoV-2 virus by rapid antigen tests")],
+    na.rm = TRUE
+  )
+
+  hk_data <- hk_data_raw |>
+    mutate(
+      date = as.Date(`As of date`, "%d/%m/%Y"),
+      iso2code = "HK",
+      country = "Hong Kong",
+      source = "HK CHP",
+      # Number of confirmed cases used to be used
+      # prior to Omicron wave, but was replaced by
+      # the two other vars that stratified by PCR or RAT pos
+      cumulative_cases = case_when(
+        !is.na(`Number of confirmed cases`) ~ as.numeric(`Number of confirmed cases`),
+        pcr_and_rat != 0 ~ pcr_and_rat,
+        TRUE ~ NA_real_
+      )
+    ) |>
+    rename(cumulative_deaths = `Number of death cases`) |>
+    # Cumultive case reporting stopped for some reason
+    # so we need to fill downwards to continue it
+    arrange(date) |>
+    tidyr::fill(cumulative_cases, cumulative_deaths) |>
+    mutate(
+      # Started tracking new deaths via this variable in Jan 2023
+      cumulative_deaths = if_else(
+        !is.na(`Number of death cases related to COVID-19`),
+        as.double(cumulative_deaths + cumsum(tidyr::replace_na(`Number of death cases related to COVID-19`, 0))),
+        as.double(cumulative_deaths)
+      ),
+      # Started tracking new cases via this variable in Jan 2023
+      cumulative_cases = if_else(
+        !is.na(`Number of positive nucleic acid test laboratory detections`),
+        as.double(cumulative_cases + cumsum(tidyr::replace_na(`Number of positive nucleic acid test laboratory detections`, 0))),
+        as.double(cumulative_cases)
+      ),
+      new_cases = cumulative_cases - lag(cumulative_cases, default = 0),
+      new_deaths = cumulative_deaths - lag(cumulative_deaths, default = 0)
+    ) |>
+    select(date, iso2code, country, new_cases, cumulative_cases, new_deaths, cumulative_deaths, source)
+
+  return(hk_data)
+}
+
+get_taiwan_data <- function() {
+  tw_case_raw <- data.table::fread(
+    datasource_lk$taiwan_cases,
+    encoding = "UTF-8",
+    data.table = FALSE,
+    check.names = FALSE
+  )
+
+  tw_death_raw <- data.table::fread(
+    datasource_lk$taiwan_deaths,
+    encoding = "UTF-8",
+    data.table = FALSE,
+    check.names = FALSE
+  )
+
+  tw_cases <- tw_case_raw |>
+    rename(
+      date = `個案研判日`,
+      cases = `確定病例數`
+    ) |>
+    mutate(
+      date = as.Date(date, "%Y/%m/%d")
+    ) |>
+    group_by(date) |>
+    summarise(
+      new_cases = sum(cases, na.rm = TRUE)
+    ) |>
+    ungroup() |>
+    arrange(date) |>
+    mutate(cumulative_cases = cumsum(new_cases))
+
+  tw_deaths <- tw_death_raw |>
+    rename(
+      date = `發病日`,
+      deaths = `死亡病例數`
+    ) |>
+    mutate(date = as.Date(date, "%Y/%m/%d")) |>
+    group_by(date) |>
+    summarise(new_deaths = sum(deaths, na.rm = TRUE)) |>
+    arrange(date) |>
+    mutate(cumulative_deaths = cumsum(new_deaths))
+
+  tw_data <- full_join(
+    tw_cases, tw_deaths,
+    by = "date"
+  ) |>
+    mutate(
+      iso2code = "TW",
+      country = "Taiwan",
+      source = "Taiwan CDC"
+    )
 
-  return(df)
+  return(tw_data)
 }
diff --git a/R/get_testing.R b/R/get_testing.R
@@ -68,7 +68,7 @@ get_owid_testing_long <- function(find_maxgap = 31, flag_test_increase = 5) {
     stop("Check testing dataset in get_testing_long() -- multiple values per country-date")
   }
 
-  full_OWID <- data.table::fread(datasource_lk$owid_all, data.table = F, showProgress = F, verbose = F) %>%
+  full_OWID <- data.table::fread(datasource_lk$owid_all, data.table = F, showProgress = F, verbose = F, encoding = "UTF-8") %>%
     rename(id = iso_code) %>%
     mutate(date = as.Date(date)) %>%
     mutate(id = recode(id, "OWID_KOS" = "XKX")) %>%

diff --git a/R/lookups.R b/R/lookups.R
@@ -139,7 +139,14 @@ manual_iso3_lk <- list(
 #' to be updated as needed.
 datasource_lk <- list(
   # OWID cases and deaths
-  owid_all = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv",
+  # SB Note: Beginning Mar 8, 2023 OWID has ceased pulling from JHU
+  # and JHU will cease operations itself on Mar 10, 2023. This will contain legacy data
+  owid_all = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data-old.csv",
+  # HK Cases and Deaths
+  hk_case_deaths = "http://www.chp.gov.hk/files/misc/latest_situation_of_reported_cases_covid_19_eng.csv",
+  # Taiwan Cases and Deaths
+  taiwan_cases = "https://data.cdc.gov.tw/en/download?resourceid=a65c7cb5-8a3c-4859-a27a-9019f65dd66e&dataurl=https://od.cdc.gov.tw/eic/Day_Confirmation_Age_County_Gender_19CoV.csv",
+  taiwan_deaths = "https://data.cdc.gov.tw/en/download?resourceid=a12dfeba-0dea-4b3f-b1b0-1bf3524b3ca9&dataurl=https://od.cdc.gov.tw/eic/open_data_death_date_statistics_19CoV_5.csv",
   # OWID Testing dataset
   owid_testing = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-all-observations.csv",
   # Testing data and metadata from FIND

diff --git a/man/datasource_lk.Rd b/man/datasource_lk.Rd
diff --git a/man/get_combined_table.Rd b/man/get_combined_table.Rd