docs(aggregation.Rmd): tweak readr & join usage

* Rename state_census -> state_naming. * Provide col_types specs for all & only columns used; avoid message spam. * Don't select unused cols; especially avoid the numeric state FIPS. * Bump dependency on dplyr and update joins; avoid message spam.
cmu-delphi · Oct 4, 2024 · 7463475 · 7463475
1 parent 635ff4d
commit 7463475
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -35,7 +35,7 @@ Imports:
     checkmate,
     cli,
     data.table,
-    dplyr (>= 1.0.8),
+    dplyr (>= 1.1.0),
     genlasso,
     ggplot2,
     glue,

diff --git a/data-raw/jhu_csse_county_level_subset.R b/data-raw/jhu_csse_county_level_subset.R
@@ -1,8 +1,15 @@
+library(readr)
 library(epidatr)
 library(epiprocess)
 library(dplyr)
 
-y <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv") %>% # nolint: line_length_linter
+y <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv", # nolint: line_length_linter
+  col_types = cols(
+    FIPS = col_character(),
+    STNAME = col_character(),
+    CTYNAME = col_character()
+  )
+) %>%
   filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>%
   select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME)
 
@@ -16,7 +23,7 @@ jhu_csse_county_level_subset <- pub_covidcast(
   time_values = epirange(20200601, 20211231),
 ) %>%
   select(geo_value, time_value, cases = value) %>%
-  full_join(y, by = "geo_value") %>%
+  inner_join(y, by = "geo_value", relationship = "many-to-one", unmatched = "error") %>%
   as_epi_df()
 
 usethis::use_data(jhu_csse_county_level_subset, overwrite = TRUE)
diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd
@@ -13,12 +13,19 @@ kinds of tasks with `epi_df` objects. We'll work with county-level reported
 COVID-19 cases in MA and VT.
 
 ```{r, message = FALSE, eval= FALSE, warning= FALSE}
+library(readr)
 library(epidatr)
 library(epiprocess)
 library(dplyr)
 
 # Get mapping between FIPS codes and county&state names:
-y <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv") %>% # nolint: line_length_linter
+y <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv", # nolint: line_length_linter
+  col_types = c(
+    FIPS = col_character(),
+    CTYNAME = col_character(),
+    STNAME = col_character()
+  )
+) %>%
   filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>%
   select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME)
 
@@ -39,6 +46,7 @@ x <- pub_covidcast(
 The data contains 16,212 rows and 5 columns.
 
 ```{r, echo=FALSE, warning=FALSE, message=FALSE}
+library(readr)
 library(epidatr)
 library(epiprocess)
 library(dplyr)
@@ -108,17 +116,15 @@ help avoid bugs in further downstream data processing tasks.
 Let's first remove certain dates from our data set to create gaps:
 
 ```{r}
-state_census <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv") %>% # nolint: line_length_linter
-  select(STATE, NAME, POPESTIMATE2019, ABBR) %>%
-  rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>%
-  mutate(abbr = tolower(abbr)) %>%
+state_naming <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv", # nolint: line_length_linter
+  col_types = c(NAME = col_character(), ABBR = col_character())
+) %>%
+  transmute(state_name = NAME, abbr = tolower(ABBR)) %>%
   as_tibble()
 
 # First make geo value more readable for tables, plots, etc.
 x <- x %>%
-  inner_join(
-    state_census %>% select(state_name = name, abbr)
-  ) %>%
+  inner_join(state_naming, by = "state_name", relationship = "many-to-one", unmatched = "error") %>%
   mutate(geo_value = paste(substr(county_name, 1, nchar(county_name) - 7), state_name, sep = ", ")) %>%
   select(geo_value, time_value, cases)