From 746347560cf70f852bf370e7bf4055fb7425b62a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 4 Oct 2024 08:58:40 -0700 Subject: [PATCH] docs(aggregation.Rmd): tweak readr & join usage * Rename state_census -> state_naming. * Provide col_types specs for all & only columns used; avoid message spam. * Don't select unused cols; especially avoid the numeric state FIPS. * Bump dependency on dplyr and update joins; avoid message spam. --- DESCRIPTION | 2 +- data-raw/jhu_csse_county_level_subset.R | 11 +++++++++-- vignettes/aggregation.Rmd | 22 ++++++++++++++-------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3da259cf..3b8f9a68 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,7 +35,7 @@ Imports: checkmate, cli, data.table, - dplyr (>= 1.0.8), + dplyr (>= 1.1.0), genlasso, ggplot2, glue, diff --git a/data-raw/jhu_csse_county_level_subset.R b/data-raw/jhu_csse_county_level_subset.R index e28ba66b..4d27c20e 100644 --- a/data-raw/jhu_csse_county_level_subset.R +++ b/data-raw/jhu_csse_county_level_subset.R @@ -1,8 +1,15 @@ +library(readr) library(epidatr) library(epiprocess) library(dplyr) -y <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv") %>% # nolint: line_length_linter +y <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv", # nolint: line_length_linter + col_types = cols( + FIPS = col_character(), + STNAME = col_character(), + CTYNAME = col_character() + ) +) %>% filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) @@ -16,7 +23,7 @@ jhu_csse_county_level_subset <- pub_covidcast( time_values = epirange(20200601, 20211231), ) %>% select(geo_value, time_value, cases = value) %>% - full_join(y, by = "geo_value") %>% + inner_join(y, by = "geo_value", relationship = "many-to-one", unmatched = "error") %>% as_epi_df() usethis::use_data(jhu_csse_county_level_subset, overwrite = TRUE) diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index 1350faac..0a0e755e 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -13,12 +13,19 @@ kinds of tasks with `epi_df` objects. We'll work with county-level reported COVID-19 cases in MA and VT. ```{r, message = FALSE, eval= FALSE, warning= FALSE} +library(readr) library(epidatr) library(epiprocess) library(dplyr) # Get mapping between FIPS codes and county&state names: -y <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv") %>% # nolint: line_length_linter +y <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv", # nolint: line_length_linter + col_types = c( + FIPS = col_character(), + CTYNAME = col_character(), + STNAME = col_character() + ) +) %>% filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) @@ -39,6 +46,7 @@ x <- pub_covidcast( The data contains 16,212 rows and 5 columns. ```{r, echo=FALSE, warning=FALSE, message=FALSE} +library(readr) library(epidatr) library(epiprocess) library(dplyr) @@ -108,17 +116,15 @@ help avoid bugs in further downstream data processing tasks. Let's first remove certain dates from our data set to create gaps: ```{r} -state_census <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv") %>% # nolint: line_length_linter - select(STATE, NAME, POPESTIMATE2019, ABBR) %>% - rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>% - mutate(abbr = tolower(abbr)) %>% +state_naming <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv", # nolint: line_length_linter + col_types = c(NAME = col_character(), ABBR = col_character()) +) %>% + transmute(state_name = NAME, abbr = tolower(ABBR)) %>% as_tibble() # First make geo value more readable for tables, plots, etc. x <- x %>% - inner_join( - state_census %>% select(state_name = name, abbr) - ) %>% + inner_join(state_naming, by = "state_name", relationship = "many-to-one", unmatched = "error") %>% mutate(geo_value = paste(substr(county_name, 1, nchar(county_name) - 7), state_name, sep = ", ")) %>% select(geo_value, time_value, cases)