From 4a6f7bfb116148221d46b9f28d979b8bf7138816 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 5 Jun 2024 23:47:14 -0500 Subject: [PATCH 01/19] basic auto-naming --- NAMESPACE | 1 + R/epi_df.R | 27 ++++++++++++++++++++++++++- man/as_epi_df.Rd | 10 +++++++++- tests/testthat/test-epi_df.R | 16 ++++++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b4cafc83..edbaf137 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,6 +100,7 @@ importFrom(checkmate,test_subset) importFrom(checkmate,vname) importFrom(cli,cat_line) importFrom(cli,cli_abort) +importFrom(cli,cli_inform) importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(cli,format_message) diff --git a/R/epi_df.R b/R/epi_df.R index f4df1604..6ac1b7db 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -256,9 +256,34 @@ as_epi_df.epi_df <- function(x, ...) { #' (stored in its attributes); if this fails, then the current day-time will #' be used. #' @importFrom rlang .data +#' @importFrom cli cli_inform #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { + additional_metadata = list(), + substitutions = NULL, ...) { + # possible standard substitutions + if (!("time_value" %in% names(x))) { + if (("forecast_date" %in% names(x)) && ("target_date" %in% names(x))) { + cli_abort("both `forecast_date` and `target_date` are present without a `time_value` +column, so it is ambiguous which to choose as `time_value`.") + } + name_substitutions <- substitutions %||% c( + time_value = "date", + time_value = "forecast_date", + time_value = "target_date", + time_value = "dates", + time_value = "time_values", + time_value = "forecast_dates", + time_value = "target_dates" + ) + x <- tryCatch(x %>% rename(any_of(name_substitutions)), + error = function(cond) { + cli_abort("There are multiple `time_value` candidate columns. +Either `rename` on yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( "Columns `geo_value` and `time_value` must be present in `x`." diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 40c0a1c5..82bd8882 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -12,7 +12,15 @@ as_epi_df(x, ...) \method{as_epi_df}{epi_df}(x, ...) -\method{as_epi_df}{tbl_df}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) +\method{as_epi_df}{tbl_df}( + x, + geo_type, + time_type, + as_of, + additional_metadata = list(), + substitutions = NULL, + ... +) \method{as_epi_df}{data.frame}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 8cfb4408..8ffc9820 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -46,6 +46,22 @@ test_that("as_epi_df errors when additional_metadata is not a list", { ) }) +test_that("as_epi_df works for nonstandard input", { + tib <- tibble::tibble( + x = 1:10, y = 1:10, + date = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) + ) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + + tib <- tib %>% rename(forecast_date = date) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + tib %>% rename(any_of(name_substitutions)) + + tib <- tib %>% mutate(target_date = 20 + forecast_date) + expect_error(tib_epi_df <- tib %>% as_epi_df()) +}) + # select fixes tib <- tibble::tibble( From 284daaf93465ea92f58f4631da339a6c866150c6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 7 Jun 2024 16:04:39 -0500 Subject: [PATCH 02/19] geo_value and version, separate functions, more ex --- NAMESPACE | 1 + R/archive.R | 3 + R/epi_df.R | 29 ++-------- R/utils.R | 106 ++++++++++++++++++++++++++++++++++ man/as_epi_df.Rd | 2 +- man/guess_time_column_name.Rd | 12 ++++ man/upcase_snake_case.Rd | 16 +++++ 7 files changed, 144 insertions(+), 25 deletions(-) create mode 100644 man/guess_time_column_name.Rd create mode 100644 man/upcase_snake_case.Rd diff --git a/NAMESPACE b/NAMESPACE index edbaf137..f0b01e82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -187,6 +187,7 @@ importFrom(tibble,as_tibble) importFrom(tibble,new_tibble) importFrom(tibble,validate_tibble) importFrom(tidyr,unnest) +importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) diff --git a/R/archive.R b/R/archive.R index 464d68ef..8488cbfa 100644 --- a/R/archive.R +++ b/R/archive.R @@ -456,6 +456,9 @@ as_epi_archive <- function( clobberable_versions_start = NULL, versions_end = NULL) { assert_data_frame(x) + x <- guess_time_column_name(x) + x <- guess_geo_column_name(x) + x <- guess_version_column_name(x) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( "Columns `geo_value`, `time_value`, and `version` must be present in `x`." diff --git a/R/epi_df.R b/R/epi_df.R index 6ac1b7db..712c4b0e 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -256,34 +256,15 @@ as_epi_df.epi_df <- function(x, ...) { #' (stored in its attributes); if this fails, then the current day-time will #' be used. #' @importFrom rlang .data +#' @importFrom tidyselect any_of #' @importFrom cli cli_inform #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, additional_metadata = list(), - substitutions = NULL, ...) { - # possible standard substitutions - if (!("time_value" %in% names(x))) { - if (("forecast_date" %in% names(x)) && ("target_date" %in% names(x))) { - cli_abort("both `forecast_date` and `target_date` are present without a `time_value` -column, so it is ambiguous which to choose as `time_value`.") - } - name_substitutions <- substitutions %||% c( - time_value = "date", - time_value = "forecast_date", - time_value = "target_date", - time_value = "dates", - time_value = "time_values", - time_value = "forecast_dates", - time_value = "target_dates" - ) - x <- tryCatch(x %>% rename(any_of(name_substitutions)), - error = function(cond) { - cli_abort("There are multiple `time_value` candidate columns. -Either `rename` on yourself or drop some.") - } - ) - cli_inform("inferring `time_value` column.") - } + ...) { + # possible standard substitutions for time_value + x <- guess_time_column_name(x) + x <- guess_geo_column_name(x) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( "Columns `geo_value` and `time_value` must be present in `x`." diff --git a/R/utils.R b/R/utils.R index b2098e9b..39f97a36 100644 --- a/R/utils.R +++ b/R/utils.R @@ -448,6 +448,112 @@ guess_time_type <- function(time_value) { return("custom") } +#' given a vector of characters, add the same values, but upcased, e.g. +#' "date" -> c("date", "Date") +#' "target_date" -> c("target_date", "Target_Date") +#' @keywords internal +upcase_snake_case <- function(x) { + X <- strsplit(x, "_") %>% + map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% + unlist() + c(x, X) +} + +#' given an arbitrary +#' @keywords internal +guess_time_column_name <- function(x, substitutions = NULL) { + if (!("time_value" %in% names(x))) { + if (is.null(substitutions)) { + substitutions <- c( + time_value = "date", + time_value = "time", + time_value = "datetime", + time_value = "dateTime", + tmie_value = "date_time", + time_value = "forecast_date", + time_value = "target_date", + time_value = "week", + time_value = "day", + time_value = "epiweek", + time_value = "month", + time_value = "year", + time_value = "yearmon", + time_value = "yearMon", + time_value = "dates", + time_value = "time_values", + time_value = "forecast_dates", + time_value = "target_dates" + ) + substitutions <- upcase_snake_case(substitutions) + } + strsplit(name_substitutions, "_") %>% + map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% + unlist() + x <- tryCatch(x %>% rename(any_of(name_substitutions)), + error = function(cond) { + cli_abort("There are multiple `time_value` candidate columns. +Either `rename` some yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } + return(x) +} + + +#' @keywords internal +guess_geo_column_name <- function(x, substitutions = NULL) { + if (!("time_value" %in% names(x))) { + substitutions <- substitutions %||% c( + geo_value = "geo_values", + geo_value = "geo_id", + geo_value = "geos", + geo_value = "location", + geo_value = "jurisdiction", + geo_value = "fips", + geo_value = "zip", + geo_value = "county", + geo_value = "hrr", + geo_value = "msa", + geo_value = "state", + geo_value = "province", + geo_value = "nation", + geo_value = "states", + geo_value = "provinces", + geo_value = "counties" + ) + substitutions <- upcase_snake_case(substitutions) + x <- tryCatch(x %>% rename(any_of(substitutions)), + error = function(cond) { + cli_abort("There are multiple `geo_value` candidate columns. +Either `rename` some yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } + return(x) +} + +guess_version_column_name <- function(x, substitutions = NULL) { + if (!("version" %in% names(x))) { + if (is.null(substitutions)) { + substitutions <- c( + version = "issue", + version = "release" + ) + substitutions <- upcase_snake_case(substitutions) + } + x <- tryCatch(x %>% rename(any_of(substitutions)), + error = function(cond) { + cli_abort("There are multiple `geo_value` candidate columns. +Either `rename` some yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } + return(x) +} + ########## diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 82bd8882..22faa265 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -17,8 +17,8 @@ as_epi_df(x, ...) geo_type, time_type, as_of, + other_keys = character(0), additional_metadata = list(), - substitutions = NULL, ... ) diff --git a/man/guess_time_column_name.Rd b/man/guess_time_column_name.Rd new file mode 100644 index 00000000..45a173b6 --- /dev/null +++ b/man/guess_time_column_name.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{guess_time_column_name} +\alias{guess_time_column_name} +\title{given an arbitrary} +\usage{ +guess_time_column_name(x, substitutions = NULL) +} +\description{ +given an arbitrary +} +\keyword{internal} diff --git a/man/upcase_snake_case.Rd b/man/upcase_snake_case.Rd new file mode 100644 index 00000000..398f6a0b --- /dev/null +++ b/man/upcase_snake_case.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{upcase_snake_case} +\alias{upcase_snake_case} +\title{given a vector of characters, add the same values, but upcased, e.g. +"date" -> c("date", "Date") +"target_date" -> c("target_date", "Target_Date")} +\usage{ +upcase_snake_case(x) +} +\description{ +given a vector of characters, add the same values, but upcased, e.g. +"date" -> c("date", "Date") +"target_date" -> c("target_date", "Target_Date") +} +\keyword{internal} From 6b361da54d06a7c317581b6d56c8d061fbef4bae Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 7 Jun 2024 21:06:52 +0000 Subject: [PATCH 03/19] docs: document (GHA) --- man/as_epi_df.Rd | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 22faa265..40c0a1c5 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -12,15 +12,7 @@ as_epi_df(x, ...) \method{as_epi_df}{epi_df}(x, ...) -\method{as_epi_df}{tbl_df}( - x, - geo_type, - time_type, - as_of, - other_keys = character(0), - additional_metadata = list(), - ... -) +\method{as_epi_df}{tbl_df}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) \method{as_epi_df}{data.frame}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) From 3276e9e5d6eacc45773c41ad0db4aaa308ee312d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 7 Jun 2024 17:31:58 -0500 Subject: [PATCH 04/19] errant renamed variables --- R/utils.R | 4 ++-- tests/testthat/test-epi_df.R | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/R/utils.R b/R/utils.R index 39f97a36..e52c0da9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -486,10 +486,10 @@ guess_time_column_name <- function(x, substitutions = NULL) { ) substitutions <- upcase_snake_case(substitutions) } - strsplit(name_substitutions, "_") %>% + strsplit(substitutions, "_") %>% map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% unlist() - x <- tryCatch(x %>% rename(any_of(name_substitutions)), + x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { cli_abort("There are multiple `time_value` candidate columns. Either `rename` some yourself or drop some.") diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 8ffc9820..c6a304bd 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -56,7 +56,6 @@ test_that("as_epi_df works for nonstandard input", { tib <- tib %>% rename(forecast_date = date) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) - tib %>% rename(any_of(name_substitutions)) tib <- tib %>% mutate(target_date = 20 + forecast_date) expect_error(tib_epi_df <- tib %>% as_epi_df()) From 00ce1da71e00db14efd78345b436c1b12d9aa69d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 24 Jun 2024 19:05:44 -0500 Subject: [PATCH 05/19] More tests, `...` tidyselect, doc as_epi_df, more values --- R/archive.R | 3 ++- R/epi_df.R | 19 ++++++++++++------- R/utils.R | 20 ++++++++++++++------ man/as_epi_df.Rd | 16 ++++++++++------ man/guess_time_column_name.Rd | 4 ++-- tests/testthat/test-epi_df.R | 9 +++++++++ 6 files changed, 49 insertions(+), 22 deletions(-) diff --git a/R/archive.R b/R/archive.R index 8488cbfa..83806eef 100644 --- a/R/archive.R +++ b/R/archive.R @@ -454,8 +454,9 @@ as_epi_archive <- function( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL) { + versions_end = NULL, ...) { assert_data_frame(x) + x <- rename(x, ...) x <- guess_time_column_name(x) x <- guess_geo_column_name(x) x <- guess_version_column_name(x) diff --git a/R/epi_df.R b/R/epi_df.R index 712c4b0e..c7554b33 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -249,12 +249,16 @@ as_epi_df.epi_df <- function(x, ...) { #' @method as_epi_df tbl_df #' @describeIn as_epi_df The input tibble `x` must contain the columns -#' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `as_of` is missing, then the function -#' will try to guess it from an `as_of`, `issue`, or `version` column of `x` -#' (if any of these are present), or from as an `as_of` field in its metadata -#' (stored in its attributes); if this fails, then the current day-time will -#' be used. +#' `geo_value` and `time_value`, or column names that uniquely map onto these +#' (e.g. `date` or `province`). Alternatively, you can specify the conversion +#' explicitly (`time_value = someWeirdColumnName`). All other columns not +#' specified as `other_keys` will be preserved as is, and treated as measured +#' variables. +#' +#' If `as_of` is missing, then the function will try to guess it from an +#' `as_of`, `issue`, or `version` column of `x` (if any of these are present), +#' or from as an `as_of` field in its metadata (stored in its attributes); if +#' this fails, then the current day-time will be used. #' @importFrom rlang .data #' @importFrom tidyselect any_of #' @importFrom cli cli_inform @@ -263,11 +267,12 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, additional_metadata = list(), ...) { # possible standard substitutions for time_value + x <- rename(x, ...) x <- guess_time_column_name(x) x <- guess_geo_column_name(x) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Columns `geo_value` and `time_value` must be present in `x`." + "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." ) } diff --git a/R/utils.R b/R/utils.R index e52c0da9..bce158dd 100644 --- a/R/utils.R +++ b/R/utils.R @@ -459,7 +459,7 @@ upcase_snake_case <- function(x) { c(x, X) } -#' given an arbitrary +#' rename potential time_value columns #' @keywords internal guess_time_column_name <- function(x, substitutions = NULL) { if (!("time_value" %in% names(x))) { @@ -473,12 +473,14 @@ guess_time_column_name <- function(x, substitutions = NULL) { time_value = "forecast_date", time_value = "target_date", time_value = "week", - time_value = "day", time_value = "epiweek", time_value = "month", + time_value = "mon", time_value = "year", time_value = "yearmon", + time_value = "yearmonth", time_value = "yearMon", + time_value = "yearMonth", time_value = "dates", time_value = "time_values", time_value = "forecast_dates", @@ -495,7 +497,9 @@ guess_time_column_name <- function(x, substitutions = NULL) { Either `rename` some yourself or drop some.") } ) - cli_inform("inferring `time_value` column.") + if (any(substitutions != "")) { + cli_inform("inferring `time_value` column.") + } } return(x) } @@ -529,7 +533,9 @@ guess_geo_column_name <- function(x, substitutions = NULL) { Either `rename` some yourself or drop some.") } ) - cli_inform("inferring `time_value` column.") + if (any(substitutions != "")) { + cli_inform("inferring `geo_value` column.") + } } return(x) } @@ -545,11 +551,13 @@ guess_version_column_name <- function(x, substitutions = NULL) { } x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `geo_value` candidate columns. + cli_abort("There are multiple `version` candidate columns. Either `rename` some yourself or drop some.") } ) - cli_inform("inferring `time_value` column.") + if (any(substitutions != "")) { + cli_inform("inferring `version` column.") + } } return(x) } diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 40c0a1c5..98cdbb83 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -55,12 +55,16 @@ examples. \item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. \item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns -\code{geo_value} and \code{time_value}. All other columns will be preserved as is, -and treated as measured variables. If \code{as_of} is missing, then the function -will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x} -(if any of these are present), or from as an \code{as_of} field in its metadata -(stored in its attributes); if this fails, then the current day-time will -be used. +\code{geo_value} and \code{time_value}, or column names that uniquely map onto these +(e.g. \code{date} or \code{province}). Alternatively, you can specify the conversion +explicitly (\code{time_value = someWeirdColumnName}). All other columns not +specified as \code{other_keys} will be preserved as is, and treated as measured +variables. + +If \code{as_of} is missing, then the function will try to guess it from an +\code{as_of}, \code{issue}, or \code{version} column of \code{x} (if any of these are present), +or from as an \code{as_of} field in its metadata (stored in its attributes); if +this fails, then the current day-time will be used. \item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. diff --git a/man/guess_time_column_name.Rd b/man/guess_time_column_name.Rd index 45a173b6..f09a0e6e 100644 --- a/man/guess_time_column_name.Rd +++ b/man/guess_time_column_name.Rd @@ -2,11 +2,11 @@ % Please edit documentation in R/utils.R \name{guess_time_column_name} \alias{guess_time_column_name} -\title{given an arbitrary} +\title{rename potential time_value columns} \usage{ guess_time_column_name(x, substitutions = NULL) } \description{ -given an arbitrary +rename potential time_value columns } \keyword{internal} diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index c6a304bd..e7f092fd 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -53,6 +53,15 @@ test_that("as_epi_df works for nonstandard input", { geo_value = rep(c("ca", "hi"), each = 5) ) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) + expect_error(expect_message( + tib %>% rename(awefa = geo_value) %>% as_epi_df(), + regexp = "inferring `time_value` column." + )) + expect_no_error(expect_message( + tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), + regexp = "inferring `time_value` column." + )) tib <- tib %>% rename(forecast_date = date) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) From 287edd7608632bbbf0d3a8cb3fcedd6e6f9d46bc Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 00:08:36 +0000 Subject: [PATCH 06/19] docs: document (GHA) --- man/epi_archive.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 97ff6af0..d95e6eb5 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -37,7 +37,8 @@ as_epi_archive( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL + versions_end = NULL, + ... ) } \arguments{ From 75c62c2bacc9bea2b35b67498de85acbe04421d8 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 10:47:03 -0500 Subject: [PATCH 07/19] remove `forecast_date` as a default --- R/utils.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/utils.R b/R/utils.R index bce158dd..d008f1b3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -470,7 +470,6 @@ guess_time_column_name <- function(x, substitutions = NULL) { time_value = "datetime", time_value = "dateTime", tmie_value = "date_time", - time_value = "forecast_date", time_value = "target_date", time_value = "week", time_value = "epiweek", @@ -483,7 +482,6 @@ guess_time_column_name <- function(x, substitutions = NULL) { time_value = "yearMonth", time_value = "dates", time_value = "time_values", - time_value = "forecast_dates", time_value = "target_dates" ) substitutions <- upcase_snake_case(substitutions) From 952e0e24b6b769edaf042bb7116fe464d2b9645d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 10:55:02 -0500 Subject: [PATCH 08/19] happier linter --- R/epi_df.R | 3 ++- R/utils.R | 6 +++--- man/upcase_snake_case.Rd | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index c7554b33..5c84e90f 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -272,7 +272,8 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, x <- guess_geo_column_name(x) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." + "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal + functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." ) } diff --git a/R/utils.R b/R/utils.R index d008f1b3..c00205a3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -452,11 +452,11 @@ guess_time_type <- function(time_value) { #' "date" -> c("date", "Date") #' "target_date" -> c("target_date", "Target_Date") #' @keywords internal -upcase_snake_case <- function(x) { - X <- strsplit(x, "_") %>% +upcase_snake_case <- function(vec) { + VEC <- strsplit(vec, "_") %>% map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% unlist() - c(x, X) + c(vec, VEC) } #' rename potential time_value columns diff --git a/man/upcase_snake_case.Rd b/man/upcase_snake_case.Rd index 398f6a0b..31ecb768 100644 --- a/man/upcase_snake_case.Rd +++ b/man/upcase_snake_case.Rd @@ -6,7 +6,7 @@ "date" -> c("date", "Date") "target_date" -> c("target_date", "Target_Date")} \usage{ -upcase_snake_case(x) +upcase_snake_case(vec) } \description{ given a vector of characters, add the same values, but upcased, e.g. From e3feab6255bdf9eee54256823e524409c25537a6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 15:34:40 -0500 Subject: [PATCH 09/19] wrong test, better too many columns error --- R/archive.R | 1 + R/utils.R | 10 +++++----- man/epi_archive.Rd | 2 ++ tests/testthat/test-epi_df.R | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 83806eef..b9c29b63 100644 --- a/R/archive.R +++ b/R/archive.R @@ -442,6 +442,7 @@ validate_epi_archive <- function( #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. +#' @param ... used for specifying column names, such as `version = release_date` #' #' @rdname epi_archive #' diff --git a/R/utils.R b/R/utils.R index c00205a3..d2fa3ab4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -453,10 +453,10 @@ guess_time_type <- function(time_value) { #' "target_date" -> c("target_date", "Target_Date") #' @keywords internal upcase_snake_case <- function(vec) { - VEC <- strsplit(vec, "_") %>% + upper_vec <- strsplit(vec, "_") %>% map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% unlist() - c(vec, VEC) + c(vec, upper_vec) } #' rename potential time_value columns @@ -491,7 +491,7 @@ guess_time_column_name <- function(x, substitutions = NULL) { unlist() x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `time_value` candidate columns. + cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) @@ -527,7 +527,7 @@ guess_geo_column_name <- function(x, substitutions = NULL) { substitutions <- upcase_snake_case(substitutions) x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `geo_value` candidate columns. + cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) @@ -549,7 +549,7 @@ guess_version_column_name <- function(x, substitutions = NULL) { } x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `version` candidate columns. + cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index d95e6eb5..943d8e11 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -90,6 +90,8 @@ beyond \code{max(x$version)}, but they all contained empty updates. (The default value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} + +\item{...}{used for specifying column names, such as \code{version = release_date}} } \value{ An \code{epi_archive} object. diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index e7f092fd..1ccb0bec 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -63,10 +63,10 @@ test_that("as_epi_df works for nonstandard input", { regexp = "inferring `time_value` column." )) - tib <- tib %>% rename(forecast_date = date) + tib <- tib %>% rename(target_date = date) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) - tib <- tib %>% mutate(target_date = 20 + forecast_date) + tib <- tib %>% mutate(Time = 20 + target_date) expect_error(tib_epi_df <- tib %>% as_epi_df()) }) From ad70c69ad97d3924bac72f12f666c0492b287786 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 8 Jul 2024 12:36:14 -0500 Subject: [PATCH 10/19] minor refactor to make col name subs accessible --- NAMESPACE | 3 + R/archive.R | 6 +- R/epi_df.R | 4 +- R/utils.R | 139 ++++++++++++++-------------------- man/geo_column_names.Rd | 12 +++ man/guess_column_name.Rd | 17 +++++ man/guess_time_column_name.Rd | 12 --- man/time_column_names.Rd | 12 +++ man/version_column_names.Rd | 12 +++ tests/testthat/test-archive.R | 8 +- tests/testthat/test-epi_df.R | 4 +- 11 files changed, 122 insertions(+), 107 deletions(-) create mode 100644 man/geo_column_names.Rd create mode 100644 man/guess_column_name.Rd delete mode 100644 man/guess_time_column_name.Rd create mode 100644 man/time_column_names.Rd create mode 100644 man/version_column_names.Rd diff --git a/NAMESPACE b/NAMESPACE index f0b01e82..dcbc151e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(epix_merge) export(epix_slide) export(epix_truncate_versions_after) export(filter) +export(geo_column_names) export(group_by) export(group_modify) export(growth_rate) @@ -75,9 +76,11 @@ export(next_after) export(relocate) export(rename) export(slice) +export(time_column_names) export(ungroup) export(unnest) export(validate_epi_archive) +export(version_column_names) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) diff --git a/R/archive.R b/R/archive.R index b9c29b63..2adccb5f 100644 --- a/R/archive.R +++ b/R/archive.R @@ -458,9 +458,9 @@ as_epi_archive <- function( versions_end = NULL, ...) { assert_data_frame(x) x <- rename(x, ...) - x <- guess_time_column_name(x) - x <- guess_geo_column_name(x) - x <- guess_version_column_name(x) + x <- guess_column_name(x, "time_value", time_column_names()) + x <- guess_column_name(x, "geo_value", geo_column_names()) + x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( "Columns `geo_value`, `time_value`, and `version` must be present in `x`." diff --git a/R/epi_df.R b/R/epi_df.R index 5c84e90f..2cb68138 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -268,8 +268,8 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, ...) { # possible standard substitutions for time_value x <- rename(x, ...) - x <- guess_time_column_name(x) - x <- guess_geo_column_name(x) + x <- guess_column_name(x, "time_value", time_column_names()) + x <- guess_column_name(x, "geo_value", geo_column_names()) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal diff --git a/R/utils.R b/R/utils.R index d2fa3ab4..e65c5fea 100644 --- a/R/utils.R +++ b/R/utils.R @@ -459,102 +459,73 @@ upcase_snake_case <- function(vec) { c(vec, upper_vec) } -#' rename potential time_value columns -#' @keywords internal -guess_time_column_name <- function(x, substitutions = NULL) { - if (!("time_value" %in% names(x))) { - if (is.null(substitutions)) { - substitutions <- c( - time_value = "date", - time_value = "time", - time_value = "datetime", - time_value = "dateTime", - tmie_value = "date_time", - time_value = "target_date", - time_value = "week", - time_value = "epiweek", - time_value = "month", - time_value = "mon", - time_value = "year", - time_value = "yearmon", - time_value = "yearmonth", - time_value = "yearMon", - time_value = "yearMonth", - time_value = "dates", - time_value = "time_values", - time_value = "target_dates" - ) - substitutions <- upcase_snake_case(substitutions) - } - strsplit(substitutions, "_") %>% - map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% - unlist() - x <- tryCatch(x %>% rename(any_of(substitutions)), - error = function(cond) { - cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. -Either `rename` some yourself or drop some.") - } - ) - if (any(substitutions != "")) { - cli_inform("inferring `time_value` column.") - } - } - return(x) +#' potential time_value columns +#' @description +#' the full list of potential substitutions for the `time_value` column name: +#' `r time_column_names()` +#' @export +time_column_names <- function() { + substitutions <- c( + "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date", + "week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth", + "yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("time_value", length(substitutions)) + return(substitutions) +} +# +#' potential geo_value columns +#' @description +#' the full list of potential substitutions for the `geo_value` column name: +#' `r geo_column_names()` +#' @export +geo_column_names <- function() { + substitutions <- c( + "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip", + "county", "hrr", "msa", "state", "province", "nation", "states", + "provinces", "counties", "geo_Value" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("geo_value", length(substitutions)) + return(substitutions) } +#' potential version columns +#' @description +#' the full list of potential substitutions for the `version` column name: +#' `r version_column_names()` +#' @export +version_column_names <- function() { + substitutions <- c( + "version", "issue", "release" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("version", length(substitutions)) + return(substitutions) +} +#' rename potential time_value columns +#' +#' @description +#' potentially renames +#' @param x the tibble to potentially rename +#' @param substitions a named vector. the potential substitions, with every name `time_value` #' @keywords internal -guess_geo_column_name <- function(x, substitutions = NULL) { - if (!("time_value" %in% names(x))) { - substitutions <- substitutions %||% c( - geo_value = "geo_values", - geo_value = "geo_id", - geo_value = "geos", - geo_value = "location", - geo_value = "jurisdiction", - geo_value = "fips", - geo_value = "zip", - geo_value = "county", - geo_value = "hrr", - geo_value = "msa", - geo_value = "state", - geo_value = "province", - geo_value = "nation", - geo_value = "states", - geo_value = "provinces", - geo_value = "counties" - ) - substitutions <- upcase_snake_case(substitutions) +guess_column_name <- function(x, column_name, substitutions) { + if (!(column_name %in% names(x))) { x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) - if (any(substitutions != "")) { - cli_inform("inferring `geo_value` column.") + # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column + if (!any(names(x) %in% substitutions)) { + cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list") } - } - return(x) -} - -guess_version_column_name <- function(x, substitutions = NULL) { - if (!("version" %in% names(x))) { - if (is.null(substitutions)) { - substitutions <- c( - version = "issue", - version = "release" - ) - substitutions <- upcase_snake_case(substitutions) - } - x <- tryCatch(x %>% rename(any_of(substitutions)), - error = function(cond) { - cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. -Either `rename` some yourself or drop some.") - } - ) if (any(substitutions != "")) { - cli_inform("inferring `version` column.") + cli_inform("inferring {column_name} column.") } } return(x) diff --git a/man/geo_column_names.Rd b/man/geo_column_names.Rd new file mode 100644 index 00000000..4b3810dc --- /dev/null +++ b/man/geo_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{geo_column_names} +\alias{geo_column_names} +\title{potential geo_value columns} +\usage{ +geo_column_names() +} +\description{ +the full list of potential substitutions for the \code{geo_value} column name: +geo_value, geo_values, geo_id, geos, location, jurisdiction, fips, zip, county, hrr, msa, state, province, nation, states, provinces, counties, geo_Value, Geo_Value, Geo_Values, Geo_Id, Geos, Location, Jurisdiction, Fips, Zip, County, Hrr, Msa, State, Province, Nation, States, Provinces, Counties, Geo_Value +} diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd new file mode 100644 index 00000000..f03a7b80 --- /dev/null +++ b/man/guess_column_name.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{guess_column_name} +\alias{guess_column_name} +\title{rename potential time_value columns} +\usage{ +guess_column_name(x, column_name, substitutions) +} +\arguments{ +\item{x}{the tibble to potentially rename} + +\item{substitions}{a named vector. the potential substitions, with every name \code{time_value}} +} +\description{ +potentially renames +} +\keyword{internal} diff --git a/man/guess_time_column_name.Rd b/man/guess_time_column_name.Rd deleted file mode 100644 index f09a0e6e..00000000 --- a/man/guess_time_column_name.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{guess_time_column_name} -\alias{guess_time_column_name} -\title{rename potential time_value columns} -\usage{ -guess_time_column_name(x, substitutions = NULL) -} -\description{ -rename potential time_value columns -} -\keyword{internal} diff --git a/man/time_column_names.Rd b/man/time_column_names.Rd new file mode 100644 index 00000000..2e2db6b5 --- /dev/null +++ b/man/time_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{time_column_names} +\alias{time_column_names} +\title{potential time_value columns} +\usage{ +time_column_names() +} +\description{ +the full list of potential substitutions for the \code{time_value} column name: +time_value, date, time, datetime, dateTime, date_time, target_date, week, epiweek, month, mon, year, yearmon, yearmonth, yearMon, yearMonth, dates, time_values, target_dates, time_Value, Time_Value, Date, Time, Datetime, DateTime, Date_Time, Target_Date, Week, Epiweek, Month, Mon, Year, Yearmon, Yearmonth, YearMon, YearMonth, Dates, Time_Values, Target_Dates, Time_Value +} diff --git a/man/version_column_names.Rd b/man/version_column_names.Rd new file mode 100644 index 00000000..75ee2315 --- /dev/null +++ b/man/version_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{version_column_names} +\alias{version_column_names} +\title{potential version columns} +\usage{ +version_column_names() +} +\description{ +the full list of potential substitutions for the \code{version} column name: +version, issue, release, Version, Issue, Release +} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1291e3c7..218baf46 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -8,13 +8,13 @@ dt <- archive_cases_dv_subset$DT test_that("data.frame must contain geo_value, time_value and version columns", { expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no geo_value column or similar name" ) - expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + expect_error(expect_message(as_epi_archive(select(dt, -time_value), compactify = FALSE)), + regexp = "There is no time_value column or similar name" ) expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no version column or similar name" ) }) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 1ccb0bec..25a2dc59 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -56,11 +56,11 @@ test_that("as_epi_df works for nonstandard input", { expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) expect_error(expect_message( tib %>% rename(awefa = geo_value) %>% as_epi_df(), - regexp = "inferring `time_value` column." + regexp = "inferring " )) expect_no_error(expect_message( tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), - regexp = "inferring `time_value` column." + regexp = "inferring" )) tib <- tib %>% rename(target_date = date) From edbf1acdfb2c862efd2e0e4a043c3b062b9280a9 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 8 Jul 2024 12:37:24 -0500 Subject: [PATCH 11/19] Nat's suggestions --- R/archive.R | 6 ++++-- R/epi_df.R | 4 ++-- man/epi_archive.Rd | 3 ++- man/new_epi_df.Rd | 3 +-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 2adccb5f..1fd40d73 100644 --- a/R/archive.R +++ b/R/archive.R @@ -442,7 +442,8 @@ validate_epi_archive <- function( #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. -#' @param ... used for specifying column names, such as `version = release_date` +#' @param ... used for specifying column names, as in [`dplyr::rename`]. For +#' example `version = release_date` #' #' @rdname epi_archive #' @@ -463,7 +464,8 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, or related columns (see the internal + functions `guess_time_column_name()`, `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." ) } if (anyMissing(x$version)) { diff --git a/R/epi_df.R b/R/epi_df.R index 2cb68138..e3c77e96 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -95,7 +95,7 @@ NULL #' #' @export new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list(), ...) { + additional_metadata = list()) { assert_data_frame(x) assert_list(additional_metadata) @@ -279,7 +279,7 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, new_epi_df( x, geo_type, time_type, as_of, - additional_metadata, ... + additional_metadata ) } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 943d8e11..19ab1d1e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -91,7 +91,8 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{...}{used for specifying column names, such as \code{version = release_date}} +\item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For +example \code{version = release_date}} } \value{ An \code{epi_archive} object. diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd index 7182c222..934a716e 100644 --- a/man/new_epi_df.Rd +++ b/man/new_epi_df.Rd @@ -9,8 +9,7 @@ new_epi_df( geo_type, time_type, as_of, - additional_metadata = list(), - ... + additional_metadata = list() ) } \arguments{ From d19d1a5e2538ffa77af429062399cf4e3b2c61ef Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 11:39:55 -0500 Subject: [PATCH 12/19] avoid arg prefix-completion for versions_end --- R/archive.R | 3 ++- tests/testthat/test-archive.R | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/R/archive.R b/R/archive.R index 1fd40d73..6137cc72 100644 --- a/R/archive.R +++ b/R/archive.R @@ -456,7 +456,8 @@ as_epi_archive <- function( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL, ...) { + .versions_end = NULL, ..., + versions_end = .versions_end) { assert_data_frame(x) x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 218baf46..c606b664 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -18,6 +18,24 @@ test_that("data.frame must contain geo_value, time_value and version columns", { ) }) +test_that("as_epi_archive custom name mapping works correctly", { + # custom name works correctly + suppressWarnings(expect_equal( + as_epi_archive(rename(dt, weirdName = version), version = weirdName), + as_epi_archive(dt) + )) + suppressWarnings(expect_equal( + as_epi_archive(rename(dt, weirdName = geo_value), geo_value = weirdName), + as_epi_archive(dt) + )) + suppressWarnings(expect_equal( + as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), + as_epi_archive(dt) + )) + + expect_error(as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value), "Names must be unique") +}) + test_that("other_keys can only contain names of the data.frame columns", { expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE), regexp = "`other_keys` must be contained in the column names of `x`." From f6904f27beb35ff933423f280d2784fc5208491b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 16:49:18 +0000 Subject: [PATCH 13/19] style: styler (GHA) --- tests/testthat/test-archive.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index c606b664..1858da86 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -21,7 +21,7 @@ test_that("data.frame must contain geo_value, time_value and version columns", { test_that("as_epi_archive custom name mapping works correctly", { # custom name works correctly suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = version), version = weirdName), + as_epi_archive(rename(dt, weirdName = version), version = weirdName), as_epi_archive(dt) )) suppressWarnings(expect_equal( @@ -29,7 +29,7 @@ test_that("as_epi_archive custom name mapping works correctly", { as_epi_archive(dt) )) suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), + as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), as_epi_archive(dt) )) From 76388e2052def411b22b6d7ff89791dbc6de83fd Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 16:49:21 +0000 Subject: [PATCH 14/19] docs: document (GHA) --- man/epi_archive.Rd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 19ab1d1e..a813c6de 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -37,8 +37,9 @@ as_epi_archive( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL, - ... + .versions_end = NULL, + ..., + versions_end = .versions_end ) } \arguments{ From 500a952f4101c52bba297317698a58a60e805946 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 12 Jul 2024 11:39:05 -0500 Subject: [PATCH 15/19] formatting --- R/archive.R | 8 ++++++-- man/epi_archive.Rd | 4 ++++ tests/testthat/test-archive.R | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 6137cc72..91f5d4a7 100644 --- a/R/archive.R +++ b/R/archive.R @@ -444,6 +444,9 @@ validate_epi_archive <- function( #' `epi_archive` object. #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` +#' @param .versions_end location based versions_end, used to avoid prefix +#' `version = issue` from being assigned to `versions_end` instead of being +#' used to rename columns. #' #' @rdname epi_archive #' @@ -465,8 +468,9 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, or related columns (see the internal - functions `guess_time_column_name()`, `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." + "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, +or related columns (see the internal functions `guess_time_column_name()`, +`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." ) } if (anyMissing(x$version)) { diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index a813c6de..99203052 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -92,6 +92,10 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +\item{.versions_end}{location based versions_end, used to avoid prefix +\code{version = issue} from being assigned to \code{versions_end} instead of being +used to rename columns.} + \item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For example \code{version = release_date}} } diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1858da86..54501a31 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -33,7 +33,9 @@ test_that("as_epi_archive custom name mapping works correctly", { as_epi_archive(dt) )) - expect_error(as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value), "Names must be unique") + expect_error(as_epi_archive(rename(dt, weirdName = version), + version = weirdName, version = time_value + ), "Names must be unique") }) test_that("other_keys can only contain names of the data.frame columns", { From 1b84d010aad36f046d5828cf2044a8f368fbfd62 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 12 Jul 2024 15:33:57 -0500 Subject: [PATCH 16/19] template needed changing --- R/epi_df.R | 1 + R/utils.R | 2 +- man-roxygen/epi_df-params.R | 1 - man/guess_column_name.Rd | 2 +- man/new_epi_df.Rd | 2 -- tests/testthat/test-archive.R | 10 +++++++--- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index e3c77e96..d8b399fb 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for #' examples. #' +#' @param ... Additional arguments passed to methods. #' @template epi_df-params #' #' @export diff --git a/R/utils.R b/R/utils.R index e65c5fea..24da3995 100644 --- a/R/utils.R +++ b/R/utils.R @@ -510,7 +510,7 @@ version_column_names <- function() { #' @description #' potentially renames #' @param x the tibble to potentially rename -#' @param substitions a named vector. the potential substitions, with every name `time_value` +#' @param substitutions a named vector. the potential substitions, with every name `time_value` #' @keywords internal guess_column_name <- function(x, column_name, substitutions) { if (!(column_name %in% names(x))) { diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R index 54d8c2d2..59c51603 100644 --- a/man-roxygen/epi_df-params.R +++ b/man-roxygen/epi_df-params.R @@ -15,5 +15,4 @@ #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a #' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd index f03a7b80..d4aa09b7 100644 --- a/man/guess_column_name.Rd +++ b/man/guess_column_name.Rd @@ -9,7 +9,7 @@ guess_column_name(x, column_name, substitutions) \arguments{ \item{x}{the tibble to potentially rename} -\item{substitions}{a named vector. the potential substitions, with every name \code{time_value}} +\item{substitutions}{a named vector. the potential substitions, with every name \code{time_value}} } \description{ potentially renames diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd index 934a716e..8010b700 100644 --- a/man/new_epi_df.Rd +++ b/man/new_epi_df.Rd @@ -34,8 +34,6 @@ then the current day-time will be used.} \code{as_of} fields; named entries from the passed list will be included as well. If your tibble has additional keys, be sure to specify them as a character vector in the \code{other_keys} component of \code{additional_metadata}.} - -\item{...}{Additional arguments passed to methods.} } \value{ An \code{epi_df} object. diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 54501a31..679e4dbd 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -33,9 +33,13 @@ test_that("as_epi_archive custom name mapping works correctly", { as_epi_archive(dt) )) - expect_error(as_epi_archive(rename(dt, weirdName = version), - version = weirdName, version = time_value - ), "Names must be unique") + expect_error( + as_epi_archive( + rename(dt, weirdName = version), + version = weirdName, + version = time_value + ), "Names must be unique" + ) }) test_that("other_keys can only contain names of the data.frame columns", { From c59af46b494d88c587a7c386d779ed714eb0824c Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 12 Jul 2024 16:48:34 -0500 Subject: [PATCH 17/19] pkgdown fix for new functions, apparently --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 4930c9f5..b95a6386 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -63,6 +63,7 @@ reference: desc: Details on `epi_df` format, and basic functionality. - contents: - matches("epi_df") + - matches("column_names") - title: "`epi_*()` functions" desc: Functions that act on `epi_df` objects. - contents: From 7d99e6df8f10cd592f4c72cd37cf404f18984eec Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jul 2024 15:21:27 -0500 Subject: [PATCH 18/19] recs from Nat, local checks passing --- R/archive.R | 7 ++++--- R/epi_df.R | 6 ++++-- R/utils.R | 36 +++++++++++++++++++++++++---------- tests/testthat/test-archive.R | 32 ++++++++++++++++++------------- tests/testthat/test-epi_df.R | 27 ++++++++++++++++++-------- 5 files changed, 72 insertions(+), 36 deletions(-) diff --git a/R/archive.R b/R/archive.R index 91f5d4a7..780279b0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -468,9 +468,10 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, -or related columns (see the internal functions `guess_time_column_name()`, -`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." + "Either columns `geo_value`, `time_value`, and `version`, or related columns + (see the internal functions `guess_time_column_name()`, + `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete + list) must be present in `x`." ) } if (anyMissing(x$version)) { diff --git a/R/epi_df.R b/R/epi_df.R index d8b399fb..707944f6 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -273,8 +273,10 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, x <- guess_column_name(x, "geo_value", geo_column_names()) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal - functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." + "Either columns `geo_value` and `time_value` or related columns + (see the internal functions `guess_time_column_name()` and/or + `guess_geo_column_name()` for a complete list) + must be present in `x`." ) } diff --git a/R/utils.R b/R/utils.R index 24da3995..9396070d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -512,21 +512,37 @@ version_column_names <- function() { #' @param x the tibble to potentially rename #' @param substitutions a named vector. the potential substitions, with every name `time_value` #' @keywords internal +#' @importFrom cli cli_inform cli_abort +#' @importFrom dplyr rename guess_column_name <- function(x, column_name, substitutions) { if (!(column_name %in% names(x))) { - x <- tryCatch(x %>% rename(any_of(substitutions)), - error = function(cond) { - cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. -Either `rename` some yourself or drop some.") - } - ) # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column if (!any(names(x) %in% substitutions)) { - cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list") - } - if (any(substitutions != "")) { - cli_inform("inferring {column_name} column.") + cli_abort( + "There is no {column_name} column or similar name. + See e.g. [`time_column_name()`] for a complete list", + class = "epiprocess__guess_column__multiple_substitution_error" + ) } + + tryCatch( + { + x <- x %>% rename(any_of(substitutions)) + cli_inform( + "inferring {column_name} column.", + class = "epiprocess__guess_column_inferring_inform" + ) + return(x) + }, + error = function(cond) { + cli_abort( + "{intersect(names(x), substitutions)} + are both/all valid substitutions for {column_name}. + Either `rename` some yourself or drop some.", + class = "epiprocess__guess_column__multiple_substitution_error" + ) + } + ) } return(x) } diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 679e4dbd..d437c983 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -10,7 +10,7 @@ test_that("data.frame must contain geo_value, time_value and version columns", { expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), regexp = "There is no geo_value column or similar name" ) - expect_error(expect_message(as_epi_archive(select(dt, -time_value), compactify = FALSE)), + expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), regexp = "There is no time_value column or similar name" ) expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), @@ -20,18 +20,24 @@ test_that("data.frame must contain geo_value, time_value and version columns", { test_that("as_epi_archive custom name mapping works correctly", { # custom name works correctly - suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = version), version = weirdName), - as_epi_archive(dt) - )) - suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = geo_value), geo_value = weirdName), - as_epi_archive(dt) - )) - suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), - as_epi_archive(dt) - )) + expect_equal( + as_epi_archive(rename(dt, weirdName = version), + version = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + expect_equal( + as_epi_archive(rename(dt, weirdName = geo_value), + geo_value = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + expect_equal( + as_epi_archive(rename(dt, weirdName = time_value), + time_value = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) expect_error( as_epi_archive( diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 25a2dc59..1c5e527f 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -52,22 +52,33 @@ test_that("as_epi_df works for nonstandard input", { date = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) - expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df()), + class = "epiprocess__guess_column_inferring_inform" + ) expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) - expect_error(expect_message( - tib %>% rename(awefa = geo_value) %>% as_epi_df(), - regexp = "inferring " - )) + expect_error( + expect_message( + tib %>% + rename(awefa = geo_value) %>% + as_epi_df(), + class = "epiprocess__guess_column_inferring_inform" + ), + class = "epiprocess__guess_column__multiple_substitution_error" + ) expect_no_error(expect_message( tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), - regexp = "inferring" + class = "epiprocess__guess_column_inferring_inform" )) tib <- tib %>% rename(target_date = date) - expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df()), + class = "epiprocess__guess_column_inferring_inform" + ) tib <- tib %>% mutate(Time = 20 + target_date) - expect_error(tib_epi_df <- tib %>% as_epi_df()) + expect_error(tib_epi_df <- tib %>% as_epi_df(), + class = "epiprocess__guess_column__multiple_substitution_error" + ) }) # select fixes From 243c45e9c949653dc6b89608dd68c1722a5fb1a7 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jul 2024 15:43:37 -0500 Subject: [PATCH 19/19] desc, news --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ac188dc9..21dda4f5 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.12 +Version: 0.7.13 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index ed9dea40..7be31506 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Improved documentation web site landing page's introduction. - Fixed documentation referring to old `epi_slide()` interface (#466, thanks @XuedaShen!). +- `as_epi_df` and `as_epi_archive` now support arguments to specify column names + e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list + of default conversions, see `time_column_names` for a list of columns that + will automatically be recognized and converted to `time_value` column (there + are similar functions for `geo` and `version`). ## Cleanup - Resolved some linting messages in package checks (#468).