diff --git a/DESCRIPTION b/DESCRIPTION index ac188dc9..21dda4f5 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.12 +Version: 0.7.13 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index b4cafc83..dcbc151e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(epix_merge) export(epix_slide) export(epix_truncate_versions_after) export(filter) +export(geo_column_names) export(group_by) export(group_modify) export(growth_rate) @@ -75,9 +76,11 @@ export(next_after) export(relocate) export(rename) export(slice) +export(time_column_names) export(ungroup) export(unnest) export(validate_epi_archive) +export(version_column_names) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) @@ -100,6 +103,7 @@ importFrom(checkmate,test_subset) importFrom(checkmate,vname) importFrom(cli,cat_line) importFrom(cli,cli_abort) +importFrom(cli,cli_inform) importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(cli,format_message) @@ -186,6 +190,7 @@ importFrom(tibble,as_tibble) importFrom(tibble,new_tibble) importFrom(tibble,validate_tibble) importFrom(tidyr,unnest) +importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) diff --git a/NEWS.md b/NEWS.md index ed9dea40..7be31506 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Improved documentation web site landing page's introduction. - Fixed documentation referring to old `epi_slide()` interface (#466, thanks @XuedaShen!). +- `as_epi_df` and `as_epi_archive` now support arguments to specify column names + e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list + of default conversions, see `time_column_names` for a list of columns that + will automatically be recognized and converted to `time_value` column (there + are similar functions for `geo` and `version`). ## Cleanup - Resolved some linting messages in package checks (#468). diff --git a/R/archive.R b/R/archive.R index 464d68ef..780279b0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -442,6 +442,11 @@ validate_epi_archive <- function( #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. +#' @param ... used for specifying column names, as in [`dplyr::rename`]. For +#' example `version = release_date` +#' @param .versions_end location based versions_end, used to avoid prefix +#' `version = issue` from being assigned to `versions_end` instead of being +#' used to rename columns. #' #' @rdname epi_archive #' @@ -454,11 +459,19 @@ as_epi_archive <- function( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL) { + .versions_end = NULL, ..., + versions_end = .versions_end) { assert_data_frame(x) + x <- rename(x, ...) + x <- guess_column_name(x, "time_value", time_column_names()) + x <- guess_column_name(x, "geo_value", geo_column_names()) + x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + "Either columns `geo_value`, `time_value`, and `version`, or related columns + (see the internal functions `guess_time_column_name()`, + `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete + list) must be present in `x`." ) } if (anyMissing(x$version)) { diff --git a/R/epi_df.R b/R/epi_df.R index f4df1604..707944f6 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -95,7 +95,7 @@ NULL #' #' @export new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list(), ...) { + additional_metadata = list()) { assert_data_frame(x) assert_list(additional_metadata) @@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for #' examples. #' +#' @param ... Additional arguments passed to methods. #' @template epi_df-params #' #' @export @@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) { #' @method as_epi_df tbl_df #' @describeIn as_epi_df The input tibble `x` must contain the columns -#' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `as_of` is missing, then the function -#' will try to guess it from an `as_of`, `issue`, or `version` column of `x` -#' (if any of these are present), or from as an `as_of` field in its metadata -#' (stored in its attributes); if this fails, then the current day-time will -#' be used. +#' `geo_value` and `time_value`, or column names that uniquely map onto these +#' (e.g. `date` or `province`). Alternatively, you can specify the conversion +#' explicitly (`time_value = someWeirdColumnName`). All other columns not +#' specified as `other_keys` will be preserved as is, and treated as measured +#' variables. +#' +#' If `as_of` is missing, then the function will try to guess it from an +#' `as_of`, `issue`, or `version` column of `x` (if any of these are present), +#' or from as an `as_of` field in its metadata (stored in its attributes); if +#' this fails, then the current day-time will be used. #' @importFrom rlang .data +#' @importFrom tidyselect any_of +#' @importFrom cli cli_inform #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { + additional_metadata = list(), + ...) { + # possible standard substitutions for time_value + x <- rename(x, ...) + x <- guess_column_name(x, "time_value", time_column_names()) + x <- guess_column_name(x, "geo_value", geo_column_names()) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Columns `geo_value` and `time_value` must be present in `x`." + "Either columns `geo_value` and `time_value` or related columns + (see the internal functions `guess_time_column_name()` and/or + `guess_geo_column_name()` for a complete list) + must be present in `x`." ) } new_epi_df( x, geo_type, time_type, as_of, - additional_metadata, ... + additional_metadata ) } diff --git a/R/utils.R b/R/utils.R index b2098e9b..9396070d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -448,6 +448,105 @@ guess_time_type <- function(time_value) { return("custom") } +#' given a vector of characters, add the same values, but upcased, e.g. +#' "date" -> c("date", "Date") +#' "target_date" -> c("target_date", "Target_Date") +#' @keywords internal +upcase_snake_case <- function(vec) { + upper_vec <- strsplit(vec, "_") %>% + map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% + unlist() + c(vec, upper_vec) +} + +#' potential time_value columns +#' @description +#' the full list of potential substitutions for the `time_value` column name: +#' `r time_column_names()` +#' @export +time_column_names <- function() { + substitutions <- c( + "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date", + "week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth", + "yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("time_value", length(substitutions)) + return(substitutions) +} +# +#' potential geo_value columns +#' @description +#' the full list of potential substitutions for the `geo_value` column name: +#' `r geo_column_names()` +#' @export +geo_column_names <- function() { + substitutions <- c( + "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip", + "county", "hrr", "msa", "state", "province", "nation", "states", + "provinces", "counties", "geo_Value" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("geo_value", length(substitutions)) + return(substitutions) +} + +#' potential version columns +#' @description +#' the full list of potential substitutions for the `version` column name: +#' `r version_column_names()` +#' @export +version_column_names <- function() { + substitutions <- c( + "version", "issue", "release" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("version", length(substitutions)) + return(substitutions) +} + +#' rename potential time_value columns +#' +#' @description +#' potentially renames +#' @param x the tibble to potentially rename +#' @param substitutions a named vector. the potential substitions, with every name `time_value` +#' @keywords internal +#' @importFrom cli cli_inform cli_abort +#' @importFrom dplyr rename +guess_column_name <- function(x, column_name, substitutions) { + if (!(column_name %in% names(x))) { + # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column + if (!any(names(x) %in% substitutions)) { + cli_abort( + "There is no {column_name} column or similar name. + See e.g. [`time_column_name()`] for a complete list", + class = "epiprocess__guess_column__multiple_substitution_error" + ) + } + + tryCatch( + { + x <- x %>% rename(any_of(substitutions)) + cli_inform( + "inferring {column_name} column.", + class = "epiprocess__guess_column_inferring_inform" + ) + return(x) + }, + error = function(cond) { + cli_abort( + "{intersect(names(x), substitutions)} + are both/all valid substitutions for {column_name}. + Either `rename` some yourself or drop some.", + class = "epiprocess__guess_column__multiple_substitution_error" + ) + } + ) + } + return(x) +} + ########## diff --git a/_pkgdown.yml b/_pkgdown.yml index 4930c9f5..b95a6386 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -63,6 +63,7 @@ reference: desc: Details on `epi_df` format, and basic functionality. - contents: - matches("epi_df") + - matches("column_names") - title: "`epi_*()` functions" desc: Functions that act on `epi_df` objects. - contents: diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R index 54d8c2d2..59c51603 100644 --- a/man-roxygen/epi_df-params.R +++ b/man-roxygen/epi_df-params.R @@ -15,5 +15,4 @@ #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a #' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 40c0a1c5..98cdbb83 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -55,12 +55,16 @@ examples. \item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. \item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns -\code{geo_value} and \code{time_value}. All other columns will be preserved as is, -and treated as measured variables. If \code{as_of} is missing, then the function -will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x} -(if any of these are present), or from as an \code{as_of} field in its metadata -(stored in its attributes); if this fails, then the current day-time will -be used. +\code{geo_value} and \code{time_value}, or column names that uniquely map onto these +(e.g. \code{date} or \code{province}). Alternatively, you can specify the conversion +explicitly (\code{time_value = someWeirdColumnName}). All other columns not +specified as \code{other_keys} will be preserved as is, and treated as measured +variables. + +If \code{as_of} is missing, then the function will try to guess it from an +\code{as_of}, \code{issue}, or \code{version} column of \code{x} (if any of these are present), +or from as an \code{as_of} field in its metadata (stored in its attributes); if +this fails, then the current day-time will be used. \item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 97ff6af0..99203052 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -37,7 +37,9 @@ as_epi_archive( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL + .versions_end = NULL, + ..., + versions_end = .versions_end ) } \arguments{ @@ -89,6 +91,13 @@ beyond \code{max(x$version)}, but they all contained empty updates. (The default value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} + +\item{.versions_end}{location based versions_end, used to avoid prefix +\code{version = issue} from being assigned to \code{versions_end} instead of being +used to rename columns.} + +\item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For +example \code{version = release_date}} } \value{ An \code{epi_archive} object. diff --git a/man/geo_column_names.Rd b/man/geo_column_names.Rd new file mode 100644 index 00000000..4b3810dc --- /dev/null +++ b/man/geo_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{geo_column_names} +\alias{geo_column_names} +\title{potential geo_value columns} +\usage{ +geo_column_names() +} +\description{ +the full list of potential substitutions for the \code{geo_value} column name: +geo_value, geo_values, geo_id, geos, location, jurisdiction, fips, zip, county, hrr, msa, state, province, nation, states, provinces, counties, geo_Value, Geo_Value, Geo_Values, Geo_Id, Geos, Location, Jurisdiction, Fips, Zip, County, Hrr, Msa, State, Province, Nation, States, Provinces, Counties, Geo_Value +} diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd new file mode 100644 index 00000000..d4aa09b7 --- /dev/null +++ b/man/guess_column_name.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{guess_column_name} +\alias{guess_column_name} +\title{rename potential time_value columns} +\usage{ +guess_column_name(x, column_name, substitutions) +} +\arguments{ +\item{x}{the tibble to potentially rename} + +\item{substitutions}{a named vector. the potential substitions, with every name \code{time_value}} +} +\description{ +potentially renames +} +\keyword{internal} diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd index 7182c222..8010b700 100644 --- a/man/new_epi_df.Rd +++ b/man/new_epi_df.Rd @@ -9,8 +9,7 @@ new_epi_df( geo_type, time_type, as_of, - additional_metadata = list(), - ... + additional_metadata = list() ) } \arguments{ @@ -35,8 +34,6 @@ then the current day-time will be used.} \code{as_of} fields; named entries from the passed list will be included as well. If your tibble has additional keys, be sure to specify them as a character vector in the \code{other_keys} component of \code{additional_metadata}.} - -\item{...}{Additional arguments passed to methods.} } \value{ An \code{epi_df} object. diff --git a/man/time_column_names.Rd b/man/time_column_names.Rd new file mode 100644 index 00000000..2e2db6b5 --- /dev/null +++ b/man/time_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{time_column_names} +\alias{time_column_names} +\title{potential time_value columns} +\usage{ +time_column_names() +} +\description{ +the full list of potential substitutions for the \code{time_value} column name: +time_value, date, time, datetime, dateTime, date_time, target_date, week, epiweek, month, mon, year, yearmon, yearmonth, yearMon, yearMonth, dates, time_values, target_dates, time_Value, Time_Value, Date, Time, Datetime, DateTime, Date_Time, Target_Date, Week, Epiweek, Month, Mon, Year, Yearmon, Yearmonth, YearMon, YearMonth, Dates, Time_Values, Target_Dates, Time_Value +} diff --git a/man/upcase_snake_case.Rd b/man/upcase_snake_case.Rd new file mode 100644 index 00000000..31ecb768 --- /dev/null +++ b/man/upcase_snake_case.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{upcase_snake_case} +\alias{upcase_snake_case} +\title{given a vector of characters, add the same values, but upcased, e.g. +"date" -> c("date", "Date") +"target_date" -> c("target_date", "Target_Date")} +\usage{ +upcase_snake_case(vec) +} +\description{ +given a vector of characters, add the same values, but upcased, e.g. +"date" -> c("date", "Date") +"target_date" -> c("target_date", "Target_Date") +} +\keyword{internal} diff --git a/man/version_column_names.Rd b/man/version_column_names.Rd new file mode 100644 index 00000000..75ee2315 --- /dev/null +++ b/man/version_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{version_column_names} +\alias{version_column_names} +\title{potential version columns} +\usage{ +version_column_names() +} +\description{ +the full list of potential substitutions for the \code{version} column name: +version, issue, release, Version, Issue, Release +} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1291e3c7..d437c983 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -8,13 +8,43 @@ dt <- archive_cases_dv_subset$DT test_that("data.frame must contain geo_value, time_value and version columns", { expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no geo_value column or similar name" ) expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no time_value column or similar name" ) expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no version column or similar name" + ) +}) + +test_that("as_epi_archive custom name mapping works correctly", { + # custom name works correctly + expect_equal( + as_epi_archive(rename(dt, weirdName = version), + version = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + expect_equal( + as_epi_archive(rename(dt, weirdName = geo_value), + geo_value = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + expect_equal( + as_epi_archive(rename(dt, weirdName = time_value), + time_value = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + + expect_error( + as_epi_archive( + rename(dt, weirdName = version), + version = weirdName, + version = time_value + ), "Names must be unique" ) }) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 8cfb4408..1c5e527f 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -46,6 +46,41 @@ test_that("as_epi_df errors when additional_metadata is not a list", { ) }) +test_that("as_epi_df works for nonstandard input", { + tib <- tibble::tibble( + x = 1:10, y = 1:10, + date = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) + ) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df()), + class = "epiprocess__guess_column_inferring_inform" + ) + expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) + expect_error( + expect_message( + tib %>% + rename(awefa = geo_value) %>% + as_epi_df(), + class = "epiprocess__guess_column_inferring_inform" + ), + class = "epiprocess__guess_column__multiple_substitution_error" + ) + expect_no_error(expect_message( + tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), + class = "epiprocess__guess_column_inferring_inform" + )) + + tib <- tib %>% rename(target_date = date) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df()), + class = "epiprocess__guess_column_inferring_inform" + ) + + tib <- tib %>% mutate(Time = 20 + target_date) + expect_error(tib_epi_df <- tib %>% as_epi_df(), + class = "epiprocess__guess_column__multiple_substitution_error" + ) +}) + # select fixes tib <- tibble::tibble(