Merge pull request #460 from cmu-delphi/autoName

`epi_df` automatic argument
cmu-delphi · Jul 19, 2024 · 69ea5e4 · 69ea5e4
2 parents 8f25ec9 + 243c45e
commit 69ea5e4
Show file tree

Hide file tree

Showing 18 changed files with 309 additions and 28 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.7.12
+Version: 0.7.13
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -61,6 +61,7 @@ export(epix_merge)
 export(epix_slide)
 export(epix_truncate_versions_after)
 export(filter)
+export(geo_column_names)
 export(group_by)
 export(group_modify)
 export(growth_rate)
@@ -75,9 +76,11 @@ export(next_after)
 export(relocate)
 export(rename)
 export(slice)
+export(time_column_names)
 export(ungroup)
 export(unnest)
 export(validate_epi_archive)
+export(version_column_names)
 importFrom(checkmate,anyInfinite)
 importFrom(checkmate,anyMissing)
 importFrom(checkmate,assert)
@@ -100,6 +103,7 @@ importFrom(checkmate,test_subset)
 importFrom(checkmate,vname)
 importFrom(cli,cat_line)
 importFrom(cli,cli_abort)
+importFrom(cli,cli_inform)
 importFrom(cli,cli_vec)
 importFrom(cli,cli_warn)
 importFrom(cli,format_message)
@@ -186,6 +190,7 @@ importFrom(tibble,as_tibble)
 importFrom(tibble,new_tibble)
 importFrom(tibble,validate_tibble)
 importFrom(tidyr,unnest)
+importFrom(tidyselect,any_of)
 importFrom(tidyselect,eval_select)
 importFrom(tidyselect,starts_with)
 importFrom(tsibble,as_tsibble)

diff --git a/NEWS.md b/NEWS.md
@@ -40,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 - Improved documentation web site landing page's introduction.
 - Fixed documentation referring to old `epi_slide()` interface (#466, thanks
   @XuedaShen!).
+- `as_epi_df` and `as_epi_archive` now support arguments to specify column names
+  e.g.  `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list
+  of default conversions, see `time_column_names`  for a list of columns that
+  will automatically be recognized and converted to  `time_value` column (there
+  are similar functions for `geo` and `version`).
 
 ## Cleanup
 - Resolved some linting messages in package checks (#468).

diff --git a/R/archive.R b/R/archive.R
@@ -442,6 +442,11 @@ validate_epi_archive <- function(
 
 #' `as_epi_archive` converts a data frame, data table, or tibble into an
 #' `epi_archive` object.
+#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
+#'   example `version = release_date`
+#' @param .versions_end location based versions_end, used to avoid prefix
+#'   `version = issue` from being assigned to `versions_end` instead of being
+#'   used to rename columns.
 #'
 #' @rdname epi_archive
 #'
@@ -454,11 +459,19 @@ as_epi_archive <- function(
     additional_metadata = NULL,
     compactify = NULL,
     clobberable_versions_start = NULL,
-    versions_end = NULL) {
+    .versions_end = NULL, ...,
+    versions_end = .versions_end) {
   assert_data_frame(x)
+  x <- rename(x, ...)
+  x <- guess_column_name(x, "time_value", time_column_names())
+  x <- guess_column_name(x, "geo_value", geo_column_names())
+  x <- guess_column_name(x, "version", version_column_names())
   if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
     cli_abort(
-      "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
+      "Either columns `geo_value`, `time_value`, and `version`, or related columns
+      (see the internal functions `guess_time_column_name()`,
+      `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete
+      list) must be present in `x`."
     )
   }
   if (anyMissing(x$version)) {

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -95,7 +95,7 @@ NULL
 #'
 #' @export
 new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
-                       additional_metadata = list(), ...) {
+                       additional_metadata = list()) {
   assert_data_frame(x)
   assert_list(additional_metadata)
 
@@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
 #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
 #' examples.
 #'
+#' @param ... Additional arguments passed to methods.
 #' @template epi_df-params
 #'
 #' @export
@@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) {
 
 #' @method as_epi_df tbl_df
 #' @describeIn as_epi_df The input tibble `x` must contain the columns
-#'   `geo_value` and `time_value`. All other columns will be preserved as is,
-#'   and treated as measured variables. If `as_of` is missing, then the function
-#'   will try to guess it from an `as_of`, `issue`, or `version` column of `x`
-#'   (if any of these are present), or from as an `as_of` field in its metadata
-#'   (stored in its attributes); if this fails, then the current day-time will
-#'   be used.
+#'   `geo_value` and `time_value`, or column names that uniquely map onto these
+#'   (e.g. `date` or `province`). Alternatively, you can specify the conversion
+#'   explicitly (`time_value = someWeirdColumnName`). All other columns not
+#'   specified as `other_keys` will be preserved as is, and treated as measured
+#'   variables.
+#'
+#'  If `as_of` is missing, then the function will try to guess it from an
+#'   `as_of`, `issue`, or `version` column of `x` (if any of these are present),
+#'   or from as an `as_of` field in its metadata (stored in its attributes); if
+#'   this fails, then the current day-time will be used.
 #' @importFrom rlang .data
+#' @importFrom tidyselect any_of
+#' @importFrom cli cli_inform
 #' @export
 as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
-                             additional_metadata = list(), ...) {
+                             additional_metadata = list(),
+                             ...) {
+  # possible standard substitutions for time_value
+  x <- rename(x, ...)
+  x <- guess_column_name(x, "time_value", time_column_names())
+  x <- guess_column_name(x, "geo_value", geo_column_names())
   if (!test_subset(c("geo_value", "time_value"), names(x))) {
     cli_abort(
-      "Columns `geo_value` and `time_value` must be present in `x`."
+      "Either columns `geo_value` and `time_value` or related columns
+       (see the internal functions `guess_time_column_name()` and/or
+       `guess_geo_column_name()` for a complete list)
+       must be present in `x`."
     )
   }
 
   new_epi_df(
     x, geo_type, time_type, as_of,
-    additional_metadata, ...
+    additional_metadata
   )
 }
 

diff --git a/R/utils.R b/R/utils.R
@@ -448,6 +448,105 @@ guess_time_type <- function(time_value) {
   return("custom")
 }
 
+#'  given a vector of characters, add the same values, but upcased, e.g.
+#'   "date" -> c("date", "Date")
+#'   "target_date" -> c("target_date", "Target_Date")
+#' @keywords internal
+upcase_snake_case <- function(vec) {
+  upper_vec <- strsplit(vec, "_") %>%
+    map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>%
+    unlist()
+  c(vec, upper_vec)
+}
+
+#' potential time_value columns
+#' @description
+#' the full list of potential substitutions for the `time_value` column name:
+#' `r time_column_names()`
+#' @export
+time_column_names <- function() {
+  substitutions <- c(
+    "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
+    "week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth",
+    "yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value"
+  )
+  substitutions <- upcase_snake_case(substitutions)
+  names(substitutions) <- rep("time_value", length(substitutions))
+  return(substitutions)
+}
+#
+#' potential geo_value columns
+#' @description
+#' the full list of potential substitutions for the `geo_value` column name:
+#' `r geo_column_names()`
+#' @export
+geo_column_names <- function() {
+  substitutions <- c(
+    "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
+    "county", "hrr", "msa", "state", "province", "nation", "states",
+    "provinces", "counties", "geo_Value"
+  )
+  substitutions <- upcase_snake_case(substitutions)
+  names(substitutions) <- rep("geo_value", length(substitutions))
+  return(substitutions)
+}
+
+#' potential version columns
+#' @description
+#' the full list of potential substitutions for the `version` column name:
+#' `r version_column_names()`
+#' @export
+version_column_names <- function() {
+  substitutions <- c(
+    "version", "issue", "release"
+  )
+  substitutions <- upcase_snake_case(substitutions)
+  names(substitutions) <- rep("version", length(substitutions))
+  return(substitutions)
+}
+
+#' rename potential time_value columns
+#'
+#' @description
+#' potentially renames
+#' @param x the tibble to potentially rename
+#' @param substitutions a named vector. the potential substitions, with every name `time_value`
+#' @keywords internal
+#' @importFrom cli cli_inform cli_abort
+#' @importFrom dplyr rename
+guess_column_name <- function(x, column_name, substitutions) {
+  if (!(column_name %in% names(x))) {
+    # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
+    if (!any(names(x) %in% substitutions)) {
+      cli_abort(
+        "There is no {column_name} column or similar name.
+         See e.g. [`time_column_name()`] for a complete list",
+        class = "epiprocess__guess_column__multiple_substitution_error"
+      )
+    }
+
+    tryCatch(
+      {
+        x <- x %>% rename(any_of(substitutions))
+        cli_inform(
+          "inferring {column_name} column.",
+          class = "epiprocess__guess_column_inferring_inform"
+        )
+        return(x)
+      },
+      error = function(cond) {
+        cli_abort(
+          "{intersect(names(x), substitutions)}
+           are both/all valid substitutions for {column_name}.
+           Either `rename` some yourself or drop some.",
+          class = "epiprocess__guess_column__multiple_substitution_error"
+        )
+      }
+    )
+  }
+  return(x)
+}
+
 ##########
 
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -63,6 +63,7 @@ reference:
     desc: Details on `epi_df` format, and basic functionality.
   - contents:
       - matches("epi_df")
+      - matches("column_names")
   - title: "`epi_*()` functions"
     desc: Functions that act on `epi_df` objects.
   - contents:

diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R
@@ -15,5 +15,4 @@
 #'   `as_of` fields; named entries from the passed list will be included as
 #'   well. If your tibble has additional keys, be sure to specify them as a
 #'   character vector in the `other_keys` component of `additional_metadata`.
-#' @param ... Additional arguments passed to methods.
 #' @return An `epi_df` object.
diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd
diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd
diff --git a/man/geo_column_names.Rd b/man/geo_column_names.Rd
diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd
diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd
diff --git a/man/time_column_names.Rd b/man/time_column_names.Rd