Skip to content

Commit

Permalink
Merge pull request #460 from cmu-delphi/autoName
Browse files Browse the repository at this point in the history
`epi_df` automatic argument
  • Loading branch information
dsweber2 authored Jul 19, 2024
2 parents 8f25ec9 + 243c45e commit 69ea5e4
Show file tree
Hide file tree
Showing 18 changed files with 309 additions and 28 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: epiprocess
Title: Tools for basic signal processing in epidemiology
Version: 0.7.12
Version: 0.7.13
Authors@R: c(
person("Jacob", "Bien", role = "ctb"),
person("Logan", "Brooks", email = "[email protected]", role = c("aut", "cre")),
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ export(epix_merge)
export(epix_slide)
export(epix_truncate_versions_after)
export(filter)
export(geo_column_names)
export(group_by)
export(group_modify)
export(growth_rate)
Expand All @@ -75,9 +76,11 @@ export(next_after)
export(relocate)
export(rename)
export(slice)
export(time_column_names)
export(ungroup)
export(unnest)
export(validate_epi_archive)
export(version_column_names)
importFrom(checkmate,anyInfinite)
importFrom(checkmate,anyMissing)
importFrom(checkmate,assert)
Expand All @@ -100,6 +103,7 @@ importFrom(checkmate,test_subset)
importFrom(checkmate,vname)
importFrom(cli,cat_line)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_vec)
importFrom(cli,cli_warn)
importFrom(cli,format_message)
Expand Down Expand Up @@ -186,6 +190,7 @@ importFrom(tibble,as_tibble)
importFrom(tibble,new_tibble)
importFrom(tibble,validate_tibble)
importFrom(tidyr,unnest)
importFrom(tidyselect,any_of)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,starts_with)
importFrom(tsibble,as_tsibble)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
- Improved documentation web site landing page's introduction.
- Fixed documentation referring to old `epi_slide()` interface (#466, thanks
@XuedaShen!).
- `as_epi_df` and `as_epi_archive` now support arguments to specify column names
e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list
of default conversions, see `time_column_names` for a list of columns that
will automatically be recognized and converted to `time_value` column (there
are similar functions for `geo` and `version`).

## Cleanup
- Resolved some linting messages in package checks (#468).
Expand Down
17 changes: 15 additions & 2 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,11 @@ validate_epi_archive <- function(

#' `as_epi_archive` converts a data frame, data table, or tibble into an
#' `epi_archive` object.
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
#' example `version = release_date`
#' @param .versions_end location based versions_end, used to avoid prefix
#' `version = issue` from being assigned to `versions_end` instead of being
#' used to rename columns.
#'
#' @rdname epi_archive
#'
Expand All @@ -454,11 +459,19 @@ as_epi_archive <- function(
additional_metadata = NULL,
compactify = NULL,
clobberable_versions_start = NULL,
versions_end = NULL) {
.versions_end = NULL, ...,
versions_end = .versions_end) {
assert_data_frame(x)
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
x <- guess_column_name(x, "version", version_column_names())
if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
cli_abort(
"Columns `geo_value`, `time_value`, and `version` must be present in `x`."
"Either columns `geo_value`, `time_value`, and `version`, or related columns
(see the internal functions `guess_time_column_name()`,
`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete
list) must be present in `x`."
)
}
if (anyMissing(x$version)) {
Expand Down
35 changes: 25 additions & 10 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ NULL
#'
#' @export
new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list()) {
assert_data_frame(x)
assert_list(additional_metadata)

Expand Down Expand Up @@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for
#' examples.
#'
#' @param ... Additional arguments passed to methods.
#' @template epi_df-params
#'
#' @export
Expand Down Expand Up @@ -249,25 +250,39 @@ as_epi_df.epi_df <- function(x, ...) {

#' @method as_epi_df tbl_df
#' @describeIn as_epi_df The input tibble `x` must contain the columns
#' `geo_value` and `time_value`. All other columns will be preserved as is,
#' and treated as measured variables. If `as_of` is missing, then the function
#' will try to guess it from an `as_of`, `issue`, or `version` column of `x`
#' (if any of these are present), or from as an `as_of` field in its metadata
#' (stored in its attributes); if this fails, then the current day-time will
#' be used.
#' `geo_value` and `time_value`, or column names that uniquely map onto these
#' (e.g. `date` or `province`). Alternatively, you can specify the conversion
#' explicitly (`time_value = someWeirdColumnName`). All other columns not
#' specified as `other_keys` will be preserved as is, and treated as measured
#' variables.
#'
#' If `as_of` is missing, then the function will try to guess it from an
#' `as_of`, `issue`, or `version` column of `x` (if any of these are present),
#' or from as an `as_of` field in its metadata (stored in its attributes); if
#' this fails, then the current day-time will be used.
#' @importFrom rlang .data
#' @importFrom tidyselect any_of
#' @importFrom cli cli_inform
#' @export
as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list(),
...) {
# possible standard substitutions for time_value
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
if (!test_subset(c("geo_value", "time_value"), names(x))) {
cli_abort(
"Columns `geo_value` and `time_value` must be present in `x`."
"Either columns `geo_value` and `time_value` or related columns
(see the internal functions `guess_time_column_name()` and/or
`guess_geo_column_name()` for a complete list)
must be present in `x`."
)
}

new_epi_df(
x, geo_type, time_type, as_of,
additional_metadata, ...
additional_metadata
)
}

Expand Down
99 changes: 99 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,105 @@ guess_time_type <- function(time_value) {
return("custom")
}

#' given a vector of characters, add the same values, but upcased, e.g.
#' "date" -> c("date", "Date")
#' "target_date" -> c("target_date", "Target_Date")
#' @keywords internal
upcase_snake_case <- function(vec) {
upper_vec <- strsplit(vec, "_") %>%
map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>%
unlist()
c(vec, upper_vec)
}

#' potential time_value columns
#' @description
#' the full list of potential substitutions for the `time_value` column name:
#' `r time_column_names()`
#' @export
time_column_names <- function() {
substitutions <- c(
"time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
"week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth",
"yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("time_value", length(substitutions))
return(substitutions)
}
#
#' potential geo_value columns
#' @description
#' the full list of potential substitutions for the `geo_value` column name:
#' `r geo_column_names()`
#' @export
geo_column_names <- function() {
substitutions <- c(
"geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
"county", "hrr", "msa", "state", "province", "nation", "states",
"provinces", "counties", "geo_Value"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("geo_value", length(substitutions))
return(substitutions)
}

#' potential version columns
#' @description
#' the full list of potential substitutions for the `version` column name:
#' `r version_column_names()`
#' @export
version_column_names <- function() {
substitutions <- c(
"version", "issue", "release"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("version", length(substitutions))
return(substitutions)
}

#' rename potential time_value columns
#'
#' @description
#' potentially renames
#' @param x the tibble to potentially rename
#' @param substitutions a named vector. the potential substitions, with every name `time_value`
#' @keywords internal
#' @importFrom cli cli_inform cli_abort
#' @importFrom dplyr rename
guess_column_name <- function(x, column_name, substitutions) {
if (!(column_name %in% names(x))) {
# if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
if (!any(names(x) %in% substitutions)) {
cli_abort(
"There is no {column_name} column or similar name.
See e.g. [`time_column_name()`] for a complete list",
class = "epiprocess__guess_column__multiple_substitution_error"
)
}

tryCatch(
{
x <- x %>% rename(any_of(substitutions))
cli_inform(
"inferring {column_name} column.",
class = "epiprocess__guess_column_inferring_inform"
)
return(x)
},
error = function(cond) {
cli_abort(
"{intersect(names(x), substitutions)}
are both/all valid substitutions for {column_name}.
Either `rename` some yourself or drop some.",
class = "epiprocess__guess_column__multiple_substitution_error"
)
}
)
}
return(x)
}

##########


Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ reference:
desc: Details on `epi_df` format, and basic functionality.
- contents:
- matches("epi_df")
- matches("column_names")
- title: "`epi_*()` functions"
desc: Functions that act on `epi_df` objects.
- contents:
Expand Down
1 change: 0 additions & 1 deletion man-roxygen/epi_df-params.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,4 @@
#' `as_of` fields; named entries from the passed list will be included as
#' well. If your tibble has additional keys, be sure to specify them as a
#' character vector in the `other_keys` component of `additional_metadata`.
#' @param ... Additional arguments passed to methods.
#' @return An `epi_df` object.
16 changes: 10 additions & 6 deletions man/as_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion man/epi_archive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/geo_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/guess_column_name.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 1 addition & 4 deletions man/new_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/time_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 69ea5e4

Please sign in to comment.