Skip to content

Commit

Permalink
Merge pull request #519 from cmu-delphi/ds/epi-slide-group
Browse files Browse the repository at this point in the history
undefined
  • Loading branch information
dshemetov authored Sep 18, 2024
2 parents c167ddf + 409dcac commit 214100d
Show file tree
Hide file tree
Showing 14 changed files with 1,039 additions and 937 deletions.
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ S3method("names<-",epi_df)
S3method(Summary,epi_df)
S3method(arrange_canonical,default)
S3method(arrange_canonical,epi_df)
S3method(arrange_col_canonical,default)
S3method(arrange_col_canonical,epi_df)
S3method(arrange_row_canonical,default)
S3method(arrange_row_canonical,epi_df)
S3method(as_epi_df,data.frame)
S3method(as_epi_df,epi_df)
S3method(as_epi_df,tbl_df)
Expand Down Expand Up @@ -76,6 +80,7 @@ export(filter)
export(full_seq)
export(geo_column_names)
export(group_by)
export(group_epi_df)
export(group_modify)
export(growth_rate)
export(guess_period)
Expand All @@ -91,6 +96,7 @@ export(relocate)
export(rename)
export(revision_summary)
export(slice)
export(sum_groups_epi_df)
export(time_column_names)
export(ungroup)
export(unnest)
Expand Down
20 changes: 18 additions & 2 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,10 @@ as_epi_df.tbl_df <- function(
)
}
if (lifecycle::is_present(geo_type)) {
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
cli_warn("epi_df constructor argument `geo_type` is now ignored. Consider removing.")
}
if (lifecycle::is_present(time_type)) {
cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.")
cli_warn("epi_df constructor argument `time_type` is now ignored. Consider removing.")
}

# If geo type is missing, then try to guess it
Expand Down Expand Up @@ -277,6 +277,22 @@ as_epi_df.tbl_df <- function(
}

assert_character(other_keys)

if (".time_value_counts" %in% other_keys) {
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
}
duplicated_time_values <- x %>%
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
filter(dplyr::n() > 1) %>%
ungroup()
if (nrow(duplicated_time_values) > 0) {
bad_data <- capture.output(duplicated_time_values)
cli_abort(
"as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
body = c("Sample groups:", bad_data)
)
}

new_epi_df(x, geo_type, time_type, as_of, other_keys)
}

Expand Down
5 changes: 3 additions & 2 deletions R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ epix_slide.grouped_epi_archive <- function(
.versions <- sort(.versions)
}

validate_slide_window_arg(.before, .x$private$ungrouped$time_type)
validate_slide_window_arg(.before, .x$private$ungrouped$time_type, lower = 0) # nolint: object_usage_linter

checkmate::assert_string(.new_col_name, null.ok = TRUE)
if (!is.null(.new_col_name)) {
Expand All @@ -292,7 +292,8 @@ epix_slide.grouped_epi_archive <- function(
))
}
if (identical(.new_col_name, "version")) {
cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element of `.versions` associated with each slide computation') # nolint: line_length_linter
cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element
of `.versions` associated with each slide computation')
}
}

Expand Down
37 changes: 20 additions & 17 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -623,26 +623,29 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
#' @param .f Function, formula, or missing; together with `...` specifies the
#' computation to slide. To "slide" means to apply a computation over a
#' sliding (a.k.a. "rolling") time window for each data group. The window is
#' determined by the `before` parameter described below. One time step is
#' typically one day or one week; see [`epi_slide`] details for more
#' explanation. If a function, `.f` must take an `epi_df` with the same
#' column names as the archive's `DT`, minus the `version` column; followed
#' by a one-row tibble containing the values of the grouping variables for
#' the associated group; followed by a reference time value, usually as a
#' `Date` object; followed by any number of named arguments. If a formula,
#' `.f` can operate directly on columns accessed via `.x$var` or `.$var`, as
#' in `~ mean (.x$var)` to compute a mean of a column `var` for each
#' group-`ref_time_value` combination. The group key can be accessed via
#' `.y` or `.group_key`, and the reference time value can be accessed via
#' `.z` or `.ref_time_value`. If `.f` is missing, then `...` will specify the
#' determined by the `.before` parameter (see details for more). If a
#' function, `.f` must have the form `function(x, g, t, ...)`, where
#'
#' - "x" is an epi_df with the same column names as the archive's `DT`, minus
#' the `version` column
#' - "g" is a one-row tibble containing the values of the grouping variables
#' for the associated group
#' - "t" is the ref_time_value for the current window
#' - "..." are additional arguments
#'
#' If a formula, `.f` can operate directly on columns accessed via `.x$var` or
#' `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for
#' each group-`ref_time_value` combination. The group key can be accessed via
#' `.y` or `.group_key`, and the reference time value can be accessed via `.z`
#' or `.ref_time_value`. If `.f` is missing, then `...` will specify the
#' computation.
#' @param ... Additional arguments to pass to the function or formula specified
#' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted as
#' a ["data-masking"][rlang::args_data_masking] expression or expressions for
#' tidy evaluation; in addition to referring columns directly by name, the
#' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted
#' as a ["data-masking"][rlang::args_data_masking] expression or expressions
#' for tidy evaluation; in addition to referring columns directly by name, the
#' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs,
#' and can also refer to `.x`, `.group_key`, and `.ref_time_value`. See
#' details.
#' and can also refer to `.x` (not the same as the input epi_archive),
#' `.group_key`, and `.ref_time_value`. See details for more.
#' @param .before How many time values before the `.ref_time_value`
#' should each snapshot handed to the function `.f` contain? If provided, it
#' should be a single value that is compatible with the time_type of the
Expand Down
106 changes: 100 additions & 6 deletions R/methods-epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,10 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) {

#' Complete epi_df
#'
#' A [tidyr::complete()] analogue for `epi_df` objects. This function fills in
#' missing combinations of `geo_value` and `time_value` with `NA` values. See
#' the examples for usage details.
#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function
#' can be used, for example, to add rows for missing combinations
#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s.
#' See the examples for usage details.
#'
#' @param data an `epi_df`
#' @param ... see [`tidyr::complete`]
Expand Down Expand Up @@ -378,8 +379,101 @@ arrange_canonical.default <- function(x, ...) {
#' @export
arrange_canonical.epi_df <- function(x, ...) {
rlang::check_dots_empty()
keys <- key_colnames(x)
x %>%
dplyr::relocate(dplyr::all_of(keys), .before = 1) %>%
dplyr::arrange(dplyr::across(dplyr::all_of(keys)))
arrange_row_canonical() %>%
arrange_col_canonical()
}

arrange_row_canonical <- function(x, ...) {
UseMethod("arrange_row_canonical")
}

#' @export
arrange_row_canonical.default <- function(x, ...) {
rlang::check_dots_empty()
cli::cli_abort(c(
"`arrange_row_canonical()` is only meaningful for an {.cls epi_df}."
))
return(x)
}

#' @export
arrange_row_canonical.epi_df <- function(x, ...) {
rlang::check_dots_empty()
x %>% dplyr::arrange(dplyr::across(dplyr::all_of(key_colnames(.))))

Check warning on line 403 in R/methods-epi_df.R

View workflow job for this annotation

GitHub Actions / lint

file=R/methods-epi_df.R,line=403,col=65,[object_usage_linter] no visible binding for global variable '.'
}

arrange_col_canonical <- function(x, ...) {
UseMethod("arrange_col_canonical")
}

#' @export
arrange_col_canonical.default <- function(x, ...) {
rlang::check_dots_empty()
cli::cli_abort(c(
"`arrange_col_canonical()` is only meaningful for an {.cls epi_df}."
))
return(x)
}

#' @export
arrange_col_canonical.epi_df <- function(x, ...) {
rlang::check_dots_empty()
x %>% dplyr::relocate(dplyr::all_of(key_colnames(.)), .before = 1)

Check warning on line 422 in R/methods-epi_df.R

View workflow job for this annotation

GitHub Actions / lint

file=R/methods-epi_df.R,line=422,col=52,[object_usage_linter] no visible binding for global variable '.'
}

#' @export
group_epi_df <- function(x) {
x %>% group_by(across(all_of(kill_time_value(key_colnames(.)))))
}

#' Aggregate an `epi_df` object
#'
#' Aggregates an `epi_df` object by the specified group columns, summing the
#' `value` column, and returning an `epi_df`. If aggregating over `geo_value`,
#' the resulting `epi_df` will have `geo_value` set to `"total"`.
#'
#' @param .x an `epi_df`
#' @param value_col character vector of the columns to aggregate
#' @param group_cols character vector of column names to group by. "time_value" is
#' included by default.
#' @return an `epi_df` object
#'
#' @export
sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) {
assert_class(.x, "epi_df")
assert_character(sum_cols)
assert_character(group_cols)
checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x)))
checkmate::assert_subset(group_cols, key_colnames(.x))
if (!"time_value" %in% group_cols) {
group_cols <- c("time_value", group_cols)
}

out <- .x %>%
group_by(across(all_of(group_cols))) %>%
dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop")

# To preserve epi_df-ness, we need to ensure that the `geo_value` column is
# present.
out <- if (!"geo_value" %in% group_cols) {
out %>%
mutate(geo_value = "total") %>%
relocate(geo_value, .before = 1)
} else {
out
}

# The `geo_type` will be correctly inherited here by the following logic:
# - if `geo_value` is in `group_cols`, then the constructor will see the
# geo_value here and will correctly read the existing values
# - if `geo_value` is not in `group_cols`, then the constructor will see
# the unrecognizeable "total" value and will correctly infer the "custom"
# geo_type.
out %>%
as_epi_df(
as_of = attr(.x, "metadata")$as_of,
other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
) %>%
arrange_canonical()
}
Loading

0 comments on commit 214100d

Please sign in to comment.