Skip to content

Commit

Permalink
fix: aggregate is now sum_groups_epi_df and other review changes
Browse files Browse the repository at this point in the history
* duplicated time values check in epi_df constructor improved
* dplyr warning and unnecessary if in sum_groups_epi_df fixed
* args in epi_slide are now validated in order of func signature
* simplify deprecated check
* error if .new_col_name is "geo_value" or "time_value"
* better TODO comment over last part of epi_slide
* comment about yearmonth - Inf weirdness
* change tests few tests
* remove complete_only and auto complete

Co-authored-by: brookslogan <[email protected]>
  • Loading branch information
dshemetov and brookslogan committed Sep 12, 2024
1 parent dd5c769 commit dfd49f5
Show file tree
Hide file tree
Showing 10 changed files with 347 additions and 380 deletions.
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ S3method(ungroup,epi_df)
S3method(ungroup,grouped_epi_archive)
S3method(unnest,epi_df)
export("%>%")
export(aggregate_epi_df)
export(archive_cases_dv_subset)
export(arrange)
export(arrange_canonical)
Expand Down Expand Up @@ -87,6 +86,7 @@ export(relocate)
export(rename)
export(revision_summary)
export(slice)
export(sum_groups_epi_df)
export(time_column_names)
export(ungroup)
export(unnest)
Expand Down
10 changes: 6 additions & 4 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -278,11 +278,13 @@ as_epi_df.tbl_df <- function(

assert_character(other_keys)

# Check one time_value per group
if (".time_value_counts" %in% other_keys) {
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
}
duplicated_time_values <- x %>%
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
dplyr::summarize(n = dplyr::n(), .groups = "drop") %>%
filter(n > 1)
filter(dplyr::n() > 1) %>%
ungroup()
if (nrow(duplicated_time_values) > 0) {
bad_data <- capture.output(duplicated_time_values)
cli_abort(
Expand Down Expand Up @@ -325,5 +327,5 @@ is_epi_df <- function(x) {
}

group_epi_df <- function(x) {
x %>% group_by(group_by(across(all_of(kill_time_value(key_colnames(.))))))
x %>% group_by(across(all_of(kill_time_value(key_colnames(.)))))
}
2 changes: 1 addition & 1 deletion R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ epix_slide.grouped_epi_archive <- function(
checkmate::assert_string(.new_col_name, null.ok = TRUE)
if (identical(.new_col_name, "time_value")) {
cli_abort(
'`new_col_name` must not be `"time_value"`; `epix_slide()` uses that column name
'`.new_col_name` must not be `"time_value"`; `epix_slide()` uses that column name
to attach the `ref_time_value` associated with each slide computation'
)
}
Expand Down
59 changes: 39 additions & 20 deletions R/methods-epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,10 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) {

#' Complete epi_df
#'
#' A [tidyr::complete()] analogue for `epi_df` objects. This function fills in
#' missing combinations of `geo_value` and `time_value` with `NA` values. See
#' the examples for usage details.
#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function
#' can be used, for example, to add rows for missing combinations
#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s.
#' See the examples for usage details.
#'
#' @param data an `epi_df`
#' @param ... see [`tidyr::complete`]
Expand Down Expand Up @@ -391,28 +392,46 @@ arrange_canonical.epi_df <- function(x, ...) {
#' the resulting `epi_df` will have `geo_value` set to `"total"`.
#'
#' @param .x an `epi_df`
#' @param value_col character name of the column to aggregate
#' @param group_cols character vector of column names to group by
#' @param value_col character vector of the columns to aggregate
#' @param group_cols character vector of column names to group by. "time_value" is
#' included by default.
#' @return an `epi_df` object
#'
#' @export
aggregate_epi_df <- function(.x, value_col = "value", group_cols = "time_value") {
sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) {
assert_class(.x, "epi_df")
assert_character(value_col, len = 1)
assert_character(sum_cols)
assert_character(group_cols)
checkmate::assert_subset(value_col, names(.x))
checkmate::assert_subset(group_cols, names(.x))
checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x)))
checkmate::assert_subset(group_cols, key_colnames(.x))
if (!"time_value" %in% group_cols) {
group_cols <- c("time_value", group_cols)
}

.x %>%
out <- .x %>%
group_by(across(all_of(group_cols))) %>%
dplyr::summarize(!!(value_col) := sum(!!sym(value_col))) %>%
ungroup() %>%
{
if (!"geo_value" %in% group_cols) {
mutate(., geo_value = "total") %>% relocate(geo_value, .before = 1)
} else {
.
}
} %>%
as_epi_df(as_of = attr(.x, "metadata")$as_of)
dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop")

# To preserve epi_df-ness, we need to ensure that the `geo_value` column is
# present.
out <- if (!"geo_value" %in% group_cols) {
out %>%
mutate(geo_value = "total") %>%
relocate(geo_value, .before = 1)
} else {
out
}

# The `geo_type` will be correctly inherited here by the following logic:
# - if `geo_value` is in `group_cols`, then the constructor will see the
# geo_value here and will correctly read the existing values
# - if `geo_value` is not in `group_cols`, then the constructor will see
# the unrecognizeable "total" value and will correctly infer the "custom"
# geo_type.
out %>%
as_epi_df(
as_of = attr(.x, "metadata")$as_of,
other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
) %>%
arrange_canonical()
}
Loading

0 comments on commit dfd49f5

Please sign in to comment.