Skip to content

Commit

Permalink
Merge pull request #521 from cmu-delphi/lcb/slide-unnest-dedupe-cols
Browse files Browse the repository at this point in the history
  • Loading branch information
brookslogan authored Sep 16, 2024
2 parents ef2639e + 16cf8d7 commit 9ab9731
Show file tree
Hide file tree
Showing 12 changed files with 285 additions and 55 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ Collate:
'correlation.R'
'data.R'
'epi_df.R'
'epi_df_forbidden_methods.R'
'epiprocess.R'
'group_by_epi_df_methods.R'
'methods-epi_archive.R'
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

S3method("[",epi_df)
S3method("names<-",epi_df)
S3method(Summary,epi_df)
S3method(arrange_canonical,default)
S3method(arrange_canonical,epi_df)
S3method(as_epi_df,data.frame)
Expand Down Expand Up @@ -36,6 +37,7 @@ S3method(key_colnames,data.frame)
S3method(key_colnames,default)
S3method(key_colnames,epi_archive)
S3method(key_colnames,epi_df)
S3method(mean,epi_df)
S3method(next_after,Date)
S3method(next_after,integer)
S3method(print,epi_archive)
Expand Down Expand Up @@ -148,6 +150,7 @@ importFrom(dplyr,everything)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,group_by_drop_default)
importFrom(dplyr,group_map)
importFrom(dplyr,group_modify)
importFrom(dplyr,group_vars)
importFrom(dplyr,groups)
Expand Down
8 changes: 4 additions & 4 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -693,20 +693,20 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE,
grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]]
grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor)
# ^ Use `as.list` to try to avoid any possibility of a deep copy.
if (!any(grouping_col_is_factor)) {
if (length(grouping_cols) != 0L && !any(grouping_col_is_factor)) {
cli_warn(
"`.drop=FALSE` but there are no factor grouping columns;
"`.drop=FALSE` but none of the grouping columns are factors;
did you mean to convert one of the columns to a factor beforehand?",
class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors"
)
} else if (any(diff(grouping_col_is_factor) == -1L)) {
cli_warn(
"`.drop=FALSE` but there are one or more non-factor grouping columns listed
after a factor grouping column; this may produce groups with `NA`s for these
columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553;
non-factor columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553;
depending on how you want completion to work, you might instead want to convert all
grouping columns to factors beforehand, specify the non-factor grouping columns first,
or use `.drop=TRUE` and add a call to `tidyr::complete`.",
or use `.drop=TRUE` and add a call to `tidyr::complete()`.",
class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor"
)
}
Expand Down
48 changes: 48 additions & 0 deletions R/epi_df_forbidden_methods.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Methods in this file are used to
# * Disable problematic inherited behavior (e.g., mean on epi_dfs)
# * Provide better error messaging if possible for things that already abort
# when they should (e.g., sum on epi_dfs)


# Disable mean on epi_dfs, to prevent `epi_slide(~ mean(.x), ....)` bad output:

#' @export
mean.epi_df <- function(x, ...) {
cli_abort(c(
"`mean` shouldn't be used on entire `epi_df`s",
"x" = "{rlang::caller_arg(x)} was an `epi_df`",
"i" = "If you encountered this while trying to take a rolling mean
of a column using `epi_slide`, you probably forgot to
specify the column name (e.g., ~ mean(.x$colname)). You may
also prefer to use the specialized `epi_slide_mean` method."
), class = "epiprocess__summarizer_on_entire_epi_df")
}

# Similarly, provide better error messages for some other potentially-common
# slide operations (sum, prod, min, max, all, any, range):

#' @export
Summary.epi_df <- function(..., na.rm = FALSE) {

Check warning on line 25 in R/epi_df_forbidden_methods.R

View workflow job for this annotation

GitHub Actions / lint

file=R/epi_df_forbidden_methods.R,line=25,col=33,[object_name_linter] Variable and function name style should match snake_case or symbols.
# cli uses dot prefixes for special purpose; use alias to avoid confusion during interpolation
generic <- .Generic

Check warning on line 27 in R/epi_df_forbidden_methods.R

View workflow job for this annotation

GitHub Actions / lint

file=R/epi_df_forbidden_methods.R,line=27,col=3,[object_usage_linter] local variable 'generic' assigned but may not be used

Check warning on line 27 in R/epi_df_forbidden_methods.R

View workflow job for this annotation

GitHub Actions / lint

file=R/epi_df_forbidden_methods.R,line=27,col=14,[object_usage_linter] no visible binding for global variable '.Generic'
opt_pointer <- switch(.Generic,

Check warning on line 28 in R/epi_df_forbidden_methods.R

View workflow job for this annotation

GitHub Actions / lint

file=R/epi_df_forbidden_methods.R,line=28,col=3,[object_usage_linter] local variable 'opt_pointer' assigned but may not be used

Check warning on line 28 in R/epi_df_forbidden_methods.R

View workflow job for this annotation

GitHub Actions / lint

file=R/epi_df_forbidden_methods.R,line=28,col=25,[object_usage_linter] no visible binding for global variable '.Generic'
sum = "You may also prefer to use the specialized `epi_slide_sum` method.",
prod = ,
min = ,
max = ,
all = ,
any = "You may also prefer to use the specialized `epi_slide_opt` method.",
range = "",
cli_abort("Unrecognized .Generic: {generic}")
)
cli_abort(c(
"`{generic}` shouldn't be used on entire `epi_df`s",
# We'd like to quote user input in the error message, but `caller_arg(..1)` is
# just "..1" and (eagerness/S4/unnamedness?) issues thwart some alternatives; just
# use something generic:
"x" = "`{generic}`'s first argument was an `epi_df`",
"i" = "If you encountered this while trying to take a rolling {generic}
of a column using `epi_slide`, you probably forgot to
specify the column name (e.g., ~ {generic}(.x$colname)). {opt_pointer}"
), class = "epiprocess__summarizer_on_entire_epi_df")
}
94 changes: 70 additions & 24 deletions R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -341,30 +341,75 @@ epix_slide.grouped_epi_archive <- function(
", class = "epiprocess__invalid_slide_comp_value")
}

.group_key_label <- if (nrow(.group_key) == 0L) {
# Edge case: we'll get here if a requested `.version` had 0 rows and we
# grouped by a nonzero number of columns using the default `.drop = TRUE`
# (or on all non-factor columns with `.drop = FALSE` for some reason,
# probably a user bug). Mimicking `dplyr`, we'll let `.group_key` provided
# to the computation be 0 rows, but then label it using NAs. (In the
# bizarre situation of grouping by a mix of factor and non-factor with
# `.drop = FALSE`, `.group_key` will already have 1 row. For ungrouped
# epix_slides and 0-variable-grouped epix_slides with either `.drop`
# setting, we will have a 1x0 .group_key, although perhaps for the latter
# this should be 0x0.)
vctrs::vec_cast(NA, .group_key)
} else {
.group_key
}

# Construct result first as list, then tibble-ify, to try to avoid some
# redundant work. `group_modify()` provides the group key, we provide the
# ref time value (appropriately recycled) and comp_value (appropriately
# named / unpacked, for quick feedback)
res <- list(version = vctrs::vec_rep(.version, vctrs::vec_size(comp_value)))
# redundant work. However, we will sacrifice some performance here doing
# checks here in the inner loop, in order to provide immediate feedback on
# some formatting errors.
res <- c(
list(), # get list output; a bit faster than `as.list()`-ing `.group_key_label`
.group_key_label,
list(version = .version)
)
res <- vctrs::vec_recycle_common(!!!res, .size = vctrs::vec_size(comp_value))

if (is.null(.new_col_name)) {
if (inherits(comp_value, "data.frame")) {
# unpack into separate columns (without name prefix):
res <- c(res, comp_value)
# Sometimes comp_value can parrot back columns already in `res`; allow
# this, but balk if a column has the same name as one in `res` but a
# different value:
comp_nms <- names(comp_value)
overlaps_label_names <- comp_nms %in% names(res)
for (comp_i in which(overlaps_label_names)) {
if (!identical(comp_value[[comp_i]], res[[comp_nms[[comp_i]]]])) {
lines <- c(
cli::format_error(c(
"conflict detected between slide value computation labels and output:",
"i" = "we are labeling slide computations with the following columns: {syms(names(res))}",
"x" = "a slide computation output included a column {syms(comp_nms[[comp_i]])} that didn't match the label"

Check warning on line 384 in R/grouped_epi_archive.R

View workflow job for this annotation

GitHub Actions / lint

file=R/grouped_epi_archive.R,line=384,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 123 characters.
)),
capture.output(print(waldo::compare(res[[comp_nms[[comp_i]]]], comp_value[[comp_i]], x_arg = "label", y_arg = "comp output"))),

Check warning on line 386 in R/grouped_epi_archive.R

View workflow job for this annotation

GitHub Actions / lint

file=R/grouped_epi_archive.R,line=386,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 141 characters.
cli::format_message(c("You likely want to rename or remove this column in your output, or debug why it has a different value."))

Check warning on line 387 in R/grouped_epi_archive.R

View workflow job for this annotation

GitHub Actions / lint

file=R/grouped_epi_archive.R,line=387,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 142 characters.
)
rlang::abort(paste(collapse = "\n", lines),
class = "epiprocess__epix_slide_label_vs_output_column_conflict"
)
}
}
# Unpack into separate columns (without name prefix). If there are
# columns duplicating label columns, de-dupe and order them as if they
# didn't exist in comp_value.
res <- c(res, comp_value[!overlaps_label_names])
} else {
# apply default name (to vector or packed data.frame-type column):
# Apply default name (to vector or packed data.frame-type column):
res[["slide_value"]] <- comp_value
# TODO check for bizarre conflicting `slide_value` label col name.
# Either here or on entry to `epix_slide` (even if there we don't know
# whether vecs will be output). Or just turn this into a special case of
# the preceding branch and let the checking code there generate a
# complaint.
}
} else {
# vector or packed data.frame-type column (note: .new_col_name of
# "version" is disallowed):
# vector or packed data.frame-type column (note: overlaps with label
# column names should already be forbidden by earlier validation):
res[[.new_col_name]] <- comp_value
}

# Stop on naming conflicts (names() fine here, non-NULL). Not the
# friendliest error messages though.
vctrs::vec_as_names(names(res), repair = "check_unique")

# Fast conversion:
return(validate_tibble(new_tibble(res)))
}
Expand All @@ -380,18 +425,19 @@ epix_slide.grouped_epi_archive <- function(

# Set:
# * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will
# `group_modify` as the `.data` argument. Might or might not
# `group_map` as the `.data` argument. Might or might not
# include version column.
# * `group_modify_fn`, the corresponding `.f` argument
# * `group_map_fn`, the corresponding `.f` argument for `group_map`
# (not our `.f`)
if (!.all_versions) {
as_of_df <- as_of_raw
group_modify_fn <- comp_one_grp
group_map_fn <- comp_one_grp
} else {
as_of_archive <- as_of_raw
# We essentially want to `group_modify` the archive, but
# haven't implemented this method yet. Next best would be
# `group_modify` on its `$DT`, but that has different
# behavior based on whether or not `dtplyr` is loaded.
# behavior based on whether or not `dtplyr` < 1.3.0 is loaded.
# Instead, go through an ordinary data frame, trying to avoid
# copies.
if (address(as_of_archive$DT) == address(.x$private$ungrouped$DT)) {
Expand All @@ -408,10 +454,10 @@ epix_slide.grouped_epi_archive <- function(
data.table::setDF(as_of_df)

# Convert each subgroup chunk to an archive before running the calculation.
group_modify_fn <- function(.data_group, .group_key,
.slide_comp, ...,
.version,
.new_col_name) {
group_map_fn <- function(.data_group, .group_key,
.slide_comp, ...,
.version,
.new_col_name) {
# .data_group is coming from as_of_df as a tibble, but we
# want to feed `comp_one_grp` an `epi_archive` backed by a
# DT; convert and wrap:
Expand All @@ -428,14 +474,14 @@ epix_slide.grouped_epi_archive <- function(
}

return(
dplyr::group_modify(
dplyr::bind_rows(dplyr::group_map( # note: output will be ungrouped
dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop),
group_modify_fn,
group_map_fn,
.slide_comp = .slide_comp, ...,
.version = .version,
.new_col_name = .new_col_name,
.keep = TRUE
)
))
)
})
# Combine output into a single tibble (allowing for packed columns)
Expand Down
Loading

0 comments on commit 9ab9731

Please sign in to comment.