Merge branch 'dev' into ndefries/use-but-not-reexport-epidatasets

cmu-delphi · Dec 13, 2024 · 5b1b071 · 5b1b071
2 parents 65745f4 + e5ec121
commit 5b1b071
Show file tree

Hide file tree

Showing 26 changed files with 1,127 additions and 242 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,10 @@
 Package: epiprocess
 Type: Package
 Title: Tools for basic signal processing in epidemiology
-Version: 0.9.7
+Version: 0.10.1
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
-    person("Logan", "Brooks", , "[email protected]", role = c("aut", "cre")),
+    person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")),
     person("Rafael", "Catoia", role = "ctb"),
     person("Nat", "DeFries", role = "ctb"),
     person("Daniel", "McDonald", role = "aut"),
@@ -13,8 +13,9 @@ Authors@R: c(
     person("Chloe", "You", role = "ctb"),
     person("Quang", "Nguyen", role = "ctb"),
     person("Evan", "Ray", role = "aut"),
-    person("Dmitry", "Shemetov", role = "ctb"),
+    person("Dmitry", "Shemetov", role = "aut"),
     person("Ryan", "Tibshirani", role = "aut"),
+    person("David", "Weber", , "[email protected]", role = "ctb"),
     person("Lionel", "Henry", role = "ctb",
            comment = "Author of included rlang fragments"),
     person("Hadley", "Wickham", role = "ctb",

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ S3method(arrange_row_canonical,default)
 S3method(arrange_row_canonical,epi_df)
 S3method(as_epi_df,data.frame)
 S3method(as_epi_df,epi_df)
+S3method(as_epi_df,grouped_df)
 S3method(as_epi_df,tbl_df)
 S3method(as_epi_df,tbl_ts)
 S3method(as_tibble,epi_df)
@@ -103,12 +104,16 @@ importFrom(checkmate,assert)
 importFrom(checkmate,assert_character)
 importFrom(checkmate,assert_class)
 importFrom(checkmate,assert_data_frame)
+importFrom(checkmate,assert_false)
 importFrom(checkmate,assert_function)
 importFrom(checkmate,assert_int)
 importFrom(checkmate,assert_list)
 importFrom(checkmate,assert_logical)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,assert_scalar)
+importFrom(checkmate,assert_string)
+importFrom(checkmate,assert_subset)
+importFrom(checkmate,assert_tibble)
 importFrom(checkmate,checkInt)
 importFrom(checkmate,check_atomic)
 importFrom(checkmate,check_data_frame)
@@ -158,6 +163,7 @@ importFrom(dplyr,groups)
 importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
+importFrom(dplyr,is_grouped_df)
 importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
 importFrom(dplyr,near)
@@ -171,6 +177,7 @@ importFrom(dplyr,summarize)
 importFrom(dplyr,tibble)
 importFrom(dplyr,ungroup)
 importFrom(ggplot2,autoplot)
+importFrom(glue,glue)
 importFrom(lifecycle,deprecated)
 importFrom(lubridate,as.period)
 importFrom(lubridate,days)
@@ -184,7 +191,6 @@ importFrom(rlang,"%||%")
 importFrom(rlang,.data)
 importFrom(rlang,.env)
 importFrom(rlang,arg_match)
-importFrom(rlang,as_label)
 importFrom(rlang,caller_arg)
 importFrom(rlang,caller_env)
 importFrom(rlang,check_dots_empty)
@@ -194,6 +200,7 @@ importFrom(rlang,env)
 importFrom(rlang,expr_label)
 importFrom(rlang,f_env)
 importFrom(rlang,f_rhs)
+importFrom(rlang,is_bare_integerish)
 importFrom(rlang,is_environment)
 importFrom(rlang,is_formula)
 importFrom(rlang,is_function)
@@ -202,7 +209,7 @@ importFrom(rlang,is_quosure)
 importFrom(rlang,list2)
 importFrom(rlang,missing_arg)
 importFrom(rlang,new_function)
-importFrom(rlang,quo_get_expr)
+importFrom(rlang,quo_get_env)
 importFrom(rlang,quo_is_missing)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
@@ -227,3 +234,5 @@ importFrom(tidyselect,starts_with)
 importFrom(tsibble,as_tsibble)
 importFrom(utils,capture.output)
 importFrom(utils,tail)
+importFrom(vctrs,vec_data)
+importFrom(vctrs,vec_equal)
diff --git a/NEWS.md b/NEWS.md
@@ -6,21 +6,38 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 
 ## Breaking changes
 
-- Moved example datasets from being hosted in the package to being fetched
+
+## Improvements
+
+
+## Bug fixes
+
+
+## Cleanup
+
+- Moved example datasets from being reexported in the package to being fetched
   from `epidatasets`. The `epidatasets` package is now auto-loaded as a
-  dependency of `epiprocess`. The datasets can still be fetched, after loading
-  the package, with `data()` or the name of the dataset alone, or can be
-  accessed with `epidatasets::`. Datasets with names starting
+  dependency of `epiprocess`. The datasets can still be accessed, after loading
+  the package, with `data()` or the name of the dataset alone, or with
+  `epidatasets::` (#577).
+
+# epiprocess 0.10
+
+## Breaking changes
+
+- Moved example datasets from being hosted in the package to being reexported
+  from the `epidatasets` package. The datasets can no longer be loaded with
+  `data()` but can be accessed with `epiprocess::` or, after loading the
+  package, just the name of the dataset (#520). Those with names starting with
   `jhu` have been renamed to a more uniform scheme and now have names starting
   with `covid`. The data set previously named `jhu_confirmed_cumulative_num` has
   been removed from the package, but a renamed version is has been removed from
-  the package, but a renamed version is still available in `epidatasets` (#520, #577).
-
-## Bug fixes
-
-- Removed `.window_size = 1` default from `epi_slide_{mean,sum,opt}`; this
-  argument is now mandatory, and should nearly always be greater than 1 except
-  for testing purposes.
+  the package, but a renamed version is still available in `epidatasets`.
+- `epi_slide_{sum,mean,opt}` have improved default output column names, and
+  additional arguments for specifying names: `.prefix`, `.suffix`,
+  `.new_col_names`. To obtain the old naming behavior, use `.prefix =
+  "slide_value_"`.
+- `as_epi_df` now removes any grouping that `x` had applied.
 
 ## Improvements
 
@@ -30,6 +47,19 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 - Improved validation of `.window_size` arguments.
 - Rewrote a lot of the package documentation to be more consistent and
   informative. Simplified and streamlined the vignettes.
+- `epi_slide_{sum,mean,opt}` on ungrouped `epi_df`s will now temporarily group
+  by `geo_value` and any `other_keys` for the slide operation rather than raise
+  an error about duplicated time values. `epi_slide`'s analogous automatic
+  grouping has been made temporary in order to match.
+- Improved speed of key-uniqueness checks.
+
+## Bug fixes
+
+- Removed `.window_size = 1` default from `epi_slide_{mean,sum,opt}`; this
+  argument is now mandatory, and should nearly always be greater than 1 except
+  for testing purposes.
+- Fixed `epi_slide_{sum,mean,opt}` raising an error on certain tidyselect
+  expressions.
 
 ## Cleanup
 

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -174,7 +174,7 @@ NULL
 #' @param other_keys If your tibble has additional keys, be sure to specify them
 #'   as a character vector here (typical examples are "age" or sub-geographies).
 #' @param ... Additional arguments passed to methods.
-#' @return An `epi_df` object.
+#' @return * Of `new_epi_df()`: an `epi_df`
 #'
 #' @export
 new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value = as.Date(integer())),
@@ -205,6 +205,8 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value =
 #'   to be converted
 #' @param ... used for specifying column names, as in [`dplyr::rename`]. For
 #'   example, `geo_value = STATEFP, time_value = end_date`.
+#' @return * Of `as_epi_df()`: an (ungrouped) `epi_df`
+#'
 #' @export
 as_epi_df <- function(x, ...) {
   UseMethod("as_epi_df")
@@ -215,6 +217,7 @@ as_epi_df <- function(x, ...) {
 #' @method as_epi_df epi_df
 #' @export
 as_epi_df.epi_df <- function(x, ...) {
+  x <- ungroup(x)
   return(x)
 }
 
@@ -232,7 +235,6 @@ as_epi_df.tbl_df <- function(
     as_of,
     other_keys = character(),
     ...) {
-  # possible standard substitutions for time_value
   x <- rename(x, ...)
   x <- guess_column_name(x, "time_value", time_column_names())
   x <- guess_column_name(x, "geo_value", geo_column_names())
@@ -277,26 +279,32 @@ as_epi_df.tbl_df <- function(
   }
 
   assert_character(other_keys)
+  assert_subset(other_keys, names(x))
+  # Fix up if given more than just other keys, at least until epipredict#428
+  # merged:
+  other_keys <- other_keys[!other_keys %in% c("geo_value", "time_value")]
 
   if (".time_value_counts" %in% other_keys) {
     cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
   }
 
-  duplicated_time_values <- x %>%
-    group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
-    filter(dplyr::n() > 1) %>%
-    ungroup()
-  if (nrow(duplicated_time_values) > 0) {
-    bad_data <- capture.output(duplicated_time_values)
-    cli_abort(
-      "as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
-      body = c("Sample groups:", bad_data)
-    )
-  }
+  assert(check_ukey_unique(x, c("geo_value", other_keys, "time_value"), c(
+    ">" = "If this is line list data, convert it to counts/rates first.",
+    ">" = "If this contains a demographic breakdown, check that you have
+           specified appropriate `other_keys`" # . from checkmate
+  )))
 
   new_epi_df(x, geo_type, time_type, as_of, other_keys)
 }
 
+#' @rdname epi_df
+#' @order 1
+#' @method as_epi_df grouped_df
+#' @export
+as_epi_df.grouped_df <- function(x, ...) {
+  as_epi_df(ungroup(x), ...)
+}
+
 #' @rdname epi_df
 #' @order 1
 #' @method as_epi_df data.frame
@@ -320,9 +328,11 @@ as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) {
 #' Test for `epi_df` format
 #'
 #' @param x An object.
-#' @return `TRUE` if the object inherits from `epi_df`.
+#' @return * Of `is_epi_df`: `TRUE` if the object inherits from `epi_df`,
+#'           otherwise `FALSE`.
 #'
 #' @rdname epi_df
+#' @order 1
 #' @export
 is_epi_df <- function(x) {
   inherits(x, "epi_df")

diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R
@@ -5,17 +5,26 @@
 #' @import epidatasets
 #' @importFrom checkmate anyInfinite anyMissing assert assert_character
 #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list
+#' @importFrom checkmate assert_false
 #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt
+#' @importFrom checkmate assert_string
+#' @importFrom checkmate assert_subset
+#' @importFrom checkmate assert_tibble
 #' @importFrom checkmate check_atomic check_data_frame expect_class test_int
 #' @importFrom checkmate check_names
 #' @importFrom checkmate test_subset test_set_equal vname
 #' @importFrom cli cli_abort cli_warn
 #' @importFrom data.table as.data.table
 #' @importFrom data.table key
 #' @importFrom data.table setkeyv
+#' @importFrom dplyr arrange
+#' @importFrom dplyr is_grouped_df
 #' @importFrom dplyr select
 #' @importFrom lifecycle deprecated
 #' @importFrom rlang %||%
+#' @importFrom rlang is_bare_integerish
+#' @importFrom vctrs vec_data
+#' @importFrom vctrs vec_equal
 ## usethis namespace: end
 NULL
 
@@ -24,5 +33,5 @@ utils::globalVariables(c(
   "fitted", ".response", "geo_value", "time_value",
   "value", ".real", "lag", "max_value", "min_value",
   "median_value", "spread", "rel_spread", "time_to",
-  "time_near_latest", "n_revisions"
+  "time_near_latest", "n_revisions", "min_lag", "max_lag"
 ))
diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
@@ -688,10 +688,10 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
 #'   requested `.versions`) for rows having a `time_value` of at least `.version
 #'   - before`. Otherwise, the slide computation will be passed only the most
 #'   recent `version` for every unique `time_value`. Default is `FALSE`.
-#' @return A tibble whose columns are: the grouping variables, `time_value`,
-#'   containing the reference time values for the slide computation, and a
-#'   column named according to the `.new_col_name` argument, containing the slide
-#'   values.
+#' @return A tibble whose columns are: the grouping variables (if any),
+#'   `time_value`, containing the reference time values for the slide
+#'   computation, and a column named according to the `.new_col_name` argument,
+#'   containing the slide values. It will be grouped by the grouping variables.
 #'
 #' @details A few key distinctions between the current function and `epi_slide()`:
 #'   1. In `.f` functions for `epix_slide`, one should not assume that the input