cmu-delphi · dsweber2 · Jan 16, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 10, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -76,6 +76,7 @@ Collate:
     'data.R'
     'epi_df.R'
     'epiprocess.R'
+    'group_by_epi_df_methods.R'
     'methods-epi_archive.R'
     'grouped_epi_archive.R'
     'growth_rate.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,7 @@ S3method(groups,grouped_epi_archive)
 S3method(next_after,Date)
 S3method(next_after,integer)
 S3method(print,epi_df)
+S3method(select,epi_df)
 S3method(summary,epi_df)
 S3method(ungroup,epi_df)
 S3method(ungroup,grouped_epi_archive)

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -122,6 +122,9 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of,
   if (!is.list(additional_metadata)) {
     Abort("`additional_metadata` must be a list type.")
   }
+  if (is.null(additional_metadata[["other_keys"]])) {
+    additional_metadata[["other_keys"]] <- character(0L)
+  }
 
   # If geo type is missing, then try to guess it
   if (missing(geo_type)) {
@@ -334,4 +337,4 @@ as_epi_df.tbl_ts = function(x, geo_type, time_type, as_of,
 #' @export
 is_epi_df = function(x) {
   inherits(x, "epi_df")
-}
+}
diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R
@@ -0,0 +1,17 @@
+# These methods (and maybe some others in methods-epi_df.R) are here to augment
+# `?dplyr_extending` implementations to get the correct behavior on grouped
+# `epi_df`s. It would be nice if there were a way to replace these with a
+# generic core that automatically fixed all misbehaving methods; see
+# brainstorming within Issue #223.
+
+#' @importFrom dplyr select
+#' @export
+select.epi_df <- function(.data, ...) {
+  selected <- NextMethod(.data)
+  might_decay <- reclass(selected, attr(selected, "metadata"))
+  return(dplyr_reconstruct(might_decay, might_decay))
+}
+
+# others to consider:
+# - arrange
+# -
diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -179,10 +179,14 @@ dplyr_row_slice.epi_df = function(data, i, ...) {
 
 #' @export
 `names<-.epi_df` = function(x, value) {
-  old_names = names(x)
-  old_other_keys = attributes(x)$metadata$other_keys
-  result = NextMethod()
-  attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)]
+  old_names <- names(x)
+  old_metadata <- attr(x, "metadata")
+  old_other_keys <- old_metadata[["other_keys"]]
+  new_other_keys <- value[match(old_other_keys, old_names)]
+  new_metadata <- old_metadata
+  new_metadata[["other_keys"]] <- new_other_keys
+  result <- reclass(NextMethod(), new_metadata)
+  # decay to non-`epi_df` if needed:
   dplyr::dplyr_reconstruct(result, result)
 }
 

diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R
@@ -1,23 +1,27 @@
 test_that("new_epi_df works as intended", {
   # Empty tibble
-  wmsg = capture_warnings(a <- new_epi_df())
-  expect_match(wmsg[1],
-                 "Unknown or uninitialised column: `geo_value`.")
-  expect_match(wmsg[2],
-                 "Unknown or uninitialised column: `time_value`.")
+  wmsg <- capture_warnings(a <- new_epi_df())
+  expect_match(
+    wmsg[1],
+    "Unknown or uninitialised column: `geo_value`."
+  )
+  expect_match(
+    wmsg[2],
+    "Unknown or uninitialised column: `time_value`."
+  )
   expect_true(is_epi_df(a))
   expect_identical(attributes(a)$metadata$geo_type, "custom")
   expect_identical(attributes(a)$metadata$time_type, "custom")
   expect_true(lubridate::is.POSIXt(attributes(a)$metadata$as_of))
-  
+
   # Simple non-empty tibble with geo_value and time_value cols
   tib <- tibble::tibble(
     x = 1:10, y = 1:10,
     time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2),
     geo_value = rep(c("ca", "hi"), each = 5)
   )
-  
-  epi_tib = new_epi_df(tib)
+
+  epi_tib <- new_epi_df(tib)
   expect_true(is_epi_df(epi_tib))
   expect_length(epi_tib, 4L)
   expect_identical(attributes(epi_tib)$metadata$geo_type, "state")
@@ -32,10 +36,72 @@ test_that("as_epi_df errors when additional_metadata is not a list", {
     dplyr::slice_tail(n = 6) %>%
     tsibble::as_tsibble() %>%
     dplyr::mutate(
-      state = rep("MA",6),
-      pol = rep(c("blue", "swing", "swing"), each = 2))
-
+      state = rep("MA", 6),
+      pol = rep(c("blue", "swing", "swing"), each = 2)
+    )
+
   expect_error(
-    as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")), 
-    "`additional_metadata` must be a list type.")
-})
+    as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")),
+    "`additional_metadata` must be a list type."
+  )
+})
+
+# select fixes
+
+tib <- tibble::tibble(
+  x = 1:10, y = 1:10,
+  time_value = rep(seq(as.Date("2020-01-01"),
+    by = 1, length.out = 5
+  ), times = 2),
+  geo_value = rep(c("ca", "hi"), each = 5)
+)
+epi_tib <- epiprocess::new_epi_df(tib)
+test_that("grouped epi_df maintains type for select", {
+  grouped_epi <- epi_tib %>% group_by(geo_value)
+  selected_df <- grouped_epi %>% select(-y)
+  expect_true(inherits(selected_df, "epi_df"))
+  # make sure that the attributes are right
+  epi_attr <- attributes(selected_df)
+  expect_identical(epi_attr$names, c("geo_value", "time_value", "x"))
+  expect_identical(epi_attr$row.names, seq(1, 10))
+  expect_identical(epi_attr$groups, attributes(grouped_epi)$groups)
+  expect_identical(epi_attr$metadata, attributes(epi_tib)$metadata)
+  expect_identical(selected_df, epi_tib %>% select(-y) %>% group_by(geo_value))
+})
+
+test_that("grouped epi_df drops type when dropping keys", {
+  grouped_epi <- epi_tib %>% group_by(geo_value)
+  selected_df <- grouped_epi %>% select(geo_value)
+  expect_true(!inherits(selected_df, "epi_df"))
+})
+
+test_that("grouped epi_df handles extra keys correctly", {
+  tib <- tibble::tibble(
+    x = 1:10, y = 1:10,
+    time_value = rep(seq(as.Date("2020-01-01"),
+      by = 1, length.out = 5
+    ), times = 2),
+    geo_value = rep(c("ca", "hi"), each = 5),
+    extra_key = rep(seq(as.Date("2020-01-01"),
+      by = 1, length.out = 5
+    ), times = 2)
+  )
+  epi_tib <- epiprocess::new_epi_df(tib,
+    additional_metadata = list(other_keys = "extra_key")
+  )
+  attributes(epi_tib)
+  grouped_epi <- epi_tib %>% group_by(geo_value)
+  selected_df <- grouped_epi %>% select(-extra_key)
+  expect_true(inherits(selected_df, "epi_df"))
+  # make sure that the attributes are right
+  old_attr <- attributes(epi_tib)
+  epi_attr <- attributes(selected_df)
+  expect_identical(epi_attr$names, c("geo_value", "time_value", "x", "y"))
+  expect_identical(epi_attr$row.names, seq(1, 10))
+  expect_identical(epi_attr$groups, attributes(grouped_epi)$groups)
+  expect_identical(epi_attr$metadata, list(
+    geo_type = "state", time_type = "day",
+    as_of = old_attr$metadata$as_of,
+    other_keys = character(0)
+  ))
+})
diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R
@@ -124,3 +124,40 @@ test_that("Metadata and grouping are dropped by `as_tibble`", {
     !any(c("metadata", "groups") %in% names(attributes(grouped_converted)))
   )
 })
+
+test_that("Renaming columns gives appropriate colnames and metadata", {
+  edf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>%
+    as_epi_df(additional_metadata = list(other_keys = "age"))
+  # renaming using base R
+  renamed_edf1 <- edf %>%
+    `[`(c("geo_value", "time_value", "age", "value")) %>%
+    `names<-`(c("geo_value", "time_value", "age_group", "value"))
+  expect_identical(names(renamed_edf1), c("geo_value", "time_value", "age_group", "value"))
+  expect_identical(attr(renamed_edf1, "metadata")$other_keys, c("age_group"))
+  # renaming using select
+  renamed_edf2 <- edf %>%
+    as_epi_df(additional_metadata = list(other_keys = "age")) %>%
+    select(geo_value, time_value, age_group = age, value)
+  expect_identical(renamed_edf1, renamed_edf2)
+})
+
+test_that("Renaming columns while grouped gives appropriate colnames and metadata", {
+  gedf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>%
+    as_epi_df(additional_metadata = list(other_keys = "age")) %>%
+    group_by(geo_value)
+  # renaming using base R
+  renamed_gedf1 <- gedf %>%
+    `[`(c("geo_value", "time_value", "age", "value")) %>%
+    `names<-`(c("geo_value", "time_value", "age_group", "value"))
+  # tets type preservation
+  expect_true(inherits(renamed_gedf1, "epi_df"))
+  expect_true(inherits(renamed_gedf1, "grouped_df"))
+  # the names are right
+  expect_identical(names(renamed_gedf1), c("geo_value", "time_value", "age_group", "value"))
+  expect_identical(attr(renamed_gedf1, "metadata")$other_keys, c("age_group"))
+  # renaming using select
+  renamed_gedf2 <- gedf %>%
+    as_epi_df(additional_metadata = list(other_keys = "age")) %>%
+    select(geo_value, time_value, age_group = age, value)
+  expect_identical(renamed_gedf1, renamed_gedf2)
+})