Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor+doc: key_colnames and vignettes
Browse files Browse the repository at this point in the history
* key_colnames order change
* replace kill_time_value with exclude arg in key_colnames
* move duplicate time_values check in epi_slide
dshemetov committed Sep 26, 2024
1 parent dd19428 commit e3dfa32
Showing 32 changed files with 616 additions and 765 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -16,3 +16,5 @@
^.lintr$
^DEVELOPMENT.md$
man-roxygen
^.venv$
^sandbox.R$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -13,3 +13,4 @@ docs
renv/
renv.lock
.Rprofile
sandbox.R
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -50,7 +50,8 @@ Imports:
tidyselect (>= 1.2.0),
tsibble,
utils,
vctrs
vctrs,
waldo
Suggests:
covidcast,
devtools,
2 changes: 1 addition & 1 deletion R/autoplot.R
Original file line number Diff line number Diff line change
@@ -55,7 +55,7 @@ autoplot.epi_df <- function(

key_cols <- key_colnames(object)
non_key_cols <- setdiff(names(object), key_cols)
geo_and_other_keys <- kill_time_value(key_cols)
geo_and_other_keys <- key_colnames(object, exclude = "time_value")

# --- check for numeric variables
allowed <- purrr::map_lgl(object[non_key_cols], is.numeric)
11 changes: 4 additions & 7 deletions R/epi_df.R
Original file line number Diff line number Diff line change
@@ -184,18 +184,14 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value =
metadata$other_keys <- other_keys

# Reorder columns (geo_value, time_value, ...)
if (sum(dim(x)) != 0) {
cols_to_put_first <- c("geo_value", "time_value", other_keys)
x <- x[, c(
cols_to_put_first,
# All other columns
names(x)[!(names(x) %in% cols_to_put_first)]
)]
if (nrow(x) > 0) {
x <- x %>% relocate(all_of(c("geo_value", other_keys, "time_value")), .before = 1)
}

# Apply epi_df class, attach metadata, and return
class(x) <- c("epi_df", class(x))
attributes(x)$metadata <- metadata

return(x)
}

@@ -281,6 +277,7 @@ as_epi_df.tbl_df <- function(
if (".time_value_counts" %in% other_keys) {
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
}

duplicated_time_values <- x %>%
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
filter(dplyr::n() > 1) %>%
4 changes: 2 additions & 2 deletions R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
@@ -397,8 +397,8 @@ epix_slide.grouped_epi_archive <- function(
)),
capture.output(print(waldo::compare(
res[[comp_nms[[comp_i]]]], comp_value[[comp_i]],
x_arg = rlang::expr_deparse(expr(`$`(label, !!sym(comp_nms[[comp_i]])))),
y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
x_arg = rlang::expr_deparse(dplyr::expr(`$`(label, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
))),
cli::format_message(c(
"You likely want to rename or remove this column in your output, or debug why it has a different value."
33 changes: 20 additions & 13 deletions R/key_colnames.R
Original file line number Diff line number Diff line change
@@ -2,39 +2,46 @@
#'
#' @param x a data.frame, tibble, or epi_df
#' @param ... additional arguments passed on to methods
#'
#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`
#' @param other_keys an optional character vector of other keys to include
#' @param exclude an optional character vector of keys to exclude
#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`.
#' @keywords internal
#' @export
key_colnames <- function(x, ...) {
UseMethod("key_colnames")
}

#' @rdname key_colnames
#' @method key_colnames default
#' @export
key_colnames.default <- function(x, ...) {
character(0L)
}

#' @rdname key_colnames
#' @method key_colnames data.frame
#' @export
key_colnames.data.frame <- function(x, other_keys = character(0L), ...) {
key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) {
assert_character(other_keys)
nm <- c("geo_value", "time_value", other_keys)
assert_character(exclude)
nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude)
intersect(nm, colnames(x))
}

#' @rdname key_colnames
#' @method key_colnames epi_df
#' @export
key_colnames.epi_df <- function(x, ...) {
key_colnames.epi_df <- function(x, exclude = character(0L), ...) {
assert_character(exclude)
other_keys <- attr(x, "metadata")$other_keys
c("geo_value", "time_value", other_keys)
setdiff(c("geo_value", other_keys, "time_value"), exclude)
}

#' @rdname key_colnames
#' @method key_colnames epi_archive
#' @export
key_colnames.epi_archive <- function(x, ...) {
key_colnames.epi_archive <- function(x, exclude = character(0L), ...) {
assert_character(exclude)
other_keys <- attr(x, "metadata")$other_keys
c("geo_value", "time_value", other_keys)
}

kill_time_value <- function(v) {
assert_character(v)
v[v != "time_value"]
setdiff(c("geo_value", other_keys, "time_value"), exclude)
}
4 changes: 2 additions & 2 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
@@ -731,7 +731,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
#' library(dplyr)
#'
#' # Reference time points for which we want to compute slide values:
#' versions <- seq(as.Date("2020-06-01"),
#' versions <- seq(as.Date("2020-06-02"),
#' as.Date("2020-06-15"),
#' by = "1 day"
#' )
@@ -780,7 +780,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
#' .versions = versions
#' ) %>%
#' ungroup() %>%
#' arrange(geo_value, time_value)
#' arrange(geo_value, version)
#'
#' # --- Advanced: ---
#'
29 changes: 18 additions & 11 deletions R/methods-epi_df.R
Original file line number Diff line number Diff line change
@@ -41,10 +41,13 @@ as_tibble.epi_df <- function(x, ...) {
#' @export
as_tsibble.epi_df <- function(x, key, ...) {
if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys)
return(as_tsibble(tibble::as_tibble(x),
key = tidyselect::all_of(key), index = "time_value",
...
))
return(
as_tsibble(
tibble::as_tibble(x),
key = tidyselect::all_of(key), index = "time_value",
...
)
)
}

#' Base S3 methods for an `epi_df` object
@@ -150,10 +153,10 @@ dplyr_reconstruct.epi_df <- function(data, template) {
# keep any grouping that has been applied:
res <- NextMethod()

cn <- names(res)
col_names <- names(res)

# Duplicate columns, cli_abort
dup_col_names <- cn[duplicated(cn)]
dup_col_names <- col_names[duplicated(col_names)]
if (length(dup_col_names) != 0) {
cli_abort(c(
"Duplicate column names are not allowed",
@@ -163,7 +166,7 @@ dplyr_reconstruct.epi_df <- function(data, template) {
))
}

not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
not_epi_df <- !("time_value" %in% col_names) || !("geo_value" %in% col_names)

if (not_epi_df) {
# If we're calling on an `epi_df` from one of our own functions, we need to
@@ -182,7 +185,7 @@ dplyr_reconstruct.epi_df <- function(data, template) {

# Amend additional metadata if some other_keys cols are dropped in the subset
old_other_keys <- attr(template, "metadata")$other_keys
attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names]

res
}
@@ -424,9 +427,13 @@ arrange_col_canonical.epi_df <- function(x, ...) {
x %>% dplyr::relocate(dplyr::all_of(cols), .before = 1)
}

#' Group an `epi_df` object by default keys
#' @param x an `epi_df`
#' @param exclude character vector of column names to exclude from grouping
#' @return a grouped `epi_df`
#' @export
group_epi_df <- function(x) {
cols <- kill_time_value(key_colnames(x))
group_epi_df <- function(x, exclude = character()) {
cols <- key_colnames(x, exclude = exclude)
x %>% group_by(across(all_of(cols)))
}

@@ -437,7 +444,7 @@ group_epi_df <- function(x) {
#' the resulting `epi_df` will have `geo_value` set to `"total"`.
#'
#' @param .x an `epi_df`
#' @param value_col character vector of the columns to aggregate
#' @param sum_cols character vector of the columns to aggregate
#' @param group_cols character vector of column names to group by. "time_value" is
#' included by default.
#' @return an `epi_df` object
12 changes: 5 additions & 7 deletions R/outliers.R
Original file line number Diff line number Diff line change
@@ -161,8 +161,7 @@ detect_outlr <- function(x = seq_along(y), y,
#' group_by(geo_value) %>%
#' mutate(outlier_info = detect_outlr_rm(
#' x = time_value, y = cases
#' )) %>%
#' unnest(outlier_info)
#' ))
detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
log_transform = FALSE,
detect_negatives = FALSE,
@@ -189,7 +188,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,

# Calculate lower and upper thresholds and replacement value
z <- z %>%
epi_slide(fitted = median(y), .window_size = n, .align = "center") %>%
epi_slide(fitted = median(y, na.rm = TRUE), .window_size = n, .align = "center") %>%
dplyr::mutate(resid = y - fitted) %>%
roll_iqr(
n = n,
@@ -256,9 +255,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
#' group_by(geo_value) %>%
#' mutate(outlier_info = detect_outlr_stl(
#' x = time_value, y = cases,
#' seasonal_period = 7
#' )) %>% # weekly seasonality for daily data
#' unnest(outlier_info)
#' seasonal_period = 7 # weekly seasonality for daily data
#' ))
detect_outlr_stl <- function(x = seq_along(y), y,
n_trend = 21,
n_seasonal = 21,
@@ -359,7 +357,7 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius,

z %>%
epi_slide(
roll_iqr = stats::IQR(resid),
roll_iqr = stats::IQR(resid, na.rm = TRUE),
.window_size = n, .align = "center"
) %>%
dplyr::mutate(
10 changes: 4 additions & 6 deletions R/revision_analysis.R
Original file line number Diff line number Diff line change
@@ -81,8 +81,8 @@ revision_summary <- function(epi_arch,
should_compactify = TRUE) {
arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
if (length(arg) == 0) {
first_non_key <- !(names(epi_arch$DT) %in% c(key_colnames(epi_arch), "version"))
arg <- names(epi_arch$DT)[first_non_key][1]
# Choose the first column that's not a key or version
arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]]
} else if (length(arg) > 1) {
cli_abort("Not currently implementing more than one column at a time. Run each separately")
}
@@ -99,11 +99,9 @@ revision_summary <- function(epi_arch,
#
# revision_tibble
keys <- key_colnames(epi_arch)
names(epi_arch$DT)

revision_behavior <-
epi_arch$DT %>%
select(c(geo_value, time_value, all_of(keys), version, !!arg))
revision_behavior <- epi_arch$DT %>%
select(all_of(unique(c("geo_value", "time_value", keys, "version", arg))))
if (!is.null(min_waiting_period)) {
revision_behavior <- revision_behavior %>%
filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)
43 changes: 30 additions & 13 deletions R/slide.R
Original file line number Diff line number Diff line change
@@ -122,8 +122,7 @@ epi_slide <- function(
assert_class(.x, "epi_df")
if (checkmate::test_class(.x, "grouped_df")) {
expected_group_keys <- .x %>%
key_colnames() %>%
kill_time_value() %>%
key_colnames(exclude = "time_value") %>%
sort()
if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) {
cli_abort(
@@ -134,12 +133,11 @@ epi_slide <- function(
)
}
} else {
.x <- group_epi_df(.x)
.x <- group_epi_df(.x, exclude = "time_value")
}
if (nrow(.x) == 0L) {
return(.x)
}

# If `.f` is missing, interpret ... as an expression for tidy evaluation
if (missing(.f)) {
used_data_masking <- TRUE
@@ -191,6 +189,20 @@ epi_slide <- function(

assert_logical(.all_rows, len = 1)

# Check for duplicated time values within groups
duplicated_time_values <- .x %>%
group_epi_df() %>%
filter(dplyr::n() > 1) %>%
ungroup()
if (nrow(duplicated_time_values) > 0) {
bad_data <- capture.output(duplicated_time_values)
cli_abort(
"as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
epi_df requires a unique time_value per group.",
body = c("Sample groups:", bad_data)
)
}

# Begin handling completion. This will create a complete time index between
# the smallest and largest time values in the data. This is used to ensure
# that the slide function is called with a complete window of data. Each slide
@@ -241,7 +253,7 @@ epi_slide <- function(
.keep = TRUE
) %>%
bind_rows() %>%
filter(.data$.real) %>%
filter(.real) %>%
select(-.real) %>%
arrange_col_canonical() %>%
group_by(!!!.x_groups)
@@ -275,11 +287,16 @@ epi_slide_one_group <- function(
missing_times <- all_dates[!(all_dates %in% .data_group$time_value)]
.data_group <- bind_rows(
.data_group,
tibble(time_value = c(
missing_times,
.date_seq_list$pad_early_dates,
.date_seq_list$pad_late_dates
), .real = FALSE)
dplyr::bind_cols(
.group_key,
tibble(
time_value = c(
missing_times,
.date_seq_list$pad_early_dates,
.date_seq_list$pad_late_dates
), .real = FALSE
)
)
) %>%
arrange(.data$time_value)

@@ -405,8 +422,8 @@ epi_slide_one_group <- function(
)),
capture.output(print(waldo::compare(
res[[comp_nms[[comp_i]]]], slide_values[[comp_i]],
x_arg = rlang::expr_deparse(expr(`$`(existing, !!sym(comp_nms[[comp_i]])))),
y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
x_arg = rlang::expr_deparse(dplyr::expr(`$`(existing, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) # nolint: object_usage_linter
))),
cli::format_message(c(
">" = "You likely want to rename or remove this column from your slide
@@ -711,7 +728,7 @@ epi_slide_opt <- function(
# positions of user-provided `col_names` into string column names. We avoid
# using `names(pos)` directly for robustness and in case we later want to
# allow users to rename fields via tidyselection.
if (class(quo_get_expr(enquo(.col_names))) == "character") {
if (inherits(quo_get_expr(enquo(.col_names)), "character")) {
pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE)
} else {
pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE)
2 changes: 2 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -543,6 +543,7 @@ as_slide_computation <- function(.f, ..., .ref_time_value_long_varnames, .ref_ti

#' @rdname as_slide_computation
#' @export
#' @keywords internal
#' @noRd
as_time_slide_computation <- function(.f, ...) {
as_slide_computation(
@@ -554,6 +555,7 @@ as_time_slide_computation <- function(.f, ...) {

#' @rdname as_slide_computation
#' @export
#' @keywords internal
#' @noRd
as_diagonal_slide_computation <- function(.f, ...) {
as_slide_computation(
1 change: 0 additions & 1 deletion _pkgdown.yml
Original file line number Diff line number Diff line change
@@ -48,7 +48,6 @@ articles:
- aggregation
- outliers
- archive
- advanced
- compactify

repo:
6 changes: 3 additions & 3 deletions man-roxygen/basic-slide-details.R
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@
#' boundary of the dataset) and will attempt to perform the computation
#' anyway. The issue of what to do with partial computations (those run on
#' incomplete windows) is therefore left up to the user, either through the
#' specified function or formula `f`, or through post-processing.
#' specified function or formula, or through post-processing.
#'
#' Let's look at some window examples, assuming that the reference time value
#' is "tv". With .align = "right" and .window_size = 3, the window will be:
@@ -60,8 +60,8 @@
#' "pronoun"-like bindings available:
#' * .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object
#' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun]
#' like [`.data`]; this allows you to use additional {dplyr}, {tidyr}, and
#' {epiprocess} operations. If you have multiple expressions in `...`, this
#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and
#' `epiprocess` operations. If you have multiple expressions in `...`, this
#' won't let you refer to the output of the earlier expressions, but `.data`
#' will.
#' * .group_key, which is like `.y` in [`dplyr::group_modify`].
3 changes: 1 addition & 2 deletions man/detect_outlr_rm.Rd
5 changes: 2 additions & 3 deletions man/detect_outlr_stl.Rd
6 changes: 3 additions & 3 deletions man/epi_slide.Rd
4 changes: 2 additions & 2 deletions man/epix_slide.Rd
19 changes: 19 additions & 0 deletions man/group_epi_df.Rd
18 changes: 17 additions & 1 deletion man/key_colnames.Rd
4 changes: 2 additions & 2 deletions man/sum_groups_epi_df.Rd
15 changes: 7 additions & 8 deletions tests/testthat/test-arrange-canonical.R
Original file line number Diff line number Diff line change
@@ -8,14 +8,13 @@ test_that("canonical arrangement works", {
expect_error(arrange_canonical(tib))

tib <- tib %>% as_epi_df(other_keys = "demo_grp")
expect_equal(names(tib), c("geo_value", "time_value", "demo_grp", "x"))
expect_equal(names(tib), c("geo_value", "demo_grp", "time_value", "x"))

tib_cols_shuffled <- tib %>% select(geo_value, time_value, x, demo_grp)

tib_sorted <- arrange_canonical(tib_cols_shuffled)
expect_equal(names(tib_sorted), c("geo_value", "time_value", "demo_grp", "x"))
tib_sorted <- tib %>%
arrange_canonical()
expect_equal(names(tib_sorted), c("geo_value", "demo_grp", "time_value", "x"))
expect_equal(tib_sorted$geo_value, rep(c("ca", "ga"), each = 4))
expect_equal(tib_sorted$time_value, c(1, 1, 2, 2, 1, 1, 2, 2))
expect_equal(tib_sorted$demo_grp, rep(letters[1:2], times = 4))
expect_equal(tib_sorted$x, c(8, 6, 7, 5, 4, 2, 3, 1))
expect_equal(tib_sorted$time_value, c(1, 2, 1, 2, 1, 2, 1, 2))
expect_equal(tib_sorted$demo_grp, c("a", "a", "b", "b", "a", "a", "b", "b"))
expect_equal(tib_sorted$x, c(8, 7, 6, 5, 4, 3, 2, 1))
})
10 changes: 5 additions & 5 deletions tests/testthat/test-epi_slide.R
Original file line number Diff line number Diff line change
@@ -53,7 +53,7 @@ get_test_dataset <- function(n, time_type = "day", other_keys = FALSE) {
}
df %>%
arrange_canonical() %>%
group_epi_df()
group_epi_df(exclude = "time_value")
}
test_data <- get_test_dataset(num_rows_per_group, "day")

@@ -82,10 +82,10 @@ epi_slide_sum_test <- function(

.x %>%
mutate(.real = TRUE) %>%
group_epi_df() %>%
group_epi_df(exclude = "time_value") %>%
complete(time_value = vctrs::vec_c(!!!date_seq_list, .name_spec = rlang::zap())) %>%
arrange_canonical() %>%
group_epi_df() %>%
group_epi_df(exclude = "time_value") %>%
mutate(
slide_value = slider::slide_index_sum(
.data$value,
@@ -246,7 +246,7 @@ for (p in (param_combinations %>% transpose())) {
mutate(slide_value = list(slide_value)) %>%
ungroup() %>%
as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>%
group_epi_df()
group_epi_df(exclude = "time_value")

expect_equal(
out %>% select(-slide_value),
@@ -268,7 +268,7 @@ for (p in (param_combinations %>% transpose())) {
mutate(slide_value = list(slide_value)) %>%
ungroup() %>%
as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>%
group_epi_df()
group_epi_df(exclude = "time_value")
expect_equal(
out %>% select(-slide_value),
expected_out %>% select(-slide_value)
11 changes: 5 additions & 6 deletions tests/testthat/test-methods-epi_df.R
Original file line number Diff line number Diff line change
@@ -69,21 +69,20 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", {
expect_equal(ncol(col_subset2), 2L)

# Row and col subset that contains geo_value and time_value - should be epi_df
row_col_subset2 <- toy_epi_df[2:3, 1:3]
row_col_subset2 <- toy_epi_df[2:3, c(1, 4)]
att_row_col_subset2 <- attr(row_col_subset2, "metadata")

expect_true(is_epi_df(row_col_subset2))
expect_equal(nrow(row_col_subset2), 2L)
expect_equal(ncol(row_col_subset2), 3L)
expect_equal(ncol(row_col_subset2), 2L)
expect_identical(att_row_col_subset2$geo_type, att_toy$geo_type)
expect_identical(att_row_col_subset2$time_type, att_toy$time_type)
expect_identical(att_row_col_subset2$as_of, att_toy$as_of)
expect_identical(att_row_col_subset2$other_keys, att_toy$other_keys[1])
})

test_that("When duplicate cols in subset should abort", {
expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)],
"Duplicated column names: time_value, indic_var2",
"Duplicated column names: indic_var1, time_value",
fixed = TRUE
)
expect_error(toy_epi_df[1:4, c(1, 2:4, 1)],
@@ -94,7 +93,7 @@ test_that("When duplicate cols in subset should abort", {

test_that("Correct metadata when subset includes some of other_keys", {
# Only include other_var of indic_var1
only_indic_var1 <- toy_epi_df[, c(1:3, 5:6)]
only_indic_var1 <- toy_epi_df[, c(1:2, 4:6)]
att_only_indic_var1 <- attr(only_indic_var1, "metadata")

expect_true(is_epi_df(only_indic_var1))
@@ -106,7 +105,7 @@ test_that("Correct metadata when subset includes some of other_keys", {
expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2])

# Only include other_var of indic_var2
only_indic_var2 <- toy_epi_df[, c(1:2, 4:6)]
only_indic_var2 <- toy_epi_df[, c(1, 3:6)]
att_only_indic_var2 <- attr(only_indic_var2, "metadata")

expect_true(is_epi_df(only_indic_var2))
488 changes: 0 additions & 488 deletions vignettes/advanced.Rmd

This file was deleted.

25 changes: 13 additions & 12 deletions vignettes/aggregation.Rmd
Original file line number Diff line number Diff line change
@@ -52,13 +52,12 @@ x <- jhu_csse_county_level_subset
## Converting to `tsibble` format

For manipulating and wrangling time series data, the
[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a whole
bunch of useful tools. A tsibble object (formerly, of class `tbl_ts`) is
basically a tibble (data frame) but with two specially-marked columns: an
**index** column representing the time variable (defining an order from past to
present), and a **key** column identifying a unique observational unit for each
time point. In fact, the key can be made up of any number of columns, not just a
single one.
[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a host of
useful tools. A tsibble object (formerly, of class `tbl_ts`) is basically a
tibble (data frame) but with two specially-marked columns: an **index** column
representing the time variable (defining an order from past to present), and a
**key** column identifying a unique observational unit for each time point. In
fact, the key can be made up of any number of columns, not just a single one.

In an `epi_df` object, the index variable is `time_value`, and the key variable
is typically `geo_value` (though this need not always be the case: for example,
@@ -113,11 +112,13 @@ Let's first remove certain dates from our data set to create gaps:
```{r}
# First make geo value more readable for tables, plots, etc.
x <- x %>%
mutate(geo_value = paste(
substr(county_name, 1, nchar(county_name) - 7),
name_to_abbr(state_name),
sep = ", "
)) %>%
mutate(
geo_value = paste(
substr(county_name, 1, nchar(county_name) - 7),
name_to_abbr(state_name),
sep = ", "
)
) %>%
select(geo_value, time_value, cases)
xt <- as_tsibble(x) %>% filter(cases >= 3)
332 changes: 289 additions & 43 deletions vignettes/archive.Rmd

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions vignettes/epiprocess.Rmd
Original file line number Diff line number Diff line change
@@ -128,9 +128,7 @@ columns required for an `epi_df` object (along with many others). We can use
frame into `epi_df` format.

```{r, message = FALSE}
x <- as_epi_df(cases,
as_of = max(cases$issue)
) %>%
x <- as_epi_df(cases, as_of = max(cases$issue)) %>%
select(geo_value, time_value, total_cases = value)
class(x)
@@ -176,9 +174,11 @@ attributes(x)$metadata
```

## Using additional key columns in `epi_df`

In the following examples we will show how to create an `epi_df` with additional keys.

### Converting a `tsibble` that has county code as an extra key

```{r}
ex1 <- tibble(
geo_value = rep(c("ca", "fl", "pa"), each = 3),
@@ -200,23 +200,24 @@ The metadata now includes `county_code` as an extra key.
attr(ex1, "metadata")
```


### Dealing with misspecified column names

`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.

```{r, error = TRUE}
data.frame(
# misnamed
state = rep(c("ca", "fl", "pa"), each = 3),
# extra key
pol = rep(c("blue", "swing", "swing"), each = 3),
# misnamed
reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)),
value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value)))
reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = 9),
value = 1:9 + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, 9))
) %>% as_epi_df(as_of = as.Date("2024-03-20"))
```

The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`.

```{r}
ex2 <- tibble(
# misnamed
@@ -240,7 +241,6 @@ ex2 <- ex2 %>%
attr(ex2, "metadata")
```


### Adding additional keys to an `epi_df` object

In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object.
2 changes: 1 addition & 1 deletion vignettes/growth_rate.Rmd
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ library(tidyr)
```

The data is fetched with the following query:

```{r, message = FALSE, eval=F}
x <- pub_covidcast(
source = "jhu-csse",
@@ -38,7 +39,6 @@ x <- pub_covidcast(

The data has 1,158 rows and 3 columns.


```{r, echo=FALSE}
data(jhu_csse_daily_subset)
x <- jhu_csse_daily_subset %>%
16 changes: 9 additions & 7 deletions vignettes/outliers.Rmd
Original file line number Diff line number Diff line change
@@ -127,11 +127,14 @@ vote across the base methods to determine whether a value is an outlier.
```{r}
x <- x %>%
group_by(geo_value) %>%
mutate(outlier_info = detect_outlr(
x = time_value, y = cases,
methods = detection_methods,
combiner = "median"
)) %>%
mutate(
outlier_info = detect_outlr(
x = time_value,
y = cases,
methods = detection_methods,
combiner = "median"
)
) %>%
ungroup() %>%
unnest(outlier_info)
@@ -240,10 +243,9 @@ ggplot(y, aes(x = time_value)) +
More advanced correction functionality will be coming at some point in the
future.


## Attribution

This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.

[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html):
These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes.

246 changes: 138 additions & 108 deletions vignettes/slide.Rmd

Large diffs are not rendered by default.

0 comments on commit e3dfa32

Please sign in to comment.