From db781635df3cefddfd9f2431f8cbab70c662f0d9 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 11 Sep 2023 18:10:09 -0700 Subject: [PATCH 001/345] remove .DS_Store --- vignettes/.DS_Store | Bin 8196 -> 0 bytes vignettes/.gitignore | 1 + 2 files changed, 1 insertion(+) delete mode 100644 vignettes/.DS_Store diff --git a/vignettes/.DS_Store b/vignettes/.DS_Store deleted file mode 100644 index dba164981b6904f488144f5147633e3b970d4dcb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMOK#gR5FI)OlK2yQnJxlo-GzbPK+R{-ZE}FLi3}%z8U=1XTkU(50y#}ikrQ;) zz9ET#9FpkHDUhK-K0)H+JU)#rj7&u4@V@y(v`<7k=#29*hD_t@+>kG(@jC*$a3d}QM*6l0@< zSJ~-gYKJ-?2^M1T2X8;*#k^o<&Nz<5KK zsP-GIafKDWQ!ALCG5c4*9A%iNg7YLelN<-J8o}dOiy+n^O=$$}ojijuvqyIZj3;=V7RF1YNAKDLcx!8^zj=I6@icYczW| zo}Fs486IkaL~ZK81{~OOG4{FsKPkWe-+&Sgp$@16Z{>g)9-kc_BD$}w6CGS@SLk2S wx$(T-p`~E3 Date: Mon, 11 Sep 2023 18:24:33 -0700 Subject: [PATCH 002/345] adjust colors --- _pkgdown.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 403cd174..93d5010c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,14 +1,25 @@ template: bootstrap: 5 + bootswatch: cosmo + bslib: + font_scale: 1.0 + primary: '#C41230' + link-color: '#C41230' + navbar-bg: '#C41230' + navbar-fg: '#f8f8f8' + +navbar: + bg: '#C41230' + fg: '#f8f8f8' url: https://cmu-delphi.github.io/epiprocess/ home: links: + - text: Introduction to Delphi's Tooling Work + href: https://cmu-delphi.github.io/delphi-tooling-book/ - text: Get the epipredict R package href: https://cmu-delphi.github.io/epipredict/ - - text: Get the covidcast R package - href: https://cmu-delphi.github.io/covidcast/covidcastR/ - text: Get the epidatr R package href: https://github.com/cmu-delphi/epidatr From c4abb04470f9728083c13f6ea4fc3ff88c68ac28 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Nov 2023 14:40:56 -0800 Subject: [PATCH 003/345] refactor+style: update for epidatr 1.0.0 --- data-raw/archive_cases_dv_subset.R | 31 ++-- data-raw/incidence_num_outlier_example.R | 9 +- data-raw/jhu_csse_county_level_subset.R | 9 +- data-raw/jhu_csse_daily_subset.R | 59 +++---- vignettes/advanced.Rmd | 166 +++++++++--------- vignettes/aggregation.Rmd | 101 ++++++----- vignettes/archive.Rmd | 214 ++++++++++++----------- vignettes/correlation.Rmd | 59 ++++--- vignettes/epiprocess.Rmd | 149 +++++++++------- vignettes/growth_rate.Rmd | 121 +++++++------ vignettes/outliers.Rmd | 138 +++++++++------ vignettes/slide.Rmd | 143 ++++++++------- 12 files changed, 642 insertions(+), 557 deletions(-) diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index d907fd89..5ba7ac4b 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -3,41 +3,40 @@ library(epiprocess) library(data.table) library(dplyr) -dv_subset <- covidcast( - data_source = "doctor-visits", +dv_subset <- pub_covidcast( + source = "doctor-visits", signals = "smoothed_adj_cli", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% # We're using compactify=FALSE here and below to avoid some testthat test # failures on tests that were based on a non-compactified version. - as_epi_archive(compactify=FALSE) + as_epi_archive(compactify = FALSE) -case_rate_subset <- covidcast( - data_source = "jhu-csse", +case_rate_subset <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify=FALSE) + as_epi_archive(compactify = FALSE) -archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, - sync="locf", - compactify=FALSE) +archive_cases_dv_subset <- epix_merge(dv_subset, case_rate_subset, + sync = "locf", + compactify = FALSE +) # If we directly store an epi_archive R6 object as data, it will store its class # implementation there as well. To prevent mismatches between these stored # implementations and the latest class definition, don't store them as R6 # objects; store the DT and construct the R6 object on request. -archive_cases_dv_subset_dt = archive_cases_dv_subset$DT +archive_cases_dv_subset_dt <- archive_cases_dv_subset$DT usethis::use_data(archive_cases_dv_subset_dt, overwrite = TRUE, internal = TRUE) diff --git a/data-raw/incidence_num_outlier_example.R b/data-raw/incidence_num_outlier_example.R index 0aea397b..a5cb4d89 100644 --- a/data-raw/incidence_num_outlier_example.R +++ b/data-raw/incidence_num_outlier_example.R @@ -3,16 +3,15 @@ library(epiprocess) library(dplyr) library(tidyr) -incidence_num_outlier_example <- covidcast( - data_source = "jhu-csse", +incidence_num_outlier_example <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20210531), + time_type = "day", geo_values = "fl,nj", + time_values = epirange(20200601, 20210531), as_of = 20211028 ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() diff --git a/data-raw/jhu_csse_county_level_subset.R b/data-raw/jhu_csse_county_level_subset.R index 61328423..faed75e8 100644 --- a/data-raw/jhu_csse_county_level_subset.R +++ b/data-raw/jhu_csse_county_level_subset.R @@ -9,15 +9,14 @@ y <- covidcast::county_census %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) # Fetch only counties from Massachusetts and Vermont, then append names columns as well -jhu_csse_county_level_subset <- covidcast( - data_source = "jhu-csse", +jhu_csse_county_level_subset <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "county", + time_type = "day", + geo_values = paste(y$geo_value, collapse = ","), time_values = epirange(20200601, 20211231), - geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() diff --git a/data-raw/jhu_csse_daily_subset.R b/data-raw/jhu_csse_daily_subset.R index ce94ff2e..14ca85c8 100644 --- a/data-raw/jhu_csse_daily_subset.R +++ b/data-raw/jhu_csse_daily_subset.R @@ -2,61 +2,60 @@ library(epidatr) library(epiprocess) library(dplyr) -confirmed_7dav_incidence_prop <- covidcast( - data_source = "jhu-csse", +confirmed_7dav_incidence_prop <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, case_rate_7d_av = value) %>% - arrange(geo_value, time_value) + arrange(geo_value, time_value) -deaths_7dav_incidence_prop <- covidcast( - data_source = "jhu-csse", +deaths_7dav_incidence_prop <- pub_covidcast( + source = "jhu-csse", signals = "deaths_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, death_rate_7d_av = value) %>% - arrange(geo_value, time_value) + arrange(geo_value, time_value) -confirmed_incidence_num <- covidcast( - data_source = "jhu-csse", +confirmed_incidence_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% - arrange(geo_value, time_value) + arrange(geo_value, time_value) -confirmed_7dav_incidence_num <- covidcast( - data_source = "jhu-csse", +confirmed_7dav_incidence_num <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases_7d_av = value) %>% arrange(geo_value, time_value) jhu_csse_daily_subset <- confirmed_7dav_incidence_prop %>% - full_join(deaths_7dav_incidence_prop, - by = c("geo_value", "time_value")) %>% - full_join(confirmed_incidence_num, - by = c("geo_value", "time_value")) %>% - full_join(confirmed_7dav_incidence_num, - by = c("geo_value", "time_value")) %>% - as_epi_df() + full_join(deaths_7dav_incidence_prop, + by = c("geo_value", "time_value") + ) %>% + full_join(confirmed_incidence_num, + by = c("geo_value", "time_value") + ) %>% + full_join(confirmed_7dav_incidence_num, + by = c("geo_value", "time_value") + ) %>% + as_epi_df() usethis::use_data(jhu_csse_daily_subset, overwrite = TRUE) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 02288905..91f8f37f 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -75,11 +75,11 @@ behavior we demonstrate also carries over to `epix_slide()`. ## Recycling outputs -When a computation returns a single atomic value, `epi_slide()` will internally -try to recycle the output so that it is size stable (in the sense described +When a computation returns a single atomic value, `epi_slide()` will internally +try to recycle the output so that it is size stable (in the sense described above). We can use this to our advantage, for example, in order to compute a trailing average marginally over geo values, which we demonstrate below in a -simple synthetic example. +simple synthetic example. ```{r message = FALSE} library(epiprocess) @@ -94,32 +94,32 @@ edf <- tibble( as_epi_df() # 2-day trailing average, per geo value -edf %>% +edf %>% group_by(geo_value) %>% epi_slide(x_2dav = mean(x), before = 1) %>% ungroup() -# 2-day trailing average, marginally -edf %>% +# 2-day trailing average, marginally +edf %>% epi_slide(x_2dav = mean(x), before = 1) ``` ```{r, include = FALSE} # More checks (not included) -edf %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) -edf %>% +edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% + mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() -edf %>% +edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) - mutate(version = time_value) %>% + mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% @@ -127,35 +127,35 @@ edf %>% ``` When the slide computation returns an atomic vector (rather than a single value) -`epi_slide()` checks whether its return length ensures size stability, and if +`epi_slide()` checks whether its return length ensures size stability, and if so, uses it to fill the new column. For example, this next computation gives the same result as the last one. ```{r} -edf %>% +edf %>% epi_slide(y_2dav = rep(mean(x), 3), before = 1) ``` However, if the output is an atomic vector (rather than a single value) and it -is *not* size stable, then `epi_slide()` throws an error. For example, below we +is *not* size stable, then `epi_slide()` throws an error. For example, below we are trying to return 2 things for 3 states. ```{r, error = TRUE} -edf %>% +edf %>% epi_slide(x_2dav = rep(mean(x), 2), before = 1) ``` ## Multi-column outputs -Now we move on to outputs that are data frames with a single row but multiple -columns. Working with this type of output structure has in fact has already been +Now we move on to outputs that are data frames with a single row but multiple +columns. Working with this type of output structure has in fact has already been demonstrated in the [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). When -we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` +we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` object returned by `epi_slide()` has a list column containing the slide values. ```{r} -edf2 <- edf %>% +edf2 <- edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = TRUE) %>% @@ -166,15 +166,15 @@ length(edf2$a) edf2$a[[2]] ``` -When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function -unnests (in the sense of `tidyr::unnest()`) the list column `a`, so that the +When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function +unnests (in the sense of `tidyr::unnest()`) the list column `a`, so that the resulting `epi_df` has multiple new columns containing the slide values. The -default is to name these unnested columns by prefixing the name assigned to the +default is to name these unnested columns by prefixing the name assigned to the list column (here `a`) onto the column names of the output data frame from the slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} -edf %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE) %>% @@ -185,7 +185,7 @@ We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop the prefix associated with list column name, in naming the unnested columns. ```{r} -edf %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) %>% @@ -196,20 +196,20 @@ Furthermore, `epi_slide()` will recycle the single row data frame as needed in order to make the result size stable, just like the case for atomic values. ```{r} -edf %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) ``` ```{r, include = FALSE} # More checks (not included) -edf %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL) -edf %>% - mutate(version = time_value) %>% +edf %>% + mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), @@ -220,8 +220,8 @@ edf %>% ## Multi-row outputs -For a slide computation that outputs a data frame with more than one row, the -behavior is analogous to a slide computation that outputs an atomic vector. +For a slide computation that outputs a data frame with more than one row, the +behavior is analogous to a slide computation that outputs an atomic vector. Meaning, `epi_slide()` will check that the result is size stable, and if so, will fill the new column(s) in the resulting `epi_df` object appropriately. @@ -241,9 +241,9 @@ edf %>% obj <- lm(y ~ x, data = d) return( as.data.frame( - predict(obj, newdata = d %>% + predict(obj, newdata = d %>% group_by(geo_value) %>% - filter(time_value == max(time_value)), + filter(time_value == max(time_value)), interval = "prediction", level = 0.9) )) }, before = 1, new_col_name = "fc", names_sep = NULL) @@ -279,24 +279,24 @@ library(ggplot2) theme_set(theme_bw()) y1 <- covidcast( - data_source = "doctor-visits", + source = "doctor-visits", signals = "smoothed_adj_cli", - time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl", + time_value = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) -) %>% fetch() +) -y2 <- covidcast( - data_source = "jhu-csse", +y2 <- pub_covidcast( + source = "jhu-csse", signal = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl", + time_value = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) -) %>% fetch() +) x <- y1 %>% select(geo_value, time_value, @@ -328,48 +328,48 @@ x <- archive_cases_dv_subset$DT %>% ``` -Next, we extend the ARX function to handle multiple geo values, since in the +Next, we extend the ARX function to handle multiple geo values, since in the present case, we will not be grouping by geo value and each slide computation -will be run on multiple geo values at once. Note that, because `epix_slide()` +will be run on multiple geo values at once. Note that, because `epix_slide()` only returns the grouping variables, `time_value`, and the slide computations in -the eventual returned tibble, we need to include `geo_value` as a column in the +the eventual returned tibble, we need to include `geo_value` as a column in the output data frame from our ARX computation. ```{r} library(tidyr) library(purrr) -prob_arx_args <- function(lags = c(0, 7, 14), - ahead = 7, +prob_arx_args <- function(lags = c(0, 7, 14), + ahead = 7, min_train_window = 20, - lower_level = 0.05, - upper_level = 0.95, - symmetrize = TRUE, + lower_level = 0.05, + upper_level = 0.95, + symmetrize = TRUE, intercept = FALSE, nonneg = TRUE) { - return(list(lags = lags, - ahead = ahead, + return(list(lags = lags, + ahead = ahead, min_train_window = min_train_window, lower_level = lower_level, upper_level = upper_level, - symmetrize = symmetrize, + symmetrize = symmetrize, intercept = intercept, nonneg = nonneg)) } -prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { +prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Return NA if insufficient training data if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { return(data.frame(geo_value = unique(geo_value), # Return geo value! point = NA, lower = NA, upper = NA)) } - + # Set up x, y, lags list if (!missing(x)) x <- data.frame(x, y) else x <- data.frame(y) if (!is.list(args$lags)) args$lags <- list(args$lags) args$lags = rep(args$lags, length.out = ncol(x)) - + # Build features and response for the AR model, and then fit it dat <- tibble(i = 1:ncol(x), lag = args$lags) %>% @@ -378,43 +378,43 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # One list element for each lagged feature pmap(function(i, lag, name) { tibble(geo_value = geo_value, - time_value = time_value + lag, # Shift back + time_value = time_value + lag, # Shift back !!name := x[,i]) - }) %>% + }) %>% # One list element for the response vector c(list( tibble(geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward + time_value = time_value - args$ahead, # Shift forward y = y))) %>% # Combine them together into one data frame reduce(full_join, by = c("geo_value", "time_value")) %>% arrange(time_value) if (args$intercept) dat$x0 = rep(1, nrow(dat)) obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) - + # Use LOCF to fill NAs in the latest feature values (do this by geo value) setDT(dat) # Convert to a data.table object by reference cols <- setdiff(names(dat), c("geo_value", "time_value")) dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] - + # Make predictions test_time_value = max(time_value) - point <- predict(obj, newdata = dat %>% + point <- predict(obj, newdata = dat %>% dplyr::group_by(geo_value) %>% dplyr::filter(time_value == test_time_value)) - + # Compute bands r <- residuals(obj) s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) lower <- point + q[1] upper <- point + q[2] - + # Clip at zero if we need to, then return - if (args$nonneg) { - point = pmax(point, 0) - lower = pmax(lower, 0) - upper = pmax(upper, 0) + if (args$nonneg) { + point = pmax(point, 0) + lower = pmax(lower, 0) + upper = pmax(upper, 0) } return(data.frame(geo_value = unique(geo_value), # Return geo value! point = point, lower = lower, upper = upper)) @@ -422,31 +422,31 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { ``` We now make forecasts on the archive and compare to forecasts on the latest -data. +data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), by = "1 month") # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), + epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead)), before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = TRUE, + mutate(target_date = time_value + ahead, as_of = TRUE, geo_value = fc_geo_value) } else { - x_latest %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), + x_latest %>% + epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead)), before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = FALSE) + mutate(target_date = time_value + ahead, as_of = FALSE) } } @@ -460,22 +460,22 @@ fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), k_week_ahead(x, ahead = 21, as_of = FALSE), k_week_ahead(x, ahead = 28, as_of = FALSE)) -# Plot them, on top of latest COVID-19 case rates +# Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), + geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), inherit.aes = FALSE, color = "gray50") + - geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + + geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") ``` We can see that these forecasts, which come from training an ARX model jointly over CA and FL, exhibit generally less variability and wider prediction bands -compared to the ones from the archive vignette, which come from training a +compared to the ones from the archive vignette, which come from training a separate ARX model on each state. As in the archive vignette, we can see a difference between version-aware (right column) and -unaware (left column) forecasting, as well. diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index bdf6279e..205ed084 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -9,7 +9,7 @@ vignette: > Aggregation, both time-wise and geo-wise, are common tasks when working with epidemiological data sets. This vignette demonstrates how to carry out these -kinds of tasks with `epi_df` objects. We'll work with county-level reported +kinds of tasks with `epi_df` objects. We'll work with county-level reported COVID-19 cases in MA and VT. ```{r, message = FALSE, eval= FALSE, warning= FALSE} @@ -24,15 +24,14 @@ y <- covidcast::county_census %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) # Fetch only counties from Massachusetts and Vermont, then append names columns as well -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "county", + time_type = "day", + geo_values = paste(y$geo_value, collapse = ","), time_values = epirange(20200601, 20211231), - geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() @@ -47,7 +46,7 @@ library(epiprocess) library(dplyr) data(jhu_csse_county_level_subset) -x <- jhu_csse_county_level_subset +x <- jhu_csse_county_level_subset ``` ## Converting to `tsibble` format @@ -59,7 +58,7 @@ basically a tibble (data frame) but with two specially-marked columns: an **index** column representing the time variable (defining an order from past to present), and a **key** column identifying a unique observational unit for each time point. In fact, the key can be made up of any number of columns, not just a -single one. +single one. In an `epi_df` object, the index variable is `time_value`, and the key variable is typically `geo_value` (though this need not always be the case: for example, @@ -101,7 +100,7 @@ head(as_tsibble(x, key = c("county_name", "state_name"))) ``` ## Detecting and filling time gaps - + One of the major advantages of the `tsibble` package is its ability to handle **implicit gaps** in time series data. In other words, it can infer what time scale we're interested in (say, daily data), and detect apparent gaps (say, when @@ -113,13 +112,15 @@ Let's first remove certain dates from our data set to create gaps: ```{r} # First make geo value more readable for tables, plots, etc. -x <- x %>% - mutate(geo_value = paste( - substr(county_name, 1, nchar(county_name) - 7), - name_to_abbr(state_name), sep = ", ")) %>% - select(geo_value, time_value, cases) - -xt <- as_tsibble(x) %>% filter(cases >= 3) +x <- x %>% + mutate(geo_value = paste( + substr(county_name, 1, nchar(county_name) - 7), + name_to_abbr(state_name), + sep = ", " + )) %>% + select(geo_value, time_value, cases) + +xt <- as_tsibble(x) %>% filter(cases >= 3) ``` The functions `has_gaps()`, `scan_gaps()`, `count_gaps()` in the `tsibble` @@ -131,20 +132,24 @@ head(scan_gaps(xt)) head(count_gaps(xt)) ``` -We can also visualize the patterns of missingness: +We can also visualize the patterns of missingness: ```{r, message = FALSE, warning = FALSE} library(ggplot2) theme_set(theme_bw()) -ggplot(count_gaps(xt), - aes(x = reorder(geo_value, desc(geo_value)), - color = geo_value)) + - geom_linerange(aes(ymin = .from, ymax = .to)) + +ggplot( + count_gaps(xt), + aes( + x = reorder(geo_value, desc(geo_value)), + color = geo_value + ) +) + + geom_linerange(aes(ymin = .from, ymax = .to)) + geom_point(aes(y = .from)) + - geom_point(aes(y = .to)) + - coord_flip() + - labs(x = "County", y = "Date") + + geom_point(aes(y = .to)) + + coord_flip() + + labs(x = "County", y = "Date") + theme(legend.position = "none") ``` @@ -168,8 +173,8 @@ went back to June 6, 2020. By setting `.full = TRUE`, we can at zero-fill over the entire span of the observed (censored) data. ```{r} -xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) - +xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) + head(xt_filled) ``` @@ -186,45 +191,49 @@ running `epi_slide()` on the zero-filled data brings these trailing averages 2021. ```{r} -xt %>% +xt %>% as_epi_df() %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% - filter(geo_value == "Plymouth, MA", - abs(time_value - as.Date("2021-07-01")) <= 3) %>% + filter( + geo_value == "Plymouth, MA", + abs(time_value - as.Date("2021-07-01")) <= 3 + ) %>% print(n = 7) - -xt_filled %>% + +xt_filled %>% as_epi_df() %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% - filter(geo_value == "Plymouth, MA", - abs(time_value - as.Date("2021-07-01")) <= 3) %>% + filter( + geo_value == "Plymouth, MA", + abs(time_value - as.Date("2021-07-01")) <= 3 + ) %>% print(n = 7) ``` -## Aggregate to different time scales +## Aggregate to different time scales Continuing on with useful `tsibble` functionality, we can aggregate to different -time scales using `index_by()` from `tsibble`, which modifies the index variable +time scales using `index_by()` from `tsibble`, which modifies the index variable in the given object by applying a suitable time-coarsening transformation (say, -moving from days to weeks, or weeks to months, and so on). The most common use -case would be to follow up with a call to a `dplyr` verb like `summarize()` in +moving from days to weeks, or weeks to months, and so on). The most common use +case would be to follow up with a call to a `dplyr` verb like `summarize()` in order to perform some kind of aggregation of our measured variables over the new index variable. -Below, we use the functions `yearweek()` and `yearmonth()` that are provided in -the `tsibble` package in order to aggregate to weekly and monthly resolutions. -In the former call, we set `week_start = 7` to coincide with the CDC definition +Below, we use the functions `yearweek()` and `yearmonth()` that are provided in +the `tsibble` package in order to aggregate to weekly and monthly resolutions. +In the former call, we set `week_start = 7` to coincide with the CDC definition of an epiweek (epidemiological week). ```{r} # Aggregate to weekly xt_filled_week <- xt_filled %>% index_by(epiweek = ~ yearweek(., week_start = 7)) %>% - group_by(geo_value) %>% + group_by(geo_value) %>% summarize(cases = sum(cases, na.rm = TRUE)) head(xt_filled_week) @@ -232,19 +241,19 @@ head(xt_filled_week) # Aggregate to monthly xt_filled_month <- xt_filled_week %>% index_by(month = ~ yearmonth(.)) %>% - group_by(geo_value) %>% - summarize(cases = sum(cases, na.rm = TRUE)) + group_by(geo_value) %>% + summarize(cases = sum(cases, na.rm = TRUE)) head(xt_filled_month) ``` -## Geographic aggregation +## Geographic aggregation TODO ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index f14c8663..b351d684 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -33,16 +33,15 @@ library(dplyr) library(purrr) library(ggplot2) -dv <- covidcast( - data_source = "doctor-visits", +dv <- pub_covidcast( + source = "doctor-visits", signals = "smoothed_adj_cli", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) -) %>% fetch() - +) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} @@ -68,16 +67,16 @@ has (at least) the following columns: the data for January 14, 2022 that were available one day later. As we can see from the above, the data frame returned by -`epidatr::covidcast()` has the columns required for the `epi_archive` +`epidatr::pub_covidcast()` has the columns required for the `epi_archive` format, with `issue` playing the role of `version`. We can now use `as_epi_archive()` to bring it into `epi_archive` format. For removal of -redundant version updates in `as_epi_archive` using compactify, please refer -to the compactify vignette. +redundant version updates in `as_epi_archive` using compactify, please refer to +the compactify vignette. ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive(compactify=TRUE) + as_epi_archive(compactify = TRUE) class(x) print(x) @@ -85,8 +84,8 @@ print(x) ```{r, echo=FALSE, message=FALSE, warning=FALSE} x <- archive_cases_dv_subset$DT %>% - select(geo_value, time_value, version , percent_cli) %>% - as_epi_archive(compactify=TRUE) + select(geo_value, time_value, version, percent_cli) %>% + as_epi_archive(compactify = TRUE) class(x) print(x) @@ -107,22 +106,22 @@ for the data table, as well as any other specified in the metadata (described below). There can only be a single row per unique combination of key variables, and therefore the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version (also described below). - + ```{r, error=TRUE} key(x$DT) ``` - + In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions. **A word of caution:** R6 objects, unlike most other objects in R, have reference semantics. An important consequence of this is that objects are not copied when modified. - + ```{r} original_value <- x$DT$percent_cli[1] y <- x # This DOES NOT make a copy of x -y$DT$percent_cli[1] = 0 +y$DT$percent_cli[1] <- 0 head(y$DT) -head(x$DT) +head(x$DT) x$DT$percent_cli[1] <- original_value ``` @@ -133,7 +132,7 @@ x$clone()`. You can read more about reference semantics in Hadley Wickham's ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` -object: +object: * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. @@ -150,7 +149,7 @@ call (as it did in the case above). A key method of an `epi_archive` class is `as_of()`, which generates a snapshot of the archive in `epi_df` format. This represents the most up-to-date values of the signal variables as of a given version. This can be accessed via `x$as_of()` -for an `epi_archive` object `x`, but the package also provides a simple wrapper +for an `epi_archive` object `x`, but the package also provides a simple wrapper function `epix_as_of()` since this is likely a more familiar interface for users not familiar with R6 (or object-oriented programming). @@ -162,7 +161,7 @@ max(x_snapshot$time_value) attributes(x_snapshot)$metadata$as_of ``` -We can see that the max time value in the `epi_df` object `x_snapshot` that was +We can see that the max time value in the `epi_df` object `x_snapshot` that was generated from the archive is May 29, 2021, even though the specified version date was June 1, 2021. From this we can infer that the doctor's visits signal was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` @@ -171,7 +170,7 @@ object has the version date recorded in the `as_of` field. By default, using the maximum of the `version` column in the underlying data table in an `epi_archive` object itself generates a snapshot of the latest values of signal variables in the entire archive. The `epix_as_of()` function issues a warning in -this case, since updates to the current version may still come in at a later +this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} @@ -180,44 +179,48 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) Below, we pull several snapshots from the archive, spaced one month apart. We overlay the corresponding signal curves as colored lines, with the version dates -marked by dotted vertical lines, and draw the latest curve in black (from the +marked by dotted vertical lines, and draw the latest curve in black (from the latest snapshot `x_latest` that the archive can provide). ```{r, fig.width = 8, fig.height = 7} theme_set(theme_bw()) -self_max = max(x$DT$version) -versions = seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") -snapshots <- map_dfr(versions, function(v) { +self_max <- max(x$DT$version) +versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") +snapshots <- map_dfr(versions, function(v) { epix_as_of(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows(x_latest %>% mutate(version = self_max)) %>% mutate(latest = version == self_max) -ggplot(snapshots %>% filter(!latest), - aes(x = time_value, y = percent_cli)) + - geom_line(aes(color = factor(version)), na.rm=TRUE) + +ggplot( + snapshots %>% filter(!latest), + aes(x = time_value, y = percent_cli) +) + + geom_line(aes(color = factor(version)), na.rm = TRUE) + geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + - facet_wrap(~ geo_value, scales = "free_y", ncol = 1) + + facet_wrap(~geo_value, scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "% of doctor's visits with CLI") + + labs(x = "Date", y = "% of doctor's visits with CLI") + theme(legend.position = "none") + - geom_line(data = snapshots %>% filter(latest), - aes(x = time_value, y = percent_cli), - inherit.aes = FALSE, color = "black", na.rm=TRUE) + geom_line( + data = snapshots %>% filter(latest), + aes(x = time_value, y = percent_cli), + inherit.aes = FALSE, color = "black", na.rm = TRUE + ) ``` We can see some interesting and highly nontrivial revision behavior: at some points in time the provisional data snapshots grossly underestimate the latest curve (look in particular at Florida close to the end of 2021), and at others -they overestimate it (both states towards the beginning of 2021), though not +they overestimate it (both states towards the beginning of 2021), though not quite as dramatically. Modeling the revision process, which is often called *backfill modeling*, is an important statistical problem in it of itself. -## Merging `epi_archive` objects +## Merging `epi_archive` objects Now we demonstrate how to merge two `epi_archive` objects together, e.g., so that grabbing data from multiple sources as of a particular version can be @@ -243,20 +246,19 @@ When merging archives, unless the archives have identical data release patterns, the other). ```{r, message = FALSE, warning = FALSE,eval=FALSE} -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20211201), + time_type = "day", geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), issues = epirange(20200601, 20211201) ) %>% - fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify=TRUE) + as_epi_archive(compactify = TRUE) -x$merge(y, sync="locf", compactify=FALSE) +x$merge(y, sync = "locf", compactify = FALSE) print(x) head(x$DT) ``` @@ -273,70 +275,75 @@ documentation for either for more detailed descriptions of what mutation, pointer aliasing, and pointer reseating is possible. ## Sliding version-aware computations - + Lastly, we demonstrate another key method of the `epi_archive` class, which is the `slide()` method. It works just like `epi_slide()` does for an `epi_df` object, but with one key difference: it performs version-aware computations. That is, for the computation at any given reference time t, it only uses **data that would have been available as of t**. The wrapper function is called -`epix_slide()`; again, this is just for convenience/familiarity---and its +`epix_slide()`; again, this is just for convenience/familiarity---and its interface is purposely designed mirror that of `epi_slide()` for `epi_df` objects. -For the demonstration, we'll revisit the forecasting example from the [slide +For the demonstration, we'll revisit the forecasting example from the [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html), and now we'll build a forecaster that uses properly-versioned data (that would have been available in real-time) to forecast future COVID-19 case rates from current and -past COVID-19 case rates, as well as current and past values of the outpatient +past COVID-19 case rates, as well as current and past values of the outpatient CLI signal from medical claims. We'll extend the `prob_ar()` function from the slide vignette to accomodate exogenous variables in the autoregressive model, -which is often referred to as an ARX model. +which is often referred to as an ARX model. ```{r} prob_arx <- function(x, y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20, - lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, - intercept = FALSE, nonneg = TRUE) { + lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, + intercept = FALSE, nonneg = TRUE) { # Return NA if insufficient training data if (length(y) < min_train_window + max(lags) + ahead) { return(data.frame(point = NA, lower = NA, upper = NA)) } - + # Useful transformations - if (!missing(x)) x <- data.frame(x, y) - else x <- data.frame(y) + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } if (!is.list(lags)) lags <- list(lags) - lags = rep(lags, length.out = ncol(x)) - + lags <- rep(lags, length.out = ncol(x)) + # Build features and response for the AR model, and then fit it dat <- do.call( - data.frame, + data.frame, unlist( # Below we loop through and build the lagged features - purrr::map(1:ncol(x), function(i) { - purrr::map(lags[[i]], function(j) lag(x[,i], n = j)) + purrr::map(1:ncol(x), function(i) { + purrr::map(lags[[i]], function(j) lag(x[, i], n = j)) }), - recursive = FALSE)) - names(dat) = paste0("x", 1:ncol(dat)) - if (intercept) dat$x0 = rep(1, nrow(dat)) - dat$y <- lead(y, n = ahead) + recursive = FALSE + ) + ) + names(dat) <- paste0("x", 1:ncol(dat)) + if (intercept) dat$x0 <- rep(1, nrow(dat)) + dat$y <- lead(y, n = ahead) obj <- lm(y ~ . + 0, data = dat) - + # Use LOCF to fill NAs in the latest feature values, make a prediction - setDT(dat) + setDT(dat) setnafill(dat, type = "locf") point <- predict(obj, newdata = tail(dat, 1)) - - # Compute a band + + # Compute a band r <- residuals(obj) s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) lower <- point + q[1] upper <- point + q[2] - + # Clip at zero if we need to, then return - if (nonneg) { - point = max(point, 0) - lower = max(lower, 0) - upper = max(upper, 0) + if (nonneg) { + point <- max(point, 0) + lower <- max(lower, 0) + upper <- max(upper, 0) } return(data.frame(point = point, lower = lower, upper = upper)) } @@ -346,14 +353,17 @@ Next we slide this forecaster over the working `epi_archive` object, in order to forecast COVID-19 case rates 7 days into the future. ```{r} -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month") +fc_time_values <- seq(as.Date("2020-08-01"), + as.Date("2021-11-30"), + by = "1 month" +) z <- x %>% group_by(geo_value) %>% - epix_slide(fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, - ref_time_values = fc_time_values) %>% + epix_slide( + fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, + ref_time_values = fc_time_values + ) %>% ungroup() head(z, 10) @@ -371,7 +381,7 @@ On the whole, `epix_slide()` works similarly to `epix_slide()`, though there are a few notable differences, even apart from the version-aware aspect. You can read the documentation for `epix_slide()` for details. -We finish off by comparing version-aware and -unaware forecasts at various +We finish off by comparing version-aware and -unaware forecasts at various points in time and forecast horizons. The former comes from using `epix_slide()` with the `epi_archive` object `x`, and the latter from applying `epi_slide()` to the latest snapshot of the data `x_latest`. @@ -384,42 +394,50 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(geo_value) %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values) %>% + epix_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + ref_time_values = fc_time_values + ) %>% mutate(target_date = time_value + ahead, as_of = TRUE) %>% ungroup() - } - else { - x_latest %>% + } else { + x_latest %>% group_by(geo_value) %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values) %>% + epi_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + ref_time_values = fc_time_values + ) %>% mutate(target_date = time_value + ahead, as_of = FALSE) %>% ungroup() } } # Generate the forecasts, and bind them together -fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE)) - -# Plot them, on top of latest COVID-19 case rates +fc <- bind_rows( + k_week_ahead(x, ahead = 7, as_of = TRUE), + k_week_ahead(x, ahead = 14, as_of = TRUE), + k_week_ahead(x, ahead = 21, as_of = TRUE), + k_week_ahead(x, ahead = 28, as_of = TRUE), + k_week_ahead(x, ahead = 7, as_of = FALSE), + k_week_ahead(x, ahead = 14, as_of = FALSE), + k_week_ahead(x, ahead = 21, as_of = FALSE), + k_week_ahead(x, ahead = 28, as_of = FALSE) +) + +# Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50") + - geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc_point)) + + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") + labs(x = "Date", y = "Reported COVID-19 case rates") + + theme(legend.position = "none") ``` Each row displays the forecasts for a different location (CA, FL, NY, and TX), and each diff --git a/vignettes/correlation.Rmd b/vignettes/correlation.Rmd index 0d357402..34e8c0f0 100644 --- a/vignettes/correlation.Rmd +++ b/vignettes/correlation.Rmd @@ -23,26 +23,24 @@ library(dplyr) The data is fetched with the following query: ```{r, message = FALSE} -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "*", time_values = epirange(20200301, 20211231), - geo_values = "*" ) %>% - fetch() %>% select(geo_value, time_value, case_rate = value) -y <- covidcast( - data_source = "jhu-csse", +y <- pub_covidcast( + source = "jhu-csse", signals = "deaths_7dav_incidence_prop", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "*", time_values = epirange(20200301, 20211231), - geo_values = "*" ) %>% - fetch() %>% select(geo_value, time_value, death_rate = value) x <- x %>% @@ -68,7 +66,7 @@ theme_set(theme_bw()) z1 <- epi_cor(x, case_rate, death_rate, cor_by = "time_value") -ggplot(z1, aes(x = time_value, y = cor)) + +ggplot(z1, aes(x = time_value, y = cor)) + geom_line() + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Correlation") @@ -95,14 +93,16 @@ correlated with case rates at an offset of -10 days.) ```{r, message = FALSE, warning = FALSE} z2 <- epi_cor(x, case_rate, death_rate, cor_by = time_value, dt1 = -10) -z <- rbind(z1 %>% mutate(lag = 0), - z2 %>% mutate(lag = 10)) %>% +z <- rbind( + z1 %>% mutate(lag = 0), + z2 %>% mutate(lag = 10) +) %>% mutate(lag = as.factor(lag)) ggplot(z, aes(x = time_value, y = cor)) + geom_line(aes(color = lag)) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Correlation", col = "Lag") + labs(x = "Date", y = "Correlation", col = "Lag") ``` Note that `epi_cor()` takes an argument `shift_by` that determines the grouping @@ -117,53 +117,56 @@ days from now. ## Correlations grouped by state The second option we have is to group by geo value, obtained by setting `cor_by -= geo_value`. We'll again look at correlations for both 0- and 10-day lagged += geo_value`. We'll again look at correlations for both 0- and 10-day lagged case rates. ```{r, message = FALSE, warning = FALSE} z1 <- epi_cor(x, case_rate, death_rate, cor_by = geo_value) z2 <- epi_cor(x, case_rate, death_rate, cor_by = geo_value, dt1 = -10) -z <- rbind(z1 %>% mutate(lag = 0), - z2 %>% mutate(lag = 10)) %>% +z <- rbind( + z1 %>% mutate(lag = 0), + z2 %>% mutate(lag = 10) +) %>% mutate(lag = as.factor(lag)) ggplot(z, aes(cor)) + geom_density(aes(fill = lag, col = lag), alpha = 0.5) + - labs(x = "Correlation", y = "Density", fill = "Lag", col = "Lag") + labs(x = "Correlation", y = "Density", fill = "Lag", col = "Lag") ``` -We can again see that, generally speaking, lagging the case rates back by 10 +We can again see that, generally speaking, lagging the case rates back by 10 days improves the correlations. ## More systematic lag analysis -Next we perform a more systematic investigation of the correlations over a broad -range of lag values. +Next we perform a more systematic investigation of the correlations over a broad +range of lag values. ```{r, message = FALSE, warning = FALSE} library(purrr) -lags = 0:35 +lags <- 0:35 z <- map_dfr(lags, function(lag) { epi_cor(x, case_rate, death_rate, cor_by = geo_value, dt1 = -lag) %>% - mutate(lag = .env$lag) + mutate(lag = .env$lag) }) z %>% group_by(lag) %>% summarize(mean = mean(cor, na.rm = TRUE)) %>% - ggplot(aes(x = lag, y = mean)) + - geom_line() + geom_point() + + ggplot(aes(x = lag, y = mean)) + + geom_line() + + geom_point() + labs(x = "Lag", y = "Mean correlation") ``` -We can see that some pretty clear curvature here in the mean correlation between +We can see that some pretty clear curvature here in the mean correlation between case and death rates (where the correlations come from grouping by geo value) as a function of lag. The maximum occurs at a lag of somewhere around 17 days. ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index ebab0ec9..a1b52daa 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -5,13 +5,13 @@ vignette: > %\VignetteIndexEntry{Get started with `epiprocess`} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} -editor_options: +editor_options: chunk_output_type: console --- This package introduces a common data structure for epidemiological data sets measured over space and time, and offers associated utilities to perform basic -signal processing tasks. +signal processing tasks. ## Installing @@ -19,7 +19,7 @@ This package is not on CRAN yet, so it can be installed using the [`devtools`](https://devtools.r-lib.org) package: ```{r, eval = FALSE} -devtools::install_github("cmu-delphi/epiprocess", ref = "main") +devtools::install_github("cmu-delphi/epiprocess", ref = "main") ``` Building the vignettes, such as this getting started guide, takes a significant @@ -27,13 +27,15 @@ amount of time. They are not included in the package by default. If you want to include vignettes, then use this modified command: ```{r, eval = FALSE} -devtools::install_github("cmu-delphi/epiprocess", ref = "main", - build_vignettes = TRUE, dependencies = TRUE) +devtools::install_github("cmu-delphi/epiprocess", + ref = "main", + build_vignettes = TRUE, dependencies = TRUE +) ``` ## Getting data into `epi_df` format -We'll start by showing how to get data into +We'll start by showing how to get data into epi_df format, which is just a tibble with a bit of special structure, and is the format assumed by all of the functions in the `epiprocess` package. An `epi_df` object has (at least) the @@ -60,28 +62,29 @@ library(epiprocess) library(dplyr) library(withr) -cases <- covidcast( - data_source = "jhu-csse", +cases <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_cumulative_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", time_values = epirange(20200301, 20220131), - geo_values = "ca,fl,ny,tx" -) %>% fetch() +) colnames(cases) ``` -As we can see, a data frame returned by `epidatr::covidcast()` has the +As we can see, a data frame returned by `epidatr::pub_covidcast()` has the columns required for an `epi_df` object (along with many others). We can use `as_epi_df()`, with specification of some relevant metadata, to bring the data frame into `epi_df` format. ```{r, message = FALSE} -x <- as_epi_df(cases, - geo_type = "state", - time_type = "day", - as_of = max(cases$issue)) %>% +x <- as_epi_df(cases, + geo_type = "state", + time_type = "day", + as_of = max(cases$issue) +) %>% select(geo_value, time_value, total_cases = value) class(x) @@ -93,7 +96,7 @@ attributes(x)$metadata ## Some details on metadata In general, an `epi_df` object has the following fields in its metadata: - + * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. * `as_of`: the time value at which the given data were available. @@ -115,10 +118,10 @@ data set. See the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for more. -If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a +If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a call to `as_epi_df()`, then this function will try to infer them from the passed object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value` -and `time_value` columns, respectively, but inferring the `as_of` field is not +and `time_value` columns, respectively, but inferring the `as_of` field is not as easy. See the documentation for `as_epi_df()` more details. ```{r} @@ -135,13 +138,16 @@ In the following examples we will show how to create an `epi_df` with additional ```{r} ex1 <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c("06059","06061","06067", - "12111","12113","12117", - "42101","42103","42105"), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), + by = "day" + ), length.out = length(geo_value)), value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) - ) %>% +) %>% as_tsibble(index = time_value, key = c(geo_value, county_code)) ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03") @@ -149,11 +155,11 @@ ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-0 The metadata now includes `county_code` as an extra key. ```{r} -attr(ex1,"metadata") +attr(ex1, "metadata") ``` -### Dealing with misspecified column names +### Dealing with misspecified column names `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. ```{r, error = TRUE} @@ -161,56 +167,62 @@ data.frame( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), # misnamed + by = "day" + ), length.out = length(geo_value)), # misnamed value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) -) %>% as_epi_df() +) %>% as_epi_df() ``` -The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. +The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. ```{r} ex2 <- tibble( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(state)), # misnamed + by = "day" + ), length.out = length(state)), # misnamed value = 1:length(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state))) ) %>% data.frame() -head(ex2) +head(ex2) -ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>% - as_epi_df(geo_type = "state", as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol")) +ex2 <- ex2 %>% + rename(geo_value = state, time_value = reported_date) %>% + as_epi_df( + geo_type = "state", as_of = "2020-06-03", + additional_metadata = list(other_keys = "pol") + ) -attr(ex2,"metadata") +attr(ex2, "metadata") ``` ### Adding additional keys to an `epi_df` object -In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. +In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. We use a toy data set included in `epiprocess` prepared using the `covidcast` library and are filtering to a single state for simplicity. ```{r} ex3 <- jhu_csse_county_level_subset %>% filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% - slice_tail(n = 6) - -attr(ex3,"metadata") # geo_type is county currently + slice_tail(n = 6) + +attr(ex3, "metadata") # geo_type is county currently ``` -Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column. +Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column. ```{r} -ex3 <- ex3 %>% +ex3 <- ex3 %>% as_tibble() %>% # needed to add the additional metadata mutate( - state = rep(tolower("MA"),6), - pol = rep(c("blue", "swing", "swing"), each = 2)) %>% + state = rep(tolower("MA"), 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) %>% as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) -attr(ex3,"metadata") +attr(ex3, "metadata") ``` Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. @@ -229,15 +241,15 @@ like plotting, which is pretty easy to do `ggplot2`. library(ggplot2) theme_set(theme_bw()) -ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + +ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + geom_line() + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State") ``` -As a last couple examples, we'll look at some more data sets just to show how -we might get them into `epi_df` format. Data on daily new (not cumulative) SARS -cases in Canada in 2003, from the +As a last couple examples, we'll look at some more data sets just to show how +we might get them into `epi_df` format. Data on daily new (not cumulative) SARS +cases in Canada in 2003, from the [outbreaks](https://github.com/reconverse/outbreaks) package: ```{r} @@ -249,12 +261,12 @@ x <- outbreaks::sars_canada_2003 %>% head(x) library(tidyr) -x <- x %>% +x <- x %>% pivot_longer(starts_with("cases"), names_to = "type") %>% mutate(type = substring(type, 7)) -yrange <- range(x %>% group_by(time_value) %>% - summarize(value = sum(value)) %>% pull(value)) +yrange <- range(x %>% group_by(time_value) %>% + summarize(value = sum(value)) %>% pull(value)) ggplot(x, aes(x = time_value, y = value)) + geom_col(aes(fill = type)) + @@ -271,22 +283,29 @@ x <- outbreaks::ebola_sierraleone_2014 %>% cases = ifelse(status == "confirmed", 1, 0), province = case_when( district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern", - district %in% c("Bombali", "Kambia", "Koinadugu", - "Port Loko", "Tonkolili") ~ "Northern", + district %in% c( + "Bombali", "Kambia", "Koinadugu", + "Port Loko", "Tonkolili" + ) ~ "Northern", district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern", - district %in% c("Western Rural", "Western Urban") ~ "Western")) %>% - select(geo_value = province, - time_value = date_of_onset, - cases) %>% filter(cases==1) %>% - group_by(geo_value, time_value) %>% + district %in% c("Western Rural", "Western Urban") ~ "Western" + ) + ) %>% + select( + geo_value = province, + time_value = date_of_onset, + cases + ) %>% + filter(cases == 1) %>% + group_by(geo_value, time_value) %>% summarise(cases = sum(cases)) %>% - as_epi_df(geo_type="province") + as_epi_df(geo_type = "province") -ggplot(x, aes(x = time_value, y = cases)) + - geom_col(aes(fill = geo_value), show.legend = FALSE) + - facet_wrap(~ geo_value, scales = "free_y") + +ggplot(x, aes(x = time_value, y = cases)) + + geom_col(aes(fill = geo_value), show.legend = FALSE) + + facet_wrap(~geo_value, scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") + labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") ``` @@ -294,6 +313,6 @@ ggplot(x, aes(x = time_value, y = cases)) + ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index 9c185b15..4fb4eda5 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -9,7 +9,7 @@ vignette: > A basic way of assessing growth in a signal is to look at its relative change over two neighboring time windows. The `epiprocess` package provides a function -`growth_rate()` to compute such relative changes, as well as more sophisticated +`growth_rate()` to compute such relative changes, as well as more sophisticated estimates the growth rate of a signal. We investigate this functionality in the current vignette, applied to state-level daily reported COVID-19 cases from GA and PA, smoothed using a 7-day trailing average. @@ -23,15 +23,14 @@ library(tidyr) The data is fetched with the following query: ```{r, message = FALSE, eval=F} -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_7dav_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ga,pa", time_values = epirange(20200601, 20211231), - geo_values = "ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) %>% as_epi_df() @@ -56,16 +55,16 @@ $t$ is defined as $f'(t)/f(t)$, where $f'(t)$ is the derivative of $f$ at $t$. To estimate the growth rate of a signal in discrete-time (which can be thought of as evaluations or discretizations of an underlying function in continuous-time), we can estimate the derivative and divide by the signal value -itself (or possibly a smoothed version of the signal value). +itself (or possibly a smoothed version of the signal value). The `growth_rate()` function takes a sequence of underlying design points `x` and corresponding sequence `y` of signal values, and allows us to choose from the following methods for estimating the growth rate at a given reference point -`x0`, by setting the `method` argument: +`x0`, by setting the `method` argument: * "rel_change": uses $(\bar B/\bar A - 1) / h$, where $\bar B$ is the average of `y` over the second half of a sliding window of bandwidth `h` centered at the - reference point `x0`, and $\bar A$ the average over the first half. This can + reference point `x0`, and $\bar A$ the average over the first half. This can be seen as using a first-difference approximation to the derivative. * "linear_reg": uses the slope from a linear regression of `y` on `x` over a sliding window centered at the reference point `x0`, divided by the fitted @@ -79,7 +78,7 @@ the following methods for estimating the growth rate at a given reference point at `x0`. The default in `growth_rate()` is `x0 = x`, so that it returns an estimate of -the growth rate at each underlying design point. +the growth rate at each underlying design point. ## Relative change @@ -90,8 +89,8 @@ a call to `dplyr::mutate()` to append a new column to our `epi_df` object with the computed growth rates. ```{r} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% mutate(cases_gr1 = growth_rate(time_value, cases)) head(x, 10) @@ -99,7 +98,7 @@ head(x, 10) We can visualize these growth rate estimates by plotting the signal values and highlighting the periods in time for which the relative change is above 1% (in -red) and below -1% (in blue), faceting by geo value. +red) and below -1% (in blue), faceting by geo value. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 4} library(ggplot2) @@ -108,30 +107,30 @@ theme_set(theme_bw()) upper = 0.01 lower = -0.01 -ggplot(x, aes(x = time_value, y = cases)) + +ggplot(x, aes(x = time_value, y = cases)) + geom_tile(data = x %>% filter(cases_gr1 >= upper), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 2, alpha = 0.08) + + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 2, alpha = 0.08) + geom_tile(data = x %>% filter(cases_gr1 <= lower), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 4, alpha = 0.08) + - geom_line() + + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 4, alpha = 0.08) + + geom_line() + facet_wrap(vars(geo_value), scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 cases") ``` As a more direct visualization, we plot the estimated growth rates themselves, -overlaying the curves for the two states on one plot. +overlaying the curves for the two states on one plot. ```{r, message = FALSE, warning = FALSE} -ggplot(x, aes(x = time_value, y = cases_gr1)) + - geom_line(aes(col = geo_value)) + +ggplot(x, aes(x = time_value, y = cases_gr1)) + + geom_line(aes(col = geo_value)) + geom_hline(yintercept = upper, linetype = 2, col = 2) + geom_hline(yintercept = lower, linetype = 2, col = 4) + scale_color_manual(values = c(3,6)) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "State") + labs(x = "Date", y = "Growth rate", col = "State") ``` We can see that the estimated growth rates from the relative change method are @@ -150,23 +149,23 @@ again `h = 7`. Compared to "rel_change", it appears to behave similarly overall, but thankfully avoids some of the troublesome spikes: ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% mutate(cases_gr2 = growth_rate(time_value, cases, method = "linear_reg")) x %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr1 = "rel_change", cases_gr2 = "linear_reg")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(2,4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") ``` ## Nonparametric estimation @@ -179,32 +178,32 @@ particular implementations and default settings for these methods: "trend_filter" is based on a full solution path algorithm provided in the `genlasso` package, and performs cross-validation by default in order to pick the level of regularization; read the documentation for `growth_rate()` more -details.) +details.) ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% mutate(cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), cases_gr4 = growth_rate(time_value, cases, method = "trend_filter")) x %>% select(geo_value, time_value, cases_gr3, cases_gr4) %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr3 = "smooth_spline", cases_gr4 = "trend_filter")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(3,6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") ``` -In this particular example, the trend filtering estimates of growth rate appear -to be much more stable than those from the smoothing spline, and also much more +In this particular example, the trend filtering estimates of growth rate appear +to be much more stable than those from the smoothing spline, and also much more stable than the estimates from local relative changes and linear regressions. The smoothing spline growth rate estimates are based on the default settings in @@ -220,79 +219,79 @@ the documentation for `growth_rate()` gives the full details. In general, and alternative view for the growth rate of a function $f$ is given by defining $g(t) = \log(f(t))$, and then observing that $g'(t) = f'(t)/f(t)$. Therefore, any method that estimates the derivative can be simply applied to the -log of the signal of interest, and in this light, each method above +log of the signal of interest, and in this light, each method above ("rel_change", "linear_reg", "smooth_spline", and "trend_filter") has a log scale analog, which can be used by setting the argument `log_scale = TRUE` in the call to `growth_rate()`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} -x <- x %>% - group_by(geo_value) %>% - mutate(cases_gr5 = growth_rate(time_value, cases, method = "rel_change", +x <- x %>% + group_by(geo_value) %>% + mutate(cases_gr5 = growth_rate(time_value, cases, method = "rel_change", log_scale = TRUE), - cases_gr6 = growth_rate(time_value, cases, method = "linear_reg", + cases_gr6 = growth_rate(time_value, cases, method = "linear_reg", log_scale = TRUE), - cases_gr7 = growth_rate(time_value, cases, method = "smooth_spline", + cases_gr7 = growth_rate(time_value, cases, method = "smooth_spline", log_scale = TRUE), - cases_gr8 = growth_rate(time_value, cases, method = "trend_filter", + cases_gr8 = growth_rate(time_value, cases, method = "trend_filter", log_scale = TRUE)) x %>% select(geo_value, time_value, cases_gr5, cases_gr6) %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr5 = "rel_change_log", cases_gr6 = "linear_reg_log")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(2,4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") x %>% select(geo_value, time_value, cases_gr7, cases_gr8) %>% pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", + names_to = "method", values_to = "gr") %>% mutate(method = recode(method, cases_gr7 = "smooth_spline_log", cases_gr8 = "trend_filter_log")) %>% - ggplot(aes(x = time_value, y = gr)) + - geom_line(aes(col = method)) + + ggplot(aes(x = time_value, y = gr)) + + geom_line(aes(col = method)) + scale_color_manual(values = c(3,6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Growth rate", col = "Method") + labs(x = "Date", y = "Growth rate", col = "Method") ``` -Comparing the `rel_change_log` curves with their `rel_change` counterparts -(shown in earlier figures), we see that the former curves appear less volatile +Comparing the `rel_change_log` curves with their `rel_change` counterparts +(shown in earlier figures), we see that the former curves appear less volatile and match the linear regression estimates much more closely. In particular, when `rel_change` has upward spikes, `rel_change_log` has less pronounced spikes. Why does this occur? The estimate of $g'(t)$ here can be expressed as $\mathbb -E[\log(B)-\log(A)]/h = \mathbb E[\log(1+hR)]/h$, where $R = ((B-A)/h) / A$, and +E[\log(B)-\log(A)]/h = \mathbb E[\log(1+hR)]/h$, where $R = ((B-A)/h) / A$, and the expectation refers to averaging over the $h$ observations in each window. Consider the following two relevant inequalities, both due to concavity of the -logarithm function: +logarithm function: $$ \mathbb E[\log(1+hR)]/h \leq \log(1+h\mathbb E[R])/h \leq \mathbb E[R]. $$ -The first inequality is Jensen's; the second inequality is because the tangent +The first inequality is Jensen's; the second inequality is because the tangent line of a concave function lies above it. Finally, we observe that $\mathbb -E[R] \approx ((\bar B-\bar A)/h) / \bar A$, which the `rel_change` estimate. +E[R] \approx ((\bar B-\bar A)/h) / \bar A$, which the `rel_change` estimate. This explains why the `rel_change_log` curve often lies below the `rel_change` curve. - ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 8d820531..416a135f 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -20,16 +20,15 @@ library(epiprocess) library(dplyr) library(tidyr) -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", - time_values = epirange(20200601, 20210531), + time_type = "day", geo_values = "fl,nj", + time_values = epirange(20200601, 20210531), as_of = 20211028 ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() ``` @@ -68,13 +67,13 @@ methods on a given signal, and then (optionally) combine the results from those methods. Here, we'll investigate outlier detection results from the following methods. -1. Detection based on a rolling median, using `detect_outlr_rm()`, which - computes a rolling median on with a default window size of `n` time points - centered at the time point under consideration, and then computes thresholds - based on a multiplier times a rolling IQR computed on the residuals. +1. Detection based on a rolling median, using `detect_outlr_rm()`, which + computes a rolling median on with a default window size of `n` time points + centered at the time point under consideration, and then computes thresholds + based on a multiplier times a rolling IQR computed on the residuals. 2. Detection based on a seasonal-trend decomposition using LOESS (STL), using - `detect_outlr_stl()`, which is similar to the rolling median method but - replaces the rolling median with fitted values from STL. + `detect_outlr_stl()`, which is similar to the rolling median method but + replaces the rolling median with fitted values from STL. 3. Detection based on an STL decomposition, but without seasonality term, which amounts to smoothing using LOESS. @@ -82,25 +81,38 @@ The outlier detection methods are specified using a `tibble` that is passed to `detect_outlr()`, with one row per method, and whose columms specify the outlier detection function, any input arguments (only nondefault values need to be supplied), and an abbreviated name for the method used in tracking results. -Abbreviations "rm" and "stl" can be used for the built-in detection functions +Abbreviations "rm" and "stl" can be used for the built-in detection functions `detect_outlr_rm()` and `detect_outlr_stl()`, respectively. ```{r} -detection_methods = bind_rows( - tibble(method = "rm", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5)), - abbr = "rm"), - tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = 7)), - abbr = "stl_seasonal"), - tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = NULL)), - abbr = "stl_nonseasonal")) +detection_methods <- bind_rows( + tibble( + method = "rm", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5 + )), + abbr = "rm" + ), + tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = 7 + )), + abbr = "stl_seasonal" + ), + tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = NULL + )), + abbr = "stl_nonseasonal" + ) +) detection_methods ``` @@ -113,10 +125,11 @@ vote across the base methods to determine whether a value is an outlier. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(outlier_info = detect_outlr( + mutate(outlier_info = detect_outlr( x = time_value, y = cases, methods = detection_methods, - combiner = "median")) %>% + combiner = "median" + )) %>% ungroup() %>% unnest(outlier_info) @@ -127,38 +140,49 @@ To visualize the results, we first define a convenience function for plotting. ```{r} # Plot outlier detection bands and/or points identified as outliers -plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, +plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, facet_vars = vars(geo_value), nrow = NULL, ncol = NULL, scales = "fixed") { - # Convert outlier detection results to long format + # Convert outlier detection results to long format signal <- rlang::enquo(signal) x_long <- x %>% pivot_longer( cols = starts_with(method_abbr), names_to = c("method", ".value"), - names_pattern = "(.+)_(.+)") - + names_pattern = "(.+)_(.+)" + ) + # Start of plot with observed data p <- ggplot() + geom_line(data = x, mapping = aes(x = time_value, y = !!signal)) # If requested, add bands - if (bands) - p <- p + geom_ribbon(data = x_long, - aes(x = time_value, ymin = lower, ymax = upper, - color = method), fill = NA) + if (bands) { + p <- p + geom_ribbon( + data = x_long, + aes( + x = time_value, ymin = lower, ymax = upper, + color = method + ), fill = NA + ) + } # If requested, add points if (points) { x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper)) - p <- p + geom_point(data = x_detected, - aes(x = time_value, y = !!signal, color = method, - shape = method)) + p <- p + geom_point( + data = x_detected, + aes( + x = time_value, y = !!signal, color = method, + shape = method + ) + ) } # If requested, add faceting - if (!is.null(facet_vars)) + if (!is.null(facet_vars)) { p <- p + facet_wrap(facet_vars, nrow = nrow, ncol = ncol, scales = scales) + } return(p) } @@ -170,29 +194,35 @@ Now we produce plots for each state at a time, faceting by the detection method. method_abbr <- c(detection_methods$abbr, "combined") plot_outlr(x %>% filter(geo_value == "fl"), cases, method_abbr, - facet_vars = vars(method), scales = "free_y", ncol = 1) + + facet_vars = vars(method), scales = "free_y", ncol = 1 +) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 counts", color = "Method", - shape = "Method") + labs( + x = "Date", y = "Reported COVID-19 counts", color = "Method", + shape = "Method" + ) plot_outlr(x %>% filter(geo_value == "nj"), cases, method_abbr, - facet_vars = vars(method), scales = "free_y", ncol = 1) + + facet_vars = vars(method), scales = "free_y", ncol = 1 +) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 counts", color = "Method", - shape = "Method") + labs( + x = "Date", y = "Reported COVID-19 counts", color = "Method", + shape = "Method" + ) ``` ## Outlier correction Finally, in order to correct outliers, we can use the posited replacement values returned by each outlier detection method. Below we use the replacement value -from the combined method, which is defined by the median of replacement values +from the combined method, which is defined by the median of replacement values from the base methods at each time point. ```{r, fig.width = 8, fig.height = 7} -y <- x %>% +y <- x %>% mutate(cases_corrected = combined_replacement) %>% - select(geo_value, time_value, cases, cases_corrected) + select(geo_value, time_value, cases, cases_corrected) y %>% filter(cases != cases_corrected) @@ -205,13 +235,13 @@ ggplot(y, aes(x = time_value)) + labs(x = "Date", y = "Reported COVID-19 counts") ``` -More advanced correction functionality will be coming at some point in the -future. +More advanced correction functionality will be coming at some point in the +future. ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index cb435e54..34d5bd59 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -8,7 +8,7 @@ vignette: > --- A central tool in the `epiprocess` package is `epi_slide()`, which is based on -the powerful functionality provided in the +the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In `epiprocess`, to "slide" means to apply a computation---represented as a function or formula---over a sliding/rolling data window. Suitable @@ -37,15 +37,14 @@ library(dplyr) The data is fetched with the following query: ```{r, message = FALSE, eval=F} -x <- covidcast( - data_source = "jhu-csse", +x <- pub_covidcast( + source = "jhu-csse", signals = "confirmed_incidence_num", - time_type = "day", geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) %>% as_epi_df() @@ -69,8 +68,8 @@ order to smooth the signal, by passing in a formula for the first argument of `epi_slide()`. To do this computation per state, we first call `group_by()`. ```{r} -x %>% - group_by(geo_value) %>% +x %>% + group_by(geo_value) %>% epi_slide(~ mean(.x$cases), before = 6) %>% ungroup() %>% head(10) @@ -84,7 +83,7 @@ default. We can of course change this post hoc, or we can instead specify a new name up front using the `new_col_name` argument: ```{r} -x <- x %>% +x <- x %>% group_by(geo_value) %>% epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav") %>% ungroup() @@ -102,7 +101,7 @@ Like in `group_modify()`, there are alternative names for these variables as well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` instead of `.ref_time_value`. -## Slide with a function +## Slide with a function We can also pass a function for the first argument in `epi_slide()`. In this case, the passed function must accept the following arguments: @@ -118,8 +117,8 @@ receives to `f`. Recreating the last example of a 7-day trailing average: ```{r} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% epi_slide(function(x, gk, rtv) mean(x$cases), before = 6, new_col_name = "cases_7dav") %>% ungroup() @@ -135,8 +134,8 @@ to a computation in which we can access any columns of `x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} -x <- x %>% - group_by(geo_value) %>% +x <- x %>% + group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() @@ -155,13 +154,13 @@ theme_set(theme_bw()) ggplot(x, aes(x = time_value)) + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + - facet_wrap(~ geo_value, scales = "free_y") + + facet_wrap(~geo_value, scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 cases") ``` -As we can see from the top right panel, it looks like Texas moved to weekly -reporting of COVID-19 cases in summer of 2021. +As we can see from the top right panel, it looks like Texas moved to weekly +reporting of COVID-19 cases in summer of 2021. ## Running a local forecaster @@ -183,57 +182,60 @@ units of the `time_value` column; so, days, in the working `epi_df` being considered in this vignette). ```{r} -prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, +prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, - intercept = FALSE, nonneg = TRUE) { + intercept = FALSE, nonneg = TRUE) { # Return NA if insufficient training data if (length(y) < min_train_window + max(lags) + ahead) { return(data.frame(point = NA, lower = NA, upper = NA)) } - + # Build features and response for the AR model dat <- do.call( - data.frame, + data.frame, purrr::map(lags, function(j) lag(y, n = j)) ) - names(dat) = paste0("x", 1:ncol(dat)) - if (intercept) dat$x0 = rep(1, nrow(dat)) - dat$y <- lead(y, n = ahead) - + names(dat) <- paste0("x", 1:ncol(dat)) + if (intercept) dat$x0 <- rep(1, nrow(dat)) + dat$y <- lead(y, n = ahead) + # Now fit the AR model and make a prediction obj <- lm(y ~ . + 0, data = dat) point <- predict(obj, newdata = tail(dat, 1)) - - # Compute a band + + # Compute a band r <- residuals(obj) s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) lower <- point + q[1] upper <- point + q[2] - + # Clip at zero if we need to, then return - if (nonneg) { - point = max(point, 0) - lower = max(lower, 0) - upper = max(upper, 0) + if (nonneg) { + point <- max(point, 0) + lower <- max(lower, 0) + upper <- max(upper, 0) } return(data.frame(point = point, lower = lower, upper = upper)) } ``` -We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 -cases. Note that we actually model the `cases_7dav` column, to operate on the +We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 +cases. Note that we actually model the `cases_7dav` column, to operate on the scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, to modeling weekly sums of COVID-19 cases. ```{r} -fc_time_values <- seq(as.Date("2020-06-01"), - as.Date("2021-12-01"), - by = "1 months") -x %>% +fc_time_values <- seq(as.Date("2020-06-01"), + as.Date("2021-12-01"), + by = "1 months" +) +x %>% group_by(geo_value) %>% - epi_slide(fc = prob_ar(cases_7dav), before = 119, - ref_time_values = fc_time_values) %>% + epi_slide( + fc = prob_ar(cases_7dav), before = 119, + ref_time_values = fc_time_values + ) %>% ungroup() %>% head(10) ``` @@ -243,42 +245,51 @@ sliding computation (here, compute a forecast) at a specific subset of reference time values. We get out three columns `fc_point`, `fc_lower`, and `fc_upper` that correspond to the point forecast, and the lower and upper endpoints of the 95\% prediction band, respectively. (If instead we had set `as_list_col = TRUE` -in the call to `epi_slide()`, then we would have gotten a list column `fc`, +in the call to `epi_slide()`, then we would have gotten a list column `fc`, where each element of `fc` is a data frame with named columns `point`, `lower`, and `upper`.) To finish off, we plot the forecasts at some times (spaced out by a few months) -over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do -so, we encapsulate the process of generating forecasts into a simple function, +over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do +so, we encapsulate the process of generating forecasts into a simple function, so that we can call it a few times. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Note the use of all_rows = TRUE (keeps all original rows in the output) k_week_ahead <- function(x, ahead = 7) { - x %>% + x %>% group_by(geo_value) %>% - epi_slide(fc = prob_ar(cases_7dav, ahead = ahead), before = 119, - ref_time_values = fc_time_values, all_rows = TRUE) %>% + epi_slide( + fc = prob_ar(cases_7dav, ahead = ahead), before = 119, + ref_time_values = fc_time_values, all_rows = TRUE + ) %>% ungroup() %>% - mutate(target_date = time_value + ahead) + mutate(target_date = time_value + ahead) } # First generate the forecasts, and bind them together -z <- bind_rows(k_week_ahead(x, ahead = 7), - k_week_ahead(x, ahead = 14), - k_week_ahead(x, ahead = 21), - k_week_ahead(x, ahead = 28)) - -# Now plot them, on top of actual COVID-19 case counts +z <- bind_rows( + k_week_ahead(x, ahead = 7), + k_week_ahead(x, ahead = 14), + k_week_ahead(x, ahead = 21), + k_week_ahead(x, ahead = 28) +) + +# Now plot them, on top of actual COVID-19 case counts ggplot(z) + - geom_line(aes(x = time_value, y = cases_7dav), color = "gray50") + - geom_ribbon(aes(x = target_date, ymin = fc_lower, ymax = fc_upper, - group = time_value), fill = 6, alpha = 0.4) + - geom_line(aes(x = target_date, y = fc_point, group = time_value)) + - geom_point(aes(x = target_date, y = fc_point, group = time_value), - size = 0.5) + - geom_vline(data = tibble(x = fc_time_values), aes(xintercept = x), - linetype = 2, alpha = 0.5) + + geom_line(aes(x = time_value, y = cases_7dav), color = "gray50") + + geom_ribbon(aes( + x = target_date, ymin = fc_lower, ymax = fc_upper, + group = time_value + ), fill = 6, alpha = 0.4) + + geom_line(aes(x = target_date, y = fc_point, group = time_value)) + + geom_point(aes(x = target_date, y = fc_point, group = time_value), + size = 0.5 + ) + + geom_vline( + data = tibble(x = fc_time_values), aes(xintercept = x), + linetype = 2, alpha = 0.5 + ) + facet_wrap(vars(geo_value), scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 cases") @@ -290,7 +301,7 @@ spotty. At various points in time, we can see that its forecasts are volatile too narrow), or both at the same time. This is only meant as a simple demo and not entirely unexpected given the way the AR model is set up. The [`epipredict`](https://cmu-delphi.github.io/epipredict) package, which is a -companion package to `epiprocess`, offers a suite of predictive modeling tools +companion package to `epiprocess`, offers a suite of predictive modeling tools that can improve on some of the shortcomings of the above simple AR model. Second, the AR forecaster here is using finalized data, meaning, it uses the @@ -304,13 +315,13 @@ therein. Fortunately, the `epiprocess` package provides a data structure called `epi_archive` that can be used to store all data revisions, and furthermore, an `epi_archive` object knows how to slide computations in the correct version-aware sense (for the computation at each reference time $t$, it uses -only data that would have been available as of $t$). We will revisit this -example in the [archive +only data that would have been available as of $t$). We will revisit this +example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. From c8bc7a4adbe7a26c99f1b3e8d96f52f5ba8f6717 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 15 Nov 2023 16:30:15 -0800 Subject: [PATCH 004/345] docs(NEWS): fix formatting --- NEWS.md | 163 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 81 insertions(+), 82 deletions(-) diff --git a/NEWS.md b/NEWS.md index df674425..be8b9b56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ Note that `epiprocess` uses the [Semantic Versioning ("semver")](https://semver.org/) scheme for all release versions, but any inter-release development versions will include an additional ".9999" suffix. -## Breaking changes: +## Breaking changes * Changes to `epi_slide` and `epix_slide`: * If `f` is a function, it is now required to take at least three arguments. @@ -19,7 +19,7 @@ inter-release development versions will include an additional ".9999" suffix. g, ) { }` to `f = function(x, g, rt, ) { }`. -## New features: +## New features * `epi_slide` and `epix_slide` also make the window data, group key and reference time value available to slide computations specified as formulas or tidy @@ -45,7 +45,7 @@ inter-release development versions will include an additional ".9999" suffix. * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` when desired and set the metadata appropriately. -## Improvements: +## Improvements * `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide computations output atomic vectors, and output a list column in "chopped" @@ -55,7 +55,7 @@ inter-release development versions will include an additional ".9999" suffix. # epiprocess 0.6.0 -## Breaking changes: +## Breaking changes * Changes to both `epi_slide` and `epix_slide`: * The `n`, `align`, and `before` arguments have been replaced by new `before` @@ -102,7 +102,7 @@ inter-release development versions will include an additional ".9999" suffix. the old behavior, pass in `clobberable_versions_start = max_version_with_row_in(x)`. -## Potentially-breaking changes: +## Potentially-breaking changes * Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when dropping the `epi_df` class (e.g., when removing the `time_value` column). @@ -116,7 +116,7 @@ inter-release development versions will include an additional ".9999" suffix. * `epi_slide` and `epix_slide` now raise an error rather than silently filtering out `ref_time_values` that don't meet their expectations. -## New features: +## New features * `epix_slide`, `$slide` have a new parameter `all_versions`. With `all_versions=TRUE`, `epix_slide` will pass a filtered `epi_archive` to each @@ -124,7 +124,7 @@ inter-release development versions will include an additional ".9999" suffix. pseudoprospective forecasts with a revision-aware forecaster using nested `epix_slide` operations. -## Improvements: +## Improvements * Added `dplyr::group_by` and `dplyr::ungroup` S3 methods for `epi_archive` objects, plus corresponding `$group_by` and `$ungroup` R6 methods. The @@ -134,35 +134,35 @@ inter-release development versions will include an additional ".9999" suffix. requirement (part of [#154](https://github.com/cmu-delphi/epiprocess/issues/154)). -## Cleanup: +## Cleanup * Added a `NEWS.md` file to track changes to the package. * Implemented `?dplyr::dplyr_extending` for `epi_df`s ([#223](https://github.com/cmu-delphi/epiprocess/issues/223)). * Fixed various small documentation issues ([#217](https://github.com/cmu-delphi/epiprocess/issues/217)). -# epiprocess 0.5.0: +# epiprocess 0.5.0 -## Potentially-breaking changes: +## Potentially-breaking changes * `epix_slide`, `$slide` now feed `f` an `epi_df` rather than converting to a tibble/`tbl_df` first, allowing use of `epi_df` methods and metadata, and often yielding `epi_df`s out of the slide as a result. To obtain the old behavior, convert to a tibble within `f`. -## Improvements: +## Improvements * Fixed `epix_merge`, `$merge` always raising error on `sync="truncate"`. -## Cleanup: +## Cleanup * Added `Remotes:` entry for `genlasso`, which was removed from CRAN. * Added `as_epi_archive` tests. * Added missing `epix_merge` test for `sync="truncate"`. -# epiprocess 0.4.0: +# epiprocess 0.4.0 -## Potentially-breaking changes: +## Potentially-breaking changes * Fixed `[.epi_df` to not reorder columns, which was incompatible with downstream packages. @@ -177,20 +177,20 @@ inter-release development versions will include an additional ".9999" suffix. * Fixed `[.epi_df` to drop metadata if decaying to a tibble (due to removal of essential columns). -## Improvements: +## Improvements * Added check that `epi_df` `additional_metadata` is list. * Fixed some incorrect `as_epi_df` examples. -## Cleanup: +## Cleanup * Applied rename of upstream package in examples: `delphi.epidata` -> `epidatr`. * Rounded out `[.epi_df` tests. -# epiprocess 0.3.0: +# epiprocess 0.3.0 -## Breaking changes: +## Breaking changes * `as_epi_archive`, `epi_archive$new`: * Compactification (see below) by default may change results if working @@ -223,7 +223,7 @@ inter-release development versions will include an additional ".9999" suffix. reporting latency, `n=7` will *not* yield 7 days of data in a typical daily-reporting surveillance data source, as one might have assumed). -## New features: +## New features * `as_epi_archive`, `epi_archive$new`: * New `compactify` parameter allows removal of rows that are redundant for the @@ -248,20 +248,20 @@ inter-release development versions will include an additional ".9999" suffix. `epi_archive` rather than an outdated R6 implementation from whenever the data object was generated. -# epiprocess 0.2.0: +# epiprocess 0.2.0 -## Breaking changes: +## Breaking changes * Removed default `n=7` argument to `epix_slide`. -## Improvements: +## Improvements * Ignore `NA`s when printing `time_value` range for an `epi_archive`. * Fixed misleading column naming in `epix_slide` example. * Trimmed down `epi_slide` examples. * Synced out-of-date docs. -## Cleanup: +## Cleanup * Removed dependency of some `epi_archive` tests on an example archive. object, and made them more understandable by reading without running. @@ -271,16 +271,16 @@ inter-release development versions will include an additional ".9999" suffix. * Removed some dead code. * Made `.{Rbuild,git}ignore` files more comprehensive. -# epiprocess 0.1.2: +# epiprocess 0.1.2 -## New features: +## New features * New `new_epi_df` function is similar to `as_epi_df`, but (i) recalculates, overwrites, and/or drops most metadata of `x` if it has any, (ii) may still reorder the columns of `x` even if it's already an `epi_df`, and (iii) treats `x` as optional, constructing an empty `epi_df` by default. -## Improvements: +## Improvements * Fixed `geo_type` guessing on alphabetical strings with more than 2 characters to yield `"custom"`, not US `"nation"`. @@ -300,20 +300,20 @@ inter-release development versions will include an additional ".9999" suffix. * Improved `as_epi_archive` and `epi_archive$new`/`$initialize` documentation, including constructing a toy archive. -## Cleanup: +## Cleanup * Added tests for `epi_slide`, `epi_cor`, and internal utility functions. * Fixed currently-unused internal utility functions `MiddleL`, `MiddleR` to yield correct results on odd-length vectors. -# epiprocess 0.1.1: +# epiprocess 0.1.1 -## New features: +## New features * New example data objects allow one to quickly experiment with `epi_df`s and `epi_archives` without relying/waiting on an API to fetch data. -## Improvements: +## Improvements * Improved `epi_slide` error messaging. * Fixed description of the appropriate parameters for an `f` argument to @@ -322,61 +322,60 @@ inter-release development versions will include an additional ".9999" suffix. * Added some examples throughout the package. * Using example data objects in vignettes also speeds up vignette compilation. -## Cleanup: +## Cleanup * Set up gh-actions CI. * Added tests for `epi_df`s. # epiprocess 0.1.0 -## Implemented core functionality, vignettes: - -Classes: -* `epi_df`: specialized `tbl_df` for geotemporal epidemiological time - series data, with optional metadata recording other key columns (e.g., - demographic breakdowns) and `as_of` what time/version this data was - current/published. Associated functions: - * `as_epi_df` converts to an `epi_df`, guessing the `geo_type`, - `time_type`, `other_keys`, and `as_of` if not specified. - * `as_epi_df.tbl_ts` and `as_tsibble.epi_df` automatically set - `other_keys` and `key`&`index`, respectively. - * `epi_slide` applies a user-supplied computation to a sliding/rolling - time window and user-specified groups, adding the results as new - columns, and recycling/broadcasting results to keep the result size - stable. Allows computation to be provided as a function, `purrr`-style - formula, or tidyeval dots. Uses `slider` underneath for efficiency. - * `epi_cor` calculates Pearson, Kendall, or Spearman correlations - between two (optionally time-shifted) variables in an `epi_df` within - user-specified groups. - * Convenience function: `is_epi_df`. -* `epi_archive`: R6 class for version (patch) data for geotemporal - epidemiological time series data sets. Comes with S3 methods and regular - functions that wrap around this functionality for those unfamiliar with R6 - methods. Associated functions: - * `as_epi_archive`: prepares an `epi_archive` object from a data frame - containing snapshots and/or patch data for every available version of - the data set. - * `as_of`: extracts a snapshot of the data set as of some requested - version, in `epi_df` format. - * `epix_slide`, `$slide`: similar to `epi_slide`, but for - `epi_archive`s; for each requested `ref_time_value` and group, applies - a time window and user-specified computation to a snapshot of the data - as of `ref_time_value`. - * `epix_merge`, `$merge`: like `merge` for `epi_archive`s, - but allowing for the last version of each observation to be carried - forward to fill in gaps in `x` or `y`. - * Convenience function: `is_epi_archive`. - -Additional functions: -* `growth_rate`: estimates growth rate of a time series using one of a few - built-in `method`s based on relative change, linear regression, - smoothing splines, or trend filtering. -* `detect_outlr`: applies one or more outlier detection methods to a given - signal variable, and optionally aggregates the outputs to create a - consensus result. -* `detect_outlr_rm`: outlier detection function based on a - rolling-median-based outlier detection function; one of the methods - included in `detect_outlr`. -* `detect_outlr_stl`: outlier detection function based on a seasonal-trend - decomposition using LOESS (STL); one of the methods included in - `detect_outlr`. +## Implemented core functionality, vignettes + +* Classes: + * `epi_df`: specialized `tbl_df` for geotemporal epidemiological time + series data, with optional metadata recording other key columns (e.g., + demographic breakdowns) and `as_of` what time/version this data was + current/published. Associated functions: + * `as_epi_df` converts to an `epi_df`, guessing the `geo_type`, + `time_type`, `other_keys`, and `as_of` if not specified. + * `as_epi_df.tbl_ts` and `as_tsibble.epi_df` automatically set + `other_keys` and `key`&`index`, respectively. + * `epi_slide` applies a user-supplied computation to a sliding/rolling + time window and user-specified groups, adding the results as new + columns, and recycling/broadcasting results to keep the result size + stable. Allows computation to be provided as a function, `purrr`-style + formula, or tidyeval dots. Uses `slider` underneath for efficiency. + * `epi_cor` calculates Pearson, Kendall, or Spearman correlations + between two (optionally time-shifted) variables in an `epi_df` within + user-specified groups. + * Convenience function: `is_epi_df`. + * `epi_archive`: R6 class for version (patch) data for geotemporal + epidemiological time series data sets. Comes with S3 methods and regular + functions that wrap around this functionality for those unfamiliar with R6 + methods. Associated functions: + * `as_epi_archive`: prepares an `epi_archive` object from a data frame + containing snapshots and/or patch data for every available version of + the data set. + * `as_of`: extracts a snapshot of the data set as of some requested + version, in `epi_df` format. + * `epix_slide`, `$slide`: similar to `epi_slide`, but for + `epi_archive`s; for each requested `ref_time_value` and group, applies + a time window and user-specified computation to a snapshot of the data + as of `ref_time_value`. + * `epix_merge`, `$merge`: like `merge` for `epi_archive`s, + but allowing for the last version of each observation to be carried + forward to fill in gaps in `x` or `y`. + * Convenience function: `is_epi_archive`. +* Additional functions: + * `growth_rate`: estimates growth rate of a time series using one of a few + built-in `method`s based on relative change, linear regression, + smoothing splines, or trend filtering. + * `detect_outlr`: applies one or more outlier detection methods to a given + signal variable, and optionally aggregates the outputs to create a + consensus result. + * `detect_outlr_rm`: outlier detection function based on a + rolling-median-based outlier detection function; one of the methods + included in `detect_outlr`. + * `detect_outlr_stl`: outlier detection function based on a seasonal-trend + decomposition using LOESS (STL); one of the methods included in + `detect_outlr`. From 9ede25a053c476591d92f70d9c4ea9cd94aa7354 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:38:27 -0800 Subject: [PATCH 005/345] add S3 method to quickly access the keys in an epi_df or epi_archive --- R/epi_keys.R | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 R/epi_keys.R diff --git a/R/epi_keys.R b/R/epi_keys.R new file mode 100644 index 00000000..324b4eff --- /dev/null +++ b/R/epi_keys.R @@ -0,0 +1,42 @@ +#' Grab any keys associated to an epi_df +#' +#' @param x a data.frame, tibble, or epi_df +#' @param ... additional arguments passed on to methods +#' +#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL` +#' @keywords internal +#' @export +epi_keys <- function(x, ...) { + UseMethod("epi_keys") +} + +#' @export +epi_keys.default <- function(x, ...) { + character(0L) +} + +#' @export +epi_keys.data.frame <- function(x, other_keys = character(0L), ...) { + arg_is_chr(other_keys, allow_empty = TRUE) + nm <- c("time_value", "geo_value", other_keys) + intersect(nm, names(x)) +} + +#' @export +epi_keys.epi_df <- function(x, ...) { + c("time_value", "geo_value", attr(x, "metadata")$other_keys) +} + +#' @export +epi_keys.epi_archive <- function(x, ...) { + c("time_value", "geo_value", attr(x, "metadata")$other_keys) +} + +kill_time_value <- function(v) { + arg_is_chr(v) + v[v != "time_value"] +} + +epi_keys_only <- function(x, ...) { + kill_time_value(epi_keys(x, ...)) +} From 96c7500901047fe0598e2e100d3867207bc0481f Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:39:04 -0800 Subject: [PATCH 006/345] grab autoplot method from ggplot2 and export --- DESCRIPTION | 5 ++++- R/reexports.R | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9a8dea6..7895ee26 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Imports: feasts, generics, genlasso, + ggplot2, lifecycle (>= 1.0.1), lubridate, magrittr, @@ -49,7 +50,6 @@ Imports: Suggests: covidcast, epidatr, - ggplot2, knitr, outbreaks, rmarkdown, @@ -72,9 +72,11 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' + 'autoplot.R' 'correlation.R' 'data.R' 'epi_df.R' + 'epi_keys.R' 'epiprocess.R' 'methods-epi_archive.R' 'grouped_epi_archive.R' @@ -83,5 +85,6 @@ Collate: 'outliers.R' 'reexports.R' 'slide.R' + 'utils-arg.R' 'utils.R' 'utils_pipe.R' diff --git a/R/reexports.R b/R/reexports.R index 4cc45e29..02f5af53 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -55,3 +55,10 @@ dplyr::slice #' @importFrom tidyr unnest #' @export tidyr::unnest + + +# ggplot2 ----------------------------------------------------------------- + +#' @importFrom ggplot2 autoplot +#' @export +ggplot2::autoplot From 1638430040896f8366e952732c7a3f586393de5b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:39:30 -0800 Subject: [PATCH 007/345] simplify enlist() --- R/utils.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/utils.R b/R/utils.R index 471fb053..c815161a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -454,13 +454,13 @@ quiet = function(x) { # Create an auto-named list enlist = function(...) { - x = list(...) - n = as.character(sys.call())[-1] - if (!is.null(n0 <- names(x))) { - n[n0 != ""] = n0[n0 != ""] - } - names(x) = n - return(x) + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) } # Variable assignment from a list. NOT USED. Something is broken, this doesn't From a3cb78bb1e3ac68284fdd2028d0cf7bb137e28de Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:41:28 -0800 Subject: [PATCH 008/345] add a few argument checking functions, see also #380 --- R/utils-arg.R | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 R/utils-arg.R diff --git a/R/utils-arg.R b/R/utils-arg.R new file mode 100644 index 00000000..b5700b6d --- /dev/null +++ b/R/utils-arg.R @@ -0,0 +1,63 @@ +handle_arg_list <- function(..., tests) { + values <- list(...) + names <- eval(substitute(alist(...))) + names <- purrr::map(names, deparse) + + purrr::walk2(names, values, tests) +} + +arg_is_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { + handle_arg_list( + ..., + tests = function(name, value) { + if (length(value) > 1 | (!allow_null & length(value) == 0)) { + cli::cli_abort("Argument {.val {name}} must be of length 1.") + } + if (!is.null(value)) { + if (is.na(value) & !allow_na) { + cli::cli_abort( + "Argument {.val {name}} must not be a missing value ({.val {NA}})." + ) + } + } + } + ) +} + +arg_is_int <- function(..., allow_null = FALSE) { + arg_is_numeric(..., allow_null = allow_null) + handle_arg_list( + ..., + tests = function(name, value) { + if (!(all(value %% 1 == 0) | (is.null(value) & allow_null))) { + cli::cli_abort("All {.val {name}} must be whole positive number(s).") + } + } + ) +} + +arg_is_chr <- function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = FALSE) { + handle_arg_list( + ..., + tests = function(name, value) { + if (is.null(value) & !allow_null) { + cli::cli_abort("Argument {.val {name}} may not be `NULL`.") + } + if (any(is.na(value)) & !allow_na) { + cli::cli_abort("Argument {.val {name}} must not contain any missing values ({.val {NA}}).") + } + if (!is.null(value) & (length(value) == 0L & !allow_empty)) { + cli::cli_abort("Argument {.val {name}} must have length > 0.") + } + if (!(is.character(value) | is.null(value) | all(is.na(value)))) { + cli::cli_abort("Argument {.val {name}} must be of character type.") + } + } + ) +} + +arg_is_chr_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { + arg_is_chr(..., allow_null = allow_null, allow_na = allow_na) + arg_is_scalar(..., allow_null = allow_null, allow_na = allow_na) +} + From f29936cf2df067de117f62fd4b2d36d80544f7be Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:41:56 -0800 Subject: [PATCH 009/345] add autoplot() for epi_df's --- R/autoplot.R | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 R/autoplot.R diff --git a/R/autoplot.R b/R/autoplot.R new file mode 100644 index 00000000..8263818d --- /dev/null +++ b/R/autoplot.R @@ -0,0 +1,154 @@ +#' Automatically plot an epi_df +#' +#' @param object An `epi_df` +#' @param ... <[`tidy-select`][dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' @param .color_by Which variables should determine the color(s) used to plot +#' lines. Options include: +#' * `all_keys` - the default uses the interaction of any key variables +#' including the `geo_value` +#' * `geo_value` - `geo_value` only +#' * `other_keys` - any available keys that are not `geo_value` +#' * `.response` - the numeric variables (same as the y-axis) +#' * `all` - uses the interaction of all keys and numeric variables +#' * `none` - no coloring aesthetic is applied +#' @param .facet_by Similar to `.color_by` except that the default is to display +#' each numeric variable on a separate facet +#' @param .base_color Lines will be shown with this color. For example, with a +#' single numeric variable and faceting by `geo_value`, all locations would +#' share the same color line. +#' @param .max_facets Cut down of the number of facets displayed. Especially +#' useful for testing when there are many `geo_value`'s or keys. +#' +#' @return +#' @export +#' +#' @examples +#' autoplot(jhu_csse_daily_subset, cases, death_rate_7d_av) +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .facet_by = "geo_value") +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", +#' .facet_by = "geo_value") +autoplot.epi_df <- function( + object, ..., + .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), + .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), + .base_color = "#3A448F", + .max_facets = Inf) { + .color_by <- match.arg(.color_by) + .facet_by <- match.arg(.facet_by) + + arg_is_scalar(.max_facets) + if (is.finite(.max_facets)) arg_is_int(.max_facets) + arg_is_chr_scalar(.base_color) + + ek <- epi_keys(object) + mv <- setdiff(names(object), ek) + ek <- kill_time_value(ek) + + # --- check for numeric variables + allowed <- purrr::map_lgl(object[mv], is.numeric) + if (length(allowed) == 0) { + cli::cli_abort("No numeric variables were available to plot automatically.") + } + vars <- tidyselect::eval_select(rlang::expr(c(...)), object) + if (rlang::is_empty(vars)) { # find them automatically if unspecified + vars <- tidyselect::eval_select(names(allowed)[1], object) + cli::cli_warn( + "Plot variable was unspecified. Automatically selecting {.var {names(allowed)[1]}}." + ) + } else { # if variables were specified, ensure that they are numeric + ok <- names(vars) %in% names(allowed) + if (!any(ok)) { + cli::cli_abort( + "None of the requested variables {.var {names(vars)}} are numeric." + ) + } else if (!all(ok)) { + cli::cli_warn(c( + "Only the requested variables {.var {names(vars)[ok]}} are numeric.", + i = "`autoplot()` cannot display {.var {names(vars)[!ok]}}." + )) + vars <- vars[ok] + } + } + + # --- create a viable df to plot + pos <- tidyselect::eval_select( + rlang::expr(c("time_value", ek, names(vars))), object + ) + if (length(vars) > 1) { + object <- tidyr::pivot_longer( + object[pos], tidyselect::all_of(names(vars)), + values_to = ".response", + names_to = ".response_name" + ) + } else { + object <- dplyr::rename(object[pos], .response := !!names(vars)) + } + all_keys <- rlang::syms(as.list(ek)) + other_keys <- rlang::syms(as.list(setdiff(ek, "geo_value"))) + all_avail <- rlang::syms(as.list(c(ek, ".response_name"))) + + object <- object %>% + dplyr::mutate( + .colours = switch(.color_by, + all_keys = interaction(!!!all_keys, sep = "/"), + geo_value = geo_value, + other_keys = interaction(!!!other_keys, sep = "/"), + all = interaction(!!!all_avail, sep = "/"), + NULL + ), + .facets = switch(.facet_by, + all_keys = interaction(!!!all_keys, sep = "/"), + geo_value = as.factor(geo_value), + other_keys = interaction(!!!other_keys, sep = "/"), + all = interaction(!!!all_avail, sep = "/"), + NULL + ) + ) + + if (.max_facets < Inf && ".facets" %in% names(object)) { + n_facets <- nlevels(object$.facets) + if (n_facets > .max_facets) { + top_n <- levels(as.factor(object$.facets))[seq_len(.max_facets)] + object <- dplyr::filter(object, .facets %in% top_n) %>% + dplyr::mutate(.facets = droplevels(.facets)) + if (".colours" %in% names(object)) { + object <- dplyr::mutate(object, .colours = droplevels(.colours)) + } + } + } + + p <- ggplot2::ggplot(object, ggplot2::aes(x = .data$time_value)) + + ggplot2::theme_bw() + + if (".colours" %in% names(object)) { + p <- p + ggplot2::geom_line( + ggplot2::aes(y = .data$.response, colour = .data$.colours), + key_glyph = "timeseries" + ) + + ggplot2::scale_colour_viridis_d(name = "") + } else if (length(vars) > 1 && .color_by == ".response") { + p <- p + + ggplot2::geom_line(ggplot2::aes( + y = .data$.response, colour = .data$.response_name + )) + + ggplot2::scale_colour_viridis_d(name = "") + } else { # none + p <- p + + ggplot2::geom_line(ggplot2::aes(y = .data$.response), color = .base_color) + } + + if (".facets" %in% names(object)) { + p <- p + ggplot2::facet_wrap(~.facets, scales = "free_y") + + ggplot2::ylab(names(vars)) + if (.facet_by == "all") p <- p + ggplot2::ylab("") + } else if ((length(vars) > 1 && .facet_by == ".response")) { + p <- p + ggplot2::facet_wrap(~.response_name, scales = "free_y") + + ggplot2::ylab("") + } else { + p <- p + ggplot2::ylab(names(vars)) + } + p +} From cc58ddc1d935374a26cd18ce899f76dd14151472 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:42:11 -0800 Subject: [PATCH 010/345] document() --- NAMESPACE | 8 +++++++ man/autoplot.epi_df.Rd | 54 ++++++++++++++++++++++++++++++++++++++++++ man/epi_keys.Rd | 20 ++++++++++++++++ man/reexports.Rd | 3 +++ 4 files changed, 85 insertions(+) create mode 100644 man/autoplot.epi_df.Rd create mode 100644 man/epi_keys.Rd diff --git a/NAMESPACE b/NAMESPACE index 4f9b8151..eb88c7a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,10 +8,15 @@ S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) +S3method(autoplot,epi_df) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) +S3method(epi_keys,data.frame) +S3method(epi_keys,default) +S3method(epi_keys,epi_archive) +S3method(epi_keys,epi_df) S3method(epix_truncate_versions_after,epi_archive) S3method(epix_truncate_versions_after,grouped_epi_archive) S3method(group_by,epi_archive) @@ -33,11 +38,13 @@ export(arrange) export(as_epi_archive) export(as_epi_df) export(as_tsibble) +export(autoplot) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) export(epi_archive) export(epi_cor) +export(epi_keys) export(epi_slide) export(epix_as_of) export(epix_merge) @@ -85,6 +92,7 @@ importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,slice) importFrom(dplyr,ungroup) +importFrom(ggplot2,autoplot) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd new file mode 100644 index 00000000..134a83fe --- /dev/null +++ b/man/autoplot.epi_df.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/autoplot.R +\name{autoplot.epi_df} +\alias{autoplot.epi_df} +\title{Automatically plot an epi_df} +\usage{ +\method{autoplot}{epi_df}( + object, + ..., + .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), + .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), + .base_color = "#3A448F", + .max_facets = Inf +) +} +\arguments{ +\item{object}{An \code{epi_df}} + +\item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> One or more unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables.} + +\item{.color_by}{Which variables should determine the color(s) used to plot +lines. Options include: +\itemize{ +\item \code{all_keys} - the default uses the interaction of any key variables +including the \code{geo_value} +\item \code{geo_value} - \code{geo_value} only +\item \code{other_keys} - any available keys that are not \code{geo_value} +\item \code{.response} - the numeric variables (same as the y-axis) +\item \code{all} - uses the interaction of all keys and numeric variables +\item \code{none} - no coloring aesthetic is applied +}} + +\item{.facet_by}{Similar to \code{.color_by} except that the default is to display +each numeric variable on a separate facet} + +\item{.base_color}{Lines will be shown with this color. For example, with a +single numeric variable and faceting by \code{geo_value}, all locations would +share the same color line.} + +\item{.max_facets}{Cut down of the number of facets displayed. Especially +useful for testing when there are many \code{geo_value}'s or keys.} +} +\description{ +Automatically plot an epi_df +} +\examples{ +autoplot(jhu_csse_daily_subset, cases, death_rate_7d_av) +autoplot(jhu_csse_daily_subset, case_rate_7d_av, .facet_by = "geo_value") +autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", + .facet_by = "geo_value") +} diff --git a/man/epi_keys.Rd b/man/epi_keys.Rd new file mode 100644 index 00000000..8026fc14 --- /dev/null +++ b/man/epi_keys.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_keys.R +\name{epi_keys} +\alias{epi_keys} +\title{Grab any keys associated to an epi_df} +\usage{ +epi_keys(x, ...) +} +\arguments{ +\item{x}{a data.frame, tibble, or epi_df} + +\item{...}{additional arguments passed on to methods} +} +\value{ +If an \code{epi_df}, this returns all "keys". Otherwise \code{NULL} +} +\description{ +Grab any keys associated to an epi_df +} +\keyword{internal} diff --git a/man/reexports.Rd b/man/reexports.Rd index 46e961d9..fdda2925 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -14,6 +14,7 @@ \alias{rename} \alias{slice} \alias{unnest} +\alias{autoplot} \title{Objects exported from other packages} \keyword{internal} \description{ @@ -23,6 +24,8 @@ below to see their documentation. \describe{ \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr:group_map]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr:group_by]{ungroup}}} + \item{ggplot2}{\code{\link[ggplot2]{autoplot}}} + \item{tidyr}{\code{\link[tidyr]{unnest}}} \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} From d29b79fdf348ef09f480ab575cef4d39857f12af Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 27 Nov 2023 15:59:35 -0800 Subject: [PATCH 011/345] add new functions to pkgdown --- _pkgdown.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index bba3ea8d..00110b01 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -66,10 +66,13 @@ reference: - archive_cases_dv_subset - incidence_num_outlier_example - contains("jhu_csse") +- title: Basic automatic plotting + contents: + - autoplot.epi_df - title: internal contents: - epiprocess - max_version_with_row_in - next_after - guess_period - + - epi_keys From c2d58ca1f99ba0e06015f588790f61030414f545 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 Nov 2023 17:06:04 -0800 Subject: [PATCH 012/345] Fix straggling `covidcast` call --- vignettes/advanced.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 91f8f37f..812cb711 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -278,7 +278,7 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -y1 <- covidcast( +y1 <- pub_covidcast( source = "doctor-visits", signals = "smoothed_adj_cli", geo_type = "state", From cf894c32f71d334ec37b37f8a976745d7064ff14 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 Nov 2023 17:08:25 -0800 Subject: [PATCH 013/345] Bump to a .9999 version + add NEWS.md entry --- DESCRIPTION | 2 +- NEWS.md | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9a8dea6..dc48eb86 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.0 +Version: 0.7.0.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index be8b9b56..e3a9ea13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ + +# epiprocess 0.7.0.9999 + +## Improvements + +* Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. + # epiprocess 0.7.0 Note that `epiprocess` uses the [Semantic Versioning From db90bdaf378d0295a760b36d96dff8aee27bee05 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 Nov 2023 17:27:46 -0800 Subject: [PATCH 014/345] Update pkgdown action&config to build separate dev doc site Also, remove references to defunct/never-existing `master` branch. --- .github/workflows/R-CMD-check.yaml | 4 ++-- .github/workflows/pkgdown.yaml | 4 ++-- _pkgdown.yml | 10 +++++++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 67f4bdb7..ecc1c082 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to run on PRs to dev, use API key. on: push: - branches: [main, master] + branches: [main, dev] pull_request: - branches: [main, master, dev] + branches: [main, dev] name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index e591f1d9..5d70a744 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to run on PRs to dev, use API key. on: push: - branches: [main, master] + branches: [main, dev] pull_request: - branches: [main, master, dev] + branches: [main, dev] release: types: [published] workflow_dispatch: diff --git a/_pkgdown.yml b/_pkgdown.yml index bba3ea8d..de6d927f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,6 +1,14 @@ template: bootstrap: 5 - + +development: + mode: auto + +# Colors from epipredict & epidatr, including Carnegie Red https://www.cmu.edu/brand/brand-guidelines/visual-identity/colors.html +navbar: + bg: '#C41230' + fg: '#f8f8f8' + url: https://cmu-delphi.github.io/epiprocess/ home: From a326680eefa3f517a54b16f0a871be5041929b57 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 Nov 2023 17:29:40 -0800 Subject: [PATCH 015/345] Update pkgdown home links --- _pkgdown.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index de6d927f..3d80ebb8 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -13,12 +13,14 @@ url: https://cmu-delphi.github.io/epiprocess/ home: links: - - text: Get the epipredict R package - href: https://cmu-delphi.github.io/epipredict/ - - text: Get the covidcast R package - href: https://cmu-delphi.github.io/covidcast/covidcastR/ + - text: View the Delphi tooling book + href: https://cmu-delphi.github.io/delphi-tooling-book/ + - text: Get epidatasets R package + href: https://cmu-delphi.github.io/epidatasets/ - text: Get the epidatr R package href: https://github.com/cmu-delphi/epidatr + - text: Get the epipredict R package + href: https://cmu-delphi.github.io/epipredict/ articles: - title: Using the package From 909c983f53b8091c0a3eea0a2719095112a42f80 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 30 Nov 2023 13:00:56 -0800 Subject: [PATCH 016/345] Update _pkgdown.yml Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 3d80ebb8..98aa1772 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -15,7 +15,7 @@ home: links: - text: View the Delphi tooling book href: https://cmu-delphi.github.io/delphi-tooling-book/ - - text: Get epidatasets R package + - text: Get the epidatasets R package href: https://cmu-delphi.github.io/epidatasets/ - text: Get the epidatr R package href: https://github.com/cmu-delphi/epidatr From 2f99b20c17862db01a9450ff24b7789ad82b8ffc Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:58:03 -0500 Subject: [PATCH 017/345] use lapply-rbindlisht instead of map_dfr -- the included bind_rows is slow --- NAMESPACE | 1 + R/epi_df.R | 1 + R/grouped_epi_archive.R | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4f9b8151..7e812e86 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,6 +66,7 @@ importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,key) +importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) diff --git a/R/epi_df.R b/R/epi_df.R index c2b84c83..6faf6837 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -159,6 +159,7 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ + # TODO: relocate is slow. How to put geo_value and time_value at the beginning? x = dplyr::relocate(x, "geo_value", "time_value") } diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index d1ddf5bf..4e0203e5 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -185,7 +185,7 @@ grouped_epi_archive = #' @description Slides a given function over variables in a `grouped_epi_archive` #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. -#' @importFrom data.table key address +#' @importFrom data.table key address rbindlist #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg slide = function(f, ..., before, ref_time_values, @@ -308,7 +308,7 @@ grouped_epi_archive = } f = as_slide_computation(f, ...) - x = purrr::map_dfr(ref_time_values, function(ref_time_value) { + x = lapply(ref_time_values, function(ref_time_value) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) @@ -365,7 +365,7 @@ grouped_epi_archive = new_col = new_col, .keep = TRUE) ) - }) + }) %>% rbindlist() # Unchop/unnest if we need to if (!as_list_col) { From e00dc125415b6cdc30a4e076a3786839d1bc820b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:46:11 -0500 Subject: [PATCH 018/345] do faster relocate step --- R/epi_df.R | 7 +++++-- R/grouped_epi_archive.R | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index 6faf6837..544fab1b 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -159,8 +159,11 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ - # TODO: relocate is slow. How to put geo_value and time_value at the beginning? - x = dplyr::relocate(x, "geo_value", "time_value") + cols_to_put_first <- colnames(x) %in% c("geo_value", "time_value") + x <- x[, c( + which(cols_to_put_first), + which(!cols_to_put_first) + )] } # Apply epi_df class, attach metadata, and return diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 4e0203e5..ac9bf996 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -357,6 +357,7 @@ grouped_epi_archive = } return( + # TODO: looks like across and maybe all_of are slow dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), .drop=private$drop) %>% dplyr::group_modify(group_modify_fn, From ff397c0da18403224d11aa458ca1b477df9b32fc Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:46:51 -0500 Subject: [PATCH 019/345] use single-step data.table fn for relocate --- NAMESPACE | 1 + R/epi_df.R | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7e812e86..7ca3befd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -68,6 +68,7 @@ importFrom(data.table,copy) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) +importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) diff --git a/R/epi_df.R b/R/epi_df.R index 544fab1b..42b8f7b7 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -111,6 +111,8 @@ NULL #' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. #' +#' @importFrom data.table setcolorder +#' #' @export new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, additional_metadata = list(), ...) { @@ -159,11 +161,8 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ - cols_to_put_first <- colnames(x) %in% c("geo_value", "time_value") - x <- x[, c( - which(cols_to_put_first), - which(!cols_to_put_first) - )] + # Moves columns in place + setcolorder(x, c("geo_value", "time_value")) } # Apply epi_df class, attach metadata, and return From 99b01e3a9ffcc0493795972cfef6563d725fc5fb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:39:06 -0500 Subject: [PATCH 020/345] speed up group_by using across and tidyselect --- R/grouped_epi_archive.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index ac9bf996..5881b0a1 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -357,8 +357,7 @@ grouped_epi_archive = } return( - # TODO: looks like across and maybe all_of are slow - dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), + dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop) %>% dplyr::group_modify(group_modify_fn, f = f, ..., From d079ceb8251751f0afb21a734341e414b0d94011 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:54:47 -0500 Subject: [PATCH 021/345] df -> tibble conversion faster than gluing when making computation output --- R/grouped_epi_archive.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 5881b0a1..64d7453b 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -186,6 +186,7 @@ grouped_epi_archive = #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. #' @importFrom data.table key address rbindlist +#' @importFrom tibble as_tibble #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg slide = function(f, ..., before, ref_time_values, @@ -280,16 +281,18 @@ grouped_epi_archive = if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } + # Convert from data.frame to tibble for speed. + # Label every result row with the `ref_time_value` + res <- as_tibble(data.frame(time_value = ref_time_value)) + # Wrap the computation output in a list and unchop/unnest later if # `as_list_col = FALSE`. This approach means that we will get a # list-class col rather than a data.frame-class col when # `as_list_col = TRUE` and the computations outputs are data # frames. - comp_value <- list(comp_value) + res[[new_col]] <- list(comp_value) - # Label every result row with the `ref_time_value`: - return(tibble::tibble(time_value = .env$ref_time_value, - !!new_col := .env$comp_value)) + return(res) } # If `f` is missing, interpret ... as an expression for tidy evaluation From a0763586e0af01ba23755b659923ac39439ce5ce Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Dec 2023 13:20:17 -0500 Subject: [PATCH 022/345] Revert "use single-step data.table fn for relocate" This reverts commit ff397c0da18403224d11aa458ca1b477df9b32fc. --- NAMESPACE | 1 - R/epi_df.R | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7ca3befd..7e812e86 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -68,7 +68,6 @@ importFrom(data.table,copy) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) -importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) diff --git a/R/epi_df.R b/R/epi_df.R index 42b8f7b7..544fab1b 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -111,8 +111,6 @@ NULL #' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. #' -#' @importFrom data.table setcolorder -#' #' @export new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, additional_metadata = list(), ...) { @@ -161,8 +159,11 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ - # Moves columns in place - setcolorder(x, c("geo_value", "time_value")) + cols_to_put_first <- colnames(x) %in% c("geo_value", "time_value") + x <- x[, c( + which(cols_to_put_first), + which(!cols_to_put_first) + )] } # Apply epi_df class, attach metadata, and return From 278d410519ea35db13adc4751205305955be2ad2 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Dec 2023 13:21:47 -0500 Subject: [PATCH 023/345] make sure computation values come out as tibble, not DT --- NAMESPACE | 1 + R/grouped_epi_archive.R | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7e812e86..c240a5a7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -68,6 +68,7 @@ importFrom(data.table,copy) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) +importFrom(data.table,setDF) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 64d7453b..108e0755 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -185,8 +185,9 @@ grouped_epi_archive = #' @description Slides a given function over variables in a `grouped_epi_archive` #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. -#' @importFrom data.table key address rbindlist +#' @importFrom data.table key address rbindlist setDF #' @importFrom tibble as_tibble +#' @importFrom dplyr group_by groups #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg slide = function(f, ..., before, ref_time_values, @@ -368,7 +369,10 @@ grouped_epi_archive = new_col = new_col, .keep = TRUE) ) - }) %>% rbindlist() + }) + x <- rbindlist(x) %>% setDF() %>% as_tibble() %>% + # Reconstruct groups + group_by(!!!groups(x[[1L]])) # Unchop/unnest if we need to if (!as_list_col) { From 077a7a0e714e367f103ce2c3ed5c7eba6ad0cf38 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Dec 2023 14:27:47 -0500 Subject: [PATCH 024/345] carry over .drop setting for groups, too --- R/grouped_epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 108e0755..a9710a3e 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -372,7 +372,7 @@ grouped_epi_archive = }) x <- rbindlist(x) %>% setDF() %>% as_tibble() %>% # Reconstruct groups - group_by(!!!groups(x[[1L]])) + group_by(!!!groups(x[[1L]]), .drop = attr(attr(x[[1L]], "groups"), ".drop")) # Unchop/unnest if we need to if (!as_list_col) { From 5886ec14f65982110cb788c4c34c487cebd23b9b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:07:30 -0500 Subject: [PATCH 025/345] make relocate input-col-order agnostic --- R/epi_df.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index 544fab1b..045c4aaf 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -159,11 +159,13 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ - cols_to_put_first <- colnames(x) %in% c("geo_value", "time_value") + cols_to_put_first <- c("geo_value", "time_value") x <- x[, c( - which(cols_to_put_first), - which(!cols_to_put_first) - )] + cols_to_put_first, + # All other columns + names(x)[!(names(x) %in% cols_to_put_first)] + ) + ] } # Apply epi_df class, attach metadata, and return From 7d9f97eaf7381120444e269c97428def22ece49e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:13:48 -0500 Subject: [PATCH 026/345] use private$ to access group keys and drop value --- R/grouped_epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index a9710a3e..74bedb8b 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -372,7 +372,7 @@ grouped_epi_archive = }) x <- rbindlist(x) %>% setDF() %>% as_tibble() %>% # Reconstruct groups - group_by(!!!groups(x[[1L]]), .drop = attr(attr(x[[1L]], "groups"), ".drop")) + group_by(!!!syms(private$vars), .drop=private$drop) # Unchop/unnest if we need to if (!as_list_col) { From d368be5fce2dd7603dff8535533b3d550f7d975d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Dec 2023 20:14:33 -0500 Subject: [PATCH 027/345] convert list result to tibble altogether --- R/grouped_epi_archive.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 74bedb8b..82d4ce40 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -282,9 +282,9 @@ grouped_epi_archive = if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } - # Convert from data.frame to tibble for speed. + # Label every result row with the `ref_time_value` - res <- as_tibble(data.frame(time_value = ref_time_value)) + res <- list(time_value = ref_time_value) # Wrap the computation output in a list and unchop/unnest later if # `as_list_col = FALSE`. This approach means that we will get a @@ -292,8 +292,9 @@ grouped_epi_archive = # `as_list_col = TRUE` and the computations outputs are data # frames. res[[new_col]] <- list(comp_value) - - return(res) + + # Convert the list to a tibble all at once for speed. + return(as_tibble(res)) } # If `f` is missing, interpret ... as an expression for tidy evaluation From d7c1cba4a393d4ef69901cf7360be6528bffc6b2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 11 Dec 2023 16:33:22 -0800 Subject: [PATCH 028/345] Document mutation&aliasing behavior of archive as_of, slide --- R/archive.R | 5 ++++- R/methods-epi_archive.R | 18 ++++++++++++++++-- man/epix_as_of.Rd | 8 ++++++++ man/epix_slide.Rd | 7 +++++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 5897fc4d..7a7d8d82 100644 --- a/R/archive.R +++ b/R/archive.R @@ -495,7 +495,10 @@ epi_archive = version <= max_version, ] %>% unique(by = c("geo_value", "time_value", other_keys), fromLast = TRUE) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% + # (`as_tibble` should de-alias the DT and its columns in any edge + # cases where they are aliased. We don't say we guarantee this + # though.) dplyr::select(-"version") %>% as_epi_df(geo_type = self$geo_type, time_type = self$time_type, diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index c110555c..ae071e6a 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -32,13 +32,18 @@ #' x$as_of(max_version = v) #' ``` #' -#' @export +#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' #' @examples #' # warning message of data latency shown #' epix_as_of(x = archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version)) #' -#' @export #' @examples #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 @@ -60,6 +65,8 @@ #' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. +#' +#' @export epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value, all_versions = all_versions)) @@ -798,6 +805,13 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' x$slide(new_var = comp(old_var), before = 119) #' ``` #' +#' Mutation and aliasing: `epix_slide` and `$slide` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' #' @examples #' library(dplyr) #' diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index dcdf167d..51884597 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -45,6 +45,13 @@ is equivalent to: \if{html}{\out{
}}\preformatted{x$as_of(max_version = v) }\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input +archives, but may in some edge cases alias parts of the inputs, so copy the +outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} +operator. Currently, the only situation where there is potentially aliasing +is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change +in the future. } \examples{ # warning message of data latency shown @@ -71,4 +78,5 @@ withCallingHandlers({ }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) # Since R 4.0, there is a `globalCallingHandlers` function that can be used # to globally toggle these warnings. + } diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index c0f07d88..aebe5d12 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -179,6 +179,13 @@ is equivalent to: \if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) }\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not mutate the input +archives, but may in some edge cases alias parts of the inputs, so copy the +outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} +operator. Currently, the only situation where there is potentially aliasing +is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change +in the future. } \examples{ library(dplyr) From 63f8441eb1bb5b0965aa7807487cec3b528cee21 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 11 Dec 2023 16:39:18 -0800 Subject: [PATCH 029/345] Fix some mutation&aliasing discussion --- R/methods-epi_archive.R | 5 ++--- man/epix_slide.Rd | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index ae071e6a..961e599d 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -808,9 +808,8 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' Mutation and aliasing: `epix_slide` and `$slide` will not mutate the input #' archives, but may in some edge cases alias parts of the inputs, so copy the #' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, the only situation where there is potentially aliasing -#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change -#' in the future. +#' operator. Currently, there should not be any aliasing encountered, but this +#' may change in the future. #' #' @examples #' library(dplyr) diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index aebe5d12..e71a11f5 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -183,9 +183,8 @@ is equivalent to: Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not mutate the input archives, but may in some edge cases alias parts of the inputs, so copy the outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, the only situation where there is potentially aliasing -is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change -in the future. +operator. Currently, there should not be any aliasing encountered, but this +may change in the future. } \examples{ library(dplyr) From 5491a9305e3f288334e48c77d5cd6b6f0d1b8fe9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 11 Dec 2023 16:41:51 -0800 Subject: [PATCH 030/345] Fix the fix of mutation & aliasing discussion --- R/methods-epi_archive.R | 5 +++-- man/epix_slide.Rd | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 961e599d..7a0d4d08 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -808,8 +808,9 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' Mutation and aliasing: `epix_slide` and `$slide` will not mutate the input #' archives, but may in some edge cases alias parts of the inputs, so copy the #' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, there should not be any aliasing encountered, but this -#' may change in the future. +#' operator. Currently, there should not be any aliasing encountered except for +#' potentially aliasing of columns in edges cases with `all_versions = TRUE`, +#' but this may change in the future. #' #' @examples #' library(dplyr) diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index e71a11f5..9cc8340d 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -183,8 +183,9 @@ is equivalent to: Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not mutate the input archives, but may in some edge cases alias parts of the inputs, so copy the outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, there should not be any aliasing encountered, but this -may change in the future. +operator. Currently, there should not be any aliasing encountered except for +potentially aliasing of columns in edges cases with \code{all_versions = TRUE}, +but this may change in the future. } \examples{ library(dplyr) From 1acc0ad6891354bbab7fcff1d28aca0b4585b54c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 11 Dec 2023 17:02:57 -0800 Subject: [PATCH 031/345] Add even more notes on aliasing&mutation in archive slide --- R/grouped_epi_archive.R | 7 +++++++ R/methods-epi_archive.R | 13 +++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 82d4ce40..ac1a0002 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -336,6 +336,13 @@ grouped_epi_archive = # copies. if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). as_of_archive$DT <- copy(as_of_archive$DT) } dt_key = data.table::key(as_of_archive$DT) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 7a0d4d08..367fe759 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -805,12 +805,13 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' x$slide(new_var = comp(old_var), before = 119) #' ``` #' -#' Mutation and aliasing: `epix_slide` and `$slide` will not mutate the input -#' archives, but may in some edge cases alias parts of the inputs, so copy the -#' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, there should not be any aliasing encountered except for -#' potentially aliasing of columns in edges cases with `all_versions = TRUE`, -#' but this may change in the future. +#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place +#' mutation of the input archives on their own. In some edge cases the inputs it +#' feeds to the slide computations may alias parts of the input archive, so copy +#' the slide computation inputs if needed before using mutating operations like +#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of +#' the slide operation may alias parts of the input archive, so similarly, make +#' sure to clone and/or copy appropriately before using in-place mutation. #' #' @examples #' library(dplyr) From 88df2073fe69f7b402a00059486a3b36eec86d28 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 12 Dec 2023 11:54:22 -0500 Subject: [PATCH 032/345] remove some pipes --- R/grouped_epi_archive.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index ac1a0002..1905882c 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -369,16 +369,18 @@ grouped_epi_archive = } return( - dplyr::group_by(as_of_df, !!!syms(private$vars), - .drop=private$drop) %>% - dplyr::group_modify(group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE) + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) ) }) - x <- rbindlist(x) %>% setDF() %>% as_tibble() %>% + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) %>% # Reconstruct groups group_by(!!!syms(private$vars), .drop=private$drop) From 4ac39807ef85162602a76d21a9700a067bc9f1d0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 12 Dec 2023 13:39:55 -0500 Subject: [PATCH 033/345] use new_tibble instead of as_tibble in epix_slide --- NAMESPACE | 1 + R/grouped_epi_archive.R | 4 ++-- man/epix_slide.Rd | 13 +++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c240a5a7..4c943469 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -116,6 +116,7 @@ importFrom(rlang,syms) importFrom(stats,cor) importFrom(stats,median) importFrom(tibble,as_tibble) +importFrom(tibble,new_tibble) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 1905882c..1b715da1 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -186,7 +186,7 @@ grouped_epi_archive = #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. #' @importFrom data.table key address rbindlist setDF -#' @importFrom tibble as_tibble +#' @importFrom tibble as_tibble new_tibble #' @importFrom dplyr group_by groups #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg @@ -294,7 +294,7 @@ grouped_epi_archive = res[[new_col]] <- list(comp_value) # Convert the list to a tibble all at once for speed. - return(as_tibble(res)) + return(new_tibble(res)) } # If `f` is missing, interpret ... as an expression for tidy evaluation diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 9cc8340d..d94460af 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -180,12 +180,13 @@ is equivalent to: \if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) }\if{html}{\out{
}} -Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not mutate the input -archives, but may in some edge cases alias parts of the inputs, so copy the -outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, there should not be any aliasing encountered except for -potentially aliasing of columns in edges cases with \code{all_versions = TRUE}, -but this may change in the future. +Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place +mutation of the input archives on their own. In some edge cases the inputs it +feeds to the slide computations may alias parts of the input archive, so copy +the slide computation inputs if needed before using mutating operations like +\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of +the slide operation may alias parts of the input archive, so similarly, make +sure to clone and/or copy appropriately before using in-place mutation. } \examples{ library(dplyr) From 1acd8d3d814d7cedc5b95bafcaadd6cb68a41363 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 12 Dec 2023 13:56:11 -0500 Subject: [PATCH 034/345] more pipe removal --- R/grouped_epi_archive.R | 6 +++--- R/slide.R | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 1b715da1..09054527 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -380,9 +380,9 @@ grouped_epi_archive = ) }) # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) %>% - # Reconstruct groups - group_by(!!!syms(private$vars), .drop=private$drop) + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(private$vars), .drop=private$drop) # Unchop/unnest if we need to if (!as_list_col) { diff --git a/R/slide.R b/R/slide.R index 7467f219..0feb689a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -300,8 +300,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # Count the number of appearances of each reference time value (these # appearances should all be real for now, but if we allow ref time values # outside of .data_group's time values): - counts = .data_group %>% - dplyr::filter(.data$time_value %in% time_values) %>% + counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>% dplyr::count(.data$time_value) %>% dplyr::pull(n) @@ -375,8 +374,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, .x$.real <- NULL f(.x, .group_key, .ref_time_value, ...) } - x = x %>% - group_modify(slide_one_grp, + x = group_modify(x, slide_one_grp, f = f_wrapper, ..., starts = starts, stops = stops, From c6863ea911093a9f1e6f818a0b9152912177084a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 13 Dec 2023 18:01:01 -0500 Subject: [PATCH 035/345] validate new_tibble output for safety --- NAMESPACE | 1 + R/grouped_epi_archive.R | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4c943469..73db3483 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -117,6 +117,7 @@ importFrom(stats,cor) importFrom(stats,median) importFrom(tibble,as_tibble) importFrom(tibble,new_tibble) +importFrom(tibble,validate_tibble) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 09054527..b11bf821 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -186,7 +186,7 @@ grouped_epi_archive = #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. #' @importFrom data.table key address rbindlist setDF -#' @importFrom tibble as_tibble new_tibble +#' @importFrom tibble as_tibble new_tibble validate_tibble #' @importFrom dplyr group_by groups #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg @@ -294,7 +294,7 @@ grouped_epi_archive = res[[new_col]] <- list(comp_value) # Convert the list to a tibble all at once for speed. - return(new_tibble(res)) + return(validate_tibble(new_tibble(res))) } # If `f` is missing, interpret ... as an expression for tidy evaluation From 6007722d5d8bd43f37c6da3a04507a034361790c Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 3 Jan 2024 19:13:41 -0800 Subject: [PATCH 036/345] docs: mirror epidatr development doc --- DEVELOPMENT.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 DEVELOPMENT.md diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 00000000..a344b78e --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,59 @@ +## Setting up the development environment + +```r +install.packages(c('devtools', 'pkgdown', 'styler', 'lintr')) # install dev dependencies +devtools::install_deps(dependencies = TRUE) # install package dependencies +devtools::document() # generate package meta data and man files +devtools::build() # build package +``` + +## Validating the package + +```r +styler::style_pkg() # format code +lintr::lint_package() # lint code + +devtools::test() # test package +devtools::check() # check package for errors +``` + +## Developing the documentation site + +The [documentation site](https://cmu-delphi.github.io/epidatr/) is built off of the `main` branch. The `dev` version of the site is available at https://cmu-delphi.github.io/epidatr/dev. + +The documentation site can be previewed locally by running in R + +```r +pkgdown::build_site(preview=TRUE) +``` + +The `main` version is available at `file:////epidatr/docs/index.html` and `dev` at `file:////epidatr/docs/dev/index.html`. + +You can also build the docs manually and launch the site with python. From the terminal, this looks like +```bash +R -e 'devtools::document()' +python -m http.server -d docs +``` + +For `pkgdown` to correctly generate both public (`main`) and `dev` documentation sites, the package version in `DESCRIPTION` on `dev` must have four components, and be of the format `x.x.x.9000`. The package version on `main` must be in the format `x.x.x`. + +The documentation website is updated on push or pull request to the `main` and `dev` branches. + +## Release process + +### Manual + +TBD + +### Automated (currently unavailable) + +The release consists of multiple steps which can be all done via the GitHub website: + +1. Go to [create_release GitHub Action](https://github.com/cmu-delphi/epidatr/actions/workflows/create_release.yml) and click the `Run workflow` button. Enter the next version number or one of the magic keywords (patch, minor, major) and hit the green `Run workflow` button. +2. The action will prepare a new release and will end up with a new [Pull Request](https://github.com/cmu-delphi/epidatr/pulls) +3. Let the code owner review the PR and its changes and let the CI check whether everything builds successfully +4. Once approved and merged, another GitHub action job starts which automatically will + 1. create a git tag + 2. create another [Pull Request](https://github.com/cmu-delphi/epidatr/pulls) to merge the changes back to the `dev` branch + 3. create a [GitHub release](https://github.com/cmu-delphi/epidatr/releases) with automatically derived release notes +5. Release to CRAN From 302a0ef266fc5c87a4fee1b89806837232165994 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 3 Jan 2024 19:17:04 -0800 Subject: [PATCH 037/345] docs: minor nit --- NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index f303b073..5b338b47 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,4 @@ - -# epiprocess 0.7.0.9999 +# epiprocess 0.7.0.9000 ## Improvements From 39225f09f5e738fd2eb5c4e719a1665a8220ab8b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 8 Jan 2024 16:36:50 -0800 Subject: [PATCH 038/345] patch: get select working with grouping --- DESCRIPTION | 1 + NAMESPACE | 1 + tests/testthat/test-epi_df.R | 22 +++++++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dc48eb86..339d5681 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -76,6 +76,7 @@ Collate: 'data.R' 'epi_df.R' 'epiprocess.R' + 'group_by_epi_df_methods.R' 'methods-epi_archive.R' 'grouped_epi_archive.R' 'growth_rate.R' diff --git a/NAMESPACE b/NAMESPACE index 73db3483..c59004c8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ S3method(groups,grouped_epi_archive) S3method(next_after,Date) S3method(next_after,integer) S3method(print,epi_df) +S3method(select,epi_df) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 10b0015e..047438bb 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -36,6 +36,22 @@ test_that("as_epi_df errors when additional_metadata is not a list", { pol = rep(c("blue", "swing", "swing"), each = 2)) expect_error( - as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")), - "`additional_metadata` must be a list type.") -}) \ No newline at end of file + as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")), + "`additional_metadata` must be a list type." + ) +}) + + +test_that("grouped epi_df maintains type for select", { + tib <- tibble::tibble( + x = 1:10, y = 1:10, + time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) + ) + + epi_tib <- epiprocess::new_epi_df(tib) + epi_tib + grouped_epi <- epi_tib %>% group_by(geo_value) + selected_df <- grouped_epi %>% select(-y) + expect_true("epi_df" %in% class(selected_df)) +}) From 05aa645469588be96b5c6706722f6e5ab6f2a6f2 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jan 2024 10:34:29 -0800 Subject: [PATCH 039/345] need the actual method --- R/group_by_epi_df_methods.R | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 R/group_by_epi_df_methods.R diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R new file mode 100644 index 00000000..29e41762 --- /dev/null +++ b/R/group_by_epi_df_methods.R @@ -0,0 +1,10 @@ +#' @importFrom dplyr select +#' @export +select.epi_df <- function(.data, ...) { + selected <- NextMethod(.data) + return(dplyr_reconstruct(selected, .data)) +} + +# others to consider: +# - arrange +# - From ac5f369839190ccdb604a0413a098651bf2352fa Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jan 2024 19:04:37 -0800 Subject: [PATCH 040/345] more thorough testing --- tests/testthat/test-epi_df.R | 85 ++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 047438bb..fbd31f7b 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -1,23 +1,27 @@ test_that("new_epi_df works as intended", { # Empty tibble - wmsg = capture_warnings(a <- new_epi_df()) - expect_match(wmsg[1], - "Unknown or uninitialised column: `geo_value`.") - expect_match(wmsg[2], - "Unknown or uninitialised column: `time_value`.") + wmsg <- capture_warnings(a <- new_epi_df()) + expect_match( + wmsg[1], + "Unknown or uninitialised column: `geo_value`." + ) + expect_match( + wmsg[2], + "Unknown or uninitialised column: `time_value`." + ) expect_true(is_epi_df(a)) expect_identical(attributes(a)$metadata$geo_type, "custom") expect_identical(attributes(a)$metadata$time_type, "custom") expect_true(lubridate::is.POSIXt(attributes(a)$metadata$as_of)) - + # Simple non-empty tibble with geo_value and time_value cols tib <- tibble::tibble( x = 1:10, y = 1:10, time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) - - epi_tib = new_epi_df(tib) + + epi_tib <- new_epi_df(tib) expect_true(is_epi_df(epi_tib)) expect_length(epi_tib, 4L) expect_identical(attributes(epi_tib)$metadata$geo_type, "state") @@ -32,26 +36,73 @@ test_that("as_epi_df errors when additional_metadata is not a list", { dplyr::slice_tail(n = 6) %>% tsibble::as_tsibble() %>% dplyr::mutate( - state = rep("MA",6), - pol = rep(c("blue", "swing", "swing"), each = 2)) - + state = rep("MA", 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) + expect_error( as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")), "`additional_metadata` must be a list type." ) }) +# select fixes +tib <- tibble::tibble( + x = 1:10, y = 1:10, + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) +) +epi_tib <- epiprocess::new_epi_df(tib) test_that("grouped epi_df maintains type for select", { + grouped_epi <- epi_tib %>% group_by(geo_value) + selected_df <- grouped_epi %>% select(-y) + expect_true("epi_df" %in% class(selected_df)) + # make sure that the attributes are right + epi_attr <- attributes(selected_df) + expect_identical(epi_attr$names, c("geo_value", "time_value", "x")) + expect_identical(epi_attr$row.names, seq(1, 10)) + expect_identical(epi_attr$groups, attributes(grouped_epi)$groups) + expect_identical(epi_attr$metadata, attributes(epi_tib)$metadata) +}) + +test_that("grouped epi_df drops type when dropping keys", { + grouped_epi <- epi_tib %>% group_by(geo_value) + selected_df <- grouped_epi %>% select(geo_value) + expect_true(!("epi_df" %in% class(selected_df))) +}) + +test_that("grouped epi_df handles extra keys correctly", { tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), - geo_value = rep(c("ca", "hi"), each = 5) + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5), + extra_key = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2) ) - - epi_tib <- epiprocess::new_epi_df(tib) - epi_tib + epi_tib <- epiprocess::new_epi_df(tib, + additional_metadata = list(other_keys = "extra_key") + ) + attributes(epi_tib) grouped_epi <- epi_tib %>% group_by(geo_value) - selected_df <- grouped_epi %>% select(-y) + selected_df <- grouped_epi %>% select(-extra_key) + selected_df expect_true("epi_df" %in% class(selected_df)) + # make sure that the attributes are right + old_attr <- attributes(epi_tib) + epi_attr <- attributes(selected_df) + expect_identical(epi_attr$names, c("geo_value", "time_value", "x", "y")) + expect_identical(epi_attr$row.names, seq(1, 10)) + expect_identical(epi_attr$groups, attributes(grouped_epi)$groups) + expect_identical(epi_attr$metadata, list( + geo_type = "state", time_type = + "day", + as_of = old_attr$metadata$as_of, + other_keys = character(0) + )) }) From b9bac788b3c50555b885cc0d4c3b7fefe62b68e1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 Jan 2024 18:37:16 -0800 Subject: [PATCH 041/345] Fix `names<-.epi_df`, introduce failing test for renaming in `select` --- R/epi_df.R | 5 ++++- R/group_by_epi_df_methods.R | 2 +- R/methods-epi_df.R | 11 +++++++++-- tests/testthat/test-methods-epi_df.R | 14 ++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index 045c4aaf..53dca62b 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -122,6 +122,9 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, if (!is.list(additional_metadata)) { Abort("`additional_metadata` must be a list type.") } + if (is.null(additional_metadata[["other_keys"]])) { + additional_metadata[["other_keys"]] <- character(0L) + } # If geo type is missing, then try to guess it if (missing(geo_type)) { @@ -334,4 +337,4 @@ as_epi_df.tbl_ts = function(x, geo_type, time_type, as_of, #' @export is_epi_df = function(x) { inherits(x, "epi_df") -} \ No newline at end of file +} diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R index 29e41762..8d02a887 100644 --- a/R/group_by_epi_df_methods.R +++ b/R/group_by_epi_df_methods.R @@ -2,7 +2,7 @@ #' @export select.epi_df <- function(.data, ...) { selected <- NextMethod(.data) - return(dplyr_reconstruct(selected, .data)) + return (dplyr_reconstruct(selected, .data)) } # others to consider: diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 6429b867..82acc107 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -180,9 +180,16 @@ dplyr_row_slice.epi_df = function(data, i, ...) { #' @export `names<-.epi_df` = function(x, value) { old_names = names(x) - old_other_keys = attributes(x)$metadata$other_keys + old_other_keys = attr(x, "metadata")[["other_keys"]] result = NextMethod() - attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)] + new_other_keys_almost <- value[match(old_other_keys, old_names)] + attr(result, "metadata")[["other_keys"]] <- + # patch until we require `other_keys` to be `chr`; match NULL-ness of input `other_keys`: + # if (length(new_other_keys_almost) == 0L) NULL + # if (is.null(old_other_keys)) NULL + # else new_other_keys_almost + new_other_keys_almost + # decay to non-`epi_df` if needed: dplyr::dplyr_reconstruct(result, result) } diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 9d03cf93..c9e38fff 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -124,3 +124,17 @@ test_that("Metadata and grouping are dropped by `as_tibble`", { !any(c("metadata", "groups") %in% names(attributes(grouped_converted))) ) }) + +test_that("Renaming columns gives appropriate colnames and metadata", { + edf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% + as_epi_df(additional_metadata = list(other_keys = "age")) + renamed_edf1 <- edf %>% + `[`(c("geo_value", "time_value", "age", "value")) %>% + `names<-`(c("geo_value", "time_value", "age_group", "value")) + expect_identical(names(renamed_edf1), c("geo_value", "time_value", "age_group", "value")) + expect_identical(attr(renamed_edf1, "metadata")$other_keys, c("age_group")) + renamed_edf2 <- edf %>% + as_epi_df(additional_metadata = list(other_keys = "age")) %>% + select(geo_value, time_value, age_group = age, value) + expect_identical(renamed_edf1, renamed_edf2) +}) From 42eb793d02332bcf6773860869e3cc91b16dff6f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 Jan 2024 18:43:40 -0800 Subject: [PATCH 042/345] Clean up some commented-out code --- R/methods-epi_df.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 82acc107..5dea964c 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -182,13 +182,8 @@ dplyr_row_slice.epi_df = function(data, i, ...) { old_names = names(x) old_other_keys = attr(x, "metadata")[["other_keys"]] result = NextMethod() - new_other_keys_almost <- value[match(old_other_keys, old_names)] - attr(result, "metadata")[["other_keys"]] <- - # patch until we require `other_keys` to be `chr`; match NULL-ness of input `other_keys`: - # if (length(new_other_keys_almost) == 0L) NULL - # if (is.null(old_other_keys)) NULL - # else new_other_keys_almost - new_other_keys_almost + new_other_keys <- value[match(old_other_keys, old_names)] + attr(result, "metadata")[["other_keys"]] <- new_other_keys # decay to non-`epi_df` if needed: dplyr::dplyr_reconstruct(result, result) } From 510e6d45b626b684265f13fa337402f284d31343 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 Jan 2024 19:14:29 -0800 Subject: [PATCH 043/345] Fix `names<-` metadata for grouped `epi_df`s --- R/methods-epi_df.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 5dea964c..6e4666e7 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -179,11 +179,13 @@ dplyr_row_slice.epi_df = function(data, i, ...) { #' @export `names<-.epi_df` = function(x, value) { - old_names = names(x) - old_other_keys = attr(x, "metadata")[["other_keys"]] - result = NextMethod() + old_names <- names(x) + old_metadata <- attr(x, "metadata") + old_other_keys <- old_metadata[["other_keys"]] new_other_keys <- value[match(old_other_keys, old_names)] - attr(result, "metadata")[["other_keys"]] <- new_other_keys + new_metadata <- old_metadata + new_metadata[["other_keys"]] <- new_other_keys + result <- reclass(NextMethod(), new_metadata) # decay to non-`epi_df` if needed: dplyr::dplyr_reconstruct(result, result) } From 9cc34370156f5f4d7862c25ecaef5bdf816740cd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 Jan 2024 19:16:09 -0800 Subject: [PATCH 044/345] Fix grouped `epi_df` `select` when renaming --- R/group_by_epi_df_methods.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R index 8d02a887..acdab378 100644 --- a/R/group_by_epi_df_methods.R +++ b/R/group_by_epi_df_methods.R @@ -2,7 +2,8 @@ #' @export select.epi_df <- function(.data, ...) { selected <- NextMethod(.data) - return (dplyr_reconstruct(selected, .data)) + might_decay <- reclass(selected, attr(selected, "metadata")) + return(dplyr_reconstruct(might_decay, might_decay)) } # others to consider: From 113121fdf5ac4119bf387db63948496bbb2378c7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 Jan 2024 19:32:15 -0800 Subject: [PATCH 045/345] Add grouped epi_df names<- test, spruce up related tests --- tests/testthat/test-epi_df.R | 11 +++++------ tests/testthat/test-methods-epi_df.R | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index fbd31f7b..decd6fd7 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -59,19 +59,20 @@ epi_tib <- epiprocess::new_epi_df(tib) test_that("grouped epi_df maintains type for select", { grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-y) - expect_true("epi_df" %in% class(selected_df)) + expect_true(inherits(selected_df, "epi_df")) # make sure that the attributes are right epi_attr <- attributes(selected_df) expect_identical(epi_attr$names, c("geo_value", "time_value", "x")) expect_identical(epi_attr$row.names, seq(1, 10)) expect_identical(epi_attr$groups, attributes(grouped_epi)$groups) expect_identical(epi_attr$metadata, attributes(epi_tib)$metadata) + expect_identical(selected_df, epi_tib %>% select(-y) %>% group_by(geo_value)) }) test_that("grouped epi_df drops type when dropping keys", { grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(geo_value) - expect_true(!("epi_df" %in% class(selected_df))) + expect_true(!inherits(selected_df, "epi_df")) }) test_that("grouped epi_df handles extra keys correctly", { @@ -91,8 +92,7 @@ test_that("grouped epi_df handles extra keys correctly", { attributes(epi_tib) grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-extra_key) - selected_df - expect_true("epi_df" %in% class(selected_df)) + expect_true(inherits(selected_df, "epi_df")) # make sure that the attributes are right old_attr <- attributes(epi_tib) epi_attr <- attributes(selected_df) @@ -100,8 +100,7 @@ test_that("grouped epi_df handles extra keys correctly", { expect_identical(epi_attr$row.names, seq(1, 10)) expect_identical(epi_attr$groups, attributes(grouped_epi)$groups) expect_identical(epi_attr$metadata, list( - geo_type = "state", time_type = - "day", + geo_type = "state", time_type = "day", as_of = old_attr$metadata$as_of, other_keys = character(0) )) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index c9e38fff..6be7e89b 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -138,3 +138,20 @@ test_that("Renaming columns gives appropriate colnames and metadata", { select(geo_value, time_value, age_group = age, value) expect_identical(renamed_edf1, renamed_edf2) }) + +test_that("Renaming columns while grouped gives appropriate colnames and metadata", { + gedf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% + as_epi_df(additional_metadata = list(other_keys = "age")) %>% + group_by(geo_value) + renamed_gedf1 <- gedf %>% + `[`(c("geo_value", "time_value", "age", "value")) %>% + `names<-`(c("geo_value", "time_value", "age_group", "value")) + expect_true(inherits(renamed_gedf1, "epi_df")) + expect_true(inherits(renamed_gedf1, "grouped_df")) + expect_identical(names(renamed_gedf1), c("geo_value", "time_value", "age_group", "value")) + expect_identical(attr(renamed_gedf1, "metadata")$other_keys, c("age_group")) + renamed_gedf2 <- gedf %>% + as_epi_df(additional_metadata = list(other_keys = "age")) %>% + select(geo_value, time_value, age_group = age, value) + expect_identical(renamed_gedf1, renamed_gedf2) +}) From 0bf702f11cfd9ac13a65b7a1b62384937d38e33e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 Jan 2024 19:41:32 -0800 Subject: [PATCH 046/345] Comment adding context for `group_by_epi_df_methods.R` --- R/group_by_epi_df_methods.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R index acdab378..b531178f 100644 --- a/R/group_by_epi_df_methods.R +++ b/R/group_by_epi_df_methods.R @@ -1,3 +1,9 @@ +# These methods (and maybe some others in methods-epi_df.R) are here to augment +# `?dplyr_extending` implementations to get the correct behavior on grouped +# `epi_df`s. It would be nice if there were a way to replace these with a +# generic core that automatically fixed all misbehaving methods; see +# brainstorming within Issue #223. + #' @importFrom dplyr select #' @export select.epi_df <- function(.data, ...) { From ff3dfae096a10b26f494a2d6427cafdafe83b14f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 16 Jan 2024 12:48:46 -0800 Subject: [PATCH 047/345] doc: minor annotations --- tests/testthat/test-methods-epi_df.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 6be7e89b..aeb08ced 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -128,11 +128,13 @@ test_that("Metadata and grouping are dropped by `as_tibble`", { test_that("Renaming columns gives appropriate colnames and metadata", { edf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% as_epi_df(additional_metadata = list(other_keys = "age")) + # renaming using base R renamed_edf1 <- edf %>% `[`(c("geo_value", "time_value", "age", "value")) %>% `names<-`(c("geo_value", "time_value", "age_group", "value")) expect_identical(names(renamed_edf1), c("geo_value", "time_value", "age_group", "value")) expect_identical(attr(renamed_edf1, "metadata")$other_keys, c("age_group")) + # renaming using select renamed_edf2 <- edf %>% as_epi_df(additional_metadata = list(other_keys = "age")) %>% select(geo_value, time_value, age_group = age, value) @@ -143,13 +145,17 @@ test_that("Renaming columns while grouped gives appropriate colnames and metadat gedf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% as_epi_df(additional_metadata = list(other_keys = "age")) %>% group_by(geo_value) + # renaming using base R renamed_gedf1 <- gedf %>% `[`(c("geo_value", "time_value", "age", "value")) %>% `names<-`(c("geo_value", "time_value", "age_group", "value")) + # tets type preservation expect_true(inherits(renamed_gedf1, "epi_df")) expect_true(inherits(renamed_gedf1, "grouped_df")) + # the names are right expect_identical(names(renamed_gedf1), c("geo_value", "time_value", "age_group", "value")) expect_identical(attr(renamed_gedf1, "metadata")$other_keys, c("age_group")) + # renaming using select renamed_gedf2 <- gedf %>% as_epi_df(additional_metadata = list(other_keys = "age")) %>% select(geo_value, time_value, age_group = age, value) From 58ed6b45f77eb252ee5f721ff0ae8ef93888e4a6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 16 Nov 2023 11:00:14 -0800 Subject: [PATCH 048/345] Update R/epi_df.R Co-authored-by: brookslogan --- R/epi_df.R | 190 +++++++++++++++++++++++++++++------------------------ 1 file changed, 104 insertions(+), 86 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index 53dca62b..91e6c9d9 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -87,7 +87,7 @@ NULL #' Creates an `epi_df` object #' -#' Creates a new `epi_df` object. By default, builds an empty tibble with the +#' Creates a new `epi_df` object. By default, builds an empty tibble with the #' correct metadata for an `epi_df` object (ie. `geo_type`, `time_type`, and `as_of`). #' Refer to the below info. about the arguments for more details. #' @@ -107,18 +107,18 @@ NULL #' `epi_df` object. The metadata will have `geo_type`, `time_type`, and #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. +#' character vector in the `other_keys` component of `additional_metadata`. #' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. -#' +#' #' @export -new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list(), ...) { +new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, + additional_metadata = list(), ...) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") } - + if (!is.list(additional_metadata)) { Abort("`additional_metadata` must be a list type.") } @@ -128,52 +128,55 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # If geo type is missing, then try to guess it if (missing(geo_type)) { - geo_type = guess_geo_type(x$geo_value) + geo_type <- guess_geo_type(x$geo_value) } - + # If time type is missing, then try to guess it if (missing(time_type)) { - time_type = guess_time_type(x$time_value) + time_type <- guess_time_type(x$time_value) } - + # If as_of is missing, then try to guess it if (missing(as_of)) { # First check the metadata for an as_of field if ("metadata" %in% names(attributes(x)) && - "as_of" %in% names(attributes(x)$metadata)) { - as_of = attributes(x)$metadata$as_of + "as_of" %in% names(attributes(x)$metadata)) { + as_of <- attributes(x)$metadata$as_of } - + # Next check for as_of, issue, or version columns - else if ("as_of" %in% names(x)) as_of = max(x$as_of) - else if ("issue" %in% names(x)) as_of = max(x$issue) - else if ("version" %in% names(x)) as_of = max(x$version) - - # If we got here then we failed - else as_of = Sys.time() # Use the current day-time + else if ("as_of" %in% names(x)) { + as_of <- max(x$as_of) + } else if ("issue" %in% names(x)) { + as_of <- max(x$issue) + } else if ("version" %in% names(x)) { + as_of <- max(x$version) + } # If we got here then we failed + else { + as_of <- Sys.time() + } # Use the current day-time } - + # Define metadata fields - metadata = list() - metadata$geo_type = geo_type - metadata$time_type = time_type - metadata$as_of = as_of - metadata = c(metadata, additional_metadata) - + metadata <- list() + metadata$geo_type <- geo_type + metadata$time_type <- time_type + metadata$as_of <- as_of + metadata <- c(metadata, additional_metadata) + # Reorder columns (geo_value, time_value, ...) - if(sum(dim(x)) != 0){ + if (sum(dim(x)) != 0) { cols_to_put_first <- c("geo_value", "time_value") x <- x[, c( cols_to_put_first, # All other columns names(x)[!(names(x) %in% cols_to_put_first)] - ) - ] + )] } - + # Apply epi_df class, attach metadata, and return - class(x) = c("epi_df", class(x)) - attributes(x)$metadata = metadata + class(x) <- c("epi_df", class(x)) + attributes(x)$metadata <- metadata return(x) } @@ -205,77 +208,85 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, #' @return An `epi_df` object. #' #' @export -#' @examples +#' @examples #' # Convert a `tsibble` that has county code as an extra key #' # Notice that county code should be a character string to preserve any leading zeroes -#' +#' #' ex1_input <- tibble::tibble( #' geo_value = rep(c("ca", "fl", "pa"), each = 3), -#' county_code = c("06059","06061","06067", -#' "12111","12113","12117", -#' "42101", "42103","42105"), +#' county_code = c( +#' "06059", "06061", "06067", +#' "12111", "12113", "12117", +#' "42101", "42103", "42105" +#' ), #' time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), -#' by = "day"), length.out = length(geo_value)), +#' by = "day" +#' ), length.out = length(geo_value)), #' value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) -#' ) %>% +#' ) %>% #' tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) -#' +#' #' # The `other_keys` metadata (`"county_code"` in this case) is automatically #' # inferred from the `tsibble`'s `key`: #' ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") -#' attr(ex1,"metadata")[["other_keys"]] -#' -#' -#' +#' attr(ex1, "metadata")[["other_keys"]] +#' +#' +#' #' # Dealing with misspecified column names: #' # Geographical and temporal information must be provided in columns named #' # `geo_value` and `time_value`; if we start from a data frame with a #' # different format, it must be converted to use `geo_value` and `time_value` #' # before calling `as_epi_df`. -#' +#' #' ex2_input <- tibble::tibble( #' state = rep(c("ca", "fl", "pa"), each = 3), # misnamed #' pol = rep(c("blue", "swing", "swing"), each = 3), # extra key #' reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), -#' by = "day"), length.out = length(state)), # misnamed +#' by = "day" +#' ), length.out = length(state)), # misnamed #' value = 1:length(state) + 0.01 * rnorm(length(state)) -#' ) -#' +#' ) +#' #' print(ex2_input) -#' -#' ex2 <- ex2_input %>% dplyr::rename(geo_value = state, time_value = reported_date) %>% -#' as_epi_df(geo_type = "state", as_of = "2020-06-03", -#' additional_metadata = list(other_keys = "pol")) -#' -#' attr(ex2,"metadata") -#' -#' -#' +#' +#' ex2 <- ex2_input %>% +#' dplyr::rename(geo_value = state, time_value = reported_date) %>% +#' as_epi_df( +#' geo_type = "state", as_of = "2020-06-03", +#' additional_metadata = list(other_keys = "pol") +#' ) +#' +#' attr(ex2, "metadata") +#' +#' +#' #' # Adding additional keys to an `epi_df` object -#' +#' #' ex3_input <- jhu_csse_county_level_subset %>% #' dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% -#' dplyr::slice_tail(n = 6) -#' -#' ex3 <- ex3_input %>% +#' dplyr::slice_tail(n = 6) +#' +#' ex3 <- ex3_input %>% #' tsibble::as_tsibble() %>% # needed to add the additional metadata #' # add 2 extra keys #' dplyr::mutate( -#' state = rep("MA",6), -#' pol = rep(c("blue", "swing", "swing"), each = 2)) %>% -#' # the 2 extra keys we added have to be specified in the other_keys +#' state = rep("MA", 6), +#' pol = rep(c("blue", "swing", "swing"), each = 2) +#' ) %>% +#' # the 2 extra keys we added have to be specified in the other_keys #' # component of additional_metadata. #' as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) -#' -#' attr(ex3,"metadata") -as_epi_df = function(x, ...) { +#' +#' attr(ex3, "metadata") +as_epi_df <- function(x, ...) { UseMethod("as_epi_df") } #' @method as_epi_df epi_df #' @describeIn as_epi_df Simply returns the `epi_df` object unchanged. #' @export -as_epi_df.epi_df = function(x, ...) { +as_epi_df.epi_df <- function(x, ...) { return(x) } @@ -289,8 +300,8 @@ as_epi_df.epi_df = function(x, ...) { #' be used. #' @importFrom rlang .data #' @export -as_epi_df.tbl_df = function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { +as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { # Check that we have geo_value and time_value columns if (!("geo_value" %in% names(x))) { Abort("`x` must contain a `geo_value` column.") @@ -298,18 +309,22 @@ as_epi_df.tbl_df = function(x, geo_type, time_type, as_of, if (!("time_value" %in% names(x))) { Abort("`x` must contain a `time_value` column.") } - - new_epi_df(x, geo_type, time_type, as_of, - additional_metadata, ...) + + new_epi_df( + x, geo_type, time_type, as_of, + additional_metadata, ... + ) } #' @method as_epi_df data.frame #' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`. #' @export -as_epi_df.data.frame = function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { - as_epi_df.tbl_df(tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ...) +as_epi_df.data.frame <- function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { + as_epi_df.tbl_df( + tibble::as_tibble(x), geo_type, time_type, as_of, + additional_metadata, ... + ) } #' @method as_epi_df tbl_ts @@ -318,23 +333,26 @@ as_epi_df.data.frame = function(x, geo_type, time_type, as_of, #' "geo_value") are added to the metadata of the returned object, under the #' `other_keys` field. #' @export -as_epi_df.tbl_ts = function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { - tsibble_other_keys = setdiff(tsibble::key_vars(x), "geo_value") +as_epi_df.tbl_ts <- function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { + tsibble_other_keys <- setdiff(tsibble::key_vars(x), "geo_value") if (length(tsibble_other_keys) != 0) { - additional_metadata$other_keys = unique( - c(additional_metadata$other_keys, tsibble_other_keys)) + additional_metadata$other_keys <- unique( + c(additional_metadata$other_keys, tsibble_other_keys) + ) } - as_epi_df.tbl_df(tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ...) + as_epi_df.tbl_df( + tibble::as_tibble(x), geo_type, time_type, as_of, + additional_metadata, ... + ) } #' Test for `epi_df` format #' #' @param x An object. #' @return `TRUE` if the object inherits from `epi_df`. -#' +#' #' @export -is_epi_df = function(x) { +is_epi_df <- function(x) { inherits(x, "epi_df") } From c65876078a6f9525952b305eaea2fca003adf907 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 17 Jan 2024 13:51:27 -0800 Subject: [PATCH 049/345] style: styler --- R/archive.R | 1094 +++++++++-------- R/correlation.R | 109 +- R/data.R | 102 +- R/epiprocess.R | 2 +- R/grouped_epi_archive.R | 500 ++++---- R/growth_rate.R | 199 +-- R/methods-epi_archive.R | 396 +++--- R/methods-epi_df.R | 104 +- R/outliers.R | 263 ++-- R/slide.R | 180 +-- R/utils.R | 356 +++--- tests/testthat/test-archive-version-bounds.R | 146 ++- tests/testthat/test-archive.R | 246 ++-- tests/testthat/test-compactify.R | 66 +- tests/testthat/test-correlation.R | 55 +- tests/testthat/test-data.R | 54 +- tests/testthat/test-deprecations.R | 19 +- tests/testthat/test-epi_slide.R | 482 +++++--- .../testthat/test-epix_fill_through_version.R | 107 +- tests/testthat/test-epix_merge.R | 176 +-- tests/testthat/test-epix_slide.R | 674 +++++----- tests/testthat/test-grouped_epi_archive.R | 136 +- tests/testthat/test-methods-epi_archive.R | 81 +- tests/testthat/test-methods-epi_df.R | 77 +- tests/testthat/test-utils.R | 241 ++-- vignettes/advanced.Rmd | 205 +-- vignettes/compactify.Rmd | 25 +- vignettes/growth_rate.Rmd | 116 +- 28 files changed, 3429 insertions(+), 2782 deletions(-) diff --git a/R/archive.R b/R/archive.R index 7a7d8d82..1908b77c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -4,7 +4,7 @@ # want the special behavior via `.datatable.aware = TRUE` or by importing any # `data.table` package member. Do both to prevent surprises if we decide to use # `data.table::` everywhere and not importing things. -.datatable.aware = TRUE +.datatable.aware <- TRUE #' Validate a version bound arg #' @@ -22,16 +22,20 @@ #' @section Side effects: raises an error if version bound appears invalid #' #' @noRd -validate_version_bound = function(version_bound, x, na_ok, - version_bound_arg = rlang::caller_arg(version_bound), - x_arg = rlang::caller_arg(version_bound)) { +validate_version_bound <- function(version_bound, x, na_ok, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { # We might want some (optional?) validation here to detect internal bugs. if (length(version_bound) != 1L) { # Check for length-1-ness fairly early so we don't have to worry as much # about our `if`s receiving non-length-1 "Boolean"s. - Abort(sprintf("`version_bound` must have length 1, but instead was length %d", - length(version_bound)), - class=sprintf("epiprocess__%s_is_not_length_1", version_bound_arg)) + Abort( + sprintf( + "`version_bound` must have length 1, but instead was length %d", + length(version_bound) + ), + class = sprintf("epiprocess__%s_is_not_length_1", version_bound_arg) + ) } else if (is.na(version_bound)) { # Check for NA before class&type, as any-class&type NA should be fine for # our purposes, and some version classes&types might not have their own NA @@ -41,20 +45,20 @@ validate_version_bound = function(version_bound, x, na_ok, return(invisible(NULL)) } else { Abort(sprintf( - '`%s` must not satisfy `is.na` (NAs are not allowed for this kind of version bound)', + "`%s` must not satisfy `is.na` (NAs are not allowed for this kind of version bound)", version_bound_arg - ), class=sprintf("epiprocess__%s_is_na", version_bound_arg)) + ), class = sprintf("epiprocess__%s_is_na", version_bound_arg)) } - } else if (!identical(class(version_bound), class(x[["version"]])) || - !identical(typeof(version_bound), typeof(x[["version"]]))) { + } else if (!identical(class(version_bound), class(x[["version"]])) || + !identical(typeof(version_bound), typeof(x[["version"]]))) { Abort(sprintf( - '`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`', + "`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`", version_bound_arg, # '{x_arg}[["version"]]' except adding parentheses if needed: rlang::expr_deparse(rlang::new_call( quote(`[[`), rlang::pairlist2(rlang::parse_expr(x_arg), "version") )) - ), class=sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) + ), class = sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) } else { # Looks like a valid version bound; exit without error. return(invisible(NULL)) @@ -71,15 +75,17 @@ validate_version_bound = function(version_bound, x, na_ok, #' an `NA` version value #' #' @export -max_version_with_row_in = function(x) { +max_version_with_row_in <- function(x) { if (nrow(x) == 0L) { Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `versions_end` must be manually specified."), - class="epiprocess__max_version_cannot_be_used") + class = "epiprocess__max_version_cannot_be_used" + ) } else { - version_col = purrr::pluck(x, "version") # error not NULL if doesn't exist + version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist if (anyNA(version_col)) { Abort("version values cannot be NA", - class="epiprocess__version_values_must_not_be_na") + class = "epiprocess__version_values_must_not_be_na" + ) } else { version_bound <- max(version_col) } @@ -92,13 +98,13 @@ max_version_with_row_in = function(x) { #' @return same class, typeof, and length as `x` #' #' @export -next_after = function(x) UseMethod("next_after") +next_after <- function(x) UseMethod("next_after") #' @export -next_after.integer = function(x) x + 1L +next_after.integer <- function(x) x + 1L #' @export -next_after.Date = function(x) x + 1L +next_after.Date <- function(x) x + 1L #' @title `epi_archive` object #' @@ -110,7 +116,7 @@ next_after.Date = function(x) x + 1L #' @details An `epi_archive` is an R6 class which contains a data table `DT`, of #' class `data.table` from the `data.table` package, with (at least) the #' following columns: -#' +#' #' * `geo_value`: the geographic value associated with each row of measurements. #' * `time_value`: the time value associated with each row of measurements. #' * `version`: the time value specifying the version for each row of @@ -122,12 +128,12 @@ next_after.Date = function(x) x + 1L #' The data table `DT` has key variables `geo_value`, `time_value`, `version`, #' as well as any others (these can be specified when instantiating the #' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for +#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for #' information and examples of relevant parameter names for an `epi_archive` object. #' Note that there can only be a single row per unique combination of #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. -#' +#' #' In general, the last version of each observation is carried forward (LOCF) to #' fill in data between recorded versions, and between the last recorded #' update and the `versions_end`. One consequence is that the `DT` @@ -153,7 +159,7 @@ next_after.Date = function(x) x + 1L #' make a clone using the `$clone` method, then overwrite the clone's `DT` #' field with `data.table::copy(clone$DT)`, and finally perform the #' modifications on the clone. -#' +#' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: @@ -191,478 +197,520 @@ next_after.Date = function(x) x + 1L #' @examples #' tib <- tibble::tibble( #' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5), times = 2), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), #' value = rnorm(10, mean = 2, sd = 1) #' ) -#' -#' toy_epi_archive <- tib %>% epi_archive$new(geo_type = "state", -#' time_type = "day") -#' toy_epi_archive -epi_archive = +#' +#' toy_epi_archive <- tib %>% epi_archive$new( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +epi_archive <- R6::R6Class( - classname = "epi_archive", - ##### - public = list( - DT = NULL, - geo_type = NULL, - time_type = NULL, - additional_metadata = NULL, - clobberable_versions_start = NULL, - versions_end = NULL, -#' @description Creates a new `epi_archive` object. -#' @param x A data frame, data table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param other_keys Character vector specifying the names of variables in `x` -#' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space while maintaining the same behavior (with the help of the -#' `clobberable_versions_start` and `versions_end` fields in some edge cases). -#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will -#' remove these rows and issue a warning. Generally, this can be set to -#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` -#' such as its `DT`, or rely on redundant updates to achieve a certain -#' behavior of the `ref_time_values` default in `epix_slide`, you will have to -#' determine whether `compactify=TRUE` will produce the desired results. If -#' compactification here is removing a large proportion of the rows, this may -#' indicate a potential for space, time, or bandwidth savings upstream the -#' data pipeline, e.g., by avoiding fetching, storing, or processing these -#' rows of `x`. -#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] -#' @param versions_end Optional; as in [`as_epi_archive`] -#' @return An `epi_archive` object. -#' @importFrom data.table as.data.table key setkeyv -#' -#' @details -#' Refer to the documentation for [as_epi_archive()] for more information -#' and examples of parameter names. - initialize = function(x, geo_type, time_type, other_keys, - additional_metadata, compactify, - clobberable_versions_start, versions_end) { - # Check that we have a data frame - if (!is.data.frame(x)) { - Abort("`x` must be a data frame.") - } - - # Check that we have geo_value, time_value, version columns - if (!("geo_value" %in% names(x))) { - Abort("`x` must contain a `geo_value` column.") - } - if (!("time_value" %in% names(x))) { - Abort("`x` must contain a `time_value` column.") - } - if (!("version" %in% names(x))) { - Abort("`x` must contain a `version` column.") - } - if (anyNA(x$version)) { - Abort("`x$version` must not contain `NA`s", - class = "epiprocess__version_values_must_not_be_na") - } - - # If geo type is missing, then try to guess it - if (missing(geo_type)) { - geo_type = guess_geo_type(x$geo_value) - } + classname = "epi_archive", + ##### + public = list( + DT = NULL, + geo_type = NULL, + time_type = NULL, + additional_metadata = NULL, + clobberable_versions_start = NULL, + versions_end = NULL, + #' @description Creates a new `epi_archive` object. + #' @param x A data frame, data table, or tibble, with columns `geo_value`, + #' `time_value`, `version`, and then any additional number of columns. + #' @param geo_type Type for the geo values. If missing, then the function will + #' attempt to infer it from the geo values present; if this fails, then it + #' will be set to "custom". + #' @param time_type Type for the time values. If missing, then the function will + #' attempt to infer it from the time values present; if this fails, then it + #' will be set to "custom". + #' @param other_keys Character vector specifying the names of variables in `x` + #' that should be considered key variables (in the language of `data.table`) + #' apart from "geo_value", "time_value", and "version". + #' @param additional_metadata List of additional metadata to attach to the + #' `epi_archive` object. The metadata will have `geo_type` and `time_type` + #' fields; named entries from the passed list or will be included as well. + #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are + #' considered redundant for the purposes of `epi_archive`'s built-in methods + #' such as `as_of`? As these methods use the last version of each observation + #' carried forward (LOCF) to interpolate between the version data provided, + #' rows that don't change these LOCF results can potentially be omitted to + #' save space while maintaining the same behavior (with the help of the + #' `clobberable_versions_start` and `versions_end` fields in some edge cases). + #' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will + #' remove these rows and issue a warning. Generally, this can be set to + #' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` + #' such as its `DT`, or rely on redundant updates to achieve a certain + #' behavior of the `ref_time_values` default in `epix_slide`, you will have to + #' determine whether `compactify=TRUE` will produce the desired results. If + #' compactification here is removing a large proportion of the rows, this may + #' indicate a potential for space, time, or bandwidth savings upstream the + #' data pipeline, e.g., by avoiding fetching, storing, or processing these + #' rows of `x`. + #' @param clobberable_versions_start Optional; as in [`as_epi_archive`] + #' @param versions_end Optional; as in [`as_epi_archive`] + #' @return An `epi_archive` object. + #' @importFrom data.table as.data.table key setkeyv + #' + #' @details + #' Refer to the documentation for [as_epi_archive()] for more information + #' and examples of parameter names. + initialize = function(x, geo_type, time_type, other_keys, + additional_metadata, compactify, + clobberable_versions_start, versions_end) { + # Check that we have a data frame + if (!is.data.frame(x)) { + Abort("`x` must be a data frame.") + } - # If time type is missing, then try to guess it - if (missing(time_type)) { - time_type = guess_time_type(x$time_value) - } - - # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys = NULL - if (missing(additional_metadata)) additional_metadata = list() - if (!all(other_keys %in% names(x))) { - Abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - Abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% - c("geo_type", "time_type"))) { - Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } + # Check that we have geo_value, time_value, version columns + if (!("geo_value" %in% names(x))) { + Abort("`x` must contain a `geo_value` column.") + } + if (!("time_value" %in% names(x))) { + Abort("`x` must contain a `time_value` column.") + } + if (!("version" %in% names(x))) { + Abort("`x` must contain a `version` column.") + } + if (anyNA(x$version)) { + Abort("`x$version` must not contain `NA`s", + class = "epiprocess__version_values_must_not_be_na" + ) + } - # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify = NULL - } else if (!rlang::is_bool(compactify) && - !rlang::is_null(compactify)) { - Abort("compactify must be boolean or null.") - } + # If geo type is missing, then try to guess it + if (missing(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end)) { - versions_end <- max_version_with_row_in(x) - } - validate_version_bound(clobberable_versions_start, x, na_ok=TRUE) - validate_version_bound(versions_end, x, na_ok=FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - Abort(sprintf("`versions_end` was %s, but `x` contained + # If time type is missing, then try to guess it + if (missing(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata)) additional_metadata <- list() + if (!all(other_keys %in% names(x))) { + Abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + Abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% + c("geo_type", "time_type"))) { + Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } else if (!rlang::is_bool(compactify) && + !rlang::is_null(compactify)) { + Abort("compactify must be boolean or null.") + } + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + Abort( + sprintf( + "`versions_end` was %s, but `x` contained updates for a later version or versions, up through %s", - versions_end, max(x[["version"]])), - class="epiprocess__versions_end_earlier_than_updates") - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - Abort(sprintf("`versions_end` was %s, but a `clobberable_versions_start` + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + Abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start), - class="epiprocess__versions_end_earlier_than_clobberable_versions_start") - } + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } - # --- End of validation and replacing missing args with defaults --- + # --- End of validation and replacing missing args with defaults --- - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars = c("geo_value", "time_value", other_keys, "version") - DT = as.data.table(x, key = key_vars) - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - maybe_first_duplicate_key_row_index = anyDuplicated(DT, by=key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { - Abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same measurement.", - class = "epiprocess__epi_archive_requires_unique_key") - } - - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec))) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df,if_any(c(everything(),-version),~ !is_locf(.))) - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df,if_all(c(everything(),-version),~ is_locf(.))) - } - - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - elim = keep_locf(DT) - DT = rm_locf(DT) - } else { - # Create empty data frame for nrow(elim) to be 0 - elim = tibble::tibble() - } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- break_str(paste( - 'Found rows that appear redundant based on', - 'last (version of each) observation carried forward;', - 'these rows have been removed to "compactify" and save space:' - )) - - warning_data <- paste(collapse="\n", capture.output(print(elim, topn=3L, nrows=7L))) - - warning_outro <- break_str(paste( - "Built-in `epi_archive` functionality should be unaffected,", - "but results may change if you work directly with its fields (such as `DT`).", - "See `?as_epi_archive` for details.", - "To silence this warning but keep compactification,", - "you can pass `compactify=TRUE` when constructing the archive." - )) - - warning_message <- paste(sep="\n", warning_intro, warning_data, warning_outro) - - rlang::warn(warning_message, class="epiprocess__compactify_default_removed_rows") - } - - # Instantiate all self variables - self$DT = DT - self$geo_type = geo_type - self$time_type = time_type - self$additional_metadata = additional_metadata - self$clobberable_versions_start = clobberable_versions_start - self$versions_end = versions_end - }, - print = function(class = TRUE, methods = TRUE) { - if (class) cat("An `epi_archive` object, with metadata:\n") - cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) - cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) - if (!is.null(self$additional_metadata)) { - sapply(self$additional_metadata, function(m) { - cat(sprintf("* %-9s = %s\n", names(m), m)) - }) - } - cat("----------\n") - if (length(self$DT$time_value) == 0 || all(is.na(self$DT$time_value))) { - min_time = max_time = NA - } else { - min_time = Min(self$DT$time_value) - max_time = Max(self$DT$time_value) - } - cat(sprintf("* %-14s = %s\n", "min time value", min_time)) - cat(sprintf("* %-14s = %s\n", "max time value", max_time)) - cat(sprintf("* %-14s = %s\n", "first version with update", - min(self$DT$version))) - cat(sprintf("* %-14s = %s\n", "last version with update", - max(self$DT$version))) - if (is.na(self$clobberable_versions_start)) { - cat("* No clobberable versions\n") - } else { - cat(sprintf("* %-14s = %s\n", "clobberable versions start", - self$clobberable_versions_start)) - } - cat(sprintf("* %-14s = %s\n", "versions end", - self$versions_end)) - cat("----------\n") - cat(sprintf("Data archive (stored in DT field): %i x %i\n", - nrow(self$DT), ncol(self$DT))) - cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( - colnames(self$DT)) <= 4, paste(colnames(self$DT), collapse = ", "), - paste(paste(colnames(self$DT)[1:4], collapse = ", "), "and", - length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns"))))) - if (methods) { - cat("----------\n") - writeLines(wrap_varnames(initial = "Public R6 methods: ", - names(epi_archive$public_methods))) - } - }, - ##### -#' @description Generates a snapshot in `epi_df` format as of a given version. -#' See the documentation for the wrapper function [`epix_as_of()`] for details. -#' @importFrom data.table between key - as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { - # Self max version and other keys - other_keys = setdiff(key(self$DT), - c("geo_value", "time_value", "version")) - if (length(other_keys) == 0) other_keys = NULL - - # Check a few things on max_version - if (!identical(class(max_version), class(self$DT$version)) || - !identical(typeof(max_version), typeof(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") - } - if (length(max_version) != 1) { - Abort("`max_version` cannot be a vector.") - } - if (is.na(max_version)) { - Abort("`max_version` must not be NA.") - } - if (max_version > self$versions_end) { - Abort("`max_version` must be at most `self$versions_end`.") - } - if (!rlang::is_bool(all_versions)) { - Abort("`all_versions` must be TRUE or FALSE.") - } - if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered"). Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', - class="epiprocess__snapshot_as_of_clobberable_version") - } - - # Filter by version and return - if (all_versions) { - result = epix_truncate_versions_after(self, max_version) - # `self` has already been `clone`d in `epix_truncate_versions_after` - # so we can modify the new archive's DT directly. - result$DT = result$DT[time_value >= min_time_value, ] - return(result) - } + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + Abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } - return( - # Make sure to use data.table ways of filtering and selecting - self$DT[time_value >= min_time_value & - version <= max_version, ] %>% - unique(by = c("geo_value", "time_value", other_keys), - fromLast = TRUE) %>% - tibble::as_tibble() %>% - # (`as_tibble` should de-alias the DT and its columns in any edge - # cases where they are aliased. We don't say we guarantee this - # though.) - dplyr::select(-"version") %>% - as_epi_df(geo_type = self$geo_type, - time_type = self$time_type, - as_of = max_version, - additional_metadata = c(self$additional_metadata, - other_keys = other_keys)) - ) - }, - ##### -#' @description Fill in unobserved history using requested scheme by mutating -#' `self` and potentially reseating its fields. See -#' [`epix_fill_through_version`] for a full description of the non-R6-method -#' version, which doesn't mutate the input archive but might alias its fields. -#' -#' @param fill_versions_end as in [`epix_fill_through_version`] -#' @param how as in [`epix_fill_through_version`] -#' -#' @importFrom data.table key setkeyv := address copy -#' @importFrom rlang arg_match - fill_through_version = function(fill_versions_end, - how=c("na", "locf")) { - validate_version_bound(fill_versions_end, self$DT, na_ok=FALSE) - how <- arg_match(how) - if (self$versions_end < fill_versions_end) { - new_DT = switch( - how, - "na" = { - # old DT + a version consisting of all NA observations - # immediately after the last currently/actually-observed - # version. Note that this NA-observation version must only be - # added if `self` is outdated. - nonversion_key_cols = setdiff(key(self$DT), "version") - nonkey_cols = setdiff(names(self$DT), key(self$DT)) - next_version_tag = next_after(self$versions_end) - if (next_version_tag > fill_versions_end) { - Abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), self$versions_end, next_version_tag, fill_versions_end)) - } - nonversion_key_vals_ever_recorded = unique(self$DT, by=nonversion_key_cols) - # In edge cases, the `unique` result can alias the original - # DT; detect and copy if necessary: - if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { - nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) - } - next_version_DT = nonversion_key_vals_ever_recorded[ - , version := next_version_tag][ - # this makes the class of these columns logical (`NA` is a - # logical NA; we're relying on the rbind below to convert to - # the proper class&typeof) - , (nonkey_cols) := NA] - # full result DT: - setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] - }, - "locf" = { - # just the old DT; LOCF is built into other methods: - self$DT - } - ) - new_versions_end = fill_versions_end - # Update `self` all at once with simple, error-free operations + - # return below: - self$DT <- new_DT - self$versions_end <- new_versions_end - } else { - # Already sufficiently up to date; nothing to do. - } - return (invisible(self)) - }, - ##### -#' @description Filter to keep only older versions, mutating the archive by -#' potentially reseating but not mutating some fields. `DT` is likely, but not -#' guaranteed, to be copied. Returns the mutated archive -#' [invisibly][base::invisible]. -#' @param x as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] - truncate_versions_after = function(max_version) { - if (length(max_version) != 1) { - Abort("`max_version` cannot be a vector.") - } - if (is.na(max_version)) { - Abort("`max_version` must not be NA.") - } - if (!identical(class(max_version), class(self$DT$version)) || - !identical(typeof(max_version), typeof(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") - } - if (max_version > self$versions_end) { - Abort("`max_version` must be at most `self$versions_end`.") - } - self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with=FALSE] - # (^ this filter operation seems to always copy the DT, even if it - # keeps every entry; we don't guarantee this behavior in - # documentation, though, so we could change to alias in this case) - if (!is.na(self$clobberable_versions_start) && - self$clobberable_versions_start > max_version) { - self$clobberable_versions_start <- NA - } - self$versions_end <- max_version - return (invisible(self)) - }, - ##### -#' @description Merges another `epi_archive` with the current one, mutating the -#' current one by reseating its `DT` and several other fields, but avoiding -#' mutation of the old `DT`; returns the current archive -#' [invisibly][base::invisible]. See [`epix_merge`] for a full description -#' of the non-R6-method version, which does not mutate either archive, and -#' does not alias either archive's `DT`. -#' @param y as in [`epix_merge`] -#' @param sync as in [`epix_merge`] -#' @param compactify as in [`epix_merge`] - merge = function(y, sync = c("forbid","na","locf","truncate"), compactify = TRUE) { - result = epix_merge(self, y, - sync = sync, - compactify = compactify) - - if (length(epi_archive$private_fields) != 0L) { - Abort("expected no private fields in epi_archive", - internal=TRUE) - } + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } - # Mutate fields all at once, trying to avoid any potential errors: - for (field_name in names(epi_archive$public_fields)) { - self[[field_name]] <- result[[field_name]] - } + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } - return (invisible(self)) - }, - ##### - group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - group_by.epi_archive(self, ..., .add=.add, .drop=.drop) - }, -#' @description Slides a given function over variables in an `epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. -#' @importFrom data.table key -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # For an "ungrouped" slide, treat all rows as belonging to one big - # group (group by 0 vars), like `dplyr::summarize`, and let the - # resulting `grouped_epi_archive` handle the slide: - self$group_by()$slide( - f, ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- break_str(paste( + "Found rows that appear redundant based on", + "last (version of each) observation carried forward;", + 'these rows have been removed to "compactify" and save space:' + )) + + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + + warning_outro <- break_str(paste( + "Built-in `epi_archive` functionality should be unaffected,", + "but results may change if you work directly with its fields (such as `DT`).", + "See `?as_epi_archive` for details.", + "To silence this warning but keep compactification,", + "you can pass `compactify=TRUE` when constructing the archive." + )) + + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + # Instantiate all self variables + self$DT <- DT + self$geo_type <- geo_type + self$time_type <- time_type + self$additional_metadata <- additional_metadata + self$clobberable_versions_start <- clobberable_versions_start + self$versions_end <- versions_end + }, + print = function(class = TRUE, methods = TRUE) { + if (class) cat("An `epi_archive` object, with metadata:\n") + cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) + cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) + if (!is.null(self$additional_metadata)) { + sapply(self$additional_metadata, function(m) { + cat(sprintf("* %-9s = %s\n", names(m), m)) + }) + } + cat("----------\n") + if (length(self$DT$time_value) == 0 || all(is.na(self$DT$time_value))) { + min_time <- max_time <- NA + } else { + min_time <- Min(self$DT$time_value) + max_time <- Max(self$DT$time_value) + } + cat(sprintf("* %-14s = %s\n", "min time value", min_time)) + cat(sprintf("* %-14s = %s\n", "max time value", max_time)) + cat(sprintf( + "* %-14s = %s\n", "first version with update", + min(self$DT$version) + )) + cat(sprintf( + "* %-14s = %s\n", "last version with update", + max(self$DT$version) + )) + if (is.na(self$clobberable_versions_start)) { + cat("* No clobberable versions\n") + } else { + cat(sprintf( + "* %-14s = %s\n", "clobberable versions start", + self$clobberable_versions_start + )) + } + cat(sprintf( + "* %-14s = %s\n", "versions end", + self$versions_end + )) + cat("----------\n") + cat(sprintf( + "Data archive (stored in DT field): %i x %i\n", + nrow(self$DT), ncol(self$DT) + )) + cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( + colnames(self$DT) + ) <= 4, paste(colnames(self$DT), collapse = ", "), + paste( + paste(colnames(self$DT)[1:4], collapse = ", "), "and", + length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns" + ) + )))) + if (methods) { + cat("----------\n") + writeLines(wrap_varnames( + initial = "Public R6 methods: ", + names(epi_archive$public_methods) + )) + } + }, + ##### + #' @description Generates a snapshot in `epi_df` format as of a given version. + #' See the documentation for the wrapper function [`epix_as_of()`] for details. + #' @importFrom data.table between key + as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { + # Self max version and other keys + other_keys <- setdiff( + key(self$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!identical(class(max_version), class(self$DT$version)) || + !identical(typeof(max_version), typeof(self$DT$version))) { + Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") + } + if (length(max_version) != 1) { + Abort("`max_version` cannot be a vector.") + } + if (is.na(max_version)) { + Abort("`max_version` must not be NA.") + } + if (max_version > self$versions_end) { + Abort("`max_version` must be at most `self$versions_end`.") + } + if (!rlang::is_bool(all_versions)) { + Abort("`all_versions` must be TRUE or FALSE.") + } + if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { + Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered"). Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + result <- epix_truncate_versions_after(self, max_version) + # `self` has already been `clone`d in `epix_truncate_versions_after` + # so we can modify the new archive's DT directly. + result$DT <- result$DT[time_value >= min_time_value, ] + return(result) + } + + return( + # Make sure to use data.table ways of filtering and selecting + self$DT[time_value >= min_time_value & + version <= max_version, ] %>% + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE ) %>% - # We want a slide on ungrouped archives to output something - # ungrouped, rather than retaining the trivial (0-variable) - # grouping applied above. So we `ungroup()`. However, the current - # `dplyr` implementation automatically ignores/drops trivial - # groupings, so this is just a no-op for now. - ungroup() - } + tibble::as_tibble() %>% + # (`as_tibble` should de-alias the DT and its columns in any edge + # cases where they are aliased. We don't say we guarantee this + # though.) + dplyr::select(-"version") %>% + as_epi_df( + geo_type = self$geo_type, + time_type = self$time_type, + as_of = max_version, + additional_metadata = c(self$additional_metadata, + other_keys = other_keys + ) + ) ) - ) + }, + ##### + #' @description Fill in unobserved history using requested scheme by mutating + #' `self` and potentially reseating its fields. See + #' [`epix_fill_through_version`] for a full description of the non-R6-method + #' version, which doesn't mutate the input archive but might alias its fields. + #' + #' @param fill_versions_end as in [`epix_fill_through_version`] + #' @param how as in [`epix_fill_through_version`] + #' + #' @importFrom data.table key setkeyv := address copy + #' @importFrom rlang arg_match + fill_through_version = function(fill_versions_end, + how = c("na", "locf")) { + validate_version_bound(fill_versions_end, self$DT, na_ok = FALSE) + how <- arg_match(how) + if (self$versions_end < fill_versions_end) { + new_DT <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `self` is outdated. + nonversion_key_cols <- setdiff(key(self$DT), "version") + nonkey_cols <- setdiff(names(self$DT), key(self$DT)) + next_version_tag <- next_after(self$versions_end) + if (next_version_tag > fill_versions_end) { + Abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), self$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(self$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + self$DT + } + ) + new_versions_end <- fill_versions_end + # Update `self` all at once with simple, error-free operations + + # return below: + self$DT <- new_DT + self$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(invisible(self)) + }, + ##### + #' @description Filter to keep only older versions, mutating the archive by + #' potentially reseating but not mutating some fields. `DT` is likely, but not + #' guaranteed, to be copied. Returns the mutated archive + #' [invisibly][base::invisible]. + #' @param x as in [`epix_truncate_versions_after`] + #' @param max_version as in [`epix_truncate_versions_after`] + truncate_versions_after = function(max_version) { + if (length(max_version) != 1) { + Abort("`max_version` cannot be a vector.") + } + if (is.na(max_version)) { + Abort("`max_version` must not be NA.") + } + if (!identical(class(max_version), class(self$DT$version)) || + !identical(typeof(max_version), typeof(self$DT$version))) { + Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") + } + if (max_version > self$versions_end) { + Abort("`max_version` must be at most `self$versions_end`.") + } + self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(self$clobberable_versions_start) && + self$clobberable_versions_start > max_version) { + self$clobberable_versions_start <- NA + } + self$versions_end <- max_version + return(invisible(self)) + }, + ##### + #' @description Merges another `epi_archive` with the current one, mutating the + #' current one by reseating its `DT` and several other fields, but avoiding + #' mutation of the old `DT`; returns the current archive + #' [invisibly][base::invisible]. See [`epix_merge`] for a full description + #' of the non-R6-method version, which does not mutate either archive, and + #' does not alias either archive's `DT`. + #' @param y as in [`epix_merge`] + #' @param sync as in [`epix_merge`] + #' @param compactify as in [`epix_merge`] + merge = function(y, sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE) { + result <- epix_merge(self, y, + sync = sync, + compactify = compactify + ) + + if (length(epi_archive$private_fields) != 0L) { + Abort("expected no private fields in epi_archive", + internal = TRUE + ) + } + + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(epi_archive$public_fields)) { + self[[field_name]] <- result[[field_name]] + } + + return(invisible(self)) + }, + ##### + group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { + group_by.epi_archive(self, ..., .add = .add, .drop = .drop) + }, + #' @description Slides a given function over variables in an `epi_archive` + #' object. See the documentation for the wrapper function [`epix_slide()`] for + #' details. + #' @importFrom data.table key + #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms + slide = function(f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + self$group_by()$slide( + f, ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() + } + ) + ) #' Convert to `epi_archive` format #' @@ -739,41 +787,55 @@ epi_archive = #' # Simple ex. with necessary keys #' tib <- tibble::tibble( #' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5), times = 2), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), #' value = rnorm(10, mean = 2, sd = 1) #' ) -#' -#' toy_epi_archive <- tib %>% as_epi_archive(geo_type = "state", -#' time_type = "day") -#' toy_epi_archive -#' +#' +#' toy_epi_archive <- tib %>% as_epi_archive( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' #' # Ex. with an additional key for county -#' df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), -#' county = c(1, 3, 2, 5), -#' time_value = c("2020-06-01", -#' "2020-06-02", -#' "2020-06-01", -#' "2020-06-02"), -#' version = c("2020-06-02", -#' "2020-06-03", -#' "2020-06-02", -#' "2020-06-03"), -#' cases = c(1, 2, 3, 4), -#' cases_rate = c(0.01, 0.02, 0.01, 0.05)) -#' -#' x <- df %>% as_epi_archive(geo_type = "state", -#' time_type = "day", -#' other_keys = "county") -as_epi_archive = function(x, geo_type, time_type, other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x)) { - epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, - compactify, clobberable_versions_start, versions_end) +#' df <- data.frame( +#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c( +#' "2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02" +#' ), +#' version = c( +#' "2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03" +#' ), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05) +#' ) +#' +#' x <- df %>% as_epi_archive( +#' geo_type = "state", +#' time_type = "day", +#' other_keys = "county" +#' ) +as_epi_archive <- function(x, geo_type, time_type, other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x)) { + epi_archive$new( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) } #' Test for `epi_archive` format @@ -782,7 +844,7 @@ as_epi_archive = function(x, geo_type, time_type, other_keys, #' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also #' count? Default is `FALSE`. #' @return `TRUE` if the object inherits from `epi_archive`. -#' +#' #' @export #' @examples #' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) @@ -791,11 +853,11 @@ as_epi_archive = function(x, geo_type, time_type, other_keys, #' # By default, grouped_epi_archives don't count as epi_archives, as they may #' # support a different set of operations from regular `epi_archives`. This #' # behavior can be controlled by `grouped_okay`. -#' grouped_archive = archive_cases_dv_subset$group_by(geo_value) +#' grouped_archive <- archive_cases_dv_subset$group_by(geo_value) #' is_epi_archive(grouped_archive) # FALSE -#' is_epi_archive(grouped_archive, grouped_okay=TRUE) # TRUE +#' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE #' #' @seealso [`is_grouped_epi_archive`] -is_epi_archive = function(x, grouped_okay=FALSE) { +is_epi_archive <- function(x, grouped_okay = FALSE) { inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") } diff --git a/R/correlation.R b/R/correlation.R index 62d024bd..a4a56d1e 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -4,11 +4,11 @@ #' grouping by geo value, time value, or any other variables. See the #' [correlation #' vignette](https://cmu-delphi.github.io/epiprocess/articles/correlation.html) -#' for examples. +#' for examples. #' #' @param x The `epi_df` object under consideration. #' @param var1,var2 The variables in `x` to correlate. -#' @param dt1,dt2 Time shifts to consider for the two variables, respectively, +#' @param dt1,dt2 Time shifts to consider for the two variables, respectively, #' before computing correlations. Negative shifts translate into in a lag #' value and positive shifts into a lead value; for example, if `dt = -1`, #' then the new value on June 2 is the original value on June 1; if `dt = 1`, @@ -34,51 +34,59 @@ #' `method` (same as `cor()`). #' #' @return An tibble with the grouping columns first (`geo_value`, `time_value`, -#' or possibly others), and then a column `cor`, which gives the correlation. -#' +#' or possibly others), and then a column `cor`, which gives the correlation. +#' #' @importFrom stats cor #' @importFrom rlang .data !! !!! enquo syms #' @importFrom tidyselect eval_select #' @export #' @examples -#' +#' #' # linear association of case and death rates on any given day -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = "time_value") -#' +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = "time_value" +#' ) +#' #' # correlation of death rates and lagged case rates -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = time_value, -#' dt1 = -2) -#' -#' # correlation grouped by location -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = geo_value) -#' +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = time_value, +#' dt1 = -2 +#' ) +#' +#' # correlation grouped by location +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = geo_value +#' ) +#' #' # correlation grouped by location and incorporates lagged cases rates -#' epi_cor(x = jhu_csse_daily_subset, -#' var1 = case_rate_7d_av, -#' var2 = death_rate_7d_av, -#' cor_by = geo_value, -#' dt1 = -2) -epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, - cor_by = geo_value, use = "na.or.complete", - method = c("pearson", "kendall", "spearman")) { +#' epi_cor( +#' x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = geo_value, +#' dt1 = -2 +#' ) +epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, + cor_by = geo_value, use = "na.or.complete", + method = c("pearson", "kendall", "spearman")) { # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") # Check that we have variables to do computations on if (missing(var1)) Abort("`var1` must be specified.") if (missing(var2)) Abort("`var2` must be specified.") - var1 = enquo(var1) - var2 = enquo(var2) - + var1 <- enquo(var1) + var2 <- enquo(var2) + # Defuse grouping variables. This looks a bit more involved since we want to # accomodate the option of specifying multiple variables for each grouping. # Hence use the power of tidyselect::eval_select(), which can accomodate any @@ -88,26 +96,33 @@ epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, # * cor_by = c(a, b) # * cor_by = c("a", "b") # and so on, and similarly for shift_by. Note: make sure to follow with !!! - cor_by = syms(names(eval_select(enquo(cor_by), x))) - shift_by = syms(names(eval_select(enquo(shift_by), x))) + cor_by <- syms(names(eval_select(enquo(cor_by), x))) + shift_by <- syms(names(eval_select(enquo(shift_by), x))) # Which method? - method = match.arg(method) + method <- match.arg(method) # Perform time shifts, then compute appropriate correlations and return return(x %>% - dplyr::group_by(!!!shift_by) %>% - dplyr::arrange(.data$time_value) %>% - dplyr::mutate(var1 = shift(!!var1, n = dt1), - var2 = shift(!!var2, n = dt2)) %>% - dplyr::ungroup() %>% - dplyr::group_by(!!!cor_by) %>% - dplyr::summarize(cor = cor(x = .data$var1, y = .data$var2, - use = use, method = method))) + dplyr::group_by(!!!shift_by) %>% + dplyr::arrange(.data$time_value) %>% + dplyr::mutate( + var1 = shift(!!var1, n = dt1), + var2 = shift(!!var2, n = dt2) + ) %>% + dplyr::ungroup() %>% + dplyr::group_by(!!!cor_by) %>% + dplyr::summarize(cor = cor( + x = .data$var1, y = .data$var2, + use = use, method = method + ))) } # Function to perform time shifts, lag or lead -shift = function(var, n) { - if (n < 0) return(dplyr::lag(var, -n)) - else return(dplyr::lead(var, n)) +shift <- function(var, n) { + if (n < 0) { + return(dplyr::lag(var, -n)) + } else { + return(dplyr::lead(var, n)) + } } diff --git a/R/data.R b/R/data.R index 248288eb..ead3dfdd 100644 --- a/R/data.R +++ b/R/data.R @@ -1,41 +1,41 @@ -#' Subset of JHU daily state cases and deaths +#' Subset of JHU daily state cases and deaths #' #' This data source of confirmed COVID-19 cases and deaths #' is based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to +#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to #' California, Florida, Texas, New York, Georgia, and Pennsylvania. #' #' @format A tibble with 4026 rows and 6 variables: #' \describe{ -#' \item{geo_value}{the geographic value associated with each row +#' \item{geo_value}{the geographic value associated with each row #' of measurements.} #' \item{time_value}{the time value associated with each row of measurements.} -#' \item{case_rate_7d_av}{7-day average signal of number of new +#' \item{case_rate_7d_av}{7-day average signal of number of new #' confirmed COVID-19 cases per 100,000 population, daily} -#' \item{death_rate_7d_av}{7-day average signal of number of new confirmed +#' \item{death_rate_7d_av}{7-day average signal of number of new confirmed #' deaths due to COVID-19 per 100,000 population, daily} #' \item{cases}{Number of new confirmed COVID-19 cases, daily} -#' \item{cases_7d_av}{7-day average signal of number of new confirmed +#' \item{cases_7d_av}{7-day average signal of number of new confirmed #' COVID-19 cases, daily} #' } -#' @source This object contains a modified part of the +#' @source This object contains a modified part of the #' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. +#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. #' This data set is licensed under the terms of the #' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} #' by the Johns Hopkins University on behalf of its Center for Systems Science #' in Engineering. Copyright Johns Hopkins University 2020. #' #' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -#' These signals are taken directly from the JHU CSSE -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -#' without changes. The 7-day average signals are computed by Delphi by -#' calculating moving averages of the preceding 7 days, so the signal for -#' June 7 is the average of the underlying data for June 1 through 7, +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: +#' These signals are taken directly from the JHU CSSE +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} +#' without changes. The 7-day average signals are computed by Delphi by +#' calculating moving averages of the preceding 7 days, so the signal for +#' June 7 is the average of the underlying data for June 1 through 7, #' inclusive. -#' * Furthermore, the data has been limited to a very small number of rows, +#' * Furthermore, the data has been limited to a very small number of rows, #' the signal names slightly altered, and formatted into a tibble. "jhu_csse_daily_subset" @@ -46,7 +46,7 @@ #' provided to us by health system partners, and also contains confirmed #' COVID-19 cases based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from June 1, 2020 to Dec 1, 2021, and +#' This example data ranges from June 1, 2020 to Dec 1, 2021, and #' is also limited to California, Florida, Texas, and New York. #' #' @format An `epi_archive` data format. The data table DT has 129,638 rows and 5 columns: @@ -88,23 +88,23 @@ #' @return Boolean #' #' @noRd -some_package_is_being_unregistered = function(parent_n = 0L) { - calls = sys.calls() +some_package_is_being_unregistered <- function(parent_n = 0L) { + calls <- sys.calls() # `calls` will include the call to this function; strip out this call plus # `parent_n` additional requested calls to make it like we're reasoning about # the desired call. This could prevent potential false positives from # triggering if, in a later version, we decide to loosen the `call_name` # checks below to something that would be `TRUE` for the name of this function # or one of the undesired call ancestors. - calls_to_inspect = utils::head(calls, n = -(parent_n + 1L)) + calls_to_inspect <- utils::head(calls, n = -(parent_n + 1L)) # Note that `utils::head(sys.calls(), n=-1L)` isn't equivalent, due to lazy # argument evaluation. Note that copy-pasting the body of this function # without this `utils::head` operation isn't always equivalent to calling it; # e.g., within the `value` argument of a package-level `delayedAssign`, # `sys.calls()` will return `NULL` is some or all cases, including when its # evaluation has been triggered via `unregister`. - simple_call_names = purrr::map_chr(calls_to_inspect, function(call) { - maybe_simple_call_name = rlang::call_name(call) + simple_call_names <- purrr::map_chr(calls_to_inspect, function(call) { + maybe_simple_call_name <- rlang::call_name(call) if (is.null(maybe_simple_call_name)) NA_character_ else maybe_simple_call_name }) # `pkgload::unregister` is an (the?) exported function that forces @@ -127,11 +127,11 @@ some_package_is_being_unregistered = function(parent_n = 0L) { #' different than when using `delayedAssign` directly. #' #' @noRd -delayed_assign_with_unregister_awareness = function(x, value, - eval.env = rlang::caller_env(), - assign.env = rlang::caller_env()) { - value_quosure = rlang::as_quosure(rlang::enexpr(value), eval.env) - this_env = environment() +delayed_assign_with_unregister_awareness <- function(x, value, + eval.env = rlang::caller_env(), + assign.env = rlang::caller_env()) { + value_quosure <- rlang::as_quosure(rlang::enexpr(value), eval.env) + this_env <- environment() delayedAssign(x, eval.env = this_env, assign.env = assign.env, value = { if (some_package_is_being_unregistered()) { withCallingHandlers( @@ -144,26 +144,30 @@ delayed_assign_with_unregister_awareness = function(x, value, # all.) rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)), error = function(err) { - Abort(paste("An error was raised while attempting to evaluate a promise", - "(prepared with `delayed_assign_with_unregister_awareness`)", - "while an `unregister` or `unregister_namespace` call", - "was being evaluated.", - "This can happen, for example, when `devtools::load_all`", - "reloads a package that contains a buggy promise,", - "because reloading can cause old package-level promises to", - "be forced via `pkgload::unregister` and", - "`pkgload:::unregister_namespace`, due to", - "https://github.com/r-lib/pkgload/pull/157.", - "If this is the current situation, you might be able to", - "be successfully reload the package again after", - "`unloadNamespace`-ing it (but this situation will", - "keep re-occurring every other `devtools::load`", - "and every `devtools:document` until the bug or situation", - "generating the promise's error has been resolved)." - ), - class = "epiprocess__promise_evaluation_error_during_unregister", - parent = err) - }) + Abort( + paste( + "An error was raised while attempting to evaluate a promise", + "(prepared with `delayed_assign_with_unregister_awareness`)", + "while an `unregister` or `unregister_namespace` call", + "was being evaluated.", + "This can happen, for example, when `devtools::load_all`", + "reloads a package that contains a buggy promise,", + "because reloading can cause old package-level promises to", + "be forced via `pkgload::unregister` and", + "`pkgload:::unregister_namespace`, due to", + "https://github.com/r-lib/pkgload/pull/157.", + "If this is the current situation, you might be able to", + "be successfully reload the package again after", + "`unloadNamespace`-ing it (but this situation will", + "keep re-occurring every other `devtools::load`", + "and every `devtools:document` until the bug or situation", + "generating the promise's error has been resolved)." + ), + class = "epiprocess__promise_evaluation_error_during_unregister", + parent = err + ) + } + ) } else { rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)) } @@ -189,14 +193,14 @@ delayed_assign_with_unregister_awareness = function(x, value, # binding may have been created with the same name as the package promise, and # this binding will stick around even when the package is reloaded, and will # need to be `rm`-d to easily access the refreshed package promise. -delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archive(archive_cases_dv_subset_dt, compactify=FALSE)) +delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archive(archive_cases_dv_subset_dt, compactify = FALSE)) #' Subset of JHU daily cases from California and Florida #' #' This data source of confirmed COVID-19 cases #' is based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data is a snapshot as of Oct 28, 2021 and captures the cases +#' This example data is a snapshot as of Oct 28, 2021 and captures the cases #' from June 1, 2020 to May 31, 2021 #' and is limited to California and Florida. #' @@ -222,7 +226,7 @@ delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archi #' This data source of confirmed COVID-19 cases and deaths #' is based on reports made available by the Center for #' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, +#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, #' and is limited to Massachusetts and Vermont. #' #' @format A tibble with 16,212 rows and 5 variables: diff --git a/R/epiprocess.R b/R/epiprocess.R index e047de8c..bbdcf4f3 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -2,7 +2,7 @@ #' #' This package introduces a common data structure for epidemiological data sets #' measured over space and time, and offers associated utilities to perform -#' basic signal processing tasks. +#' basic signal processing tasks. #' #' @docType package #' @name epiprocess diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index b11bf821..f083cf93 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -1,4 +1,3 @@ - #' Get var names from select-only `tidy_select`ing `...` in `.data` #' #' Convenience function for performing a `tidy_select` on dots according to its @@ -11,7 +10,7 @@ #' `names(.data)` denoting the selection #' #' @noRd -eval_pure_select_names_from_dots = function(..., .data) { +eval_pure_select_names_from_dots <- function(..., .data) { # `?tidyselect::eval_select` tells us to use this form when we take in dots. # It seems a bit peculiar, since the expr doesn't pack with it a way to get at # the environment for the dots, but it looks like `eval_select` will assume @@ -20,7 +19,7 @@ eval_pure_select_names_from_dots = function(..., .data) { # # If we were allowing renaming, we'd need to be careful about which names (new # vs. old vs. both) to return here. - names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename=FALSE)) + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) } #' Get names of dots without forcing the dots @@ -31,16 +30,16 @@ eval_pure_select_names_from_dots = function(..., .data) { #' dots if we're using NSE. #' #' @noRd -nse_dots_names = function(...) { +nse_dots_names <- function(...) { names(rlang::call_match()) } -nse_dots_names2 = function(...) { +nse_dots_names2 <- function(...) { rlang::names2(rlang::call_match()) } #' @importFrom dplyr group_by_drop_default #' @noRd -grouped_epi_archive = +grouped_epi_archive <- R6::R6Class( classname = "grouped_epi_archive", # (We don't R6-inherit `epi_archive` or S3-multiclass with "epi_archive"; @@ -55,34 +54,39 @@ grouped_epi_archive = initialize = function(ungrouped, vars, drop) { if (inherits(ungrouped, "grouped_epi_archive")) { Abort("`ungrouped` must not already be grouped (neither automatic regrouping nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, or `ungroup` first.", - class="epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", - epiprocess__ungrouped_class = class(ungrouped), - epiprocess__ungrouped_groups = groups(ungrouped)) + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped) + ) } if (!inherits(ungrouped, "epi_archive")) { Abort("`ungrouped` must be an epi_archive", - class="epiprocess__grouped_epi_archive__ungrouped_arg_is_not_epi_archive", - epiprocess__ungrouped_class = class(ungrouped)) + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_not_epi_archive", + epiprocess__ungrouped_class = class(ungrouped) + ) } if (!is.character(vars)) { Abort("`vars` must be a character vector (any tidyselection should have already occurred in a helper method).", - class="epiprocess__grouped_epi_archive__vars_is_not_chr", - epiprocess__vars_class = class(vars), - epiprocess__vars_type = typeof(vars)) + class = "epiprocess__grouped_epi_archive__vars_is_not_chr", + epiprocess__vars_class = class(vars), + epiprocess__vars_type = typeof(vars) + ) } if (!all(vars %in% names(ungrouped$DT))) { Abort("`vars` must be selected from the names of columns of `ungrouped$DT`", - class="epiprocess__grouped_epi_archive__vars_contains_invalid_entries", - epiprocess__vars = vars, - epiprocess__DT_names = names(ungrouped$DT)) + class = "epiprocess__grouped_epi_archive__vars_contains_invalid_entries", + epiprocess__vars = vars, + epiprocess__DT_names = names(ungrouped$DT) + ) } if ("version" %in% vars) { Abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") } if (!rlang::is_bool(drop)) { Abort("`drop` must be a Boolean", - class="epiprocess__grouped_epi_archive__drop_is_not_bool", - epiprocess__drop = drop) + class = "epiprocess__grouped_epi_archive__drop_is_not_bool", + epiprocess__drop = drop + ) } # ----- private$ungrouped <- ungrouped @@ -91,7 +95,7 @@ grouped_epi_archive = }, print = function(class = TRUE, methods = TRUE) { if (class) cat("A `grouped_epi_archive` object:\n") - writeLines(wrap_varnames(private$vars, initial="* Groups: ")) + writeLines(wrap_varnames(private$vars, initial = "* Groups: ")) # If none of the grouping vars is a factor, then $drop doesn't seem # relevant, so try to be less verbose and don't message about it. # @@ -102,7 +106,7 @@ grouped_epi_archive = # something to rely too much on), while map functions currently appear # to avoid column copies. if (any(purrr::map_lgl(private$ungrouped$DT, is.factor)[private$vars])) { - cat(strwrap(init="* ", prefix=" ", sprintf( + cat(strwrap(init = "* ", prefix = " ", sprintf( "%s groups formed by factor levels that don't appear in the data", if (private$drop) "Drops" else "Does not drop" ))) @@ -113,14 +117,20 @@ grouped_epi_archive = if (methods) { cat("----------\n") cat("Public `grouped_epi_archive` R6 methods:\n") - grouped_method_names = names(grouped_epi_archive$public_methods) - ungrouped_method_names = names(epi_archive$public_methods) - writeLines(wrap_varnames(initial = "\u2022 Specialized `epi_archive` methods: ", - intersect(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "\u2022 Exclusive to `grouped_epi_archive`: ", - setdiff(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "\u2022 `ungroup` to use: ", - setdiff(ungrouped_method_names, grouped_method_names))) + grouped_method_names <- names(grouped_epi_archive$public_methods) + ungrouped_method_names <- names(epi_archive$public_methods) + writeLines(wrap_varnames( + initial = "\u2022 Specialized `epi_archive` methods: ", + intersect(grouped_method_names, ungrouped_method_names) + )) + writeLines(wrap_varnames( + initial = "\u2022 Exclusive to `grouped_epi_archive`: ", + setdiff(grouped_method_names, ungrouped_method_names) + )) + writeLines(wrap_varnames( + initial = "\u2022 `ungroup` to use: ", + setdiff(ungrouped_method_names, grouped_method_names) + )) } # Return self invisibly for convenience in `$`-"pipe": invisible(self) @@ -135,14 +145,15 @@ grouped_epi_archive = If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. ', - class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE") + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) } else { # `group_by` `...` computations are performed on ungrouped data (see # `?dplyr::group_by`) - detailed_mutate = epix_detailed_restricted_mutate(private$ungrouped, ...) - out_ungrouped = detailed_mutate[["archive"]] - vars_from_dots = detailed_mutate[["request_names"]] - vars = union(private$vars, vars_from_dots) + detailed_mutate <- epix_detailed_restricted_mutate(private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(private$vars, vars_from_dots) grouped_epi_archive$new(private$ungrouped, vars, .drop) } }, @@ -158,11 +169,11 @@ grouped_epi_archive = # an ungrouped class, as with `grouped_df`s. private$ungrouped } else { - exclude_vars = eval_pure_select_names_from_dots(..., .data=private$ungrouped$DT) + exclude_vars <- eval_pure_select_names_from_dots(..., .data = private$ungrouped$DT) # (requiring a pure selection here is a little stricter than dplyr # implementations, but passing a renaming selection into `ungroup` # seems pretty weird.) - result_vars = private$vars[! private$vars %in% exclude_vars] + result_vars <- private$vars[!private$vars %in% exclude_vars] # `vars` might be length 0 if the user's tidyselection removed all # grouping vars. Unlike with tibble, opt here to keep the result as a # grouped_epi_archive, for output class consistency when `...` is @@ -170,36 +181,36 @@ grouped_epi_archive = grouped_epi_archive$new(private$ungrouped, result_vars, private$drop) } }, -#' @description Filter to keep only older versions by mutating the underlying -#' `epi_archive` using `$truncate_versions_after`. Returns the mutated -#' `grouped_epi_archive` [invisibly][base::invisible]. -#' @param x as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] + #' @description Filter to keep only older versions by mutating the underlying + #' `epi_archive` using `$truncate_versions_after`. Returns the mutated + #' `grouped_epi_archive` [invisibly][base::invisible]. + #' @param x as in [`epix_truncate_versions_after`] + #' @param max_version as in [`epix_truncate_versions_after`] truncate_versions_after = function(max_version) { # The grouping is irrelevant for this method; if we were to split into # groups and recombine appropriately, we should get the same result as # just leveraging the ungrouped method, so just do the latter: private$ungrouped$truncate_versions_after(max_version) - return (invisible(self)) + return(invisible(self)) }, -#' @description Slides a given function over variables in a `grouped_epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. -#' @importFrom data.table key address rbindlist setDF -#' @importFrom tibble as_tibble new_tibble validate_tibble -#' @importFrom dplyr group_by groups -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms -#' env missing_arg - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = - # deprecated()` in the function signature, because they are from - # early development versions and much more likely to be clutter than - # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { - Abort(" + #' @description Slides a given function over variables in a `grouped_epi_archive` + #' object. See the documentation for the wrapper function [`epix_slide()`] for + #' details. + #' @importFrom data.table key address rbindlist setDF + #' @importFrom tibble as_tibble new_tibble validate_tibble + #' @importFrom dplyr group_by groups + #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms + #' env missing_arg + slide = function(f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + Abort(" The `group_by` argument to `slide` has been removed; please use the `group_by` S3 generic function or `$group_by` R6 method before the slide instead. (If you were instead trying to pass a @@ -208,208 +219,209 @@ grouped_epi_archive = different column name here and rename the resulting column after the slide.) ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") - } - if ("all_rows" %in% nse_dots_names(...)) { - Abort(" + } + if ("all_rows" %in% nse_dots_names(...)) { + Abort(" The `all_rows` argument has been removed from `epix_slide` (but is still supported in `epi_slide`). Add rows for excluded results with a manual join instead. ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") - } - - if (missing(ref_time_values)) { - ref_time_values = epix_slide_ref_time_values_default(private$ungrouped) - } else if (length(ref_time_values) == 0L) { - Abort("`ref_time_values` must have at least one element.") - } else if (any(is.na(ref_time_values))) { - Abort("`ref_time_values` must not include `NA`.") - } else if (anyDuplicated(ref_time_values) != 0L) { - Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") - } else if (any(ref_time_values > private$ungrouped$versions_end)) { - Abort("All `ref_time_values` must be `<=` the `versions_end`.") - } else { - # Sort, for consistency with `epi_slide`, although the current - # implementation doesn't take advantage of it. - ref_time_values = sort(ref_time_values) - } + } - # Validate and pre-process `before`: - if (missing(before)) { - Abort("`before` is required (and must be passed by name); + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(private$ungrouped) + } else if (length(ref_time_values) == 0L) { + Abort("`ref_time_values` must have at least one element.") + } else if (any(is.na(ref_time_values))) { + Abort("`ref_time_values` must not include `NA`.") + } else if (anyDuplicated(ref_time_values) != 0L) { + Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } else if (any(ref_time_values > private$ungrouped$versions_end)) { + Abort("All `ref_time_values` must be `<=` the `versions_end`.") + } else { + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + Abort("`before` is required (and must be passed by name); if you did not want to apply a sliding window but rather to map `as_of` and `f` across various `ref_time_values`, pass a large `before` value (e.g., if time steps are days, `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - if (length(before) != 1L || is.na(before) || before < 0L) { - Abort("`before` must be length-1, non-NA, non-negative.") - } + } + before <- vctrs::vec_cast(before, integer()) + if (length(before) != 1L || is.na(before) || before < 0L) { + Abort("`before` must be length-1, non-NA, non-negative.") + } - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) - - # Symbolize column name - new_col = sym(new_col_name) + # If a custom time step is specified, then redefine units - # Validate rest of parameters: - if (!rlang::is_bool(as_list_col)) { - Abort("`as_list_col` must be TRUE or FALSE.") - } - if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) { - Abort("`names_sep` must be a (single) string or NULL.") - } - if (!rlang::is_bool(all_versions)) { - Abort("`all_versions` must be TRUE or FALSE.") - } + if (!missing(time_step)) before <- time_step(before) - # Computation for one group, one time value - comp_one_grp = function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # Carry out the specified computation - comp_value = f(.data_group, .group_key, ref_time_value, ...) + # Symbolize column name + new_col <- sym(new_col_name) - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `as_of` step. - .data_group = .data_group$DT - } + # Validate rest of parameters: + if (!rlang::is_bool(as_list_col)) { + Abort("`as_list_col` must be TRUE or FALSE.") + } + if (!(rlang::is_string(names_sep) || is.null(names_sep))) { + Abort("`names_sep` must be a (single) string or NULL.") + } + if (!rlang::is_bool(all_versions)) { + Abort("`all_versions` must be TRUE or FALSE.") + } - if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { - Abort("The slide computation must return an atomic vector or a data frame.") - } + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group <- .data_group$DT + } - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) + if (!(is.atomic(comp_value) || is.data.frame(comp_value))) { + Abort("The slide computation must return an atomic vector or a data frame.") + } - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos = enquos(...) - if (length(quos) == 0) { - Abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - Abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - f = quos[[1]] - new_col = sym(names(rlang::quos_auto_name(quos))) - ... = missing_arg() # magic value that passes zero args as dots in calls below - } + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) - f = as_slide_computation(f, ...) - x = lapply(ref_time_values, function(ref_time_value) { - # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, - # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) - # Set: - # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not - # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { - as_of_df = as_of_raw - group_modify_fn = comp_one_grp - } else { - as_of_archive = as_of_raw - # We essentially want to `group_modify` the archive, but - # haven't implemented this method yet. Next best would be - # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. - # Instead, go through an ordinary data frame, trying to avoid - # copies. - if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { - # `as_of` aliased its the full `$DT`; copy before mutating: - # - # Note: this step is probably unneeded; we're fine with - # aliasing of the DT or its columns: vanilla operations aren't - # going to mutate them in-place if they are aliases, and we're - # not performing mutation (unlike the situation with - # `fill_through_version` where we do mutate a `DT` and don't - # want aliasing). - as_of_archive$DT <- copy(as_of_archive$DT) - } - dt_key = data.table::key(as_of_archive$DT) - as_of_df = as_of_archive$DT - data.table::setDF(as_of_df) + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } - # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn = function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # .data_group is coming from as_of_df as a tibble, but we - # want to feed `comp_one_grp` an `epi_archive` backed by a - # DT; convert and wrap: - data.table::setattr(.data_group, "sorted", dt_key) - data.table::setDT(.data_group, key=dt_key) - .data_group_archive = as_of_archive$clone() - .data_group_archive$DT = .data_group - comp_one_grp(.data_group_archive, .group_key, f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col - ) - } - } + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + Abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + Abort("If `f` is missing then only a single computation can be specified via `...`.") + } - return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE - ) - ) - }) - # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) - # Reconstruct groups - x <- group_by(x, !!!syms(private$vars), .drop=private$drop) + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below + } - # Unchop/unnest if we need to - if (!as_list_col) { - x = tidyr::unnest(x, !!new_col, names_sep = names_sep) + f <- as_slide_computation(f, ...) + x <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). + as_of_archive$DT <- copy(as_of_archive$DT) } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) - # if (is_epi_df(x)) { - # # The analogue of `epi_df`'s `as_of` metadata for an archive is - # # `$versions_end`, at least in the current absence of - # # separate fields/columns denoting the "archive version" with a - # # different resolution, or from the perspective of a different - # # stage of a data pipeline. The `as_of` that is automatically - # # derived won't always match; override: - # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - # } + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- as_of_archive$clone() + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } - # XXX We need to work out when we want to return an `epi_df` and how - # to get appropriate keys (see #290, #223, #163). We'll probably - # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have some more consistency across - # different ways of calling `epix_slide`, and to prevent `epi_df` - # output with invalid metadata, always output a (grouped or - # ungrouped) tibble. - x <- decay_epi_df(x) + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(private$vars), .drop = private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(private$vars), .drop = private$drop) - return(x) - } + # Unchop/unnest if we need to + if (!as_list_col) { + x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) + } ) ) @@ -424,8 +436,8 @@ grouped_epi_archive = #' #' @importFrom dplyr group_by #' @export -group_by.grouped_epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { - .data$group_by(..., .add=.add, .drop=.drop) +group_by.grouped_epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { + .data$group_by(..., .add = .add, .drop = .drop) } #' @include methods-epi_archive.R @@ -433,7 +445,7 @@ group_by.grouped_epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::gro #' #' @importFrom dplyr groups #' @export -groups.grouped_epi_archive = function(x) { +groups.grouped_epi_archive <- function(x) { x$groups() } @@ -442,7 +454,7 @@ groups.grouped_epi_archive = function(x) { #' #' @importFrom dplyr ungroup #' @export -ungroup.grouped_epi_archive = function(x, ...) { +ungroup.grouped_epi_archive <- function(x, ...) { x$ungroup(...) } @@ -450,7 +462,7 @@ ungroup.grouped_epi_archive = function(x, ...) { #' @rdname group_by.epi_archive #' #' @export -is_grouped_epi_archive = function(x) { +is_grouped_epi_archive <- function(x) { inherits(x, "grouped_epi_archive") } @@ -458,12 +470,12 @@ is_grouped_epi_archive = function(x) { #' @rdname group_by.epi_archive #' #' @export -group_by_drop_default.grouped_epi_archive = function(.tbl) { +group_by_drop_default.grouped_epi_archive <- function(.tbl) { .tbl$group_by_drop_default() } #' @export -epix_truncate_versions_after.grouped_epi_archive = function(x, max_version) { - return ((x$clone()$truncate_versions_after(max_version))) +epix_truncate_versions_after.grouped_epi_archive <- function(x, max_version) { + return((x$clone()$truncate_versions_after(max_version))) # ^ second set of parens drops invisibility } diff --git a/R/growth_rate.R b/R/growth_rate.R index 17c4ec74..f54d1277 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -1,5 +1,5 @@ #' Estimate growth rate -#' +#' #' Estimates the growth rate of a signal at given points along the underlying #' sequence. Several methodologies are available; see the [growth rate #' vignette](https://cmu-delphi.github.io/epiprocess/articles/growth_rate.html) @@ -10,7 +10,7 @@ #' `y`). #' @param y Signal values. #' @param x0 Points at which we should estimate the growth rate. Must be a -#' subset of `x` (no extrapolation allowed). Default is `x`. +#' subset of `x` (no extrapolation allowed). Default is `x`. #' @param method Either "rel_change", "linear_reg", "smooth_spline", or #' "trend_filter", indicating the method to use for the growth rate #' calculation. The first two are local methods: they are run in a sliding @@ -28,7 +28,7 @@ #' @param na_rm Should missing values be removed before the computation? Default #' is `FALSE`. #' @param ... Additional arguments to pass to the method used to estimate the -#' derivative. +#' derivative. #' @return Vector of growth rate estimates at the specified points `x0`. #' #' @details The growth rate of a function f defined over a continuously-valued @@ -39,8 +39,8 @@ #' the signal value itself (or possibly a smoothed version of the signal #' value). #' -#' The following methods are available for estimating the growth rate: -#' +#' The following methods are available for estimating the growth rate: +#' #' * "rel_change": uses (B/A - 1) / h, where B is the average of `y` over the #' second half of a sliding window of bandwidth h centered at the reference #' point `x0`, and A the average over the first half. This can be seen as @@ -51,11 +51,11 @@ #' * "smooth_spline": uses the estimated derivative at `x0` from a smoothing #' spline fit to `x` and `y`, via `stats::smooth.spline()`, divided by the #' fitted value of the spline at `x0`. -#' * "trend_filter": uses the estimated derivative at `x0` from polynomial trend +#' * "trend_filter": uses the estimated derivative at `x0` from polynomial trend #' filtering (a discrete spline) fit to `x` and `y`, via #' `genlasso::trendfilter()`, divided by the fitted value of the discrete #' spline at `x0`. -#' +#' #' @section Log Scale: #' An alternative view for the growth rate of a function f in general is given #' by defining g(t) = log(f(t)), and then observing that g'(t) = f'(t) / @@ -74,7 +74,7 @@ #' `Date` objects, `h = 7`, and the reference point is January 7, then the #' sliding window contains all data in between January 1 and 14 (matching the #' behavior of `epi_slide()` with `before = h - 1` and `after = h`). -#' +#' #' @section Additional Arguments: #' For the global methods, "smooth_spline" and "trend_filter", additional #' arguments can be specified via `...` for the underlying estimation @@ -89,7 +89,7 @@ #' * `maxsteps`: maximum number of steps to take in the solution path before #' terminating. Default is 1000. #' * `cv`: should cross-validation be used to choose an effective degrees of -#' freedom for the fit? Default is `TRUE`. +#' freedom for the fit? Default is `TRUE`. #' * `k`: number of folds if cross-validation is to be used. Default is 3. #' * `df`: desired effective degrees of freedom for the trend filtering fit. If #' `cv = FALSE`, then `df` must be a positive integer; if `cv = TRUE`, then @@ -98,153 +98,166 @@ #' rule, respectively. Default is "min" (going along with the default `cv = #' TRUE`). Note that if `cv = FALSE`, then we require `df` to be set by the #' user. -#' +#' #' @export #' @examples #' # COVID cases growth rate by state using default method relative change -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' mutate(cases_gr = growth_rate(x = time_value, y = cases)) -#' +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' mutate(cases_gr = growth_rate(x = time_value, y = cases)) +#' #' # Log scale, degree 4 polynomial and 6-fold cross validation -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) - -growth_rate = function(x = seq_along(y), y, x0 = x, - method = c("rel_change", "linear_reg", - "smooth_spline", "trend_filter"), - h = 7, log_scale = FALSE, - dup_rm = FALSE, na_rm = FALSE, ...) { +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) +growth_rate <- function(x = seq_along(y), y, x0 = x, + method = c( + "rel_change", "linear_reg", + "smooth_spline", "trend_filter" + ), + h = 7, log_scale = FALSE, + dup_rm = FALSE, na_rm = FALSE, ...) { # Check x, y, x0 if (length(x) != length(y)) Abort("`x` and `y` must have the same length.") if (!all(x0 %in% x)) Abort("`x0` must be a subset of `x`.") - + # Check the method - method = match.arg(method) - + method <- match.arg(method) + # Arrange in increasing order of x - o = order(x) - x = x[o] - y = y[o] - + o <- order(x) + x <- x[o] + y <- y[o] + # Convert to log(y) if we need to - y = as.numeric(y) - if (log_scale) y = log(y) + y <- as.numeric(y) + if (log_scale) y <- log(y) # Remove duplicates if we need to if (dup_rm) { - o = !duplicated(x) + o <- !duplicated(x) if (any(!o)) { Warn("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") } - x = x[o] - y = y[o] + x <- x[o] + y <- y[o] } - - + + # Remove NAs if we need to if (na_rm) { - o = !(is.na(x) & is.na(y)) - x = x[o] - y = y[o] + o <- !(is.na(x) & is.na(y)) + x <- x[o] + y <- y[o] } # Useful indices for later - i0 = x %in% x0 + i0 <- x %in% x0 # Local methods - if (method == "rel_change" || method == "linear_reg") { - g = purrr::map_dbl(x, function(x_ref) { + if (method == "rel_change" || method == "linear_reg") { + g <- purrr::map_dbl(x, function(x_ref) { # Form the local window - ii = (x > x_ref - h) & (x <= x_ref + h) - xx = x[ii] - yy = y[ii] + ii <- (x > x_ref - h) & (x <= x_ref + h) + xx <- x[ii] + yy <- y[ii] # Convert to numerics - x_ref = as.numeric(x_ref) - xx = as.numeric(xx) - + x_ref <- as.numeric(x_ref) + xx <- as.numeric(xx) + # Relative change if (method == "rel_change") { - right = xx > x_ref - left = xx <= x_ref - b = mean(yy[right]) - a = mean(yy[left]) - hh = mean(xx[right]) - mean(xx[left]) - if (log_scale) return((b-a) / hh) - else return((b/a - 1) / hh) + right <- xx > x_ref + left <- xx <= x_ref + b <- mean(yy[right]) + a <- mean(yy[left]) + hh <- mean(xx[right]) - mean(xx[left]) + if (log_scale) { + return((b - a) / hh) + } else { + return((b / a - 1) / hh) + } } # Linear regression else { - xm = xx - mean(xx) - ym = yy - mean(yy) - b = sum(xm * ym) / sum(xm^2) - a = mean(yy - b * xx) - if (log_scale) return(b) - else return(b / (a + b * x_ref)) + xm <- xx - mean(xx) + ym <- yy - mean(yy) + b <- sum(xm * ym) / sum(xm^2) + a <- mean(yy - b * xx) + if (log_scale) { + return(b) + } else { + return(b / (a + b * x_ref)) + } } }) - + return(g[i0]) } - + # Global methods if (method == "smooth_spline" || method == "trend_filter") { # Convert to numerics - x = as.numeric(x) - x0 = as.numeric(x0) - + x <- as.numeric(x) + x0 <- as.numeric(x0) + # Collect parameters - params = list(...) + params <- list(...) # Smoothing spline if (method == "smooth_spline") { - params$x = x - params$y = y - obj = do.call(stats::smooth.spline, params) - f0 = stats::predict(obj, x = x0)$y - d0 = stats::predict(obj, x = x0, deriv = 1)$y - if (log_scale) return(d0) - else return(d0 / f0) + params$x <- x + params$y <- y + obj <- do.call(stats::smooth.spline, params) + f0 <- stats::predict(obj, x = x0)$y + d0 <- stats::predict(obj, x = x0, deriv = 1)$y + if (log_scale) { + return(d0) + } else { + return(d0 / f0) + } } # Trend filtering else { - ord = params$ord - maxsteps = params$maxsteps - cv = params$cv - df = params$df - k = params$k + ord <- params$ord + maxsteps <- params$maxsteps + cv <- params$cv + df <- params$df + k <- params$k # Default parameters - if (is.null(ord)) ord = 3 - if (is.null(maxsteps)) maxsteps = 1000 - if (is.null(cv)) cv = TRUE - if (is.null(df)) df = "min" - if (is.null(k)) k = 3 + if (is.null(ord)) ord <- 3 + if (is.null(maxsteps)) maxsteps <- 1000 + if (is.null(cv)) cv <- TRUE + if (is.null(df)) df <- "min" + if (is.null(k)) k <- 3 # Check cv and df combo - if (is.numeric(df)) cv = FALSE + if (is.numeric(df)) cv <- FALSE if (!cv && !(is.numeric(df) && df == round(df))) { Abort("If `cv = FALSE`, then `df` must be an integer.") } # Compute trend filtering path - obj = genlasso::trendfilter(y = y, pos = x, ord = ord, max = maxsteps) + obj <- genlasso::trendfilter(y = y, pos = x, ord = ord, max = maxsteps) # Use CV to find df, if we need to if (cv) { - cv_obj = quiet(genlasso::cv.trendfilter(obj, k = k, mode = "df")) - df = ifelse(df == "min", cv_obj$df.min, cv_obj$df.1se) + cv_obj <- quiet(genlasso::cv.trendfilter(obj, k = k, mode = "df")) + df <- ifelse(df == "min", cv_obj$df.min, cv_obj$df.1se) } # Estimate growth rate and return - f = genlasso::coef.genlasso(obj, df = df)$beta - d = ExtendR(diff(f) / diff(x)) - if (log_scale) return(d[i0]) - else return((d / f)[i0]) + f <- genlasso::coef.genlasso(obj, df = df)$beta + d <- ExtendR(diff(f) / diff(x)) + if (log_scale) { + return(d[i0]) + } else { + return((d / f)[i0]) + } } } } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 367fe759..45db2855 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -41,15 +41,19 @@ #' #' @examples #' # warning message of data latency shown -#' epix_as_of(x = archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version)) -#' +#' epix_as_of( +#' x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version) +#' ) +#' #' @examples #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' -#' epix_as_of(x = archive_cases_dv_subset, -#' max_version = as.Date("2020-06-12")) +#' epix_as_of( +#' x = archive_cases_dv_subset, +#' max_version = as.Date("2020-06-12") +#' ) #' #' # When fetching a snapshot as of the latest version with update data in the #' # archive, a warning is issued by default, as this update data might not yet @@ -59,15 +63,20 @@ #' # based on database queries, the latest available update might still be #' # subject to change, but previous versions should be finalized). We can #' # muffle such warnings with the following pattern: -#' withCallingHandlers({ -#' epix_as_of(x = archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version)) -#' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +#' withCallingHandlers( +#' { +#' epix_as_of( +#' x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version) +#' ) +#' }, +#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' ) #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. #' #' @export -epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { +epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value, all_versions = all_versions)) } @@ -102,12 +111,12 @@ epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALS #' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are #' based on LOCF). Default is `"na"`. #' @return An `epi_archive` -epix_fill_through_version = function(x, fill_versions_end, - how=c("na", "locf")) { +epix_fill_through_version <- function(x, fill_versions_end, + how = c("na", "locf")) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") # Enclosing parentheses drop the invisibility flag. See description above of # potential mutation and aliasing behavior. - ( x$clone()$fill_through_version(fill_versions_end, how=how) ) + (x$clone()$fill_through_version(fill_versions_end, how = how)) } #' Merge two `epi_archive` objects @@ -155,21 +164,21 @@ epix_fill_through_version = function(x, fill_versions_end, #' @examples #' # create two example epi_archive datasets #' x <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value,time_value,version,case_rate_7d_av) %>% -#' as_epi_archive(compactify=TRUE) +#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% +#' as_epi_archive(compactify = TRUE) #' y <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value,time_value,version,percent_cli) %>% -#' as_epi_archive(compactify=TRUE) +#' dplyr::select(geo_value, time_value, version, percent_cli) %>% +#' as_epi_archive(compactify = TRUE) #' # merge results stored in a third object: -#' xy = epix_merge(x, y) +#' xy <- epix_merge(x, y) #' # vs. mutating x to hold the merge result: #' x$merge(y) #' #' @importFrom data.table key set setkeyv #' @export -epix_merge = function(x, y, - sync = c("forbid","na","locf","truncate"), - compactify = TRUE) { +epix_merge <- function(x, y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { if (!inherits(x, "epi_archive")) { Abort("`x` must be of class `epi_archive`.") } @@ -190,15 +199,17 @@ epix_merge = function(x, y, if (length(x$additional_metadata) != 0L) { Warn("x$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata") + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) } if (length(y$additional_metadata) != 0L) { Warn("y$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata") + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) } - result_additional_metadata = list() + result_additional_metadata <- list() - result_clobberable_versions_start = + result_clobberable_versions_start <- if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { NA # (any type of NA is fine here) } else { @@ -216,21 +227,23 @@ epix_merge = function(x, y, "`x$versions_end` was not identical to `y$versions_end`;", "either ensure that `x` and `y` are equally up to date before merging,", "or specify how to deal with this using `sync`" - ), class="epiprocess__epix_merge_unresolved_sync") + ), class = "epiprocess__epix_merge_unresolved_sync") } else { - new_versions_end = x$versions_end - x_DT = x$DT - y_DT = y$DT + new_versions_end <- x$versions_end + x_DT <- x$DT + y_DT <- y$DT } } else if (sync %in% c("na", "locf")) { - new_versions_end = max(x$versions_end, y$versions_end) - x_DT = epix_fill_through_version(x, new_versions_end, sync)$DT - y_DT = epix_fill_through_version(y, new_versions_end, sync)$DT + new_versions_end <- max(x$versions_end, y$versions_end) + x_DT <- epix_fill_through_version(x, new_versions_end, sync)$DT + y_DT <- epix_fill_through_version(y, new_versions_end, sync)$DT } else if (sync == "truncate") { - new_versions_end = min(x$versions_end, y$versions_end) - x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with=FALSE] - y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with=FALSE] - } else Abort("unimplemented") + new_versions_end <- min(x$versions_end, y$versions_end) + x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + } else { + Abort("unimplemented") + } # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to @@ -241,15 +254,15 @@ epix_merge = function(x, y, # have a bug in the preprocessing, a weird/invalid archive as input, and/or a # data.table version with different semantics (which may break other parts of # our code). - x_DT_key_as_expected = identical(key(x$DT), key(x_DT)) - y_DT_key_as_expected = identical(key(y$DT), key(y_DT)) + x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) + y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) if (!x_DT_key_as_expected || !y_DT_key_as_expected) { Warn(" `epiprocess` internal warning (please report): pre-processing for epix_merge unexpectedly resulted in an intermediate data table (or tables) with a different key than the corresponding input archive. Manually setting intermediate data table keys to the expected values. - ", internal=TRUE) + ", internal = TRUE) setkeyv(x_DT, key(x$DT)) setkeyv(y_DT, key(y$DT)) } @@ -267,94 +280,106 @@ epix_merge = function(x, y, but y does not), please retry after processing them to share the same key (e.g., by summarizing x to remove the age breakdown, or by applying a static age breakdown to y). - ", class="epiprocess__epix_merge_x_y_must_have_same_key_set") + ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") } # `by` cols = result (and each input's) `key` cols, and determine # the row set, determined using a full join via `merge` # # non-`by` cols = "value"-ish cols, and are looked up with last # version carried forward via rolling joins - by = key(x_DT) # = some perm of key(y_DT) - if (!all(c("geo_value","time_value","version") %in% key(x_DT))) { + by <- key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { Abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to contain "geo_value", "time_value", and "version".', - class="epiprocess__epi_archive_must_have_required_key_cols") + class = "epiprocess__epi_archive_must_have_required_key_cols" + ) } if (length(by) < 1L || utils::tail(by, 1L) != "version") { Abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to have a "version" as the last key col.', - class="epiprocess__epi_archive_must_have_version_at_end_of_key") + class = "epiprocess__epi_archive_must_have_version_at_end_of_key" + ) } - x_nonby_colnames = setdiff(names(x_DT), by) - y_nonby_colnames = setdiff(names(y_DT), by) + x_nonby_colnames <- setdiff(names(x_DT), by) + y_nonby_colnames <- setdiff(names(y_DT), by) if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { Abort(" `x` and `y` DTs have overlapping non-by column names; this is currently not supported; please manually fix up first: any overlapping columns that can are key-like should be incorporated into the key, and other columns should be renamed. - ", class="epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") } - x_by_vals = x_DT[, by, with=FALSE] + x_by_vals <- x_DT[, by, with = FALSE] if (anyDuplicated(x_by_vals) != 0L) { Abort(" The `by` columns must uniquely determine rows of `x$DT`; the `by` is currently set to the common `key` of the two archives, so this can be resolved by adding key-like columns to `x`'s key (to get a unique key). - ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - y_by_vals = y_DT[, by, with=FALSE] + y_by_vals <- y_DT[, by, with = FALSE] if (anyDuplicated(y_by_vals) != 0L) { Abort(" The `by` columns must uniquely determine rows of `y$DT`; the `by` is currently set to the common `key` of the two archives, so this can be resolved by adding key-like columns to `y`'s key (to get a unique key). - ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - result_DT = merge(x_by_vals, y_by_vals, by=by, - # We must have `all=TRUE` or we may skip updates - # from x and/or y and corrupt the history - all=TRUE, - # We don't want Cartesian products, but the - # by-is-unique-key check above already ensures - # this. (Note that `allow.cartesian=FALSE` doesn't - # actually catch all Cartesian products anyway.) - # Disable superfluous check: - allow.cartesian=TRUE) - set(result_DT,, x_nonby_colnames, - x_DT[result_DT[, by, with=FALSE], x_nonby_colnames, with=FALSE, - # It's good practice to specify `on`, and we must - # explicitly specify `on` if there's a potential key vs. - # by order mismatch (not possible currently for x - # with by = key(x$DT), but possible for y): - on = by, - # last version carried forward: - roll=TRUE, - # requesting non-version key that doesn't exist in the other archive, - # or before its first version, should result in NA - nomatch=NA, - # see note on `allow.cartesian` above; currently have a - # similar story here. - allow.cartesian=TRUE]) - set(result_DT,, y_nonby_colnames, - y_DT[result_DT[, by, with=FALSE], y_nonby_colnames, with=FALSE, - on = by, - roll=TRUE, - nomatch=NA, - allow.cartesian=TRUE]) + result_DT <- merge(x_by_vals, y_by_vals, + by = by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all = TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian = TRUE + ) + set( + result_DT, , x_nonby_colnames, + x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + with = FALSE, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll = TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch = NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian = TRUE + ] + ) + set( + result_DT, , y_nonby_colnames, + y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + with = FALSE, + on = by, + roll = TRUE, + nomatch = NA, + allow.cartesian = TRUE + ] + ) # The key could be unset in case of a key vs. by order mismatch as # noted above. Ensure that we keep it: setkeyv(result_DT, by) - return (as_epi_archive( + return(as_epi_archive( result_DT[], # clear data.table internal invisibility flag if set geo_type = x$geo_type, time_type = x$time_type, - other_keys = setdiff(key(result_DT), c("geo_value","time_value","version")), + other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), additional_metadata = result_additional_metadata, # It'd probably be better to pre-compactify before the merge, and might be # guaranteed not to be necessary to compactify the merge result if the @@ -383,10 +408,11 @@ epix_merge = function(x, y, #' @return a `col_modify_recorder_df` #' #' @noRd -new_col_modify_recorder_df = function(parent_df) { +new_col_modify_recorder_df <- function(parent_df) { if (!inherits(parent_df, "data.frame")) { Abort('`parent_df` must inherit class `"data.frame"`', - internal=TRUE) + internal = TRUE + ) } `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) } @@ -398,17 +424,20 @@ new_col_modify_recorder_df = function(parent_df) { #' input to [`dplyr::dplyr_col_modify`] that this class was designed to record #' #' @noRd -destructure_col_modify_recorder_df = function(col_modify_recorder_df) { +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { if (!inherits(col_modify_recorder_df, "col_modify_recorder_df")) { Abort('`col_modify_recorder_df` must inherit class `"col_modify_recorder_df"`', - internal=TRUE) + internal = TRUE + ) } list( unchanged_parent_df = col_modify_recorder_df %>% `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% `class<-`(setdiff(class(.), "col_modify_recorder_df")), cols = attr(col_modify_recorder_df, - "epiprocess::col_modify_recorder_df::cols", exact=TRUE) + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) ) } @@ -420,10 +449,11 @@ destructure_col_modify_recorder_df = function(col_modify_recorder_df) { #' @importFrom dplyr dplyr_col_modify #' @export #' @noRd -dplyr_col_modify.col_modify_recorder_df = function(data, cols) { - if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact=TRUE))) { +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { Abort("`col_modify_recorder_df` can only record `cols` once", - internal=TRUE) + internal = TRUE + ) } attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols data @@ -452,7 +482,7 @@ dplyr_col_modify.col_modify_recorder_df = function(data, cols) { #' Don't export this without cleaning up language of "mutate" as in side effects #' vs. "mutate" as in `dplyr::mutate`. #' @noRd -epix_detailed_restricted_mutate = function(.data, ...) { +epix_detailed_restricted_mutate <- function(.data, ...) { # We don't want to directly use `dplyr::mutate` on the `$DT`, as: # - this likely copies the entire table # - `mutate` behavior, including the output class, changes depending on @@ -466,12 +496,12 @@ epix_detailed_restricted_mutate = function(.data, ...) { # back to something that will use `dplyr`'s included `mutate` method(s), # then convert this using shallow operations into a `data.table`. # - Use `col_modify_recorder_df` to get the desired details. - in_tbl = tibble::as_tibble(as.list(.data$DT), .name_repair="minimal") - col_modify_cols = + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + col_modify_cols <- destructure_col_modify_recorder_df( mutate(new_col_modify_recorder_df(in_tbl), ...) )[["cols"]] - invalidated_key_col_is = + invalidated_key_col_is <- which(purrr::map_lgl(key(.data$DT), function(key_colname) { key_colname %in% names(col_modify_cols) && !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) @@ -480,7 +510,8 @@ epix_detailed_restricted_mutate = function(.data, ...) { rlang::abort(paste_lines(c( "Key columns must not be replaced or removed.", wrap_varnames(key(.data$DT)[invalidated_key_col_is], - initial="Flagged key cols: ") + initial = "Flagged key cols: " + ) ))) } else { # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing @@ -493,12 +524,12 @@ epix_detailed_restricted_mutate = function(.data, ...) { # sorting (including potential extra copies) or sortedness checking, then # `setDT` (rather than `as.data.table`, in order to prevent column copying # to establish ownership according to `data.table`'s memory model). - out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% - data.table::setDT(key=key(.data$DT)) - out_archive = .data$clone() + data.table::setDT(key = key(.data$DT)) + out_archive <- .data$clone() out_archive$DT <- out_DT - request_names = names(col_modify_cols) + request_names <- names(col_modify_cols) return(list( archive = out_archive, request_names = request_names @@ -577,7 +608,7 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' @examples #' -#' grouped_archive = archive_cases_dv_subset %>% group_by(geo_value) +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) #' #' # `print` for metadata and method listing: #' grouped_archive %>% print() @@ -586,10 +617,12 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% -#' epix_slide(f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = 'case_rate_3d_av') %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% #' ungroup() #' #' # ----------------------------------------------------------------- @@ -597,34 +630,42 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' # Advanced: some other features of dplyr grouping are implemented: #' #' library(dplyr) -#' toy_archive = +#' toy_archive <- #' tribble( -#' ~geo_value, ~age_group, ~time_value, ~version, ~value, -#' "us", "adult", "2000-01-01", "2000-01-02", 121, -#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) -#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) -#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) #' ) %>% -#' mutate(age_group = ordered(age_group, c("pediatric", "adult")), -#' time_value = as.Date(time_value), -#' version = as.Date(version)) %>% #' as_epi_archive(other_keys = "age_group") #' #' # The following are equivalent: #' toy_archive %>% group_by(geo_value, age_group) -#' toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) -#' grouping_cols = c("geo_value", "age_group") +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") #' toy_archive %>% group_by(across(all_of(grouping_cols))) #' #' # And these are equivalent: #' toy_archive %>% group_by(geo_value) -#' toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) #' #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -#' toy_archive %>% group_by(geo_value) %>% groups() +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() #' #' toy_archive %>% -#' group_by(geo_value, age_group, .drop=FALSE) %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% #' epix_slide(f = ~ sum(.x$value), before = 20) %>% #' ungroup() #' @@ -632,27 +673,30 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' @export #' #' @aliases grouped_epi_archive -group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { +group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. - detailed_mutate = epix_detailed_restricted_mutate(.data, ...) + detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) if (!rlang::is_bool(.drop)) { Abort("`.drop` must be TRUE or FALSE") } if (!.drop) { - grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] - grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. if (!any(grouping_col_is_factor)) { Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) } else if (any(diff(grouping_col_is_factor) == -1L)) { Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) } } grouped_epi_archive$new(detailed_mutate[["archive"]], - detailed_mutate[["request_names"]], - drop = .drop) + detailed_mutate[["request_names"]], + drop = .drop + ) } #' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` @@ -818,17 +862,20 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' #' # Reference time points for which we want to compute slide values: #' ref_time_values <- seq(as.Date("2020-06-01"), -#' as.Date("2020-06-15"), -#' by = "1 day") +#' as.Date("2020-06-15"), +#' by = "1 day" +#' ) #' #' # A simple (but not very useful) example (see the archive vignette for a more #' # realistic one): #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% -#' epix_slide(f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = ref_time_values, -#' new_col_name = 'case_rate_7d_av_recent_av') %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = "case_rate_7d_av_recent_av" +#' ) %>% #' ungroup() #' # We requested time windows that started 2 days before the corresponding time #' # values. The actual number of `time_value`s in each computation depends on @@ -846,23 +893,24 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' # Examining characteristics of the data passed to each computation with #' # `all_versions=FALSE`. #' archive_cases_dv_subset %>% -#' group_by(geo_value) %>% -#' epix_slide( -#' function(x, gk, rtv) { -#' tibble( -#' time_range = if(nrow(x) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) -#' }, -#' n = nrow(x), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' before = 5, all_versions = FALSE, -#' ref_time_values = ref_time_values, names_sep=NULL) %>% -#' ungroup() %>% -#' arrange(geo_value, time_value) +#' group_by(geo_value) %>% +#' epix_slide( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if (nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) #' #' # --- Advanced: --- #' @@ -884,7 +932,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' toString(min(x$DT$version)) #' }, #' versions_end = x$versions_end, -#' time_range = if(nrow(x$DT) == 0L) { +#' time_range = if (nrow(x$DT) == 0L) { #' "0 `time_value`s" #' } else { #' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) @@ -894,7 +942,8 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' ) #' }, #' before = 5, all_versions = TRUE, -#' ref_time_values = ref_time_values, names_sep=NULL) %>% +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% #' ungroup() %>% #' # Focus on one geo_value so we can better see the columns above: #' filter(geo_value == "ca") %>% @@ -902,30 +951,31 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' #' @importFrom rlang enquo !!! #' @export -epix_slide = function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - if (!is_epi_archive(x, grouped_okay=TRUE)) { +epix_slide <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive(x, grouped_okay = TRUE)) { Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } - return(x$slide(f, ..., before = before, - ref_time_values = ref_time_values, - time_step = time_step, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_versions = all_versions - )) + return(x$slide(f, ..., + before = before, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_name = new_col_name, + as_list_col = as_list_col, + names_sep = names_sep, + all_versions = all_versions + )) } #' Default value for `ref_time_values` in an `epix_slide` #' #' @noRd -epix_slide_ref_time_values_default = function(ea) { - versions_with_updates = c(ea$DT$version, ea$versions_end) - ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) - return (ref_time_values) +epix_slide_ref_time_values_default <- function(ea) { + versions_with_updates <- c(ea$DT$version, ea$versions_end) + ref_time_values <- tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + return(ref_time_values) } #' Filter an `epi_archive` object to keep only older versions @@ -941,12 +991,12 @@ epix_slide_ref_time_values_default = function(ea) { #' @return An `epi_archive` object #' #' @export -epix_truncate_versions_after = function(x, max_version) { +epix_truncate_versions_after <- function(x, max_version) { UseMethod("epix_truncate_versions_after") } #' @export -epix_truncate_versions_after.epi_archive = function(x, max_version) { - return ((x$clone()$truncate_versions_after(max_version))) +epix_truncate_versions_after.epi_archive <- function(x, max_version) { + return((x$clone()$truncate_versions_after(max_version))) # ^ second set of parens drops invisibility } diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 6e4666e7..7e002320 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -8,7 +8,7 @@ #' #' @importFrom tibble as_tibble #' @export -as_tibble.epi_df = function(x, ...) { +as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. @@ -16,22 +16,23 @@ as_tibble.epi_df = function(x, ...) { } #' Convert to tsibble format -#' -#' Converts an `epi_df` object into a tsibble, where the index is taken to be +#' +#' Converts an `epi_df` object into a tsibble, where the index is taken to be #' `time_value`, and the key variables taken to be `geo_value` along with any -#' others in the `other_keys` field of the metadata, or else explicitly set. +#' others in the `other_keys` field of the metadata, or else explicitly set. #' #' @method as_tsibble epi_df #' @param x The `epi_df` object. -#' @param key Optional. Any additional keys (other than `geo_value`) to add to +#' @param key Optional. Any additional keys (other than `geo_value`) to add to #' the `tsibble`. #' @param ... additional arguments passed on to `tsibble::as_tsibble()` #' @export -as_tsibble.epi_df = function(x, key, ...) { - if (missing(key)) key = c("geo_value", attributes(x)$metadata$other_keys) +as_tsibble.epi_df <- function(x, key, ...) { + if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys) return(as_tsibble(tibble::as_tibble(x), - key = tidyselect::all_of(key), index = "time_value", - ...)) + key = tidyselect::all_of(key), index = "time_value", + ... + )) } #' Base S3 methods for an `epi_df` object @@ -43,9 +44,11 @@ as_tsibble.epi_df = function(x, key, ...) { #' #' @method print epi_df #' @export -print.epi_df = function(x, ...) { - cat("An `epi_df` object,", prettyNum(nrow(x),","), "x", - prettyNum(ncol(x),","), "with metadata:\n") +print.epi_df <- function(x, ...) { + cat( + "An `epi_df` object,", prettyNum(nrow(x), ","), "x", + prettyNum(ncol(x), ","), "with metadata:\n" + ) cat(sprintf("* %-9s = %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", attributes(x)$metadata$time_type)) cat(sprintf("* %-9s = %s\n", "as_of", attributes(x)$metadata$as_of)) @@ -67,7 +70,7 @@ print.epi_df = function(x, ...) { #' @importFrom rlang .data #' @importFrom stats median #' @export -summary.epi_df = function(object, ...) { +summary.epi_df <- function(object, ...) { cat("An `epi_df` x, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", attributes(object)$metadata$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", attributes(object)$metadata$time_type)) @@ -75,10 +78,12 @@ summary.epi_df = function(object, ...) { cat("----------\n") cat(sprintf("* %-27s = %s\n", "min time value", min(object$time_value))) cat(sprintf("* %-27s = %s\n", "max time value", max(object$time_value))) - cat(sprintf("* %-27s = %i\n", "average rows per time value", - as.integer(object %>% dplyr::group_by(.data$time_value) %>% - dplyr::summarize(num = dplyr::n()) %>% - dplyr::summarize(mean(.data$num))))) + cat(sprintf( + "* %-27s = %i\n", "average rows per time value", + as.integer(object %>% dplyr::group_by(.data$time_value) %>% + dplyr::summarize(num = dplyr::n()) %>% + dplyr::summarize(mean(.data$num))) + )) } #' Drop any `epi_df` metadata and class on a data frame @@ -93,7 +98,7 @@ summary.epi_df = function(object, ...) { #' present, dropped #' #' @noRd -decay_epi_df = function(x) { +decay_epi_df <- function(x) { attributes(x)$metadata <- NULL class(x) <- class(x)[class(x) != "epi_df"] x @@ -117,23 +122,26 @@ decay_epi_df = function(x) { #' @importFrom dplyr dplyr_reconstruct #' @export #' @noRd -dplyr_reconstruct.epi_df = function(data, template) { +dplyr_reconstruct.epi_df <- function(data, template) { # Start from a reconstruction for the backing S3 classes; this ensures that we # keep any grouping that has been applied: res <- NextMethod() - + cn <- names(res) # Duplicate columns, Abort - dup_col_names = cn[duplicated(cn)] + dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { - Abort(paste0("Column name(s) ", - paste(unique(dup_col_names), - collapse = ", "), " must not be duplicated.")) + Abort(paste0( + "Column name(s) ", + paste(unique(dup_col_names), + collapse = ", " + ), " must not be duplicated." + )) } - + not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn) - + if (not_epi_df) { # If we're calling on an `epi_df` from one of our own functions, we need to # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble, @@ -142,43 +150,45 @@ dplyr_reconstruct.epi_df = function(data, template) { # should work in both cases. return(decay_epi_df(res)) } - + res <- reclass(res, attr(template, "metadata")) # XXX we may want verify the `geo_type` and `time_type` here. If it's # significant overhead, we may also want to keep this less strict version # around and implement some extra S3 methods that use it, when appropriate. - + # Amend additional metadata if some other_keys cols are dropped in the subset - old_other_keys = attr(template, "metadata")$other_keys + old_other_keys <- attr(template, "metadata")$other_keys attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn] - + res } #' @export `[.epi_df` <- function(x, i, j, drop = FALSE) { res <- NextMethod() - - if (!is.data.frame(res)) return(res) - + + if (!is.data.frame(res)) { + return(res) + } + dplyr::dplyr_reconstruct(res, x) } #' @importFrom dplyr dplyr_col_modify #' @export -dplyr_col_modify.epi_df = function(data, cols) { +dplyr_col_modify.epi_df <- function(data, cols) { dplyr::dplyr_reconstruct(NextMethod(), data) } #' @importFrom dplyr dplyr_row_slice #' @export -dplyr_row_slice.epi_df = function(data, i, ...) { +dplyr_row_slice.epi_df <- function(data, i, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) } #' @export -`names<-.epi_df` = function(x, value) { +`names<-.epi_df` <- function(x, value) { old_names <- names(x) old_metadata <- attr(x, "metadata") old_other_keys <- old_metadata[["other_keys"]] @@ -193,18 +203,18 @@ dplyr_row_slice.epi_df = function(data, i, ...) { #' @method group_by epi_df #' @rdname print.epi_df #' @export -group_by.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() +group_by.epi_df <- function(.data, ...) { + metadata <- attributes(.data)$metadata + .data <- NextMethod() reclass(.data, metadata) } #' @method ungroup epi_df #' @rdname print.epi_df #' @export -ungroup.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() +ungroup.epi_df <- function(x, ...) { + metadata <- attributes(x)$metadata + x <- NextMethod() reclass(x, metadata) } @@ -214,7 +224,7 @@ ungroup.epi_df = function(x, ...) { #' @param .f function or formula; see [`dplyr::group_modify`] #' @param .keep Boolean; see [`dplyr::group_modify`] #' @export -group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { +group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) } @@ -222,13 +232,13 @@ group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { #' @rdname print.epi_df #' @param data The `epi_df` object. #' @export -unnest.epi_df = function(data, ...) { +unnest.epi_df <- function(data, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) } # Simple reclass function -reclass = function(x, metadata) { - class(x) = unique(c("epi_df", class(x))) - attributes(x)$metadata = metadata +reclass <- function(x, metadata) { + class(x) <- unique(c("epi_df", class(x))) + attributes(x)$metadata <- metadata return(x) } diff --git a/R/outliers.R b/R/outliers.R index e5fd8765..1eb3ea01 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -46,38 +46,54 @@ #' @export #' @importFrom dplyr select #' @examples -#' detection_methods = dplyr::bind_rows( -#' dplyr::tibble(method = "rm", -#' args = list(list(detect_negatives = TRUE, -#' detection_multiplier = 2.5)), -#' abbr = "rm"), -#' dplyr::tibble(method = "stl", -#' args = list(list(detect_negatives = TRUE, -#' detection_multiplier = 2.5, -#' seasonal_period = 7)), -#' abbr = "stl_seasonal"), -#' dplyr::tibble(method = "stl", -#' args = list(list(detect_negatives = TRUE, -#' detection_multiplier = 2.5, -#' seasonal_period = NULL)), -#' abbr = "stl_nonseasonal")) +#' detection_methods <- dplyr::bind_rows( +#' dplyr::tibble( +#' method = "rm", +#' args = list(list( +#' detect_negatives = TRUE, +#' detection_multiplier = 2.5 +#' )), +#' abbr = "rm" +#' ), +#' dplyr::tibble( +#' method = "stl", +#' args = list(list( +#' detect_negatives = TRUE, +#' detection_multiplier = 2.5, +#' seasonal_period = 7 +#' )), +#' abbr = "stl_seasonal" +#' ), +#' dplyr::tibble( +#' method = "stl", +#' args = list(list( +#' detect_negatives = TRUE, +#' detection_multiplier = 2.5, +#' seasonal_period = NULL +#' )), +#' abbr = "stl_nonseasonal" +#' ) +#' ) #' -#' x <- incidence_num_outlier_example %>% -#' dplyr::select(geo_value,time_value,cases) %>% -#' as_epi_df() %>% -#' group_by(geo_value) %>% -#' mutate(outlier_info = detect_outlr( -#' x = time_value, y = cases, -#' methods = detection_methods, -#' combiner = "median")) %>% -#' unnest(outlier_info) -detect_outlr = function(x = seq_along(y), y, - methods = tibble::tibble(method = "rm", - args = list(list()), - abbr = "rm"), - combiner = c("median", "mean", "none")) { +#' x <- incidence_num_outlier_example %>% +#' dplyr::select(geo_value, time_value, cases) %>% +#' as_epi_df() %>% +#' group_by(geo_value) %>% +#' mutate(outlier_info = detect_outlr( +#' x = time_value, y = cases, +#' methods = detection_methods, +#' combiner = "median" +#' )) %>% +#' unnest(outlier_info) +detect_outlr <- function(x = seq_along(y), y, + methods = tibble::tibble( + method = "rm", + args = list(list()), + abbr = "rm" + ), + combiner = c("median", "mean", "none")) { # Validate combiner - combiner = match.arg(combiner) + combiner <- match.arg(combiner) # Validate that x contains all distinct values if (any(duplicated(x))) { @@ -85,32 +101,33 @@ detect_outlr = function(x = seq_along(y), y, } # Run all outlier detection methods - results = purrr::pmap_dfc(methods, function(method, args, abbr) { - if (is.character(method)) method = paste0("detect_outlr_", method) + results <- purrr::pmap_dfc(methods, function(method, args, abbr) { + if (is.character(method)) method <- paste0("detect_outlr_", method) # Call the method - results = do.call(method, args = c(list("x" = x, "y" = y), args)) + results <- do.call(method, args = c(list("x" = x, "y" = y), args)) - # Validate the output + # Validate the output if (!is.data.frame(results) || - !all(c("lower", "upper", "replacement") %in% colnames(results))) { + !all(c("lower", "upper", "replacement") %in% colnames(results))) { Abort("Outlier detection method must return a data frame with columns `lower`, `upper`, and `replacement`.") } # Update column names with model abbreviation - colnames(results) = paste(abbr, colnames(results), sep = "_") + colnames(results) <- paste(abbr, colnames(results), sep = "_") return(results) }) # Combine information about detected outliers if (combiner != "none") { - if (combiner == "mean") combine_fun = mean - else if (combiner == "median") combine_fun = median + if (combiner == "mean") { + combine_fun <- mean + } else if (combiner == "median") combine_fun <- median for (target in c("lower", "upper", "replacement")) { - results[[paste0("combined_", target)]] = apply( + results[[paste0("combined_", target)]] <- apply( results %>% - dplyr::select(dplyr::ends_with(target)), 1, combine_fun + dplyr::select(dplyr::ends_with(target)), 1, combine_fun ) } } @@ -154,48 +171,54 @@ detect_outlr = function(x = seq_along(y), y, #' @examples #' # Detect outliers based on a rolling median #' incidence_num_outlier_example %>% -#' dplyr::select(geo_value,time_value,cases) %>% +#' dplyr::select(geo_value, time_value, cases) %>% #' as_epi_df() %>% #' group_by(geo_value) %>% -#' mutate(outlier_info = detect_outlr_rm( -#' x = time_value, y = cases)) %>% +#' mutate(outlier_info = detect_outlr_rm( +#' x = time_value, y = cases +#' )) %>% #' unnest(outlier_info) -detect_outlr_rm = function(x = seq_along(y), y, n = 21, - log_transform = FALSE, - detect_negatives = FALSE, - detection_multiplier = 2, - min_radius = 0, - replacement_multiplier = 0) { +detect_outlr_rm <- function(x = seq_along(y), y, n = 21, + log_transform = FALSE, + detect_negatives = FALSE, + detection_multiplier = 2, + min_radius = 0, + replacement_multiplier = 0) { # Transform if requested if (log_transform) { # Replace all negative values with 0 - y = pmax(0, y) - offset = as.integer(any(y == 0)) - y = log(y + offset) + y <- pmax(0, y) + offset <- as.integer(any(y == 0)) + y <- log(y + offset) } # Detect negatives if requested - if (detect_negatives && !log_transform) min_lower = 0 - else min_lower = -Inf + if (detect_negatives && !log_transform) { + min_lower <- 0 + } else { + min_lower <- -Inf + } # Make an epi_df for easy sliding - z = as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) + z <- as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) # Calculate lower and upper thresholds and replacement value - z = z %>% - epi_slide(fitted = median(y), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>% + z <- z %>% + epi_slide(fitted = median(y), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% dplyr::mutate(resid = y - fitted) %>% - roll_iqr(n = n, - detection_multiplier = detection_multiplier, - min_radius = min_radius, - replacement_multiplier = replacement_multiplier, - min_lower = min_lower) + roll_iqr( + n = n, + detection_multiplier = detection_multiplier, + min_radius = min_radius, + replacement_multiplier = replacement_multiplier, + min_lower = min_lower + ) # Undo log transformation if necessary if (log_transform) { - z$lower = exp(z$lower) - offset - z$upper = exp(z$upper) - offset - z$replacement = exp(z$replacement) - offset + z$lower <- exp(z$lower) - offset + z$upper <- exp(z$upper) - offset + z$replacement <- exp(z$replacement) - offset } return(z) @@ -251,100 +274,116 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, #' @examples #' # Detects outliers based on a seasonal-trend decomposition using LOESS #' incidence_num_outlier_example %>% -#' dplyr::select(geo_value,time_value,cases) %>% +#' dplyr::select(geo_value, time_value, cases) %>% #' as_epi_df() %>% #' group_by(geo_value) %>% -#' mutate(outlier_info = detect_outlr_stl( +#' mutate(outlier_info = detect_outlr_stl( #' x = time_value, y = cases, -#' seasonal_period = 7 )) %>% # weekly seasonality for daily data +#' seasonal_period = 7 +#' )) %>% # weekly seasonality for daily data #' unnest(outlier_info) -detect_outlr_stl = function(x = seq_along(y), y, - n_trend = 21, - n_seasonal = 21, - n_threshold = 21, - seasonal_period = NULL, - log_transform = FALSE, - detect_negatives = FALSE, - detection_multiplier = 2, - min_radius = 0, - replacement_multiplier = 0) { +detect_outlr_stl <- function(x = seq_along(y), y, + n_trend = 21, + n_seasonal = 21, + n_threshold = 21, + seasonal_period = NULL, + log_transform = FALSE, + detect_negatives = FALSE, + detection_multiplier = 2, + min_radius = 0, + replacement_multiplier = 0) { # Transform if requested if (log_transform) { # Replace all negative values with 0 - y = pmax(0, y) - offset = as.integer(any(y == 0)) - y = log(y + offset) + y <- pmax(0, y) + offset <- as.integer(any(y == 0)) + y <- log(y + offset) } # Make a tsibble for fabletools, setup and run STL - z_tsibble = tsibble::tsibble(x = x, y = y, index = x) + z_tsibble <- tsibble::tsibble(x = x, y = y, index = x) - stl_formula = y ~ trend(window = n_trend) + + stl_formula <- y ~ trend(window = n_trend) + season(period = seasonal_period, window = n_seasonal) - stl_components = z_tsibble %>% + stl_components <- z_tsibble %>% fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>% generics::components() %>% tibble::as_tibble() %>% dplyr::select(trend:remainder) %>% - dplyr::rename_with(~ "seasonal", tidyselect::starts_with("season")) %>% + dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>% dplyr::rename(resid = remainder) # Allocate the seasonal term from STL to either fitted or resid if (!is.null(seasonal_period)) { - stl_components = stl_components %>% + stl_components <- stl_components %>% dplyr::mutate( - fitted = trend + seasonal) + fitted = trend + seasonal + ) } else { - stl_components = stl_components %>% + stl_components <- stl_components %>% dplyr::mutate( fitted = trend, - resid = seasonal + resid) + resid = seasonal + resid + ) } # Detect negatives if requested - if (detect_negatives && !log_transform) min_lower = 0 - else min_lower = -Inf + if (detect_negatives && !log_transform) { + min_lower <- 0 + } else { + min_lower <- -Inf + } # Make an epi_df for easy sliding - z = as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) + z <- as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) # Calculate lower and upper thresholds and replacement value - z = z %>% + z <- z %>% dplyr::mutate( fitted = stl_components$fitted, - resid = stl_components$resid) %>% - roll_iqr(n = n_threshold, - detection_multiplier = detection_multiplier, - min_radius = min_radius, - replacement_multiplier = replacement_multiplier, - min_lower = min_lower) + resid = stl_components$resid + ) %>% + roll_iqr( + n = n_threshold, + detection_multiplier = detection_multiplier, + min_radius = min_radius, + replacement_multiplier = replacement_multiplier, + min_lower = min_lower + ) # Undo log transformation if necessary if (log_transform) { - z$lower = exp(z$lower) - offset - z$upper = exp(z$upper) - offset - z$replacement = exp(z$replacement) - offset + z$lower <- exp(z$lower) - offset + z$upper <- exp(z$upper) - offset + z$replacement <- exp(z$replacement) - offset } return(z) } # Common function for rolling IQR, using fitted and resid variables -roll_iqr = function(z, n, detection_multiplier, min_radius, - replacement_multiplier, min_lower) { - if (typeof(z$y) == "integer") as_type = as.integer - else as_type = as.numeric +roll_iqr <- function(z, n, detection_multiplier, min_radius, + replacement_multiplier, min_lower) { + if (typeof(z$y) == "integer") { + as_type <- as.integer + } else { + as_type <- as.numeric + } - epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>% + epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% dplyr::mutate( - lower = pmax(min_lower, - fitted - pmax(min_radius, detection_multiplier * roll_iqr)), + lower = pmax( + min_lower, + fitted - pmax(min_radius, detection_multiplier * roll_iqr) + ), upper = fitted + pmax(min_radius, detection_multiplier * roll_iqr), replacement = dplyr::case_when( - (y < lower) ~ as_type(fitted - replacement_multiplier * roll_iqr), - (y > upper) ~ as_type(fitted + replacement_multiplier * roll_iqr), - TRUE ~ y)) %>% + (y < lower) ~ as_type(fitted - replacement_multiplier * roll_iqr), + (y > upper) ~ as_type(fitted + replacement_multiplier * roll_iqr), + TRUE ~ y + ) + ) %>% dplyr::select(lower, upper, replacement) %>% tibble::as_tibble() } diff --git a/R/slide.R b/R/slide.R index 0feb689a..2a10efce 100644 --- a/R/slide.R +++ b/R/slide.R @@ -76,7 +76,7 @@ #' `NA` marker. #' @return An `epi_df` object given by appending a new column to `x`, named #' according to the `new_col_name` argument. -#' +#' #' @details To "slide" means to apply a function or formula over a rolling #' window of time steps for each data group, where the window is entered at a #' reference time and left and right endpoints are given by the `before` and @@ -117,21 +117,21 @@ #' new_col_name = "cases_7dav") #' ``` #' Thus, to be clear, when the computation is specified via an expression for -#' tidy evaluation (first example, above), then the name for the new column is -#' inferred from the given expression and overrides any name passed explicitly +#' tidy evaluation (first example, above), then the name for the new column is +#' inferred from the given expression and overrides any name passed explicitly #' through the `new_col_name` argument. -#' +#' #' @importFrom lubridate days weeks #' @importFrom dplyr bind_rows group_vars filter select #' @importFrom rlang .data .env !! enquo enquos sym env missing_arg #' @export -#' @examples +#' @examples #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6) %>% +#' epi_slide(cases_7dav = mean(cases), before = 6) %>% #' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' dplyr::select(-death_rate_7d_av) #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% @@ -143,35 +143,38 @@ #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% +#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% #' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' dplyr::select(-death_rate_7d_av) #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% +#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% #' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' dplyr::select(-death_rate_7d_av) #' #' # nested new columns #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(a = data.frame(cases_2dav = mean(cases), -#' cases_2dma = mad(cases)), -#' before = 1, as_list_col = TRUE) -epi_slide = function(x, f, ..., before, after, ref_time_values, - time_step, - new_col_name = "slide_value", as_list_col = FALSE, - names_sep = "_", all_rows = FALSE) { - +#' epi_slide( +#' a = data.frame( +#' cases_2dav = mean(cases), +#' cases_2dma = mad(cases) +#' ), +#' before = 1, as_list_col = TRUE +#' ) +epi_slide <- function(x, f, ..., before, after, ref_time_values, + time_step, + new_col_name = "slide_value", as_list_col = FALSE, + names_sep = "_", all_rows = FALSE) { # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") - + if (missing(ref_time_values)) { - ref_time_values = unique(x$time_value) + ref_time_values <- unique(x$time_value) } - + # Some of these `ref_time_values` checks and processing steps also apply to # the `ref_time_values` default; for simplicity, just apply all the steps # regardless of whether we are working with a default or user-provided @@ -185,9 +188,9 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } else if (!all(ref_time_values %in% unique(x$time_value))) { Abort("All `ref_time_values` must appear in `x$time_value`.") } else { - ref_time_values = sort(ref_time_values) + ref_time_values <- sort(ref_time_values) } - + # Validate and pre-process `before`, `after`: if (!missing(before)) { before <- vctrs::vec_cast(before, integer()) @@ -227,20 +230,20 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, after <- time_step(after) } - min_ref_time_values = ref_time_values - before + min_ref_time_values <- ref_time_values - before min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))] # Do set up to let us recover `ref_time_value`s later. # A helper column marking real observations. - x$.real = TRUE + x$.real <- TRUE # Create df containing phony data. Df has the same columns and attributes as # `x`, but filled with `NA`s aside from grouping columns. Number of rows is # equal to the number of `min_ref_time_values_not_in_x` we have * the # number of unique levels seen in the grouping columns. - before_time_values_df = data.frame(time_value=min_ref_time_values_not_in_x) + before_time_values_df <- data.frame(time_value = min_ref_time_values_not_in_x) if (length(group_vars(x)) != 0) { - before_time_values_df = dplyr::cross_join( + before_time_values_df <- dplyr::cross_join( # Get unique combinations of grouping columns seen in real data. unique(x[, group_vars(x)]), before_time_values_df @@ -248,69 +251,71 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } # Automatically fill in all other columns from `x` with `NA`s, and carry # attributes over to new df. - before_time_values_df <- bind_rows(x[0,], before_time_values_df) + before_time_values_df <- bind_rows(x[0, ], before_time_values_df) before_time_values_df$.real <- FALSE x <- bind_rows(before_time_values_df, x) # Arrange by increasing time_value - x = arrange(x, time_value) + x <- arrange(x, time_value) # Now set up starts and stops for sliding/hopping - time_range = range(unique(x$time_value)) - starts = in_range(ref_time_values - before, time_range) - stops = in_range(ref_time_values + after, time_range) - - if( length(starts) == 0 || length(stops) == 0 ) { + time_range <- range(unique(x$time_value)) + starts <- in_range(ref_time_values - before, time_range) + stops <- in_range(ref_time_values + after, time_range) + + if (length(starts) == 0 || length(stops) == 0) { Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).") } # Symbolize new column name - new_col = sym(new_col_name) + new_col <- sym(new_col_name) # Computation for one group, all time values - slide_one_grp = function(.data_group, - f, ..., - starts, - stops, - time_values, - all_rows, - new_col) { + slide_one_grp <- function(.data_group, + f, ..., + starts, + stops, + time_values, + all_rows, + new_col) { # Figure out which reference time values appear in the data group in the # first place (we need to do this because it could differ based on the # group, hence the setup/checks for the reference time values based on all # the data could still be off) - o = time_values %in% .data_group$time_value - starts = starts[o] - stops = stops[o] - time_values = time_values[o] - - # Compute the slide values - slide_values_list = slider::hop_index(.x = .data_group, - .i = .data_group$time_value, - .f = f, ..., - .starts = starts, - .stops = stops) + o <- time_values %in% .data_group$time_value + starts <- starts[o] + stops <- stops[o] + time_values <- time_values[o] + + # Compute the slide values + slide_values_list <- slider::hop_index( + .x = .data_group, + .i = .data_group$time_value, + .f = f, ..., + .starts = starts, + .stops = stops + ) # Now figure out which rows in the data group are in the reference time # values; this will be useful for all sorts of checks that follow - o = .data_group$time_value %in% time_values - num_ref_rows = sum(o) - + o <- .data_group$time_value %in% time_values + num_ref_rows <- sum(o) + # Count the number of appearances of each reference time value (these # appearances should all be real for now, but if we allow ref time values # outside of .data_group's time values): - counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>% + counts <- dplyr::filter(.data_group, .data$time_value %in% time_values) %>% dplyr::count(.data$time_value) %>% dplyr::pull(n) if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && - !all(purrr::map_lgl(slide_values_list, is.data.frame))) { + !all(purrr::map_lgl(slide_values_list, is.data.frame))) { Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } # Unlist if appropriate: - slide_values = + slide_values <- if (as_list_col) { slide_values_list } else { @@ -318,16 +323,16 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, } if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && - length(slide_values_list) != 0L) { + length(slide_values_list) != 0L) { # Recycle to make size stable (one slide value per ref time value). # (Length-0 case also could be handled here, but causes difficulties; # leave it to the next branch, where it also belongs.) - slide_values = vctrs::vec_rep_each(slide_values, times = counts) + slide_values <- vctrs::vec_rep_each(slide_values, times = counts) } else { # Split and flatten if appropriate, perform a (loose) check on number of # rows. if (as_list_col) { - slide_values = purrr::list_flatten(purrr::map( + slide_values <- purrr::list_flatten(purrr::map( slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]] )) } @@ -338,60 +343,61 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # If all rows, then pad slide values with NAs, else filter down data group if (all_rows) { - orig_values = slide_values - slide_values = vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) + orig_values <- slide_values + slide_values <- vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) # ^ using vctrs::vec_init would be shorter but docs don't guarantee it # fills with NA equivalent. - vctrs::vec_slice(slide_values, o) = orig_values + vctrs::vec_slice(slide_values, o) <- orig_values } else { # This implicitly removes phony (`.real` == FALSE) observations. - .data_group = filter(.data_group, o) + .data_group <- filter(.data_group, o) } return(mutate(.data_group, !!new_col := slide_values)) } # If `f` is missing, interpret ... as an expression for tidy evaluation if (missing(f)) { - quos = enquos(...) + quos <- enquos(...) if (length(quos) == 0) { Abort("If `f` is missing then a computation must be specified via `...`.") } if (length(quos) > 1) { Abort("If `f` is missing then only a single computation can be specified via `...`.") } - - f = quos[[1]] - new_col = sym(names(rlang::quos_auto_name(quos))) - ... = missing_arg() # magic value that passes zero args as dots in calls below + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below } - f = as_slide_computation(f, ...) + f <- as_slide_computation(f, ...) # Create a wrapper that calculates and passes `.ref_time_value` to the # computation. - f_wrapper = function(.x, .group_key, ...) { - .ref_time_value = min(.x$time_value) + before - .x <- .x[.x$.real,] + f_wrapper <- function(.x, .group_key, ...) { + .ref_time_value <- min(.x$time_value) + before + .x <- .x[.x$.real, ] .x$.real <- NULL f(.x, .group_key, .ref_time_value, ...) } - x = group_modify(x, slide_one_grp, - f = f_wrapper, ..., - starts = starts, - stops = stops, - time_values = ref_time_values, - all_rows = all_rows, - new_col = new_col, - .keep = FALSE) - + x <- group_modify(x, slide_one_grp, + f = f_wrapper, ..., + starts = starts, + stops = stops, + time_values = ref_time_values, + all_rows = all_rows, + new_col = new_col, + .keep = FALSE + ) + # Unnest if we need to, and return if (!as_list_col) { - x = unnest(x, !!new_col, names_sep = names_sep) + x <- unnest(x, !!new_col, names_sep = names_sep) } # Remove any remaining phony observations. When `all_rows` is TRUE, phony # observations aren't necessarily removed in `slide_one_grp`. if (all_rows) { - x <- x[x$.real,] + x <- x[x$.real, ] } # Drop helper column `.real`. diff --git a/R/utils.R b/R/utils.R index 471fb053..9cc707a6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,6 +1,6 @@ -break_str = function(str, nchar = 79, init = "") { - str = paste(strwrap(str, nchar, initial = init), collapse = "\n") - str[1] = substring(str, nchar(init)+1) +break_str <- function(str, nchar = 79, init = "") { + str <- paste(strwrap(str, nchar, initial = init), collapse = "\n") + str[1] <- substring(str, nchar(init) + 1) return(str) } @@ -29,9 +29,9 @@ break_str = function(str, nchar = 79, init = "") { #' @return `chr`; to print, use [`base::writeLines`]. #' #' @noRd -wrap_symbolics = function(symbolics, - initial = "", common_prefix = "", none_str = "", - width = getOption("width", 80L)) { +wrap_symbolics <- function(symbolics, + initial = "", common_prefix = "", none_str = "", + width = getOption("width", 80L)) { if (!all(purrr::map_lgl(symbolics, rlang::is_symbolic))) { Abort("`symbolics` must be a list of symbolic objects") } @@ -44,14 +44,16 @@ wrap_symbolics = function(symbolics, if (!rlang::is_string(none_str)) { Abort("`none_str` must be a string") } - prefix = strrep(" ", nchar(initial, type="width")) - full_initial = paste0(common_prefix, initial) - full_prefix = paste0(common_prefix, prefix) - full_initial_width = nchar(full_initial, type="width") - minimum_reasonable_line_width_for_syms = 20L - line_width_for_syms = max(width - full_initial_width, - minimum_reasonable_line_width_for_syms) - unprefixed_lines = + prefix <- strrep(" ", nchar(initial, type = "width")) + full_initial <- paste0(common_prefix, initial) + full_prefix <- paste0(common_prefix, prefix) + full_initial_width <- nchar(full_initial, type = "width") + minimum_reasonable_line_width_for_syms <- 20L + line_width_for_syms <- max( + width - full_initial_width, + minimum_reasonable_line_width_for_syms + ) + unprefixed_lines <- if (length(symbolics) == 0L) { none_str } else { @@ -60,12 +62,14 @@ wrap_symbolics = function(symbolics, # `paste0` already takes care of necessary backquotes. `cat` with # `fill=TRUE` takes care of spacing + line wrapping exclusively # between elements. We need to add commas appropriately. - cat(paste0(symbolics, c(rep(",", times=length(symbolics)-1L), "")), fill=TRUE) + cat(paste0(symbolics, c(rep(",", times = length(symbolics) - 1L), "")), fill = TRUE) }) ) } - lines = paste0(c(full_initial, rep(full_prefix, times=length(unprefixed_lines)-1L)), - unprefixed_lines) + lines <- paste0( + c(full_initial, rep(full_prefix, times = length(unprefixed_lines) - 1L)), + unprefixed_lines + ) lines } @@ -76,15 +80,15 @@ wrap_symbolics = function(symbolics, #' @return `chr`; to print, use [`base::writeLines`]. #' #' @noRd -wrap_varnames = function(nms, - initial = "", common_prefix = "", none_str = "", - width = getOption("width", 80L)) { +wrap_varnames <- function(nms, + initial = "", common_prefix = "", none_str = "", + width = getOption("width", 80L)) { # (Repeating parameter names and default args here for better autocomplete. # Using `...` instead would require less upkeep, but have worse autocomplete.) if (!rlang::is_character(nms)) { Abort("`nms` must be a character vector") } - wrap_symbolics(rlang::syms(nms), initial=initial, common_prefix=common_prefix, none_str=none_str, width=width) + wrap_symbolics(rlang::syms(nms), initial = initial, common_prefix = common_prefix, none_str = none_str, width = width) } #' Paste `chr` entries (lines) together with `"\n"` separators, trailing `"\n"` @@ -93,12 +97,12 @@ wrap_varnames = function(nms, #' @return string #' #' @noRd -paste_lines = function(lines) { - paste(paste0(lines,"\n"), collapse="") +paste_lines <- function(lines) { + paste(paste0(lines, "\n"), collapse = "") } -Abort = function(msg, ...) rlang::abort(break_str(msg, init = "Error: "), ...) -Warn = function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) +Abort <- function(msg, ...) rlang::abort(break_str(msg, init = "Error: "), ...) +Warn <- function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) #' Assert that a sliding computation function takes enough args #' @@ -115,17 +119,17 @@ Warn = function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) assert_sufficient_f_args <- function(f, ...) { mandatory_f_args_labels <- c("window data", "group key", "reference time value") n_mandatory_f_args <- length(mandatory_f_args_labels) - args = formals(args(f)) - args_names = names(args) + args <- formals(args(f)) + args_names <- names(args) # Remove named arguments forwarded from `epi[x]_slide`'s `...`: - forwarded_dots_names = names(rlang::call_match(dots_expand = FALSE)[["..."]]) - args_matched_in_dots = + forwarded_dots_names <- names(rlang::call_match(dots_expand = FALSE)[["..."]]) + args_matched_in_dots <- # positional calling args will skip over args matched by named calling args args_names %in% forwarded_dots_names & - # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` - args_names != "..." - remaining_args = args[!args_matched_in_dots] - remaining_args_names = names(remaining_args) + # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` + args_names != "..." + remaining_args <- args[!args_matched_in_dots] + remaining_args_names <- names(remaining_args) # note that this doesn't include unnamed args forwarded through `...`. dots_i <- which(remaining_args_names == "...") # integer(0) if no match n_f_args_before_dots <- dots_i - 1L @@ -134,7 +138,7 @@ assert_sufficient_f_args <- function(f, ...) { mandatory_args_mapped_names <- remaining_args_names[seq_len(n_f_args_before_dots)] if (n_f_args_before_dots < n_mandatory_f_args) { - mandatory_f_args_in_f_dots = + mandatory_f_args_in_f_dots <- tail(mandatory_f_args_labels, n_mandatory_f_args - n_f_args_before_dots) cli::cli_warn( "`f` might not have enough positional arguments before its `...`; in the current `epi[x]_slide` call, the {mandatory_f_args_in_f_dots} will be included in `f`'s `...`; if `f` doesn't expect those arguments, it may produce confusing error messages", @@ -149,13 +153,15 @@ assert_sufficient_f_args <- function(f, ...) { if (rlang::dots_n(...) == 0L) { # common case; try for friendlier error message Abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", - epiprocess__f = f) + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", + epiprocess__f = f + ) } else { # less common; highlight that they are (accidentally?) using dots forwarding Abort(sprintf("`f` must take at least %s arguments plus the %s arguments forwarded through `epi[x]_slide`'s `...`, or a named argument to `epi[x]_slide` was misspelled", n_mandatory_f_args, rlang::dots_n(...)), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", - epiprocess__f = f) + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", + epiprocess__f = f + ) } } } @@ -163,21 +169,22 @@ assert_sufficient_f_args <- function(f, ...) { # calling args. If `f` has fewer than n_mandatory_f_args before `...`, then we # only need to check those args for defaults. Note that `n_f_args_before_dots` is # length 0 if `f` doesn't accept `...`. - n_remaining_args_for_default_check = min(c(n_f_args_before_dots, n_mandatory_f_args)) - default_check_args = remaining_args[seq_len(n_remaining_args_for_default_check)] - default_check_args_names = names(default_check_args) - has_default_replaced_by_mandatory = map_lgl(default_check_args, ~!is_missing(.x)) + n_remaining_args_for_default_check <- min(c(n_f_args_before_dots, n_mandatory_f_args)) + default_check_args <- remaining_args[seq_len(n_remaining_args_for_default_check)] + default_check_args_names <- names(default_check_args) + has_default_replaced_by_mandatory <- map_lgl(default_check_args, ~ !is_missing(.x)) if (any(has_default_replaced_by_mandatory)) { - default_check_mandatory_args_labels = + default_check_mandatory_args_labels <- mandatory_f_args_labels[seq_len(n_remaining_args_for_default_check)] # ^ excludes any mandatory args absorbed by f's `...`'s: - mandatory_args_replacing_defaults = + mandatory_args_replacing_defaults <- default_check_mandatory_args_labels[has_default_replaced_by_mandatory] - args_with_default_replaced_by_mandatory = + args_with_default_replaced_by_mandatory <- rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) cli::cli_abort("`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which {?has a/have} default value{?s}; we suspect that `f` doesn't expect {?this arg/these args} at all and may produce confusing error messages. Please add additional arguments to `f` or remove defaults as appropriate.", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", - epiprocess__f = f) + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", + epiprocess__f = f + ) } } @@ -276,24 +283,24 @@ assert_sufficient_f_args <- function(f, ...) { #' #' @noRd as_slide_computation <- function(f, ...) { - arg = caller_arg(f) - call = caller_env() + arg <- caller_arg(f) + call <- caller_env() # A quosure is a type of formula, so be careful with the order and contents # of the conditional logic here. if (is_quosure(f)) { - fn = function(.x, .group_key, .ref_time_value) { + fn <- function(.x, .group_key, .ref_time_value) { # Convert to environment to standardize between tibble and R6 # based inputs. In both cases, we should get a simple # environment with the empty environment as its parent. - data_env = rlang::as_environment(.x) - data_mask = rlang::new_data_mask(bottom = data_env, top = data_env) + data_env <- rlang::as_environment(.x) + data_mask <- rlang::new_data_mask(bottom = data_env, top = data_env) data_mask$.data <- rlang::as_data_pronoun(data_mask) # We'll also install `.x` directly, not as an `rlang_data_pronoun`, so # that we can, e.g., use more dplyr and epiprocess operations. - data_mask$.x = .x - data_mask$.group_key = .group_key - data_mask$.ref_time_value = .ref_time_value + data_mask$.x <- .x + data_mask$.group_key <- .group_key + data_mask$.ref_time_value <- .ref_time_value rlang::eval_tidy(f, data_mask) } @@ -309,24 +316,27 @@ as_slide_computation <- function(f, ...) { if (is_formula(f)) { if (length(f) > 2) { Abort(sprintf("%s must be a one-sided formula", arg), - class = "epiprocess__as_slide_computation__formula_is_twosided", - epiprocess__f = f, - call = call) + class = "epiprocess__as_slide_computation__formula_is_twosided", + epiprocess__f = f, + call = call + ) } if (rlang::dots_n(...) > 0L) { Abort("No arguments can be passed via `...` when `f` is a formula, or there are unrecognized/misspelled parameter names.", - class = "epiprocess__as_slide_computation__formula_with_dots", - epiprocess__f = f, - epiprocess__enquos_dots = enquos(...)) + class = "epiprocess__as_slide_computation__formula_with_dots", + epiprocess__f = f, + epiprocess__enquos_dots = enquos(...) + ) } env <- f_env(f) if (!is_environment(env)) { Abort("Formula must carry an environment.", - class = "epiprocess__as_slide_computation__formula_has_no_env", - epiprocess__f = f, - epiprocess__f_env = env, - arg = arg, call = call) + class = "epiprocess__as_slide_computation__formula_has_no_env", + epiprocess__f = f, + epiprocess__f_env = env, + arg = arg, call = call + ) } args <- list( @@ -340,103 +350,121 @@ as_slide_computation <- function(f, ...) { return(fn) } - Abort(sprintf("Can't convert an object of class %s to a slide computation", paste(collapse=" ", deparse(class(f)))), - class = "epiprocess__as_slide_computation__cant_convert_catchall", - epiprocess__f = f, - epiprocess__f_class = class(f), - arg = arg, - call = call) + Abort(sprintf("Can't convert an object of class %s to a slide computation", paste(collapse = " ", deparse(class(f)))), + class = "epiprocess__as_slide_computation__cant_convert_catchall", + epiprocess__f = f, + epiprocess__f_class = class(f), + arg = arg, + call = call + ) } ########## -in_range = function(x, rng) pmin(pmax(x, rng[1]), rng[2]) +in_range <- function(x, rng) pmin(pmax(x, rng[1]), rng[2]) ########## -Min = function(x) min(x, na.rm = TRUE) -Max = function(x) max(x, na.rm = TRUE) -Sum = function(x) sum(x, na.rm = TRUE) -Mean = function(x) mean(x, na.rm = TRUE) -Median = function(x) median(x, na.rm = TRUE) +Min <- function(x) min(x, na.rm = TRUE) +Max <- function(x) max(x, na.rm = TRUE) +Sum <- function(x) sum(x, na.rm = TRUE) +Mean <- function(x) mean(x, na.rm = TRUE) +Median <- function(x) median(x, na.rm = TRUE) ########## -Start = function(x) x[1] -End = function(x) x[length(x)] -MiddleL = function(x) x[floor((length(x)+1)/2)] -MiddleR = function(x) x[ceiling((length(x)+1)/2)] -ExtendL = function(x) c(Start(x), x) -ExtendR = function(x) c(x, End(x)) +Start <- function(x) x[1] +End <- function(x) x[length(x)] +MiddleL <- function(x) x[floor((length(x) + 1) / 2)] +MiddleR <- function(x) x[ceiling((length(x) + 1) / 2)] +ExtendL <- function(x) c(Start(x), x) +ExtendR <- function(x) c(x, End(x)) -guess_geo_type = function(geo_value) { +guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { # Convert geo values to lowercase - geo_value = tolower(geo_value) - - # If all geo values are state abbreviations, then use "state" - state_values = c(tolower(datasets::state.abb), - "as", "dc", "gu", "mp", "pr", "vi") - if (all(geo_value %in% state_values)) return("state") - - # Else if all geo values are 2 letters, then use "nation" - else if (all(grepl("[a-z]{2}", geo_value)) - & !any(grepl("[a-z]{3}", geo_value))) return("nation") - - # Else if all geo values are 5 numbers, then use "county" - else if (all(grepl("[0-9]{5}", geo_value)) & - !any(grepl("[0-9]{6}", geo_value))) return("county") - } + geo_value <- tolower(geo_value) - else if (is.numeric(geo_value)) { + # If all geo values are state abbreviations, then use "state" + state_values <- c( + tolower(datasets::state.abb), + "as", "dc", "gu", "mp", "pr", "vi" + ) + if (all(geo_value %in% state_values)) { + return("state") + } # Else if all geo values are 2 letters, then use "nation" + else if (all(grepl("[a-z]{2}", geo_value)) & + !any(grepl("[a-z]{3}", geo_value))) { + return("nation") + } # Else if all geo values are 5 numbers, then use "county" + else if (all(grepl("[0-9]{5}", geo_value)) & + !any(grepl("[0-9]{6}", geo_value))) { + return("county") + } + } else if (is.numeric(geo_value)) { # Convert geo values to integers - geo_value = as.integer(geo_value) + geo_value <- as.integer(geo_value) # If the max geo value is at most 10, then use "hhs" - if (max(geo_value) <= 10) return("hhs") - + if (max(geo_value) <= 10) { + return("hhs") + } + # Else if the max geo value is at most 457, then use "hrr" - if (max(geo_value) <= 457) return("hrr") + if (max(geo_value) <= 457) { + return("hrr") + } } # If we got here then we failed return("custom") } -guess_time_type = function(time_value) { +guess_time_type <- function(time_value) { # Convert character time values to Date or POSIXct if (is.character(time_value)) { if (nchar(time_value[1]) <= "10") { - new_time_value = tryCatch({ as.Date(time_value) }, - error = function(e) NULL) - } - else { - new_time_value = tryCatch({ as.POSIXct(time_value) }, - error = function(e) NULL) + new_time_value <- tryCatch( + { + as.Date(time_value) + }, + error = function(e) NULL + ) + } else { + new_time_value <- tryCatch( + { + as.POSIXct(time_value) + }, + error = function(e) NULL + ) } - if (!is.null(new_time_value)) time_value = new_time_value + if (!is.null(new_time_value)) time_value <- new_time_value } - - # Now, if a POSIXct class, then use "day-time" - if (inherits(time_value, "POSIXct")) return("day-time") - # Else, if a Date class, then use "week" or "day" depending on gaps + # Now, if a POSIXct class, then use "day-time" + if (inherits(time_value, "POSIXct")) { + return("day-time") + } # Else, if a Date class, then use "week" or "day" depending on gaps else if (inherits(time_value, "Date")) { return(ifelse(all(diff(sort(time_value)) == 7), "week", "day")) } # Else, check whether it's one of the tsibble classes - else if (inherits(time_value, "yearweek")) return("yearweek") - else if (inherits(time_value, "yearmonth")) return("yearmonth") - else if (inherits(time_value, "yearquarter")) return("yearquarter") + else if (inherits(time_value, "yearweek")) { + return("yearweek") + } else if (inherits(time_value, "yearmonth")) { + return("yearmonth") + } else if (inherits(time_value, "yearquarter")) { + return("yearquarter") + } # Else, if it's an integer that's at least 1582, then use "year" if (is.numeric(time_value) && - all(time_value == as.integer(time_value)) && - all(time_value >= 1582)) { + all(time_value == as.integer(time_value)) && + all(time_value >= 1582)) { return("year") } - + # If we got here then we failed return("custom") } @@ -444,29 +472,29 @@ guess_time_type = function(time_value) { ########## -quiet = function(x) { - sink(tempfile()) - on.exit(sink()) - invisible(force(x)) +quiet <- function(x) { + sink(tempfile()) + on.exit(sink()) + invisible(force(x)) } ########## # Create an auto-named list -enlist = function(...) { - x = list(...) - n = as.character(sys.call())[-1] +enlist <- function(...) { + x <- list(...) + n <- as.character(sys.call())[-1] if (!is.null(n0 <- names(x))) { - n[n0 != ""] = n0[n0 != ""] + n[n0 != ""] <- n0[n0 != ""] } - names(x) = n - return(x) + names(x) <- n + return(x) } -# Variable assignment from a list. NOT USED. Something is broken, this doesn't +# Variable assignment from a list. NOT USED. Something is broken, this doesn't # seem to work completely as expected: the variables it define don't propogate -# down to child environments -list2var = function(x) { +# down to child environments +list2var <- function(x) { list2env(x, envir = parent.frame()) } @@ -485,7 +513,7 @@ list2var = function(x) { #' #' @examples #' -#' fn = function(x = deprecated()) { +#' fn <- function(x = deprecated()) { #' deprecated_quo_is_present(rlang::enquo(x)) #' } #' @@ -497,10 +525,10 @@ list2var = function(x) { #' # argument that has already been defused into a quosure, `!!quo`). (This is #' # already how NSE arguments that will be enquosed should be forwarded.) #' -#' wrapper1 = function(x=deprecated()) fn({{x}}) -#' wrapper2 = function(x=lifecycle::deprecated()) fn({{x}}) -#' wrapper3 = function(x) fn({{x}}) -#' wrapper4 = function(x) fn(!!rlang::enquo(x)) +#' wrapper1 <- function(x = deprecated()) fn({{ x }}) +#' wrapper2 <- function(x = lifecycle::deprecated()) fn({{ x }}) +#' wrapper3 <- function(x) fn({{ x }}) +#' wrapper4 <- function(x) fn(!!rlang::enquo(x)) #' #' wrapper1() # FALSE #' wrapper2() # FALSE @@ -509,27 +537,28 @@ list2var = function(x) { #' #' # More advanced: wrapper that receives an already-enquosed arg: #' -#' inner_wrapper = function(quo) fn(!!quo) -#' outer_wrapper1 = function(x=deprecated()) inner_wrapper(rlang::enquo(x)) +#' inner_wrapper <- function(quo) fn(!!quo) +#' outer_wrapper1 <- function(x = deprecated()) inner_wrapper(rlang::enquo(x)) #' #' outer_wrapper1() # FALSE #' #' # Improper argument forwarding from a wrapper function will cause this #' # function to produce incorrect results. -#' bad_wrapper1 = function(x) fn(x) +#' bad_wrapper1 <- function(x) fn(x) #' bad_wrapper1() # TRUE, bad #' #' @noRd -deprecated_quo_is_present = function(quo) { +deprecated_quo_is_present <- function(quo) { if (!rlang::is_quosure(quo)) { Abort("`quo` must be a quosure; `enquo` the arg first", - internal=TRUE) + internal = TRUE + ) } else if (rlang::quo_is_missing(quo)) { FALSE } else { - quo_expr = rlang::get_expr(quo) + quo_expr <- rlang::get_expr(quo) if (identical(quo_expr, rlang::expr(deprecated())) || - identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { + identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { FALSE } else { TRUE @@ -577,7 +606,7 @@ deprecated_quo_is_present = function(quo) { #' be an integer. #' #' @noRd -gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { +gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { if (!is.numeric(a) || length(a) != 1L) { Abort("`a` must satisfy `is.numeric`, have `length` 1.") } @@ -593,21 +622,21 @@ gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { if (!is.numeric(irtol) || length(irtol) != 1L || irtol < 0) { Abort("`irtol` must satisfy `is.numeric`, have `length` 1, and be non-negative.") } - if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a/b) >= pqlim || abs(b/a) >= pqlim) { + if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a / b) >= pqlim || abs(b / a) >= pqlim) { Abort("`a` and/or `b` is either `NA` or exactly zero, or one is so much smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting.") } - iatol = irtol * max(a,b) - a_curr = a - b_curr = b + iatol <- irtol * max(a, b) + a_curr <- a + b_curr <- b while (TRUE) { # `b_curr` is the candidate GCD / iterand; check first if it seems too small: if (abs(b_curr) <= iatol) { - Abort('No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.') + Abort("No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.") } - remainder = a_curr - round(a_curr / b_curr) * b_curr + remainder <- a_curr - round(a_curr / b_curr) * b_curr if (abs(remainder / b_curr) <= rrtol) { # We consider `a_curr` divisible by `b_curr`; `b_curr` is the GCD or its negation - return (abs(b_curr)) + return(abs(b_curr)) } a_curr <- b_curr b_curr <- remainder @@ -625,7 +654,7 @@ gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { #' error.) #' #' @noRd -gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { +gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { if (!is.numeric(dividends) || length(dividends) == 0L) { Abort("`dividends` must satisfy `is.numeric`, and have `length` > 0") } @@ -637,7 +666,7 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { # workload. Also take `abs` early on as another form of deduplication and to # make the sort simpler. Use `na.last=FALSE` in the sort to preserve presence # of `NA`s in order to get a better error message in this case. - optimized_dividends = sort(unique(abs(dividends)), na.last=FALSE) + optimized_dividends <- sort(unique(abs(dividends)), na.last = FALSE) # Note that taking the prime factorizations of a set of integers, and # calculating the minimum power for each prime across all these # factorizations, yields the prime factorization of the GCD of the set of @@ -656,8 +685,9 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { # gcd2real(gcd_int(X/gcd_real(XUY))*gcd_real(XUY), # gcd_int(Y/gcd_real(XUY))*gcd_real(XUY)) = gcd2real(gcd_real(X), # gcd_real(Y)). So "gcd_real" should also be `reduce`-compatible. - numeric_gcd = purrr::reduce(optimized_dividends, gcd2num, - rrtol=rrtol, pqlim=pqlim, irtol=irtol) + numeric_gcd <- purrr::reduce(optimized_dividends, gcd2num, + rrtol = rrtol, pqlim = pqlim, irtol = irtol + ) vctrs::vec_cast(numeric_gcd, dividends) } @@ -672,13 +702,13 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { #' in error messages. Defaults to quoting the expression the caller fed into #' the `ref_time_values` argument. #' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)` -guess_period = function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { - sorted_distinct_ref_time_values = sort(unique(ref_time_values)) +guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { + sorted_distinct_ref_time_values <- sort(unique(ref_time_values)) if (length(sorted_distinct_ref_time_values) < 2L) { Abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) } - skips = diff(sorted_distinct_ref_time_values) - decayed_skips = + skips <- diff(sorted_distinct_ref_time_values) + decayed_skips <- if (typeof(skips) == "integer") { as.integer(skips) } else { diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 598825f6..4000727a 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -1,8 +1,9 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { - my_version_bound = NA - validate_version_bound(my_version_bound, na_ok=TRUE) - expect_error(validate_version_bound(my_version_bound, na_ok=FALSE), - class="epiprocess__my_version_bound_is_na") + my_version_bound <- NA + validate_version_bound(my_version_bound, na_ok = TRUE) + expect_error(validate_version_bound(my_version_bound, na_ok = FALSE), + class = "epiprocess__my_version_bound_is_na" + ) # Note that if the error class name changes, this test may produce some # confusing output along the following lines: # @@ -11,23 +12,26 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { }) test_that("`validate_version_bound` catches bounds that are the wrong length", { - my_version_bound1a = NULL - expect_error(validate_version_bound(my_version_bound1a, na_ok=TRUE), - class="epiprocess__my_version_bound1a_is_not_length_1") - my_version_bound1b = integer(0L) - expect_error(validate_version_bound(my_version_bound1b, na_ok=TRUE), - class="epiprocess__my_version_bound1b_is_not_length_1") - my_version_bound2 = c(2, 10) - expect_error(validate_version_bound(my_version_bound2, na_ok=TRUE), - class="epiprocess__my_version_bound2_is_not_length_1") + my_version_bound1a <- NULL + expect_error(validate_version_bound(my_version_bound1a, na_ok = TRUE), + class = "epiprocess__my_version_bound1a_is_not_length_1" + ) + my_version_bound1b <- integer(0L) + expect_error(validate_version_bound(my_version_bound1b, na_ok = TRUE), + class = "epiprocess__my_version_bound1b_is_not_length_1" + ) + my_version_bound2 <- c(2, 10) + expect_error(validate_version_bound(my_version_bound2, na_ok = TRUE), + class = "epiprocess__my_version_bound2_is_not_length_1" + ) }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { - my_int = 5L - my_dbl = 5 - my_list = list(5L) - my_date = as.Date("2000-01-01") - my_datetime = vctrs::vec_cast(my_date, as.POSIXct(as.Date("1900-01-01"))) + my_int <- 5L + my_dbl <- 5 + my_list <- list(5L) + my_date <- as.Date("2000-01-01") + my_datetime <- vctrs::vec_cast(my_date, as.POSIXct(as.Date("1900-01-01"))) # When first drafted, this validate function was a (validate+)cast function, # which used vctrs::vec_cast inside. However, the initial implementation # didn't actually allow casting to occur, and it was easier to change to the @@ -41,30 +45,36 @@ test_that("`validate_version_bound` validate and class checks together allow and expect_identical(vctrs::vec_cast(my_datetime, my_date), my_date) expect_identical(vctrs::vec_cast(my_date, my_datetime), my_datetime) # - x_int = tibble::tibble(version = my_int) - x_dbl = tibble::tibble(version = my_dbl) - x_list = tibble::tibble(version = my_list) - x_date = tibble::tibble(version = my_date) - x_datetime = tibble::tibble(version = my_datetime) + x_int <- tibble::tibble(version = my_int) + x_dbl <- tibble::tibble(version = my_dbl) + x_list <- tibble::tibble(version = my_list) + x_date <- tibble::tibble(version = my_date) + x_datetime <- tibble::tibble(version = my_datetime) # Custom classes matter (test vectors and non-vctrs-specialized lists separately): - my_version_bound1 = `class<-`(24, "c1") - expect_error(validate_version_bound(my_version_bound1, x_int, na_ok=FALSE), - class="epiprocess__my_version_bound1_has_invalid_class_or_typeof") - my_version_bound2 = `class<-`(list(12), c("c2a","c2b","c2c")) - expect_error(validate_version_bound(my_version_bound2, x_list, na_ok=FALSE), - class="epiprocess__my_version_bound2_has_invalid_class_or_typeof") + my_version_bound1 <- `class<-`(24, "c1") + expect_error(validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), + class = "epiprocess__my_version_bound1_has_invalid_class_or_typeof" + ) + my_version_bound2 <- `class<-`(list(12), c("c2a", "c2b", "c2c")) + expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), + class = "epiprocess__my_version_bound2_has_invalid_class_or_typeof" + ) # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: - validate_version_bound(my_date, x_date, version_bound_arg="vb") - validate_version_bound(my_datetime, x_datetime, version_bound_arg="vb") - expect_error(validate_version_bound(my_datetime, x_date, na_ok=TRUE, version_bound_arg="vb"), - class="epiprocess__vb_has_invalid_class_or_typeof") - expect_error(validate_version_bound(my_date, x_datetime, na_ok=TRUE, version_bound_arg="vb"), - class="epiprocess__vb_has_invalid_class_or_typeof") + validate_version_bound(my_date, x_date, version_bound_arg = "vb") + validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") + expect_error(validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), + class = "epiprocess__vb_has_invalid_class_or_typeof" + ) + expect_error(validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), + class = "epiprocess__vb_has_invalid_class_or_typeof" + ) # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb")) expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb")) - expect_error(validate_version_bound(`class<-`(list(2), "clazz"), - tibble::tibble(version=`class<-`(5L, "clazz")), TRUE, "vb")) + expect_error(validate_version_bound( + `class<-`(list(2), "clazz"), + tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" + )) # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -77,34 +87,48 @@ test_that("`validate_version_bound` validate and class checks together allow and }) test_that("archive version bounds args work as intended", { - measurement_date = as.Date("2000-01-01") - update_tbl = tibble::tibble( + measurement_date <- as.Date("2000-01-01") + update_tbl <- tibble::tibble( geo_value = "g1", time_value = measurement_date, version = measurement_date + 1:5, value = 1:5 ) - expect_error(as_epi_archive(update_tbl, - clobberable_versions_start = 1241, - versions_end = measurement_date), - class="epiprocess__clobberable_versions_start_has_invalid_class_or_typeof") - expect_error(as_epi_archive(update_tbl[integer(0L),]), - class="epiprocess__max_version_cannot_be_used") - expect_error(as_epi_archive(update_tbl, - clobberable_versions_start = NA, - versions_end = measurement_date), - class="epiprocess__versions_end_earlier_than_updates") - expect_error(as_epi_archive(update_tbl, - clobberable_versions_start=measurement_date+6L, - versions_end = measurement_date+5L), - class="epiprocess__versions_end_earlier_than_clobberable_versions_start") + expect_error( + as_epi_archive(update_tbl, + clobberable_versions_start = 1241, + versions_end = measurement_date + ), + class = "epiprocess__clobberable_versions_start_has_invalid_class_or_typeof" + ) + expect_error(as_epi_archive(update_tbl[integer(0L), ]), + class = "epiprocess__max_version_cannot_be_used" + ) + expect_error( + as_epi_archive(update_tbl, + clobberable_versions_start = NA, + versions_end = measurement_date + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + expect_error( + as_epi_archive(update_tbl, + clobberable_versions_start = measurement_date + 6L, + versions_end = measurement_date + 5L + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) expect_error(as_epi_archive(update_tbl, versions_end = NA), - regexp="versions_end.*must not satisfy.*is.na") - ea_default = as_epi_archive(update_tbl) - ea_default$as_of(measurement_date+4L) - expect_warning(regexp=NA, - ea_default$as_of(measurement_date+5L), - class = "epiprocess__snapshot_as_of_clobberable_version") - expect_error(ea_default$as_of(measurement_date+6L), - regexp = "max_version.*at most.*versions_end") + regexp = "versions_end.*must not satisfy.*is.na" + ) + ea_default <- as_epi_archive(update_tbl) + ea_default$as_of(measurement_date + 4L) + expect_warning( + regexp = NA, + ea_default$as_of(measurement_date + 5L), + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + expect_error(ea_default$as_of(measurement_date + 6L), + regexp = "max_version.*at most.*versions_end" + ) }) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 790ac65f..73f0e166 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -1,145 +1,167 @@ library(dplyr) -test_that("first input must be a data.frame",{ - expect_error(as_epi_archive(c(1,2,3),compactify=FALSE), - regexp="`x` must be a data frame.") +test_that("first input must be a data.frame", { + expect_error(as_epi_archive(c(1, 2, 3), compactify = FALSE), + regexp = "`x` must be a data frame." + ) }) dt <- archive_cases_dv_subset$DT -test_that("data.frame must contain geo_value, time_value and version columns",{ - expect_error(as_epi_archive(select(dt,-geo_value), compactify=FALSE), - regexp="`x` must contain a `geo_value` column.") - expect_error(as_epi_archive(select(dt,-time_value), compactify=FALSE), - regexp="`x` must contain a `time_value` column.") - expect_error(as_epi_archive(select(dt,-version), compactify=FALSE), - regexp="`x` must contain a `version` column.") +test_that("data.frame must contain geo_value, time_value and version columns", { + expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), + regexp = "`x` must contain a `geo_value` column." + ) + expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), + regexp = "`x` must contain a `time_value` column." + ) + expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), + regexp = "`x` must contain a `version` column." + ) }) -test_that("other_keys can only contain names of the data.frame columns",{ - expect_error(as_epi_archive(dt,other_keys = "xyz", compactify=FALSE), - regexp="`other_keys` must be contained in the column names of `x`.") - expect_error(as_epi_archive(dt,other_keys = "percent_cli", compactify=FALSE),NA) +test_that("other_keys can only contain names of the data.frame columns", { + expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE), + regexp = "`other_keys` must be contained in the column names of `x`." + ) + expect_error(as_epi_archive(dt, other_keys = "percent_cli", compactify = FALSE), NA) }) -test_that("other_keys cannot contain names geo_value, time_value or version",{ - expect_error(as_epi_archive(dt,other_keys = "geo_value", compactify=FALSE), - regexp="`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - expect_error(as_epi_archive(dt,other_keys = "time_value", compactify=FALSE), - regexp="`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - expect_error(as_epi_archive(dt,other_keys = "version", compactify=FALSE), - regexp="`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") +test_that("other_keys cannot contain names geo_value, time_value or version", { + expect_error(as_epi_archive(dt, other_keys = "geo_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive(dt, other_keys = "time_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive(dt, other_keys = "version", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) }) -test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields",{ - expect_warning(as_epi_archive(dt,additional_metadata = list(geo_type = 1), compactify=FALSE), - regexp="`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\".") - expect_warning(as_epi_archive(dt,additional_metadata = list(time_type = 1), compactify=FALSE), - regexp="`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\".") +test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { + expect_warning(as_epi_archive(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\"." + ) + expect_warning(as_epi_archive(dt, additional_metadata = list(time_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\"." + ) }) -test_that("epi_archives are correctly instantiated with a variety of data types",{ +test_that("epi_archives are correctly instantiated with a variety of data types", { # Data frame - df <- data.frame(geo_value="ca", - time_value=as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value=1:20) - - ea1 <- as_epi_archive(df, compactify=FALSE) - expect_equal(key(ea1$DT),c("geo_value","time_value","version")) - expect_equal(ea1$additional_metadata,list()) - - ea2 <- as_epi_archive(df, other_keys="value", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea2$DT),c("geo_value","time_value","value","version")) - expect_equal(ea2$additional_metadata,list(value=df$value)) - + df <- data.frame( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20 + ) + + ea1 <- as_epi_archive(df, compactify = FALSE) + expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) + expect_equal(ea1$additional_metadata, list()) + + ea2 <- as_epi_archive(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea2$additional_metadata, list(value = df$value)) + # Tibble - tib <- tibble::tibble(df, code="x") - - ea3 <- as_epi_archive(tib, compactify=FALSE) - expect_equal(key(ea3$DT),c("geo_value","time_value","version")) - expect_equal(ea3$additional_metadata,list()) - - ea4 <- as_epi_archive(tib, other_keys="code", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea4$DT),c("geo_value","time_value","code","version")) - expect_equal(ea4$additional_metadata,list(value=df$value)) - + tib <- tibble::tibble(df, code = "x") + + ea3 <- as_epi_archive(tib, compactify = FALSE) + expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) + expect_equal(ea3$additional_metadata, list()) + + ea4 <- as_epi_archive(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea4$additional_metadata, list(value = df$value)) + # Keyed data.table - kdt <- data.table::data.table(geo_value="ca", - time_value=as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20, - code = "CA", - key = "code") - - ea5 <- as_epi_archive(kdt, compactify=FALSE) + kdt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA", + key = "code" + ) + + ea5 <- as_epi_archive(kdt, compactify = FALSE) # Key from data.table isn't absorbed when as_epi_archive is used - expect_equal(key(ea5$DT),c("geo_value","time_value","version")) - expect_equal(ea5$additional_metadata,list()) - - ea6 <- as_epi_archive(kdt,other_keys="value", additional_metadata=list(value=df$value), compactify=FALSE) + expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) + expect_equal(ea5$additional_metadata, list()) + + ea6 <- as_epi_archive(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) # Mismatched keys, but the one from as_epi_archive overrides - expect_equal(key(ea6$DT),c("geo_value","time_value","value","version")) - expect_equal(ea6$additional_metadata,list(value=df$value)) - + expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea6$additional_metadata, list(value = df$value)) + # Unkeyed data.table - udt <- data.table::data.table(geo_value="ca", - time_value=as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value=1:20, - code = "CA") - - ea7 <- as_epi_archive(udt, compactify=FALSE) - expect_equal(key(ea7$DT),c("geo_value","time_value","version")) - expect_equal(ea7$additional_metadata,list()) - - ea8 <- as_epi_archive(udt,other_keys="code", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea8$DT),c("geo_value","time_value","code","version")) - expect_equal(ea8$additional_metadata,list(value=df$value)) - + udt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA" + ) + + ea7 <- as_epi_archive(udt, compactify = FALSE) + expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) + expect_equal(ea7$additional_metadata, list()) + + ea8 <- as_epi_archive(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea8$additional_metadata, list(value = df$value)) + # epi_df edf1 <- jhu_csse_daily_subset %>% - select(geo_value,time_value,cases) %>% + select(geo_value, time_value, cases) %>% mutate(version = max(time_value), code = "USA") - - ea9 <- as_epi_archive(edf1, compactify=FALSE) - expect_equal(key(ea9$DT),c("geo_value","time_value","version")) - expect_equal(ea9$additional_metadata,list()) - - ea10 <- as_epi_archive(edf1,other_keys="code", additional_metadata=list(value=df$value), compactify=FALSE) - expect_equal(key(ea10$DT),c("geo_value","time_value","code","version")) - expect_equal(ea10$additional_metadata,list(value=df$value)) - + + ea9 <- as_epi_archive(edf1, compactify = FALSE) + expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) + expect_equal(ea9$additional_metadata, list()) + + ea10 <- as_epi_archive(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea10$additional_metadata, list(value = df$value)) + # Keyed epi_df - edf2 <- data.frame(geo_value = "al", - time_value = rep(as.Date("2020-01-01") + 0:9,2), - version = c(rep(as.Date("2020-01-25"),10), - rep(as.Date("2020-01-26"),10)), - cases = 1:20, - misc = "USA") %>% + edf2 <- data.frame( + geo_value = "al", + time_value = rep(as.Date("2020-01-01") + 0:9, 2), + version = c( + rep(as.Date("2020-01-25"), 10), + rep(as.Date("2020-01-26"), 10) + ), + cases = 1:20, + misc = "USA" + ) %>% as_epi_df(additional_metadata = list(other_keys = "misc")) - - ea11 <- as_epi_archive(edf2, compactify=FALSE) - expect_equal(key(ea11$DT),c("geo_value","time_value","version")) - expect_equal(ea11$additional_metadata,list()) - - ea12 <- as_epi_archive(edf2,other_keys="misc", additional_metadata=list(value=df$misc), compactify=FALSE) - expect_equal(key(ea12$DT),c("geo_value","time_value","misc","version")) - expect_equal(ea12$additional_metadata,list(value=df$misc)) + + ea11 <- as_epi_archive(edf2, compactify = FALSE) + expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) + expect_equal(ea11$additional_metadata, list()) + + ea12 <- as_epi_archive(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(ea12$additional_metadata, list(value = df$misc)) }) test_that("`epi_archive` rejects nonunique keys", { - toy_update_tbl = + toy_update_tbl <- tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130, - "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 ) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version)) + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) expect_error( as_epi_archive(toy_update_tbl), class = "epiprocess__epi_archive_requires_unique_key" diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index f8d956c0..4400c94a 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -3,22 +3,22 @@ library(data.table) library(dplyr) dt <- archive_cases_dv_subset$DT -dt <- filter(dt,geo_value == "ca") %>% +dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) test_that("Input for compactify must be NULL or a boolean", { - expect_error(as_epi_archive(dt,compactify="no")) + expect_error(as_epi_archive(dt, compactify = "no")) }) dt$percent_cli <- c(1:80) dt$case_rate <- c(1:80) -row_replace <- function(dt,row,x,y) { +row_replace <- function(dt, row, x, y) { # (This way of "replacing" elements appears to use copy-on-write even though # we are working with a data.table.) - dt[row,4] <- x - dt[row,5] <- y + dt[row, 4] <- x + dt[row, 5] <- y dt } @@ -26,7 +26,7 @@ row_replace <- function(dt,row,x,y) { # observation carried forward) # Rows 1 should not be eliminated even if NA -dt <- row_replace(dt,1,NA,NA) # Not LOCF +dt <- row_replace(dt, 1, NA, NA) # Not LOCF # NOTE! We are assuming that there are no NA's in geo_value, time_value, # and version. Even though compactify may erroneously remove the first row @@ -34,71 +34,71 @@ dt <- row_replace(dt,1,NA,NA) # Not LOCF # has problems beyond the scope of this test # Rows 11 and 12 correspond to different time_values -dt <- row_replace(dt,12,11,11) # Not LOCF +dt <- row_replace(dt, 12, 11, 11) # Not LOCF # Rows 20 and 21 only differ in version -dt <- row_replace(dt,21,20,20) # LOCF +dt <- row_replace(dt, 21, 20, 20) # LOCF # Rows 21 and 22 only differ in version -dt <- row_replace(dt,22,20,20) # LOCF +dt <- row_replace(dt, 22, 20, 20) # LOCF # Row 39 comprises the first NA's -dt <- row_replace(dt,39,NA,NA) # Not LOCF +dt <- row_replace(dt, 39, NA, NA) # Not LOCF # Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(dt,40,NA,NA) # LOCF +dt <- row_replace(dt, 40, NA, NA) # LOCF # Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(dt,62,15,15) # Not LOCF +dt <- row_replace(dt, 62, 15, 15) # Not LOCF # Row 73 only has one value carried over -dt <- row_replace(dt,74,73,74) # Not LOCF +dt <- row_replace(dt, 74, 73, 74) # Not LOCF -dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) -dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) -dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt,compactify=NULL)$DT)) +dt_true <- as_tibble(as_epi_archive(dt, compactify = TRUE)$DT) +dt_false <- as_tibble(as_epi_archive(dt, compactify = FALSE)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt, compactify = NULL)$DT)) test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive(dt,compactify=NULL)) + expect_warning(as_epi_archive(dt, compactify = NULL)) }) test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive(dt[1:5],compactify=NULL),NA) + expect_warning(as_epi_archive(dt[1:5], compactify = NULL), NA) }) test_that("LOCF values are ignored with compactify=FALSE", { - expect_identical(nrow(dt),nrow(dt_false)) + expect_identical(nrow(dt), nrow(dt_false)) }) test_that("LOCF values are taken out with compactify=TRUE", { - dt_test <- as_tibble(as_epi_archive(dt[-c(21,22,40),],compactify=FALSE)$DT) - - expect_identical(dt_true,dt_null) - expect_identical(dt_null,dt_test) + dt_test <- as_tibble(as_epi_archive(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) + + expect_identical(dt_true, dt_null) + expect_identical(dt_null, dt_test) }) test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { - ea_true <- as_epi_archive(dt,compactify=TRUE) - ea_false <- as_epi_archive(dt,compactify=FALSE) - + ea_true <- as_epi_archive(dt, compactify = TRUE) + ea_false <- as_epi_archive(dt, compactify = FALSE) + # Row 22, an LOCF row corresponding to the latest version, is omitted in # ea_true - latest_version = max(ea_false$DT$version) - as_of_true <- ea_true$as_of(latest_version) + latest_version <- max(ea_false$DT$version) + as_of_true <- ea_true$as_of(latest_version) as_of_false <- ea_false$as_of(latest_version) - - expect_identical(as_of_true,as_of_false) + + expect_identical(as_of_true, as_of_false) }) test_that("compactify does not alter the default clobberable and observed version bounds", { - x = tibble::tibble( + x <- tibble::tibble( geo_value = "geo1", time_value = as.Date("2000-01-01"), version = as.Date("2000-01-01") + 1:5, value = 42L ) - ea_true <- as_epi_archive(x, compactify=TRUE) - ea_false <- as_epi_archive(x, compactify=FALSE) + ea_true <- as_epi_archive(x, compactify = TRUE) + ea_false <- as_epi_archive(x, compactify = FALSE) # We say that we base the bounds on the user's `x` arg. We might mess up or # change our minds and base things on the `DT` field (or a temporary `DT` # variable, post-compactify) instead. Check that this test would trigger diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index b7b22dc6..fe129616 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -1,34 +1,39 @@ library(tibble) -test_that("epi_cor throws an error for a non-epi_df for its first argument",{ - expect_error(epi_cor(1:10,1,1)) - expect_error(epi_cor(data.frame(x=1:10),1,1)) +test_that("epi_cor throws an error for a non-epi_df for its first argument", { + expect_error(epi_cor(1:10, 1, 1)) + expect_error(epi_cor(data.frame(x = 1:10), 1, 1)) }) -test_that("epi_cor requires two var arguments, var1 and var2",{ - expect_error(epi_cor(archive_cases_dv_subset$DT,var2=1)) - expect_error(epi_cor(archive_cases_dv_subset$DT,var1=1)) +test_that("epi_cor requires two var arguments, var1 and var2", { + expect_error(epi_cor(archive_cases_dv_subset$DT, var2 = 1)) + expect_error(epi_cor(archive_cases_dv_subset$DT, var1 = 1)) }) -test_that("epi_cor functions as intended",{ - expect_equal(epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = geo_value, - dt1 = -2)[1], - tibble(geo_value = unique(jhu_csse_daily_subset$geo_value)) +test_that("epi_cor functions as intended", { + expect_equal( + epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value, + dt1 = -2 + )[1], + tibble(geo_value = unique(jhu_csse_daily_subset$geo_value)) ) - - edf <- as_epi_df(data.frame(geo_value=rep("asdf",20), - time_value=as.Date("2020-01-01") + 1:20, - pos=1:20, - neg=-(1:20))) - expect_equal(epi_cor(edf, pos, pos)[[2]],1) - expect_equal(epi_cor(edf, pos, neg)[[2]],-1) + + edf <- as_epi_df(data.frame( + geo_value = rep("asdf", 20), + time_value = as.Date("2020-01-01") + 1:20, + pos = 1:20, + neg = -(1:20) + )) + expect_equal(epi_cor(edf, pos, pos)[[2]], 1) + expect_equal(epi_cor(edf, pos, neg)[[2]], -1) }) -test_that("shift works as intended",{ - expect_identical(epiprocess:::shift(1:100,1),dplyr::lead(1:100)) - expect_identical(epiprocess:::shift(1:100,0),1:100) - expect_identical(epiprocess:::shift(1:100,-1),dplyr::lag(1:100)) -}) \ No newline at end of file +test_that("shift works as intended", { + expect_identical(epiprocess:::shift(1:100, 1), dplyr::lead(1:100)) + expect_identical(epiprocess:::shift(1:100, 0), 1:100) + expect_identical(epiprocess:::shift(1:100, -1), dplyr::lag(1:100)) +}) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index f3d4c9d7..511cc8d7 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -5,12 +5,17 @@ test_that("`archive_cases_dv_subset` is formed successfully", { test_that("`delayed_assign_with_unregister_awareness` works as expected on good promises", { # Since we're testing environment stuff, use some "my_" prefixes to try to # prevent naming coincidences from changing behavior. - my_eval_env = rlang::new_environment(list(x=40L, n_evals=0L), parent=rlang::base_env()) - my_assign_env = rlang::new_environment() - delayed_assign_with_unregister_awareness("good1", { - n_evals <- n_evals + 1L - x + 2L - }, my_eval_env, my_assign_env) + my_eval_env <- rlang::new_environment(list(x = 40L, n_evals = 0L), parent = rlang::base_env()) + my_assign_env <- rlang::new_environment() + delayed_assign_with_unregister_awareness( + "good1", + { + n_evals <- n_evals + 1L + x + 2L + }, + my_eval_env, + my_assign_env + ) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) @@ -19,12 +24,17 @@ test_that("`delayed_assign_with_unregister_awareness` works as expected on good }) test_that("original `delayedAssign` works as expected on good promises", { - my_eval_env = rlang::new_environment(list(x=40L, n_evals=0L), parent=rlang::base_env()) - my_assign_env = rlang::new_environment() - delayedAssign("good1", { - n_evals <- n_evals + 1L - x + 2L - }, my_eval_env, my_assign_env) + my_eval_env <- rlang::new_environment(list(x = 40L, n_evals = 0L), parent = rlang::base_env()) + my_assign_env <- rlang::new_environment() + delayedAssign( + "good1", + { + n_evals <- n_evals + 1L + x + 2L + }, + my_eval_env, + my_assign_env + ) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) force(my_assign_env[["good1"]]) @@ -33,34 +43,34 @@ test_that("original `delayedAssign` works as expected on good promises", { }) test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { - delayed_assign_with_unregister_awareness("x", Abort("msg", class="original_error_class")) - expect_error(force(x), class="original_error_class") + delayed_assign_with_unregister_awareness("x", Abort("msg", class = "original_error_class")) + expect_error(force(x), class = "original_error_class") }) test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { - delayed_assign_with_unregister_awareness("x", Abort("msg", class="original_error_class")) + delayed_assign_with_unregister_awareness("x", Abort("msg", class = "original_error_class")) # Take advantage of a false positive / hedge against package renaming: make # our own `unregister` function to trigger the special error message. - unregister = function(y) y - expect_error(unregister(force(x)), class="epiprocess__promise_evaluation_error_during_unregister") + unregister <- function(y) y + expect_error(unregister(force(x)), class = "epiprocess__promise_evaluation_error_during_unregister") }) test_that("`delayed_assign_with_unregister_awareness` injection support works", { - my_exprs = rlang::exprs(a = b + c, d = e) + my_exprs <- rlang::exprs(a = b + c, d = e) delayed_assign_with_unregister_awareness( "good2", list(!!!my_exprs), - eval.env=rlang::new_environment(list(b=2L, c=3L, e=4L), rlang::base_env()) + eval.env = rlang::new_environment(list(b = 2L, c = 3L, e = 4L), rlang::base_env()) ) force(good2) - expect_identical(good2, list(a=5L, d=4L)) + expect_identical(good2, list(a = 5L, d = 4L)) }) test_that("`some_package_is_being_unregistered` doesn't fail in response to non-simple calls", { # Prerequisite for current implementation to work (testing here to help debug # in case some R version doesn't obey): expect_false(NA_character_ %in% letters) - f = function() function() some_package_is_being_unregistered() - my_expr = rlang::expr(f()()) + f <- function() function() some_package_is_being_unregistered() + my_expr <- rlang::expr(f()()) # Prerequisite for this to test to actually be testing on non-simple calls: expect_false(rlang::is_call_simple(my_expr)) # Actual test (`FALSE` is correct; `NA` or error is not): diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R index 334b4488..5be3824e 100644 --- a/tests/testthat/test-deprecations.R +++ b/tests/testthat/test-deprecations.R @@ -1,48 +1,47 @@ - -test_that("epix_slide group_by= deprecation works",{ +test_that("epix_slide group_by= deprecation works", { expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before=2L, group_by=c()), + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ - slide(function(...) {}, before=2L, group_by=c()), + slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before=2L, group_by=c()), + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ group_by(geo_value)$ - slide(function(...) {}, before=2L, group_by=c()), + slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) # expect_error( archive_cases_dv_subset %>% - epix_slide(function(...) {}, before=2L, all_rows=TRUE), + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ - slide(function(...) {}, before=2L, all_rows=TRUE), + slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset %>% group_by(geo_value) %>% - epix_slide(function(...) {}, before=2L, all_rows=TRUE), + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( archive_cases_dv_subset$ group_by(geo_value)$ - slide(function(...) {}, before=2L, all_rows=TRUE), + slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) }) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 2e61e088..e2bbc040 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -2,116 +2,154 @@ d <- as.Date("2020-01-01") -ungrouped = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value=1:200), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) +ungrouped <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value = 1:200), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) ) %>% as_epi_df() -grouped = ungrouped %>% +grouped <- ungrouped %>% group_by(geo_value) -small_x = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value=11:15), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) +small_x <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) ) %>% as_epi_df(as_of = d + 6) %>% group_by(geo_value) -f = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) +f <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) -toy_edf = tibble::tribble( - ~geo_value, ~time_value, ~value , - "a" , 1:10 , 2L^( 1:10), - "b" , 1:10 , 2L^(11:20), - ) %>% +toy_edf <- tibble::tribble( + ~geo_value, ~time_value, ~value, + "a", 1:10, 2L^(1:10), + "b", 1:10, 2L^(11:20), +) %>% tidyr::unchop(c(time_value, value)) %>% as_epi_df(as_of = 100) ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { - expect_error(epi_slide(grouped, f, before = c(0,1), after = 0, ref_time_values = d+3), - "`before`.*length-1") - expect_error(epi_slide(grouped, f, before = 1, after = c(0,1), ref_time_values = d+3), - "`after`.*length-1") + expect_error( + epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = d + 3), + "`before`.*length-1" + ) + expect_error( + epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), + "`after`.*length-1" + ) }) test_that("Test errors/warnings for discouraged features", { - expect_error(epi_slide(grouped, f, ref_time_values = d+1), - "Either or both of `before`, `after` must be provided.") - expect_warning(epi_slide(grouped, f, before = 0L, ref_time_values = d+1), - "`before==0`, `after` missing") - expect_warning(epi_slide(grouped, f, after = 0L, ref_time_values = d+1), - "`before` missing, `after==0`") + expect_error( + epi_slide(grouped, f, ref_time_values = d + 1), + "Either or both of `before`, `after` must be provided." + ) + expect_warning( + epi_slide(grouped, f, before = 0L, ref_time_values = d + 1), + "`before==0`, `after` missing" + ) + expect_warning( + epi_slide(grouped, f, after = 0L, ref_time_values = d + 1), + "`before` missing, `after==0`" + ) # Below cases should raise no errors/warnings: - expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d+2),NA) - expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d+2),NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d+2),NA) + expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d + 2), NA) + expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d + 2), NA) + expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2), NA) }) -test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible",{ - expect_error(epi_slide(grouped, f, before = -1L, ref_time_values = d+2L), - "`before`.*non-negative") - expect_error(epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d+2L), - "`after`.*non-negative") - expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d+2L), - regexp="before", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d+2L), - regexp="after", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d+2L), - regexp="before", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d+2L), - regexp="after", class="vctrs_error_incompatible_type") - expect_error(epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d+2L), - "`before`.*non-NA") - expect_error(epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d+2L), - "`after`.*non-NA") +test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { + expect_error( + epi_slide(grouped, f, before = -1L, ref_time_values = d + 2L), + "`before`.*non-negative" + ) + expect_error( + epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d + 2L), + "`after`.*non-negative" + ) + expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error( + epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d + 2L), + "`before`.*non-NA" + ) + expect_error( + epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), + "`after`.*non-NA" + ) # Non-integer-class but integer-compatible values are allowed: - expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d+2L),NA) + expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L), NA) }) test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { - expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`.") # before the first, no data in the slide windows - expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d+207L), - "All `ref_time_values` must appear in `x\\$time_value`.") # beyond the last, no data in window + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # before the first, no data in the slide windows + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # beyond the last, no data in window }) test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { - expect_error(epi_slide(grouped, f, before=0L, after=2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`.") # before the first, but we'd expect there to be data in the window - expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d+201L), - "All `ref_time_values` must appear in `x\\$time_value`.") # beyond the last, but still with data in window + expect_error( + epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # beyond the last, but still with data in window }) ## --- These cases generate warnings (or not): --- -test_that("Warn user against having a blank `before`",{ - expect_warning(epi_slide(grouped, f, after = 1L, - ref_time_values = d+1L), NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 1L, - ref_time_values = d+1L), NA) +test_that("Warn user against having a blank `before`", { + expect_warning(epi_slide(grouped, f, + after = 1L, + ref_time_values = d + 1L + ), NA) + expect_warning(epi_slide(grouped, f, + before = 0L, after = 1L, + ref_time_values = d + 1L + ), NA) }) ## --- These cases doesn't generate the error: --- test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { - expect_identical(epi_slide(grouped, f, before=2L, ref_time_values = d+200L) %>% - ungroup() %>% - dplyr::select("geo_value","slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199)) # out of range for one group - expect_identical(epi_slide(grouped, f, before=2L, ref_time_values=d+3) %>% - ungroup() %>% - dplyr::select("geo_value","slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group }) test_that("computation output formats x as_list_col", { # See `toy_edf` definition at top of file. # We'll try 7d sum with a few formats. - basic_result_from_size1 = tibble::tribble( - ~geo_value, ~time_value, ~value , ~slide_value , - "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - ) %>% + basic_result_from_size1 <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) @@ -134,21 +172,25 @@ test_that("computation output formats x as_list_col", { ) # output naming functionality: expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - new_col_name = "result"), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result" + ), basic_result_from_size1 %>% rename(result_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)), - names_sep = NULL), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL + ), basic_result_from_size1 %>% rename(value_sum = slide_value) ) # trying with non-size-1 computation outputs: - basic_result_from_size2 = tibble::tribble( - ~geo_value, ~time_value, ~value , ~slide_value , - "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L, - ) %>% + basic_result_from_size2 <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, + ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) @@ -172,24 +214,25 @@ test_that("computation output formats x as_list_col", { }) test_that("epi_slide alerts if the provided f doesn't take enough args", { - f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) - expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) + expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) + expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) - f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value)) - expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d+1), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) + expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d + 1), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) }) test_that("`ref_time_values` + `all_rows = TRUE` works", { # See `toy_edf` definition at top of file. We'll do variants of a slide # returning the following: - basic_full_result = tibble::tribble( - ~geo_value, ~time_value, ~value , ~slide_value , - "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), - ) %>% + basic_full_result <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) @@ -199,16 +242,21 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { basic_full_result ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L)), + toy_edf %>% epi_slide( + before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L) + ), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L), all_rows = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L), all_rows = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, NA_integer_)) + slide_value, NA_integer_ + )) ) # slide computations returning data frames: expect_identical( @@ -216,64 +264,82 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L)), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L) + ), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, NA_integer_)) %>% + slide_value, NA_integer_ + )) %>% dplyr::rename(slide_value_value = slide_value) ) # slide computations returning data frames with `as_list_col=TRUE`: expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), - as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% dplyr::filter(time_value %in% c(2L, 8L)) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE + ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, list(NULL))) + slide_value, list(NULL) + )) ) # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE + ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE + ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE + ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% # XXX unclear exactly what we want in this case. Current approach is @@ -282,7 +348,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) - rework_nulls = function(slide_values_list) { + rework_nulls <- function(slide_values_list) { vctrs::vec_assign( slide_values_list, vctrs::vec_detect_missing(slide_values_list), @@ -290,14 +356,17 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) } expect_identical( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, - as_list_col = TRUE) %>% + toy_edf %>% epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE + ) %>% mutate(slide_value = rework_nulls(slide_value)) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), - slide_value, NA_integer_)) %>% + slide_value, NA_integer_ + )) %>% dplyr::rename(slide_value_value = slide_value) ) }) @@ -314,56 +383,60 @@ test_that("`epi_slide` doesn't decay date output", { test_that("basic grouped epi_slide computation produces expected output", { # Also checks that we correctly remove extra rows and columns (`.real`) used # to recover `ref_time_value`s. - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=cumsum(-(1:5))) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) # formula - result1 <- epi_slide(small_x, f = ~sum(.x$value), before=50) + result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50) expect_identical(result1, expected_output) # function - result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before=50) + result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50) expect_identical(result2, expected_output) # dots - result3 <- epi_slide(small_x, slide_value = sum(value), before=50) + result3 <- epi_slide(small_x, slide_value = sum(value), before = 50) expect_identical(result3, expected_output) }) test_that("ungrouped epi_slide computation completes successfully", { expect_error( small_x %>% - ungroup() %>% - epi_slide(before = 2, - slide_value = sum(.x$value)), - regexp=NA + ungroup() %>% + epi_slide( + before = 2, + slide_value = sum(.x$value) + ), + regexp = NA ) }) test_that("basic ungrouped epi_slide computation produces expected output", { - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)) ) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% - epi_slide(before = 50, - slide_value = sum(.x$value)) + epi_slide( + before = 50, + slide_value = sum(.x$value) + ) expect_identical(result1, expected_output) # Ungrouped with multiple geos - expected_output = dplyr::bind_rows( + expected_output <- dplyr::bind_rows( dplyr::tibble( - geo_value = "ak", time_value = d + 1:5, value=11:15, slide_value=cumsum(11:15) + cumsum(-(1:5) - )), + geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) + ), dplyr::tibble( - geo_value = "al", time_value = d + 1:5, value=-(1:5), slide_value=cumsum(11:15) + cumsum(-(1:5)) + geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) ) ) %>% as_epi_df(as_of = d + 6) %>% @@ -371,154 +444,181 @@ test_that("basic ungrouped epi_slide computation produces expected output", { result2 <- small_x %>% ungroup() %>% - epi_slide(before = 50, - slide_value = sum(.x$value)) + epi_slide( + before = 50, + slide_value = sum(.x$value) + ) expect_identical(result2, expected_output) }) test_that("epi_slide computation via formula can use ref_time_value", { - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(f = ~ .ref_time_value, - before = 50) + epi_slide( + f = ~.ref_time_value, + before = 50 + ) expect_identical(result1, expected_output) result2 <- small_x %>% - epi_slide(f = ~ .z, - before = 50) + epi_slide( + f = ~.z, + before = 50 + ) expect_identical(result2, expected_output) result3 <- small_x %>% - epi_slide(f = ~ ..3, - before = 50) + epi_slide( + f = ~..3, + before = 50 + ) expect_identical(result3, expected_output) # Ungrouped with multiple geos - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% as_epi_df(as_of = d + 6) %>% arrange(time_value) result4 <- small_x %>% ungroup() %>% - epi_slide(f = ~ .ref_time_value, - before = 50) + epi_slide( + f = ~.ref_time_value, + before = 50 + ) expect_identical(result4, expected_output) }) test_that("epi_slide computation via function can use ref_time_value", { - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(f = function(x, g, t) t, - before = 2) + epi_slide( + f = function(x, g, t) t, + before = 2 + ) expect_identical(result1, expected_output) }) test_that("epi_slide computation via dots can use ref_time_value and group", { # ref_time_value - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(before = 50, - slide_value = .ref_time_value) + epi_slide( + before = 50, + slide_value = .ref_time_value + ) expect_identical(result1, expected_output) # `.{x,group_key,ref_time_value}` should be inaccessible from `.data` and # `.env`. expect_error(small_x %>% - epi_slide(before = 50, - slide_value = .env$.ref_time_value) - ) + epi_slide( + before = 50, + slide_value = .env$.ref_time_value + )) # group_key # Use group_key column - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value="ak"), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value="al") + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = "ak"), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = "al") ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result3 <- small_x %>% - epi_slide(before = 2, - slide_value = .group_key$geo_value) + epi_slide( + before = 2, + slide_value = .group_key$geo_value + ) expect_identical(result3, expected_output) # Use entire group_key object - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=1L), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=1L) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = 1L), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = 1L) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) result4 <- small_x %>% - epi_slide(before = 2, - slide_value = nrow(.group_key)) + epi_slide( + before = 2, + slide_value = nrow(.group_key) + ) expect_identical(result4, expected_output) # Ungrouped with multiple geos - expected_output = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) ) %>% as_epi_df(as_of = d + 6) %>% arrange(time_value) result5 <- small_x %>% ungroup() %>% - epi_slide(before = 50, - slide_value = .ref_time_value) + epi_slide( + before = 50, + slide_value = .ref_time_value + ) expect_identical(result5, expected_output) }) test_that("epi_slide computation via dots outputs the same result using col names and the data var", { expected_output <- small_x %>% - epi_slide(before = 2, - slide_value = max(time_value)) %>% + epi_slide( + before = 2, + slide_value = max(time_value) + ) %>% as_epi_df(as_of = d + 6) result1 <- small_x %>% - epi_slide(before = 2, - slide_value = max(.x$time_value)) + epi_slide( + before = 2, + slide_value = max(.x$time_value) + ) expect_identical(result1, expected_output) result2 <- small_x %>% - epi_slide(before = 2, - slide_value = max(.data$time_value)) + epi_slide( + before = 2, + slide_value = max(.data$time_value) + ) expect_identical(result2, expected_output) }) test_that("`epi_slide` can access objects inside of helper functions", { - helper = function(archive_haystack, time_value_needle) { + helper <- function(archive_haystack, time_value_needle) { archive_haystack %>% epi_slide(has_needle = time_value_needle %in% time_value, before = 365000L) } expect_error( diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 1d78bf49..6b113545 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -1,11 +1,12 @@ - test_that("epix_fill_through_version mirrors input when it is sufficiently up to date", { - ea_orig = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) - some_earlier_observed_version = 2L - ea_trivial_fill_na1 = epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") - ea_trivial_fill_na2 = epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") - ea_trivial_fill_locf = epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") + ea_orig <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + some_earlier_observed_version <- 2L + ea_trivial_fill_na1 <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 <- epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") + ea_trivial_fill_locf <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") # Below, we want R6 objects to be compared based on contents rather than # addresses. We appear to get this with `expect_identical` in `testthat` # edition 3, which is based on `waldo::compare` rather than `base::identical`; @@ -21,86 +22,102 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to }) test_that("epix_fill_through_version can extend observed versions, gives expected `as_of`s", { - ea_orig = as_epi_archive(data.table::data.table( + ea_orig <- as_epi_archive(data.table::data.table( geo_value = "g1", - time_value = as.Date("2020-01-01") + c(rep(0L,5L), 1L), + time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), version = c(1:5, 2L), - value = 1:6)) - first_unobserved_version = 6L - later_unobserved_version = 10L - ea_fill_na = epix_fill_through_version(ea_orig, later_unobserved_version, "na") - ea_fill_locf = epix_fill_through_version(ea_orig, later_unobserved_version, "locf") + value = 1:6 + )) + first_unobserved_version <- 6L + later_unobserved_version <- 10L + ea_fill_na <- epix_fill_through_version(ea_orig, later_unobserved_version, "na") + ea_fill_locf <- epix_fill_through_version(ea_orig, later_unobserved_version, "locf") # We use testthat edition 3 features here, passing `ignore_attr` to # `waldo::compare`. Ensure we are using edition 3: testthat::local_edition(3) - withCallingHandlers({ - expect_identical(ea_fill_na$versions_end, later_unobserved_version) - expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), - tibble::tibble(geo_value="g1", time_value=as.Date("2020-01-01")+0:1, value=rep(NA_integer_, 2L)), - ignore_attr = TRUE) - expect_identical(ea_fill_locf$versions_end, later_unobserved_version) - expect_identical(ea_fill_locf$as_of(first_unobserved_version), - ea_fill_locf$as_of(ea_orig$versions_end) %>% - {attr(., "metadata")$as_of <- first_unobserved_version; .}) - }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) + withCallingHandlers( + { + expect_identical(ea_fill_na$versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), + tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + ignore_attr = TRUE + ) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) + expect_identical( + ea_fill_locf$as_of(first_unobserved_version), + ea_fill_locf$as_of(ea_orig$versions_end) %>% + { + attr(., "metadata")$as_of <- first_unobserved_version + . + } + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") + ) }) test_that("epix_fill_through_version does not mutate x", { for (ea_orig in list( # vanilla case - as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)), + as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )), # data.table unique yielding original DT by reference special case (maybe # having only 1 row is the trigger? having no revisions of initial values # doesn't seem sufficient to trigger) - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=10L)) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) )) { # We want to perform a strict comparison of the contents of `ea_orig` before # and `ea_orig` after. `clone` + `expect_identical` based on waldo would # sort of work, but we might want something stricter. `as.list` + # `identical` plus a check of the DT seems to do the trick. - ea_orig_before_as_list = as.list(ea_orig) - ea_orig_DT_before_copy = data.table::copy(ea_orig$DT) - some_unobserved_version = 8L + ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + some_unobserved_version <- 8L # - ea_fill_na = epix_fill_through_version(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list = as.list(ea_orig) + ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list <- as.list(ea_orig) # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) expect_identical(ea_orig_DT_before_copy, ea_orig$DT) # - ea_fill_locf = epix_fill_through_version(ea_orig, some_unobserved_version, "locf") - ea_orig_after_as_list = as.list(ea_orig) + ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") + ea_orig_after_as_list <- as.list(ea_orig) expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) expect_identical(ea_orig_DT_before_copy, ea_orig$DT) } }) test_that("x$fill_through_version mutates x (if needed)", { - ea = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) # We want the contents to change in a substantial way that makes waldo compare # different (if the contents need to change). - ea_before_copies_as_list = lapply(ea, data.table::copy) - some_unobserved_version = 8L + ea_before_copies_as_list <- lapply(ea, data.table::copy) + some_unobserved_version <- 8L ea$fill_through_version(some_unobserved_version, "na") - ea_after_copies_as_list = lapply(ea, data.table::copy) + ea_after_copies_as_list <- lapply(ea, data.table::copy) expect_failure(expect_identical(ea_before_copies_as_list, ea_after_copies_as_list)) }) test_that("{epix_,$}fill_through_version return with expected visibility", { - ea = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) expect_false(withVisible(ea$fill_through_version(15L, "na"))[["visible"]]) }) test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { - ea = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=10L)) - old_DT = ea$DT - old_DT_copy = data.table::copy(old_DT) - old_key = data.table::key(ea$DT) + ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + old_DT <- ea$DT + old_DT_copy <- data.table::copy(old_DT) + old_key <- data.table::key(ea$DT) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) expect_identical(data.table::key(ea$DT), old_key) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 265263f0..0ae428e4 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,57 +1,59 @@ - -test_that("epix_merge requires forbids on invalid `y`",{ - ea = archive_cases_dv_subset$clone() - expect_error(epix_merge(ea, data.frame(x=1))) +test_that("epix_merge requires forbids on invalid `y`", { + ea <- archive_cases_dv_subset$clone() + expect_error(epix_merge(ea, data.frame(x = 1))) }) test_that("epix_merge merges and carries forward updates properly", { - x = as_epi_archive( + x <- as_epi_archive( data.table::as.data.table( - tibble::tribble(~geo_value, ~time_value, ~version, ~x_value, - # same version set for x and y - "g1", 1L, 1:3, paste0("XA", 1:3), - # versions of x surround those of y + this measurement has - # max update version beyond some others - "g1", 2L, 1:5, paste0("XB", 1:5), - # mirror case - "g1", 3L, 2L, paste0("XC", 2L), - # x has 1 version, y has 0 - "g1", 4L, 1L, paste0("XD", 1L), - # non-NA values that should be carried forward - # (version-wise LOCF) in other versions, plus NAs that - # should (similarly) be carried forward as NA (latter - # wouldn't work with an ordinary merge + post-processing - # with `data.table::nafill`) - "g1", 6L, c(1L,3L,5L), paste0("XE", c(1L, NA, 5L)) - ) %>% + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) + "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + ) %>% tidyr::unchop(c(version, x_value)) %>% dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) ) - y = as_epi_archive( + y <- as_epi_archive( data.table::as.data.table( - tibble::tribble(~geo_value, ~time_value, ~version, ~y_value, - "g1", 1L, 1:3, paste0("YA", 1:3), - "g1", 2L, 2L, paste0("YB", 2L), - "g1", 3L, 1:5, paste0("YC", 1:5), - "g1", 5L, 1L, paste0("YD", 1L), - "g1", 6L, 1:5, paste0("YE", 1:5), - ) %>% + tibble::tribble( + ~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% tidyr::unchop(c(version, y_value)) %>% dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) ) - xy = epix_merge(x, y) - xy_expected = as_epi_archive( + xy <- epix_merge(x, y) + xy_expected <- as_epi_archive( data.table::as.data.table( - tibble::tribble(~geo_value, ~time_value, ~version, ~x_value, ~y_value, - "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), - "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA,2L,2L,2L,2L)), - "g1", 3L, 1:5, paste0("XC", c(NA,2L,2L,2L,2L)), paste0("YC", 1:5), - "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), - "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), - "g1", 6L, 1:5, paste0("XE", c(1L,1L,NA,NA,5L)), paste0("YE", 1:5), - ) %>% + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + ) %>% tidyr::unchop(c(version, x_value, y_value)) %>% dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) @@ -62,42 +64,44 @@ test_that("epix_merge merges and carries forward updates properly", { expect_identical(xy, xy_expected) }) -test_that('epix_merge forbids and warns on metadata and naming issues', { +test_that("epix_merge forbids and warns on metadata and naming issues", { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value="tx", time_value=1L, version=1L, x_value=1L)), - as_epi_archive(tibble::tibble(geo_value="us", time_value=1L, version=5L, y_value=2L)) + as_epi_archive(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) ), regexp = "must have the same.*geo_type" ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value="pa", time_value=1L, version=1L, x_value=1L)), - as_epi_archive(tibble::tibble(geo_value="pa", time_value=as.Date("2020-01-01"), version=5L, y_value=2L)) + as_epi_archive(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) ), regexp = "must have the same.*time_type" ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=1L)), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=2L)) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) ), regexp = "overlapping.*names" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), - additional_metadata=list("updates_fetched"=lubridate::ymd_hms("2022-05-01 16:00:00", tz="UTC"))), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L)) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) ), regexp = "x\\$additional_metadata", class = "epiprocess__epix_merge_ignores_additional_metadata" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L)), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), - additional_metadata=list("updates_fetched"=lubridate::ymd_hms("2022-05-01 16:00:00", tz="UTC"))) + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ) ), regexp = "y\\$additional_metadata", class = "epiprocess__epix_merge_ignores_additional_metadata" @@ -107,74 +111,78 @@ test_that('epix_merge forbids and warns on metadata and naming issues', { # use `local` to prevent accidentally using the x, y, xy bindings here # elsewhere, while allowing reuse across a couple tests local({ - x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), - clobberable_versions_start=1L, versions_end = 10L) - y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), - clobberable_versions_start=3L, versions_end = 10L) - xy = epix_merge(x,y) - test_that('epix_merge considers partially-clobberable row to be clobberable', { + x <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + clobberable_versions_start = 1L, versions_end = 10L + ) + y <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + clobberable_versions_start = 3L, versions_end = 10L + ) + xy <- epix_merge(x, y) + test_that("epix_merge considers partially-clobberable row to be clobberable", { expect_identical(xy$clobberable_versions_start, 1L) }) - test_that('epix_merge result uses versions_end metadata not max version val', { + test_that("epix_merge result uses versions_end metadata not max version val", { expect_identical(xy$versions_end, 10L) }) }) local({ - x = as_epi_archive( - tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L), + x <- as_epi_archive( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), clobberable_versions_start = 1L, versions_end = 3L ) - y = as_epi_archive( - tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L), + y <- as_epi_archive( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), clobberable_versions_start = 1L ) test_that('epix_merge forbids on sync default or "forbid"', { - expect_error(epix_merge(x,y), - class="epiprocess__epix_merge_unresolved_sync") - expect_error(epix_merge(x,y, sync = "forbid"), - class="epiprocess__epix_merge_unresolved_sync") + expect_error(epix_merge(x, y), + class = "epiprocess__epix_merge_unresolved_sync" + ) + expect_error(epix_merge(x, y, sync = "forbid"), + class = "epiprocess__epix_merge_unresolved_sync" + ) }) test_that('epix_merge sync="na" works', { expect_equal( - epix_merge(x,y, sync = "na"), + epix_merge(x, y, sync = "na"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet - 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated # (we should not have a y vals -> NA update here; version 5 should be # the `versions_end` of the result) - ), clobberable_versions_start=1L) + ), clobberable_versions_start = 1L) ) }) test_that('epix_merge sync="locf" works', { expect_equal( - epix_merge(x,y, sync = "locf"), + epix_merge(x, y, sync = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated - ), clobberable_versions_start=1L) + ), clobberable_versions_start = 1L) ) }) test_that('epix_merge sync="truncate" works', { expect_equal( - epix_merge(x,y, sync = "truncate"), + epix_merge(x, y, sync = "truncate"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet # y's update beyond x's last update has been truncated - ), clobberable_versions_start=1L, versions_end=3L) + ), clobberable_versions_start = 1L, versions_end = 3L) ) }) - x_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L)) - y_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=20L)) - xy_no_conflict_expected = as_epi_archive(tibble::tribble( + x_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) + y_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + xy_no_conflict_expected <- as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet - )) + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) test_that('epix_merge sync="forbid" on no-conflict works', { expect_equal( epix_merge(x_no_conflict, y_no_conflict, sync = "forbid"), @@ -208,8 +216,8 @@ local({ test_that('epix_merge sync="na" balks if do not know next_after', { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-01")), x_value=10L)), - as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-02")), y_value=20L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), + as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), sync = "na" ), regexp = "no applicable method.*next_after" diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 9e091642..b3fff13d 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -1,45 +1,55 @@ library(dplyr) test_that("epix_slide only works on an epi_archive", { - expect_error(epix_slide(data.frame(x=1))) + expect_error(epix_slide(data.frame(x = 1))) }) -x <- tibble::tribble(~version, ~time_value, ~binary, - 4, c(1:3), 2^(1:3), - 5, c(1:2,4), 2^(4:6), - 6, c(1:2,4:5), 2^(7:10), - 7, 2:6, 2^(11:15)) %>% - tidyr::unnest(c(time_value,binary)) +x <- tibble::tribble( + ~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2, 4), 2^(4:6), + 6, c(1:2, 4:5), 2^(7:10), + 7, 2:6, 2^(11:15) +) %>% + tidyr::unnest(c(time_value, binary)) -xx <- bind_cols(geo_value = rep("x",15), x) %>% +xx <- bind_cols(geo_value = rep("x", 15), x) %>% as_epi_archive() test_that("epix_slide works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary") - - xx2 <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - sum_binary = c(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14)) %>% + epix_slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% group_by(geo_value) - - expect_identical(xx1,xx2) # * - + + expect_identical(xx1, xx2) # * + xx3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide(f = ~ sum(.x$binary), - before = 2, - new_col_name = 'sum_binary') + $slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) ) - - expect_identical(xx1,xx3) # This and * imply xx2 and xx3 are identical + + expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical # function interface xx4 <- xx %>% @@ -47,158 +57,199 @@ test_that("epix_slide works as intended", { epix_slide(f = function(x, gk, rtv) { tibble::tibble(sum_binary = sum(x$binary)) }, before = 2, names_sep = NULL) - - expect_identical(xx1,xx4) + + expect_identical(xx1, xx4) # tidyeval interface xx5 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(sum_binary = sum(binary), - before = 2) - - expect_identical(xx1,xx5) + epix_slide( + sum_binary = sum(binary), + before = 2 + ) + + expect_identical(xx1, xx5) }) -test_that("epix_slide works as intended with `as_list_col=TRUE`",{ +test_that("epix_slide works as intended with `as_list_col=TRUE`", { xx_dfrow1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + xx_dfrow2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), slide_value = - c(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14) %>% - purrr::map(~ data.frame(bin_sum = .x)) + c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) %>% + purrr::map(~ data.frame(bin_sum = .x)) ) %>% group_by(geo_value) - - expect_identical(xx_dfrow1,xx_dfrow2) # * - + + expect_identical(xx_dfrow1, xx_dfrow2) # * + xx_dfrow3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide(f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE) + $slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) ) - - expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical - + + expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + xx_df1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ data.frame(bin = .x$binary), - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE + ) + xx_df2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), slide_value = - list(c(2^3,2^2), - c(2^6,2^3), - c(2^10,2^9), - c(2^15,2^14)) %>% - purrr::map(~ data.frame(bin = rev(.x))) + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(~ data.frame(bin = rev(.x))) ) %>% group_by(geo_value) - - expect_identical(xx_df1,xx_df2) + + expect_identical(xx_df1, xx_df2) xx_scalar1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ sum(.x$binary), - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE + ) + xx_scalar2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), slide_value = - list(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14) + list( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) ) %>% group_by(geo_value) - - expect_identical(xx_scalar1,xx_scalar2) - + + expect_identical(xx_scalar1, xx_scalar2) + xx_vec1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ .x$binary, - before = 2, - as_list_col = TRUE) - + epix_slide( + f = ~ .x$binary, + before = 2, + as_list_col = TRUE + ) + xx_vec2 <- tibble( - geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = - list(c(2^3,2^2), - c(2^6,2^3), - c(2^10,2^9), - c(2^15,2^14)) %>% - purrr::map(rev) + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(rev) ) %>% group_by(geo_value) - - expect_identical(xx_vec1,xx_vec2) + + expect_identical(xx_vec1, xx_vec2) }) test_that("epix_slide `before` validation works", { - expect_error(xx$slide(f = ~ sum(.x$binary)), - "`before` is required") - expect_error(xx$slide(f = ~ sum(.x$binary), before=NA), - "`before`.*NA") - expect_error(xx$slide(f = ~ sum(.x$binary), before=-1), - "`before`.*negative") - expect_error(xx$slide(f = ~ sum(.x$binary), before=1.5), - regexp="before", - class="vctrs_error_incompatible_type") + expect_error( + xx$slide(f = ~ sum(.x$binary)), + "`before` is required" + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = NA), + "`before`.*NA" + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = -1), + "`before`.*negative" + ) + expect_error(xx$slide(f = ~ sum(.x$binary), before = 1.5), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) # We might want to allow this at some point (issue #219): - expect_error(xx$slide(f = ~ sum(.x$binary), before=Inf), - regexp="before", - class="vctrs_error_incompatible_type") + expect_error(xx$slide(f = ~ sum(.x$binary), before = Inf), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) # (wrapper shouldn't introduce a value:) expect_error(epix_slide(xx, f = ~ sum(.x$binary)), "`before` is required") # These `before` values should be accepted: - expect_error(xx$slide(f = ~ sum(.x$binary), before=0), - NA) - expect_error(xx$slide(f = ~ sum(.x$binary), before=2L), - NA) - expect_error(xx$slide(f = ~ sum(.x$binary), before=365000), - NA) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = 0), + NA + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = 2L), + NA + ) + expect_error( + xx$slide(f = ~ sum(.x$binary), before = 365000), + NA + ) }) test_that("quosure passing issue in epix_slide is resolved + other potential issues", { # (First part adapted from @examples) time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-02"), - by = "1 day") + as.Date("2020-06-02"), + by = "1 day" + ) # We only have one non-version, non-time key in the example archive. Add # another so that we don't accidentally pass tests due to accidentally # matching the default grouping. - ea = as_epi_archive(archive_cases_dv_subset$DT %>% - dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), - other_keys = "modulus", - compactify = TRUE) - reference_by_modulus = ea %>% + ea <- as_epi_archive( + archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE + ) + reference_by_modulus <- ea %>% group_by(modulus) %>% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') - reference_by_neither = ea %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + reference_by_neither <- ea %>% group_by() %>% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) # test the passing-something-that-must-be-enquosed behavior: # # (S3 group_by behavior for this case is the `reference_by_modulus`) @@ -207,17 +258,19 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the .data pronoun behavior: expect_identical( - epix_slide(x = ea %>% group_by(.data$modulus), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) expect_identical( @@ -225,17 +278,19 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the passing across-all-of-string-literal behavior: expect_identical( - epix_slide(x = ea %>% group_by(dplyr::across(all_of("modulus"))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) expect_identical( @@ -243,18 +298,20 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the passing-across-all-of-string-var behavior: - my_group_by = "modulus" + my_group_by <- "modulus" expect_identical( - epix_slide(x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) expect_identical( @@ -262,17 +319,19 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_modulus ) # test the default behavior (default in this case should just be grouping by neither): expect_identical( - epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), + epix_slide( + x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_neither ) expect_identical( @@ -280,20 +339,22 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' + new_col_name = "case_rate_3d_av" ), reference_by_neither ) }) -ea <- tibble::tribble(~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1)) %>% - tidyr::unnest(c(time_value,binary)) %>% +ea <- tibble::tribble( + ~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1) +) %>% + tidyr::unnest(c(time_value, binary)) %>% mutate(geo_value = "x") %>% as_epi_archive() @@ -305,67 +366,79 @@ test_that("epix_slide with all_versions option has access to all older versions" testthat::local_edition(3) slide_fn <- function(x, gk, rtv) { - return(tibble(n_versions = length(unique(x$DT$version)), - n_row = nrow(x$DT), - dt_class1 = class(x$DT)[[1L]], - dt_key = list(key(x$DT)))) + return(tibble( + n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)) + )) } - ea_orig_mirror = ea$clone(deep=TRUE) + ea_orig_mirror <- ea$clone(deep = TRUE) ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) - result1 <- ea %>% group_by() %>% - epix_slide(f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + result1 <- ea %>% + group_by() %>% + epix_slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) expect_true(inherits(result1, "tbl_df")) result2 <- tibble::tribble( - ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, - 2, 1L, sum(1:1), "data.table", key(ea$DT), - 3, 2L, sum(1:2), "data.table", key(ea$DT), - 4, 3L, sum(1:3), "data.table", key(ea$DT), - 5, 4L, sum(1:4), "data.table", key(ea$DT), - 6, 5L, sum(1:5), "data.table", key(ea$DT), - 7, 6L, sum(1:6), "data.table", key(ea$DT), - ) + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) - expect_identical(result1,result2) # * + expect_identical(result1, result2) # * result3 <- ( ea $group_by() - $slide(f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + $slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) ) - expect_identical(result1,result3) # This and * Imply result2 and result3 are identical + expect_identical(result1, result3) # This and * Imply result2 and result3 are identical # formula interface - result4 <- ea %>% group_by() %>% - epix_slide(f = ~ slide_fn(.x, .y), - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + result4 <- ea %>% + group_by() %>% + epix_slide( + f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) - expect_identical(result1,result4) # This and * Imply result2 and result4 are identical + expect_identical(result1, result4) # This and * Imply result2 and result4 are identical # tidyeval interface result5 <- ea %>% group_by() %>% - epix_slide(data = slide_fn( - .data$clone(), # hack to convert from pronoun back to archive - stop("slide_fn doesn't use group key, no need to prepare it") - ), - before = 10^3, - names_sep = NULL, - all_versions = TRUE) + epix_slide( + data = slide_fn( + .data$clone(), # hack to convert from pronoun back to archive + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) - expect_identical(result1,result5) # This and * Imply result2 and result5 are identical + expect_identical(result1, result5) # This and * Imply result2 and result5 are identical expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea }) @@ -376,21 +449,21 @@ test_that("as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: - f1 = function(x, gk, rtv) { + f1 <- function(x, gk, rtv) { tibble( diff_mean = mean(diff(x$binary)) ) } - ref_time_value1 = 5 + ref_time_value1 <- 5 expect_identical( - ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before=1L), - ea$slide(f1, before=1000L, ref_time_values=ref_time_value1, names_sep=NULL) + ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea$slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) ) # For all_versions = TRUE: - f2 = function(x, gk, rtv) { + f2 <- function(x, gk, rtv) { x %>% # extract time&version-lag-1 data: epix_slide( @@ -400,81 +473,95 @@ test_that("as_of and epix_slide with long enough window are compatible", { filter(time_value == attr(subx, "metadata")$as_of - 1) %>% rename(real_time_value = time_value, lag1 = binary) )) - }, before = 1, names_sep = NULL + }, + before = 1, names_sep = NULL ) %>% # assess as nowcast: unnest(data) %>% inner_join(x$as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } - ref_time_value2 = 5 + ref_time_value2 <- 5 expect_identical( - ea$as_of(ref_time_value2, all_versions=TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before=1L), - ea$slide(f2, before=1000L, ref_time_values=ref_time_value2, all_versions=TRUE, names_sep=NULL) + ea$as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), + ea$slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo = ea$clone() - ea_multigeo$DT <- rbind(ea_multigeo$DT, - copy(ea_multigeo$DT)[,geo_value:="y"][,binary:=-binary][]) + ea_multigeo <- ea$clone() + ea_multigeo$DT <- rbind( + ea_multigeo$DT, + copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + ) setkeyv(ea_multigeo$DT, key(ea$DT)) expect_identical( ea_multigeo %>% group_by(geo_value) %>% - epix_slide(f2, before=1000L, ref_time_values=ref_time_value2, all_versions=TRUE, names_sep=NULL) %>% + epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% filter(geo_value == "x"), ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` - epix_as_of(ref_time_value2, all_versions=TRUE) %>% + epix_as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% group_by(geo_value) ) }) -test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`",{ +test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { slide_fn <- function(x, gk, rtv) { expect_true(is_epi_archive(x)) return(NA) } - ea %>% group_by() %>% - epix_slide(f = slide_fn, - before = 1, - ref_time_values = 5, - new_col_name = "out", - all_versions = TRUE) + ea %>% + group_by() %>% + epix_slide( + f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE + ) }) test_that("epix_slide with all_versions option works as intended", { xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE) - - xx2 <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - sum_binary = c(2^3+2^2, - 2^6+2^3, - 2^10+2^9+2^6, - 2^15+2^14+2^10)) %>% + epix_slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9 + 2^6, + 2^15 + 2^14 + 2^10 + ) + ) %>% group_by(geo_value) - expect_identical(xx1,xx2) # * + expect_identical(xx1, xx2) # * xx3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide(f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = 'sum_binary', - all_versions = TRUE) + $slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) ) - expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical + expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical }) # XXX currently, we're using a stopgap measure of having `epix_slide` always @@ -498,7 +585,7 @@ test_that("epix_slide with all_versions option works as intended", { # }) test_that("epix_slide works with 0-row computation outputs", { - epix_slide_empty = function(ea, ...) { + epix_slide_empty <- function(ea, ...) { ea %>% epix_slide(before = 5L, ..., function(x, gk, rtv) { tibble::tibble() @@ -521,13 +608,13 @@ test_that("epix_slide works with 0-row computation outputs", { ) %>% # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, # as_of = ea$versions_end) %>% - group_by(geo_value) + group_by(geo_value) ) # with `all_versions=TRUE`, we have something similar but never get an # `epi_df`: expect_identical( ea %>% - epix_slide_empty(all_versions=TRUE), + epix_slide_empty(all_versions = TRUE), tibble::tibble( time_value = ea$DT$version[integer(0)] ) @@ -535,7 +622,7 @@ test_that("epix_slide works with 0-row computation outputs", { expect_identical( ea %>% group_by(geo_value) %>% - epix_slide_empty(all_versions=TRUE), + epix_slide_empty(all_versions = TRUE), tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], time_value = ea$DT$version[integer(0)] @@ -563,87 +650,104 @@ test_that("epix_slide works with 0-row computation outputs", { # }) test_that("epix_slide alerts if the provided f doesn't take enough args", { - f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. expect_error(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) expect_warning(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) - f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) expect_warning(epix_slide(xx, f_x_dots, before = 2L), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) }) test_that("epix_slide computation via formula can use ref_time_value", { - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = c(4,5,6,7) - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% group_by(geo_value) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ .ref_time_value, - before = 2) + epix_slide( + f = ~.ref_time_value, + before = 2 + ) expect_identical(xx1, xx_ref) xx2 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ .z, - before = 2) + epix_slide( + f = ~.z, + before = 2 + ) expect_identical(xx2, xx_ref) xx3 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = ~ ..3, - before = 2) + epix_slide( + f = ~..3, + before = 2 + ) expect_identical(xx3, xx_ref) }) test_that("epix_slide computation via function can use ref_time_value", { - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = c(4,5,6,7) - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% group_by(geo_value) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(f = function(x, g, t) t, - before = 2) + epix_slide( + f = function(x, g, t) t, + before = 2 + ) expect_identical(xx1, xx_ref) }) test_that("epix_slide computation via dots can use ref_time_value and group", { # ref_time_value - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = c(4,5,6,7) - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% group_by(geo_value) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - slide_value = .ref_time_value) + epix_slide( + before = 2, + slide_value = .ref_time_value + ) expect_identical(xx1, xx_ref) # group_key - xx_ref <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = "x" - ) %>% + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = "x" + ) %>% group_by(geo_value) # Use group_key column xx3 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - slide_value = .group_key$geo_value) + epix_slide( + before = 2, + slide_value = .group_key$geo_value + ) expect_identical(xx3, xx_ref) @@ -651,8 +755,10 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { expect_error( xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - slide_value = nrow(.group_key)), + epix_slide( + before = 2, + slide_value = nrow(.group_key) + ), NA ) }) @@ -660,20 +766,26 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { test_that("epix_slide computation via dots outputs the same result using col names and the data var", { xx_ref <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - sum_binary = sum(time_value)) + epix_slide( + before = 2, + sum_binary = sum(time_value) + ) xx1 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - sum_binary = sum(.x$time_value)) + epix_slide( + before = 2, + sum_binary = sum(.x$time_value) + ) expect_identical(xx1, xx_ref) xx2 <- xx %>% group_by(.data$geo_value) %>% - epix_slide(before = 2, - sum_binary = sum(.data$time_value)) + epix_slide( + before = 2, + sum_binary = sum(.data$time_value) + ) expect_identical(xx2, xx_ref) }) @@ -691,7 +803,7 @@ test_that("`epix_slide` doesn't decay date output", { }) test_that("`epix_slide` can access objects inside of helper functions", { - helper = function(archive_haystack, time_value_needle) { + helper <- function(archive_haystack, time_value_needle) { archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = 365000L) } expect_error( diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 0423352e..68e7c76d 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -1,17 +1,19 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # From an example: library(dplyr) - toy_archive = + toy_archive <- tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) ) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version)) %>% as_epi_archive(other_keys = "age_group") # Ensure that we're using testthat edition 3's idea of "identical", which is @@ -19,12 +21,12 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { testthat::local_edition(3) # Test equivalency claims in example: - by_both_keys = toy_archive %>% group_by(geo_value, age_group) + by_both_keys <- toy_archive %>% group_by(geo_value, age_group) expect_identical( by_both_keys, - toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) ) - grouping_cols = c("geo_value", "age_group") + grouping_cols <- c("geo_value", "age_group") expect_identical( by_both_keys, toy_archive %>% group_by(across(all_of(grouping_cols))) @@ -37,52 +39,66 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # Test `.drop` behavior: expect_error(toy_archive %>% group_by(.drop = "bogus"), - regexp = "\\.drop.*TRUE or FALSE") - expect_warning(toy_archive %>% group_by(.drop=FALSE), - class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") - expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), - class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") - expect_warning(grouped_factor_then_nonfactor <- - toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), - class="epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") - expect_identical(grouped_factor_then_nonfactor %>% - epix_slide(before = 10, s = sum(value)), - tibble::tribble( - ~age_group, ~geo_value, ~time_value, ~s, - "pediatric", NA_character_, "2000-01-02", 0, - "adult", "us", "2000-01-02", 121, - "pediatric", "us", "2000-01-03", 5, - "adult", "us", "2000-01-03", 255) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value)) %>% - # # See - # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 - # # and - # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 - # # for why this is commented out, pending some design - # # decisions. - # # - # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - # as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(age_group, geo_value, time_value, s) %>% - group_by(age_group, geo_value, .drop=FALSE)) - expect_identical(toy_archive %>% - group_by(geo_value, age_group, .drop=FALSE) %>% - epix_slide(before = 10, s = sum(value)), - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~s, - "us", "pediatric", "2000-01-02", 0, - "us", "adult", "2000-01-02", 121, - "us", "pediatric", "2000-01-03", 5, - "us", "adult", "2000-01-03", 255) %>% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value)) %>% - # as_epi_df(as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(geo_value, age_group, time_value, s) %>% - group_by(geo_value, age_group, .drop=FALSE) - ) + regexp = "\\.drop.*TRUE or FALSE" + ) + expect_warning(toy_archive %>% group_by(.drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning( + grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + expect_identical( + grouped_factor_then_nonfactor %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop = FALSE) + ) + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop = FALSE) %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop = FALSE) + ) }) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 3b692475..7ab63f19 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -3,27 +3,29 @@ library(dplyr) ea <- archive_cases_dv_subset$clone() ea2_data <- tibble::tribble( - ~geo_value, ~time_value, ~version, ~cases, - "ca", "2020-06-01", "2020-06-01", 1, - "ca", "2020-06-01", "2020-06-02", 2, - # - "ca", "2020-06-02", "2020-06-02", 0, - "ca", "2020-06-02", "2020-06-03", 1, - "ca", "2020-06-02", "2020-06-04", 2, - # - "ca", "2020-06-03", "2020-06-03", 1, - # - "ca", "2020-06-04", "2020-06-04", 4, - ) %>% - dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, +) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) # epix_as_of tests -test_that("epix_as_of behaves identically to as_of method",{ - expect_identical(epix_as_of(ea,max_version = min(ea$DT$version)), - ea$as_of(max_version = min(ea$DT$version))) +test_that("epix_as_of behaves identically to as_of method", { + expect_identical( + epix_as_of(ea, max_version = min(ea$DT$version)), + ea$as_of(max_version = min(ea$DT$version)) + ) }) -test_that("Errors are thrown due to bad as_of inputs",{ +test_that("Errors are thrown due to bad as_of inputs", { # max_version cannot be of string class rather than date class expect_error(ea$as_of("2020-01-01")) # max_version cannot be later than latest version @@ -32,25 +34,24 @@ test_that("Errors are thrown due to bad as_of inputs",{ expect_error(ea$as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) }) -test_that("Warning against max_version being clobberable",{ +test_that("Warning against max_version being clobberable", { # none by default expect_warning(regexp = NA, ea$as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea$as_of(max_version = min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable = ea$clone() - ea_with_clobberable$clobberable_versions_start = max(ea_with_clobberable$DT$version) + ea_with_clobberable <- ea$clone() + ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) expect_warning(ea_with_clobberable$as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea_with_clobberable$as_of(max_version = min(ea$DT$version))) }) -test_that("as_of properly grabs the data and doesn't mutate key",{ - +test_that("as_of properly grabs the data and doesn't mutate key", { d <- as.Date("2020-06-01") - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() - old_key = data.table::key(ea2$DT) + old_key <- data.table::key(ea2$DT) edf_as_of <- ea2 %>% epix_as_of(max_version = as.Date("2020-06-03")) @@ -58,14 +59,14 @@ test_that("as_of properly grabs the data and doesn't mutate key",{ edf_expected <- as_epi_df(tibble( geo_value = "ca", time_value = d + 0:2, - cases = c(2,1,1) + cases = c(2, 1, 1) ), as_of = as.Date("2020-06-03")) - expect_equal(edf_as_of, edf_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) expect_equal(data.table::key(ea2$DT), old_key) }) -test_that("Errors are thrown due to bad epix_truncate_versions_after inputs",{ +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { # x must be an archive expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) # max_version cannot be of string class rather than date class @@ -79,44 +80,41 @@ test_that("Errors are thrown due to bad epix_truncate_versions_after inputs",{ }) test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() - old_key = data.table::key(ea2$DT) + old_key <- data.table::key(ea2$DT) ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-02")) - ea_expected <- ea2_data[1:3,] %>% + ea_expected <- ea2_data[1:3, ] %>% as_epi_archive() - expect_equal(ea_as_of, ea_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) expect_equal(data.table::key(ea2$DT), old_key) }) test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() ea_expected <- ea2$clone() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of, ea_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) }) test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_epi_archive(ea_as_of, grouped_okay=FALSE)) + expect_true(is_epi_archive(ea_as_of, grouped_okay = FALSE)) - ea2_grouped = ea2$group_by(geo_value) + ea2_grouped <- ea2$group_by(geo_value) ea_as_of <- ea2_grouped %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -125,10 +123,9 @@ test_that("epix_truncate_version_after returns the same grouping type as input e test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { - - ea2 = ea2_data %>% + ea2 <- ea2_data %>% as_epi_archive() - ea2 = ea2$group_by(geo_value) + ea2 <- ea2$group_by(geo_value) ea_expected <- ea2$clone() diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index aeb08ced..c2a6d956 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -7,17 +7,19 @@ toy_epi_df <- tibble::tibble( length.out = 5 ), times = 2), geo_value = rep(c("ca", "hi"), each = 5), - indic_var1 = as.factor(rep(1:2, times = 5)), + indic_var1 = as.factor(rep(1:2, times = 5)), indic_var2 = as.factor(rep(letters[1:5], times = 2)) -) %>% as_epi_df(additional_metadata = - list(other_keys = c("indic_var1", "indic_var2"))) +) %>% as_epi_df( + additional_metadata = + list(other_keys = c("indic_var1", "indic_var2")) +) -att_toy = attr(toy_epi_df, "metadata") +att_toy <- attr(toy_epi_df, "metadata") test_that("Head and tail do not drop the epi_df class", { - att_head = attr(head(toy_epi_df), "metadata") - att_tail = attr(tail(toy_epi_df), "metadata") - + att_head <- attr(head(toy_epi_df), "metadata") + att_tail <- attr(tail(toy_epi_df), "metadata") + expect_true(is_epi_df(head(toy_epi_df))) expect_true(is_epi_df(tail(toy_epi_df))) expect_identical(att_head$geo_type, att_toy$geo_type) @@ -32,11 +34,10 @@ test_that("Head and tail do not drop the epi_df class", { test_that("Subsetting drops & does not drop the epi_df class appropriately", { - # Row subset - should be epi_df - row_subset = toy_epi_df[1:2, ] - att_row_subset = attr(row_subset, "metadata") - + row_subset <- toy_epi_df[1:2, ] + att_row_subset <- attr(row_subset, "metadata") + expect_true(is_epi_df(row_subset)) expect_equal(nrow(row_subset), 2L) expect_equal(ncol(row_subset), 6L) @@ -44,34 +45,34 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { expect_identical(att_row_subset$time_type, att_toy$time_type) expect_identical(att_row_subset$as_of, att_toy$as_of) expect_identical(att_row_subset$other_keys, att_toy$other_keys) - + # Row and col single value - shouldn't be an epi_df - row_col_subset1 = toy_epi_df[1,2] + row_col_subset1 <- toy_epi_df[1, 2] expect_false(is_epi_df(row_col_subset1)) expect_true(tibble::is_tibble(row_col_subset1)) expect_equal(nrow(row_col_subset1), 1L) expect_equal(ncol(row_col_subset1), 1L) - + # Col subset with no time_value - shouldn't be an epi_df - col_subset1 = toy_epi_df[, c(1,3)] - + col_subset1 <- toy_epi_df[, c(1, 3)] + expect_false(is_epi_df(col_subset1)) expect_true(tibble::is_tibble(col_subset1)) expect_equal(nrow(col_subset1), 10L) expect_equal(ncol(col_subset1), 2L) - + # Col subset with no geo_value - shouldn't be an epi_df - col_subset2 = toy_epi_df[, 2:3] - + col_subset2 <- toy_epi_df[, 2:3] + expect_false(is_epi_df(col_subset2)) expect_true(tibble::is_tibble(col_subset2)) expect_equal(nrow(col_subset2), 10L) expect_equal(ncol(col_subset2), 2L) - + # Row and col subset that contains geo_value and time_value - should be epi_df - row_col_subset2 = toy_epi_df[2:3,1:3] - att_row_col_subset2 = attr(row_col_subset2, "metadata") - + row_col_subset2 <- toy_epi_df[2:3, 1:3] + att_row_col_subset2 <- attr(row_col_subset2, "metadata") + expect_true(is_epi_df(row_col_subset2)) expect_equal(nrow(row_col_subset2), 2L) expect_equal(ncol(row_col_subset2), 3L) @@ -82,17 +83,21 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { }) test_that("When duplicate cols in subset should abort", { - expect_error(toy_epi_df[, c(2,2:3,4,4,4)], - "Column name(s) time_value, y must not be duplicated.", fixed = T) - expect_error(toy_epi_df[1:4, c(1,2:4,1)], - "Column name(s) geo_value must not be duplicated.", fixed = T) + expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], + "Column name(s) time_value, y must not be duplicated.", + fixed = T + ) + expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], + "Column name(s) geo_value must not be duplicated.", + fixed = T + ) }) test_that("Correct metadata when subset includes some of other_keys", { # Only include other_var of indic_var1 - only_indic_var1 = toy_epi_df[, 1:5] - att_only_indic_var1 = attr(only_indic_var1, "metadata") - + only_indic_var1 <- toy_epi_df[, 1:5] + att_only_indic_var1 <- attr(only_indic_var1, "metadata") + expect_true(is_epi_df(only_indic_var1)) expect_equal(nrow(only_indic_var1), 10L) expect_equal(ncol(only_indic_var1), 5L) @@ -100,11 +105,11 @@ test_that("Correct metadata when subset includes some of other_keys", { expect_identical(att_only_indic_var1$time_type, att_toy$time_type) expect_identical(att_only_indic_var1$as_of, att_toy$as_of) expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2]) - + # Only include other_var of indic_var2 - only_indic_var2 = toy_epi_df[, c(1:4,6)] - att_only_indic_var2 = attr(only_indic_var2, "metadata") - + only_indic_var2 <- toy_epi_df[, c(1:4, 6)] + att_only_indic_var2 <- attr(only_indic_var2, "metadata") + expect_true(is_epi_df(only_indic_var2)) expect_equal(nrow(only_indic_var2), 10L) expect_equal(ncol(only_indic_var2), 5L) @@ -112,12 +117,12 @@ test_that("Correct metadata when subset includes some of other_keys", { expect_identical(att_only_indic_var2$time_type, att_toy$time_type) expect_identical(att_only_indic_var2$as_of, att_toy$as_of) expect_identical(att_only_indic_var2$other_keys, att_toy$other_keys[-1]) - + # Including both original other_keys was already tested above }) test_that("Metadata and grouping are dropped by `as_tibble`", { - grouped_converted = toy_epi_df %>% + grouped_converted <- toy_epi_df %>% group_by(geo_value) %>% as_tibble() expect_true( diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 8460a5e8..2319d045 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,117 +1,119 @@ -test_that("break_string works properly",{ - expect_equal(break_str("A dog is here", 6),"A dog\nis\nhere") +test_that("break_string works properly", { + expect_equal(break_str("A dog is here", 6), "A dog\nis\nhere") }) -test_that("Abort and Warn work",{ +test_that("Abort and Warn work", { expect_error(Abort("abort")) expect_warning(Warn("warn")) }) -test_that("in_range works",{ - expect_equal(in_range(1,c(2,4)),2) - expect_equal(in_range(3,c(2,4)),3) - expect_equal(in_range(5,c(2,4)),4) +test_that("in_range works", { + expect_equal(in_range(1, c(2, 4)), 2) + expect_equal(in_range(3, c(2, 4)), 3) + expect_equal(in_range(5, c(2, 4)), 4) }) -test_that("new summarizing functions work",{ - x <- c(3,4,5,9,NA) - expect_equal(Min(x),3) - expect_equal(Max(x),9) - expect_equal(Sum(x),21) - expect_equal(Mean(x),5.25) - expect_equal(Median(x),4.5) +test_that("new summarizing functions work", { + x <- c(3, 4, 5, 9, NA) + expect_equal(Min(x), 3) + expect_equal(Max(x), 9) + expect_equal(Sum(x), 21) + expect_equal(Mean(x), 5.25) + expect_equal(Median(x), 4.5) }) -test_that("Other capital letter functions work",{ - x <- c(1,2,3,4,5) - expect_equal(Start(x),1) - expect_equal(End(x),5) - expect_equal(MiddleL(x),3) - expect_equal(MiddleR(x),3) - expect_equal(MiddleL(x[-5]),2) - expect_equal(MiddleR(x[-5]),3) - expect_equal(ExtendL(x),c(1,1,2,3,4,5)) - expect_equal(ExtendR(x),c(1,2,3,4,5,5)) +test_that("Other capital letter functions work", { + x <- c(1, 2, 3, 4, 5) + expect_equal(Start(x), 1) + expect_equal(End(x), 5) + expect_equal(MiddleL(x), 3) + expect_equal(MiddleR(x), 3) + expect_equal(MiddleL(x[-5]), 2) + expect_equal(MiddleR(x[-5]), 3) + expect_equal(ExtendL(x), c(1, 1, 2, 3, 4, 5)) + expect_equal(ExtendR(x), c(1, 2, 3, 4, 5, 5)) }) -test_that("guess_geo_type tests for different types of geo_value's",{ +test_that("guess_geo_type tests for different types of geo_value's", { # California, New York - states <- c("ca","ny") - + states <- c("ca", "ny") + # Canada, USA, United Kingdom - nations <- c("ca","us","uk") - + nations <- c("ca", "us", "uk") + # Note: These are just five-number names that may not necessarily be existent # counties - counties <- c("12345","67890") - + counties <- c("12345", "67890") + # HHS regions hhs <- c(1:3) - + # HRR regions - hrr <- c(100,200) - + hrr <- c(100, 200) + # Long numbers should be custom - long_nums <- c(123456789,111222333) - + long_nums <- c(123456789, 111222333) + # Health regions in British Columbia - bc <- c("Vancouver Coastal","Interior","Fraser", - "Northern","Vancouver Island") - + bc <- c( + "Vancouver Coastal", "Interior", "Fraser", + "Northern", "Vancouver Island" + ) + # Long numbers as strings should also be custom - long_num_strings <- c("123456789","111222333") - - expect_equal(guess_geo_type(states),"state") - expect_equal(guess_geo_type(nations),"nation") - expect_equal(guess_geo_type(counties),"county") - expect_equal(guess_geo_type(hhs),"hhs") - expect_equal(guess_geo_type(hrr),"hrr") - expect_equal(guess_geo_type(long_num_strings),"custom") - expect_equal(guess_geo_type(bc),"custom") - expect_equal(guess_geo_type(long_nums),"custom") + long_num_strings <- c("123456789", "111222333") + + expect_equal(guess_geo_type(states), "state") + expect_equal(guess_geo_type(nations), "nation") + expect_equal(guess_geo_type(counties), "county") + expect_equal(guess_geo_type(hhs), "hhs") + expect_equal(guess_geo_type(hrr), "hrr") + expect_equal(guess_geo_type(long_num_strings), "custom") + expect_equal(guess_geo_type(bc), "custom") + expect_equal(guess_geo_type(long_nums), "custom") }) -test_that("guess_time_type works for different types",{ +test_that("guess_time_type works for different types", { days <- as.Date("2022-01-01") + 0:6 weeks <- as.Date("2022-01-01") + 7 * 0:6 yearweeks <- tsibble::yearweek(10) yearmonths <- tsibble::yearmonth(10) yearquarters <- tsibble::yearquarter(10) - - years <- c(1999,2000) - + + years <- c(1999, 2000) + # YYYY-MM-DD is the accepted format not_ymd1 <- "January 1, 2022" not_ymd2 <- "1 January 2022" not_ymd3 <- "1 Jan 2022" - + not_a_date <- "asdf" - - expect_equal(guess_time_type(days),"day") - expect_equal(guess_time_type(weeks),"week") - - expect_equal(guess_time_type(yearweeks),"yearweek") - expect_equal(guess_time_type(yearmonths),"yearmonth") - expect_equal(guess_time_type(yearquarters),"yearquarter") - - expect_equal(guess_time_type(years),"year") - - expect_equal(guess_time_type(not_ymd1),"custom") - expect_equal(guess_time_type(not_ymd2),"custom") - expect_equal(guess_time_type(not_ymd3),"custom") - expect_equal(guess_time_type(not_a_date),"custom") + + expect_equal(guess_time_type(days), "day") + expect_equal(guess_time_type(weeks), "week") + + expect_equal(guess_time_type(yearweeks), "yearweek") + expect_equal(guess_time_type(yearmonths), "yearmonth") + expect_equal(guess_time_type(yearquarters), "yearquarter") + + expect_equal(guess_time_type(years), "year") + + expect_equal(guess_time_type(not_ymd1), "custom") + expect_equal(guess_time_type(not_ymd2), "custom") + expect_equal(guess_time_type(not_ymd3), "custom") + expect_equal(guess_time_type(not_a_date), "custom") }) -test_that("enlist works",{ - my_list <- enlist(x=1,y=2,z=3) - expect_equal(my_list$x,1) - expect_true(inherits(my_list,"list")) +test_that("enlist works", { + my_list <- enlist(x = 1, y = 2, z = 3) + expect_equal(my_list$x, 1) + expect_true(inherits(my_list, "list")) }) test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough args", { - f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xgt_dots = function(x, g, t, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xgt_dots <- function(x, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. expect_error(assert_sufficient_f_args(f_xgt), regexp = NA) @@ -119,63 +121,76 @@ test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough expect_error(assert_sufficient_f_args(f_xgt_dots), regexp = NA) expect_warning(assert_sufficient_f_args(f_xgt_dots), regexp = NA) - f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_dots = function(...) dplyr::tibble(value=c(5), count=c(2)) - f_x = function(x) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f = function() dplyr::tibble(value=c(5), count=c(2)) + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_dots <- function(...) dplyr::tibble(value = c(5), count = c(2)) + f_x <- function(x) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f <- function() dplyr::tibble(value = c(5), count = c(2)) expect_warning(assert_sufficient_f_args(f_x_dots), regexp = ", the group key and reference time value will be included", - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) expect_warning(assert_sufficient_f_args(f_dots), regexp = ", the window data, group key, and reference time value will be included", - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) expect_error(assert_sufficient_f_args(f_x), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args") + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" + ) expect_error(assert_sufficient_f_args(f), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args") + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" + ) - f_xs_dots = function(x, setting="a", ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xs = function(x, setting="a") dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - expect_warning(assert_sufficient_f_args(f_xs_dots, setting="b"), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") - expect_error(assert_sufficient_f_args(f_xs, setting="b"), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded") + f_xs_dots <- function(x, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xs <- function(x, setting = "a") dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b"), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + expect_error(assert_sufficient_f_args(f_xs, setting = "b"), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" + ) expect_error(assert_sufficient_f_args(f_xgt, "b"), - class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded") + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" + ) }) test_that("assert_sufficient_f_args alerts if the provided f has defaults for the required args", { - f_xgt = function(x, g=1, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xgt_dots = function(x=1, g, t, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_x_dots = function(x=1, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xgt <- function(x, g = 1, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xgt_dots <- function(x = 1, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_x_dots <- function(x = 1, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) expect_error(assert_sufficient_f_args(f_xgt), regexp = "pass the group key to `f`'s g argument,", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error(assert_sufficient_f_args(f_xgt_dots), regexp = "pass the window data to `f`'s x argument,", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots)), - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) - f_xsgt = function(x, setting="a", g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xsgt_dots = function(x, setting="a", g, t, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) - f_xs_dots = function(x=1, setting="a", ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xsgt <- function(x, setting = "a", g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xsgt_dots <- function(x, setting = "a", g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + f_xs_dots <- function(x = 1, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # forwarding named dots should prevent some complaints: expect_no_error(assert_sufficient_f_args(f_xsgt, setting = "b")) expect_no_error(assert_sufficient_f_args(f_xsgt_dots, setting = "b")) expect_error(suppressWarnings(assert_sufficient_f_args(f_xs_dots, setting = "b")), regexp = "pass the window data to `f`'s x argument", - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) # forwarding unnamed dots should not: expect_error(assert_sufficient_f_args(f_xsgt, "b"), - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error(assert_sufficient_f_args(f_xsgt_dots, "b"), - class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" + ) expect_error( expect_warning( assert_sufficient_f_args(f_xs_dots, "b"), @@ -198,36 +213,42 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th test_that("computation formula-derived functions take all argument types", { # positional expect_identical(as_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) - expect_identical(as_slide_computation(~ ..1)(1, 2, 3), 1) + expect_identical(as_slide_computation(~..1)(1, 2, 3), 1) # Matching rlang, purr, dplyr usage expect_identical(as_slide_computation(~ .x + .z)(1, 2, 3), 4) expect_identical(as_slide_computation(~ .x + .y)(1, 2, 3), 3) # named expect_identical(as_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) - expect_identical(as_slide_computation(~ .group_key)(1, 2, 3), 2) + expect_identical(as_slide_computation(~.group_key)(1, 2, 3), 2) }) test_that("as_slide_computation passes functions unaltered", { - f <- function(a, b, c) {a * b * c + 5} + f <- function(a, b, c) { + a * b * c + 5 + } expect_identical(as_slide_computation(f), f) }) test_that("as_slide_computation raises errors as expected", { # Formulas must be one-sided expect_error(as_slide_computation(y ~ ..1), - class="epiprocess__as_slide_computation__formula_is_twosided") + class = "epiprocess__as_slide_computation__formula_is_twosided" + ) # Formulas can't be paired with ... - expect_error(as_slide_computation(~ ..1, method = "fn"), - class="epiprocess__as_slide_computation__formula_with_dots") + expect_error(as_slide_computation(~..1, method = "fn"), + class = "epiprocess__as_slide_computation__formula_with_dots" + ) # `f_env` must be an environment - formula_without_env <- stats::as.formula(~ ..1) + formula_without_env <- stats::as.formula(~..1) rlang::f_env(formula_without_env) <- 5 expect_error(as_slide_computation(formula_without_env), - class="epiprocess__as_slide_computation__formula_has_no_env") + class = "epiprocess__as_slide_computation__formula_has_no_env" + ) # `f` must be a function, formula, or string expect_error(as_slide_computation(5), - class="epiprocess__as_slide_computation__cant_convert_catchall") + class = "epiprocess__as_slide_computation__cant_convert_catchall" + ) }) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 812cb711..567975a5 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -88,7 +88,8 @@ library(dplyr) edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), + by = "day" + ), length.out = length(geo_value)), x = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)), ) %>% as_epi_df() @@ -157,8 +158,10 @@ object returned by `epi_slide()` has a list column containing the slide values. ```{r} edf2 <- edf %>% group_by(geo_value) %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = TRUE) %>% + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = TRUE + ) %>% ungroup() class(edf2$a) @@ -176,8 +179,10 @@ slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} edf %>% group_by(geo_value) %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE) %>% + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = FALSE + ) %>% ungroup() ``` @@ -187,8 +192,10 @@ the prefix associated with list column name, in naming the unnested columns. ```{r} edf %>% group_by(geo_value) %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL) %>% + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = FALSE, names_sep = NULL + ) %>% ungroup() ``` @@ -197,24 +204,30 @@ order to make the result size stable, just like the case for atomic values. ```{r} edf %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL) + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + before = 1, as_list_col = FALSE, names_sep = NULL + ) ``` ```{r, include = FALSE} # More checks (not included) edf %>% - epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL) + epi_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + ref_time_values = as.Date("2020-06-02"), + before = 1, as_list_col = FALSE, names_sep = NULL + ) edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% - epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL) %>% + epix_slide( + a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), + ref_time_values = as.Date("2020-06-02"), + before = 1, as_list_col = FALSE, names_sep = NULL + ) %>% ungroup() ``` @@ -241,11 +254,14 @@ edf %>% obj <- lm(y ~ x, data = d) return( as.data.frame( - predict(obj, newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), - interval = "prediction", level = 0.9) - )) + predict(obj, + newdata = d %>% + group_by(geo_value) %>% + filter(time_value == max(time_value)), + interval = "prediction", level = 0.9 + ) + ) + ) }, before = 1, new_col_name = "fc", names_sep = NULL) ``` @@ -303,17 +319,18 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive(compactify=FALSE) + as_epi_archive(compactify = FALSE) # mutating merge operation: -x$merge(y2 %>% - select(geo_value, time_value, - version = issue, - case_rate_7d_av = value - ) %>% - as_epi_archive(compactify=FALSE), +x$merge( + y2 %>% + select(geo_value, time_value, + version = issue, + case_rate_7d_av = value + ) %>% + as_epi_archive(compactify = FALSE), sync = "locf", - compactify=FALSE + compactify = FALSE ) ``` @@ -323,9 +340,8 @@ library(ggplot2) theme_set(theme_bw()) x <- archive_cases_dv_subset$DT %>% - filter(geo_value %in% c("ca","fl")) %>% + filter(geo_value %in% c("ca", "fl")) %>% as_epi_archive(compactify = FALSE) - ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -347,28 +363,35 @@ prob_arx_args <- function(lags = c(0, 7, 14), symmetrize = TRUE, intercept = FALSE, nonneg = TRUE) { - return(list(lags = lags, - ahead = ahead, - min_train_window = min_train_window, - lower_level = lower_level, - upper_level = upper_level, - symmetrize = symmetrize, - intercept = intercept, - nonneg = nonneg)) + return(list( + lags = lags, + ahead = ahead, + min_train_window = min_train_window, + lower_level = lower_level, + upper_level = upper_level, + symmetrize = symmetrize, + intercept = intercept, + nonneg = nonneg + )) } prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Return NA if insufficient training data if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { - return(data.frame(geo_value = unique(geo_value), # Return geo value! - point = NA, lower = NA, upper = NA)) + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = NA, lower = NA, upper = NA + )) } # Set up x, y, lags list - if (!missing(x)) x <- data.frame(x, y) - else x <- data.frame(y) + if (!missing(x)) { + x <- data.frame(x, y) + } else { + x <- data.frame(y) + } if (!is.list(args$lags)) args$lags <- list(args$lags) - args$lags = rep(args$lags, length.out = ncol(x)) + args$lags <- rep(args$lags, length.out = ncol(x)) # Build features and response for the AR model, and then fit it dat <- @@ -377,19 +400,24 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { mutate(name = paste0("x", 1:nrow(.))) %>% # One list element for each lagged feature pmap(function(i, lag, name) { - tibble(geo_value = geo_value, - time_value = time_value + lag, # Shift back - !!name := x[,i]) + tibble( + geo_value = geo_value, + time_value = time_value + lag, # Shift back + !!name := x[, i] + ) }) %>% # One list element for the response vector c(list( - tibble(geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward - y = y))) %>% + tibble( + geo_value = geo_value, + time_value = time_value - args$ahead, # Shift forward + y = y + ) + )) %>% # Combine them together into one data frame reduce(full_join, by = c("geo_value", "time_value")) %>% arrange(time_value) - if (args$intercept) dat$x0 = rep(1, nrow(dat)) + if (args$intercept) dat$x0 <- rep(1, nrow(dat)) obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) # Use LOCF to fill NAs in the latest feature values (do this by geo value) @@ -398,10 +426,10 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] # Make predictions - test_time_value = max(time_value) + test_time_value <- max(time_value) point <- predict(obj, newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value)) + dplyr::group_by(geo_value) %>% + dplyr::filter(time_value == test_time_value)) # Compute bands r <- residuals(obj) @@ -412,12 +440,14 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Clip at zero if we need to, then return if (args$nonneg) { - point = pmax(point, 0) - lower = pmax(lower, 0) - upper = pmax(upper, 0) + point <- pmax(point, 0) + lower <- pmax(lower, 0) + upper <- pmax(upper, 0) } - return(data.frame(geo_value = unique(geo_value), # Return geo value! - point = point, lower = lower, upper = upper)) + return(data.frame( + geo_value = unique(geo_value), # Return geo value! + point = point, lower = lower, upper = upper + )) } ``` @@ -428,44 +458,57 @@ data. # Latest snapshot of data, and forecast dates x_latest <- epix_as_of(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month") + as.Date("2021-11-30"), + by = "1 month" +) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), - before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = TRUE, - geo_value = fc_geo_value) - } - else { + epix_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead) + ), + before = 119, ref_time_values = fc_time_values + ) %>% + mutate( + target_date = time_value + ahead, as_of = TRUE, + geo_value = fc_geo_value + ) + } else { x_latest %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, - args = prob_arx_args(ahead = ahead)), - before = 119, ref_time_values = fc_time_values) %>% + epi_slide( + fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + args = prob_arx_args(ahead = ahead) + ), + before = 119, ref_time_values = fc_time_values + ) %>% mutate(target_date = time_value + ahead, as_of = FALSE) } } # Generate the forecasts, and bind them together -fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), - k_week_ahead(x, ahead = 14, as_of = TRUE), - k_week_ahead(x, ahead = 21, as_of = TRUE), - k_week_ahead(x, ahead = 28, as_of = TRUE), - k_week_ahead(x, ahead = 7, as_of = FALSE), - k_week_ahead(x, ahead = 14, as_of = FALSE), - k_week_ahead(x, ahead = 21, as_of = FALSE), - k_week_ahead(x, ahead = 28, as_of = FALSE)) +fc <- bind_rows( + k_week_ahead(x, ahead = 7, as_of = TRUE), + k_week_ahead(x, ahead = 14, as_of = TRUE), + k_week_ahead(x, ahead = 21, as_of = TRUE), + k_week_ahead(x, ahead = 28, as_of = TRUE), + k_week_ahead(x, ahead = 7, as_of = FALSE), + k_week_ahead(x, ahead = 14, as_of = FALSE), + k_week_ahead(x, ahead = 21, as_of = FALSE), + k_week_ahead(x, ahead = 28, as_of = FALSE) +) # Plot them, on top of latest COVID-19 case rates ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + - geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50") + - geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + + geom_line( + data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50" + ) + + geom_line(aes(y = fc_point)) + + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 8ad3d1cd..3e97b6b9 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -35,7 +35,7 @@ library(dplyr) dt <- archive_cases_dv_subset$DT locf_omitted <- as_epi_archive(dt) -locf_included <- as_epi_archive(dt,compactify = FALSE) +locf_included <- as_epi_archive(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -46,10 +46,10 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu `percent_cli` column for comparing performance. ```{r} -dt2 <- select(dt,-percent_cli) +dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) -locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) +locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were @@ -70,13 +70,13 @@ the LOCF values are omitted. # Performance of filtering iterate_filter <- function(my_ea) { for (i in 1:1000) { - filter(my_ea$DT,version >= as.Date("2020-01-01") + i) + filter(my_ea$DT, version >= as.Date("2020-01-01") + i) } } elapsed_time <- function(fx) c(system.time(fx))[[3]] -speed_test <- function(f,name) { +speed_test <- function(f, name) { data.frame( operation = name, locf = elapsed_time(f(locf_included_2)), @@ -84,8 +84,7 @@ speed_test <- function(f,name) { ) } -speeds <- speed_test(iterate_filter,"filter_1000x") - +speeds <- speed_test(iterate_filter, "filter_1000x") ``` We would also like to measure the speed of `epi_archive` methods. @@ -98,22 +97,22 @@ iterate_as_of <- function(my_ea) { } } -speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) +speeds <- rbind(speeds, speed_test(iterate_as_of, "as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(case_rate_7d_av), before = 7) + my_ea$slide(median = median(case_rate_7d_av), before = 7) } -speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) +speeds <- rbind(speeds, speed_test(slide_median, "slide_median")) ``` Here is a detailed performance comparison: ```{r} -speeds_tidy <- tidyr::gather(speeds,key="is_locf",value="time_in_s",locf,no_locf) +speeds_tidy <- tidyr::gather(speeds, key = "is_locf", value = "time_in_s", locf, no_locf) library(ggplot2) ggplot(speeds_tidy) + - geom_bar(aes(x=is_locf,y=time_in_s,fill=operation),stat = "identity") + geom_bar(aes(x = is_locf, y = time_in_s, fill = operation), stat = "identity") ``` diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index 4fb4eda5..abef646f 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -43,7 +43,7 @@ The data has 1,158 rows and 3 columns. data(jhu_csse_daily_subset) x <- jhu_csse_daily_subset %>% select(geo_value, time_value, cases = cases_7d_av) %>% - filter(geo_value %in% c("pa","ga") & time_value >= "2020-06-01") %>% + filter(geo_value %in% c("pa", "ga") & time_value >= "2020-06-01") %>% arrange(geo_value, time_value) %>% as_epi_df() ``` @@ -104,16 +104,20 @@ red) and below -1% (in blue), faceting by geo value. library(ggplot2) theme_set(theme_bw()) -upper = 0.01 -lower = -0.01 +upper <- 0.01 +lower <- -0.01 ggplot(x, aes(x = time_value, y = cases)) + - geom_tile(data = x %>% filter(cases_gr1 >= upper), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 2, alpha = 0.08) + - geom_tile(data = x %>% filter(cases_gr1 <= lower), - aes(x = time_value, y = 0, width = 7, height = Inf), - fill = 4, alpha = 0.08) + + geom_tile( + data = x %>% filter(cases_gr1 >= upper), + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 2, alpha = 0.08 + ) + + geom_tile( + data = x %>% filter(cases_gr1 <= lower), + aes(x = time_value, y = 0, width = 7, height = Inf), + fill = 4, alpha = 0.08 + ) + geom_line() + facet_wrap(vars(geo_value), scales = "free_y") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + @@ -128,7 +132,7 @@ ggplot(x, aes(x = time_value, y = cases_gr1)) + geom_line(aes(col = geo_value)) + geom_hline(yintercept = upper, linetype = 2, col = 2) + geom_hline(yintercept = lower, linetype = 2, col = 4) + - scale_color_manual(values = c(3,6)) + + scale_color_manual(values = c(3, 6)) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "State") ``` @@ -154,15 +158,18 @@ x <- x %>% mutate(cases_gr2 = growth_rate(time_value, cases, method = "linear_reg")) x %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr1 = "rel_change", - cases_gr2 = "linear_reg")) %>% + cases_gr1 = "rel_change", + cases_gr2 = "linear_reg" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(2,4)) + + scale_color_manual(values = c(2, 4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") @@ -183,20 +190,25 @@ details.) ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), - cases_gr4 = growth_rate(time_value, cases, method = "trend_filter")) + mutate( + cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), + cases_gr4 = growth_rate(time_value, cases, method = "trend_filter") + ) x %>% select(geo_value, time_value, cases_gr3, cases_gr4) %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr3 = "smooth_spline", - cases_gr4 = "trend_filter")) %>% + cases_gr3 = "smooth_spline", + cases_gr4 = "trend_filter" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(3,6)) + + scale_color_manual(values = c(3, 6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") @@ -227,41 +239,57 @@ the call to `growth_rate()`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr5 = growth_rate(time_value, cases, method = "rel_change", - log_scale = TRUE), - cases_gr6 = growth_rate(time_value, cases, method = "linear_reg", - log_scale = TRUE), - cases_gr7 = growth_rate(time_value, cases, method = "smooth_spline", - log_scale = TRUE), - cases_gr8 = growth_rate(time_value, cases, method = "trend_filter", - log_scale = TRUE)) + mutate( + cases_gr5 = growth_rate(time_value, cases, + method = "rel_change", + log_scale = TRUE + ), + cases_gr6 = growth_rate(time_value, cases, + method = "linear_reg", + log_scale = TRUE + ), + cases_gr7 = growth_rate(time_value, cases, + method = "smooth_spline", + log_scale = TRUE + ), + cases_gr8 = growth_rate(time_value, cases, + method = "trend_filter", + log_scale = TRUE + ) + ) x %>% select(geo_value, time_value, cases_gr5, cases_gr6) %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr5 = "rel_change_log", - cases_gr6 = "linear_reg_log")) %>% + cases_gr5 = "rel_change_log", + cases_gr6 = "linear_reg_log" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(2,4)) + + scale_color_manual(values = c(2, 4)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") x %>% select(geo_value, time_value, cases_gr7, cases_gr8) %>% - pivot_longer(cols = starts_with("cases_gr"), - names_to = "method", - values_to = "gr") %>% + pivot_longer( + cols = starts_with("cases_gr"), + names_to = "method", + values_to = "gr" + ) %>% mutate(method = recode(method, - cases_gr7 = "smooth_spline_log", - cases_gr8 = "trend_filter_log")) %>% + cases_gr7 = "smooth_spline_log", + cases_gr8 = "trend_filter_log" + )) %>% ggplot(aes(x = time_value, y = gr)) + geom_line(aes(col = method)) + - scale_color_manual(values = c(3,6)) + + scale_color_manual(values = c(3, 6)) + facet_wrap(vars(geo_value), scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Growth rate", col = "Method") From c02d2013e71fe78659bea0d29683c68f8ff55ca5 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 17 Jan 2024 13:52:17 -0800 Subject: [PATCH 050/345] repo: ignore styling commit --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..a3d36061 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +c65876078a6f9525952b305eaea2fca003adf907 \ No newline at end of file From 96c83d664406f749cf357d01061e479bf0346189 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:50:14 -0500 Subject: [PATCH 051/345] use closure to fetch min_ref_time_values from `starts` instead of recalculating --- R/slide.R | 59 ++++++++++----------------------- tests/testthat/test-epi_slide.R | 44 ++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 42 deletions(-) diff --git a/R/slide.R b/R/slide.R index 2a10efce..9f2cccad 100644 --- a/R/slide.R +++ b/R/slide.R @@ -230,37 +230,15 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, after <- time_step(after) } + # Do set up to let us recover `ref_time_value`s later. min_ref_time_values <- ref_time_values - before min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))] - # Do set up to let us recover `ref_time_value`s later. - # A helper column marking real observations. - x$.real <- TRUE - - # Create df containing phony data. Df has the same columns and attributes as - # `x`, but filled with `NA`s aside from grouping columns. Number of rows is - # equal to the number of `min_ref_time_values_not_in_x` we have * the - # number of unique levels seen in the grouping columns. - before_time_values_df <- data.frame(time_value = min_ref_time_values_not_in_x) - if (length(group_vars(x)) != 0) { - before_time_values_df <- dplyr::cross_join( - # Get unique combinations of grouping columns seen in real data. - unique(x[, group_vars(x)]), - before_time_values_df - ) - } - # Automatically fill in all other columns from `x` with `NA`s, and carry - # attributes over to new df. - before_time_values_df <- bind_rows(x[0, ], before_time_values_df) - before_time_values_df$.real <- FALSE - - x <- bind_rows(before_time_values_df, x) - # Arrange by increasing time_value x <- arrange(x, time_value) # Now set up starts and stops for sliding/hopping - time_range <- range(unique(x$time_value)) + time_range <- range(unique(c(x$time_value, min_ref_time_values_not_in_x))) starts <- in_range(ref_time_values - before, time_range) stops <- in_range(ref_time_values + after, time_range) @@ -273,7 +251,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Computation for one group, all time values slide_one_grp <- function(.data_group, - f, ..., + f_factory, ..., starts, stops, time_values, @@ -288,6 +266,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, stops <- stops[o] time_values <- time_values[o] + f <- f_factory(starts) + # Compute the slide values slide_values_list <- slider::hop_index( .x = .data_group, @@ -349,7 +329,6 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # fills with NA equivalent. vctrs::vec_slice(slide_values, o) <- orig_values } else { - # This implicitly removes phony (`.real` == FALSE) observations. .data_group <- filter(.data_group, o) } return(mutate(.data_group, !!new_col := slide_values)) @@ -372,15 +351,20 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, f <- as_slide_computation(f, ...) # Create a wrapper that calculates and passes `.ref_time_value` to the - # computation. - f_wrapper <- function(.x, .group_key, ...) { - .ref_time_value <- min(.x$time_value) + before - .x <- .x[.x$.real, ] - .x$.real <- NULL - f(.x, .group_key, .ref_time_value, ...) + # computation. `i` is contained in the `f_wrapper_factory` environment such + # that when called within `slide_one_grp` `i` is reset for every group. + f_wrapper_factory <- function(starts) { + # Use `i` to advance through list of start dates. + i <- 1L + f_wrapper <- function(.x, .group_key, ...) { + .ref_time_value <- starts[[i]] + before + i <<- i + 1L + f(.x, .group_key, .ref_time_value, ...) + } + return(f_wrapper) } x <- group_modify(x, slide_one_grp, - f = f_wrapper, ..., + f_factory = f_wrapper_factory, ..., starts = starts, stops = stops, time_values = ref_time_values, @@ -394,14 +378,5 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, x <- unnest(x, !!new_col, names_sep = names_sep) } - # Remove any remaining phony observations. When `all_rows` is TRUE, phony - # observations aren't necessarily removed in `slide_one_grp`. - if (all_rows) { - x <- x[x$.real, ] - } - - # Drop helper column `.real`. - x$.real <- NULL - return(x) } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index e2bbc040..cd38dc97 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -626,3 +626,47 @@ test_that("`epi_slide` can access objects inside of helper functions", { NA ) }) + +test_that("epi_slide basic behavior is correct when groups have non-overlapping date ranges", { + small_x_misaligned_dates <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) + ) %>% + as_epi_df(as_of = d + 6) %>% + group_by(geo_value) + + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5))) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- epi_slide(small_x_misaligned_dates, f = ~ sum(.x$value), before = 50) + expect_identical(result1, expected_output) +}) + + +test_that("epi_slide gets correct ref_time_value when groups have non-overlapping date ranges", { + small_x_misaligned_dates <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) + ) %>% + as_epi_df(as_of = d + 6) %>% + group_by(geo_value) + + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = d + 151:155) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x_misaligned_dates %>% + epi_slide( + before = 50, + slide_value = .ref_time_value + ) + + expect_identical(result1, expected_output) +}) From 9e0d9ed156fcfd10ec2d9aa5f28b4322176255b0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 17 Jan 2024 13:29:55 -0500 Subject: [PATCH 052/345] calculate all starts + before at once --- R/slide.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 9f2cccad..c7493fd3 100644 --- a/R/slide.R +++ b/R/slide.R @@ -356,8 +356,9 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, f_wrapper_factory <- function(starts) { # Use `i` to advance through list of start dates. i <- 1L + starts <- starts + before f_wrapper <- function(.x, .group_key, ...) { - .ref_time_value <- starts[[i]] + before + .ref_time_value <- starts[[i]] i <<- i + 1L f(.x, .group_key, .ref_time_value, ...) } From aae9dee3becd26dc59da73ed2c157eb4ba95240f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:27:23 -0500 Subject: [PATCH 053/345] document --- man/as_epi_archive.Rd | 56 ++++++++++++++++++++------------- man/as_epi_df.Rd | 44 +++++++++++++++----------- man/detect_outlr.Rd | 62 +++++++++++++++++++++++-------------- man/detect_outlr_rm.Rd | 7 +++-- man/detect_outlr_stl.Rd | 7 +++-- man/epi_archive.Rd | 18 ++++++----- man/epi_cor.Rd | 48 ++++++++++++++++------------ man/epi_slide.Rd | 22 +++++++------ man/epix_as_of.Rd | 25 ++++++++++----- man/epix_merge.Rd | 10 +++--- man/epix_slide.Rd | 55 +++++++++++++++++--------------- man/group_by.epi_archive.Rd | 48 ++++++++++++++++------------ man/growth_rate.Rd | 12 +++---- man/is_epi_archive.Rd | 4 +-- 14 files changed, 247 insertions(+), 171 deletions(-) diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index d63a5faa..a1c60687 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -99,32 +99,44 @@ would be equivalent to: # Simple ex. with necessary keys tib <- tibble::tibble( geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5), times = 2), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% as_epi_archive(geo_type = "state", - time_type = "day") -toy_epi_archive +toy_epi_archive <- tib \%>\% as_epi_archive( + geo_type = "state", + time_type = "day" +) +toy_epi_archive # Ex. with an additional key for county -df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), - county = c(1, 3, 2, 5), - time_value = c("2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02"), - version = c("2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03"), - cases = c(1, 2, 3, 4), - cases_rate = c(0.01, 0.02, 0.01, 0.05)) +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) +) -x <- df \%>\% as_epi_archive(geo_type = "state", - time_type = "day", - other_keys = "county") +x <- df \%>\% as_epi_archive( + geo_type = "state", + time_type = "day", + other_keys = "county" +) } diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 851aed7e..40c0a1c5 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -76,19 +76,22 @@ the \code{tbl_ts} class is dropped, and any key variables (other than ex1_input <- tibble::tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c("06059","06061","06067", - "12111","12113","12117", - "42101", "42103","42105"), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), + by = "day" + ), length.out = length(geo_value)), value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) -) \%>\% +) \%>\% tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) # The `other_keys` metadata (`"county_code"` in this case) is automatically # inferred from the `tsibble`'s `key`: ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") -attr(ex1,"metadata")[["other_keys"]] +attr(ex1, "metadata")[["other_keys"]] @@ -102,17 +105,21 @@ ex2_input <- tibble::tibble( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(state)), # misnamed + by = "day" + ), length.out = length(state)), # misnamed value = 1:length(state) + 0.01 * rnorm(length(state)) -) +) print(ex2_input) -ex2 <- ex2_input \%>\% dplyr::rename(geo_value = state, time_value = reported_date) \%>\% - as_epi_df(geo_type = "state", as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol")) +ex2 <- ex2_input \%>\% + dplyr::rename(geo_value = state, time_value = reported_date) \%>\% + as_epi_df( + geo_type = "state", as_of = "2020-06-03", + additional_metadata = list(other_keys = "pol") + ) -attr(ex2,"metadata") +attr(ex2, "metadata") @@ -120,17 +127,18 @@ attr(ex2,"metadata") ex3_input <- jhu_csse_county_level_subset \%>\% dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% - dplyr::slice_tail(n = 6) + dplyr::slice_tail(n = 6) -ex3 <- ex3_input \%>\% +ex3 <- ex3_input \%>\% tsibble::as_tsibble() \%>\% # needed to add the additional metadata # add 2 extra keys dplyr::mutate( - state = rep("MA",6), - pol = rep(c("blue", "swing", "swing"), each = 2)) \%>\% - # the 2 extra keys we added have to be specified in the other_keys + state = rep("MA", 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) \%>\% + # the 2 extra keys we added have to be specified in the other_keys # component of additional_metadata. as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) -attr(ex3,"metadata") +attr(ex3, "metadata") } diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 4aa0b79c..3a793ebf 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -64,29 +64,43 @@ For convenience, the outlier detection method can be specified (in the STL decomposition. } \examples{ - detection_methods = dplyr::bind_rows( - dplyr::tibble(method = "rm", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5)), - abbr = "rm"), - dplyr::tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = 7)), - abbr = "stl_seasonal"), - dplyr::tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = NULL)), - abbr = "stl_nonseasonal")) +detection_methods <- dplyr::bind_rows( + dplyr::tibble( + method = "rm", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5 + )), + abbr = "rm" + ), + dplyr::tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = 7 + )), + abbr = "stl_seasonal" + ), + dplyr::tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = NULL + )), + abbr = "stl_nonseasonal" + ) +) - x <- incidence_num_outlier_example \%>\% - dplyr::select(geo_value,time_value,cases) \%>\% - as_epi_df() \%>\% - group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr( - x = time_value, y = cases, - methods = detection_methods, - combiner = "median")) \%>\% - unnest(outlier_info) +x <- incidence_num_outlier_example \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr( + x = time_value, y = cases, + methods = detection_methods, + combiner = "median" + )) \%>\% + unnest(outlier_info) } diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 3efae55d..0d011619 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -59,10 +59,11 @@ terms of multiples of the rolling interquartile range (IQR). \examples{ # Detect outliers based on a rolling median incidence_num_outlier_example \%>\% - dplyr::select(geo_value,time_value,cases) \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% as_epi_df() \%>\% group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr_rm( - x = time_value, y = cases)) \%>\% + mutate(outlier_info = detect_outlr_rm( + x = time_value, y = cases + )) \%>\% unnest(outlier_info) } diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 7e724a4e..34a550d5 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -78,11 +78,12 @@ are exactly as in \code{detect_outlr_rm()}. \examples{ # Detects outliers based on a seasonal-trend decomposition using LOESS incidence_num_outlier_example \%>\% - dplyr::select(geo_value,time_value,cases) \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% as_epi_df() \%>\% group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr_stl( + mutate(outlier_info = detect_outlr_stl( x = time_value, y = cases, - seasonal_period = 7 )) \%>\% # weekly seasonality for daily data + seasonal_period = 7 + )) \%>\% # weekly seasonality for daily data unnest(outlier_info) } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index a4a58645..366eafe0 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -100,16 +100,20 @@ are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. \examples{ tib <- tibble::tibble( geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5), times = 2), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% epi_archive$new(geo_type = "state", - time_type = "day") -toy_epi_archive +toy_epi_archive <- tib \%>\% epi_archive$new( + geo_type = "state", + time_type = "day" +) +toy_epi_archive } \section{Methods}{ \subsection{Public methods}{ diff --git a/man/epi_cor.Rd b/man/epi_cor.Rd index 6b2279db..fb56073f 100644 --- a/man/epi_cor.Rd +++ b/man/epi_cor.Rd @@ -58,30 +58,38 @@ grouping by geo value, time value, or any other variables. See the for examples. } \examples{ - + # linear association of case and death rates on any given day -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = "time_value") +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = "time_value" +) # correlation of death rates and lagged case rates -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = time_value, - dt1 = -2) +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = time_value, + dt1 = -2 +) -# correlation grouped by location -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = geo_value) +# correlation grouped by location +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value +) # correlation grouped by location and incorporates lagged cases rates -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = geo_value, - dt1 = -2) +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value, + dt1 = -2 +) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 33c3a7fb..668be9ff 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -156,9 +156,9 @@ through the \code{new_col_name} argument. # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6) \%>\% + epi_slide(cases_7dav = mean(cases), before = 6) \%>\% # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + dplyr::select(-death_rate_7d_av) # slide a 7-day leading average jhu_csse_daily_subset \%>\% @@ -170,21 +170,25 @@ jhu_csse_daily_subset \%>\% # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% + epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + dplyr::select(-death_rate_7d_av) # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% + epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + dplyr::select(-death_rate_7d_av) # nested new columns jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(a = data.frame(cases_2dav = mean(cases), - cases_2dma = mad(cases)), - before = 1, as_list_col = TRUE) + epi_slide( + a = data.frame( + cases_2dav = mean(cases), + cases_2dma = mad(cases) + ), + before = 1, as_list_col = TRUE + ) } diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 51884597..9a0a53ce 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -55,14 +55,18 @@ in the future. } \examples{ # warning message of data latency shown -epix_as_of(x = archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version)) +epix_as_of( + x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version) +) range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 -epix_as_of(x = archive_cases_dv_subset, - max_version = as.Date("2020-06-12")) +epix_as_of( + x = archive_cases_dv_subset, + max_version = as.Date("2020-06-12") +) # When fetching a snapshot as of the latest version with update data in the # archive, a warning is issued by default, as this update data might not yet @@ -72,10 +76,15 @@ epix_as_of(x = archive_cases_dv_subset, # based on database queries, the latest available update might still be # subject to change, but previous versions should be finalized). We can # muffle such warnings with the following pattern: -withCallingHandlers({ - epix_as_of(x = archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version)) -}, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +withCallingHandlers( + { + epix_as_of( + x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version) + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +) # Since R 4.0, there is a `globalCallingHandlers` function that can be used # to globally toggle these warnings. diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 09f67fa2..53dea071 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -60,13 +60,13 @@ be clobbered in either input archive. \examples{ # create two example epi_archive datasets x <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value,time_value,version,case_rate_7d_av) \%>\% - as_epi_archive(compactify=TRUE) + dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% + as_epi_archive(compactify = TRUE) y <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value,time_value,version,percent_cli) \%>\% - as_epi_archive(compactify=TRUE) + dplyr::select(geo_value, time_value, version, percent_cli) \%>\% + as_epi_archive(compactify = TRUE) # merge results stored in a third object: -xy = epix_merge(x, y) +xy <- epix_merge(x, y) # vs. mutating x to hold the merge result: x$merge(y) diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index d94460af..3ac55a18 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -193,17 +193,20 @@ library(dplyr) # Reference time points for which we want to compute slide values: ref_time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-15"), - by = "1 day") + as.Date("2020-06-15"), + by = "1 day" +) # A simple (but not very useful) example (see the archive vignette for a more # realistic one): archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = ref_time_values, - new_col_name = 'case_rate_7d_av_recent_av') \%>\% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = "case_rate_7d_av_recent_av" + ) \%>\% ungroup() # We requested time windows that started 2 days before the corresponding time # values. The actual number of `time_value`s in each computation depends on @@ -221,23 +224,24 @@ archive_cases_dv_subset \%>\% # Examining characteristics of the data passed to each computation with # `all_versions=FALSE`. archive_cases_dv_subset \%>\% - group_by(geo_value) \%>\% - epix_slide( - function(x, gk, rtv) { - tibble( - time_range = if(nrow(x) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) - }, - n = nrow(x), - class1 = class(x)[[1L]] - ) - }, - before = 5, all_versions = FALSE, - ref_time_values = ref_time_values, names_sep=NULL) \%>\% - ungroup() \%>\% - arrange(geo_value, time_value) + group_by(geo_value) \%>\% + epix_slide( + function(x, gk, rtv) { + tibble( + time_range = if (nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) # --- Advanced: --- @@ -259,7 +263,7 @@ archive_cases_dv_subset \%>\% toString(min(x$DT$version)) }, versions_end = x$versions_end, - time_range = if(nrow(x$DT) == 0L) { + time_range = if (nrow(x$DT) == 0L) { "0 `time_value`s" } else { sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) @@ -269,7 +273,8 @@ archive_cases_dv_subset \%>\% ) }, before = 5, all_versions = TRUE, - ref_time_values = ref_time_values, names_sep=NULL) \%>\% + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% ungroup() \%>\% # Focus on one geo_value so we can better see the columns above: filter(geo_value == "ca") \%>\% diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index aee0a07b..5e867bf3 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -93,7 +93,7 @@ to \code{group_by_drop_default.default} (but there is a dedicated method for } \examples{ -grouped_archive = archive_cases_dv_subset \%>\% group_by(geo_value) +grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) # `print` for metadata and method listing: grouped_archive \%>\% print() @@ -102,10 +102,12 @@ grouped_archive \%>\% print() archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = 'case_rate_3d_av') \%>\% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = "case_rate_3d_av" + ) \%>\% ungroup() # ----------------------------------------------------------------- @@ -113,34 +115,42 @@ archive_cases_dv_subset \%>\% # Advanced: some other features of dplyr grouping are implemented: library(dplyr) -toy_archive = +toy_archive <- tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) ) \%>\% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version)) \%>\% as_epi_archive(other_keys = "age_group") # The following are equivalent: toy_archive \%>\% group_by(geo_value, age_group) -toy_archive \%>\% group_by(geo_value) \%>\% group_by(age_group, .add=TRUE) -grouping_cols = c("geo_value", "age_group") +toy_archive \%>\% + group_by(geo_value) \%>\% + group_by(age_group, .add = TRUE) +grouping_cols <- c("geo_value", "age_group") toy_archive \%>\% group_by(across(all_of(grouping_cols))) # And these are equivalent: toy_archive \%>\% group_by(geo_value) -toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) +toy_archive \%>\% + group_by(geo_value, age_group) \%>\% + ungroup(age_group) # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -toy_archive \%>\% group_by(geo_value) \%>\% groups() +toy_archive \%>\% + group_by(geo_value) \%>\% + groups() toy_archive \%>\% - group_by(geo_value, age_group, .drop=FALSE) \%>\% + group_by(geo_value, age_group, .drop = FALSE) \%>\% epix_slide(f = ~ sum(.x$value), before = 20) \%>\% ungroup() diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd index 203d5d7d..7a3f1151 100644 --- a/man/growth_rate.Rd +++ b/man/growth_rate.Rd @@ -136,12 +136,12 @@ user. \examples{ # COVID cases growth rate by state using default method relative change -jhu_csse_daily_subset \%>\% - group_by(geo_value) \%>\% - mutate(cases_gr = growth_rate(x = time_value, y = cases)) +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + mutate(cases_gr = growth_rate(x = time_value, y = cases)) # Log scale, degree 4 polynomial and 6-fold cross validation -jhu_csse_daily_subset \%>\% - group_by(geo_value) \%>\% - mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) } diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd index 5b133004..2beb3a8c 100644 --- a/man/is_epi_archive.Rd +++ b/man/is_epi_archive.Rd @@ -25,9 +25,9 @@ is_epi_archive(archive_cases_dv_subset) # TRUE # By default, grouped_epi_archives don't count as epi_archives, as they may # support a different set of operations from regular `epi_archives`. This # behavior can be controlled by `grouped_okay`. -grouped_archive = archive_cases_dv_subset$group_by(geo_value) +grouped_archive <- archive_cases_dv_subset$group_by(geo_value) is_epi_archive(grouped_archive) # FALSE -is_epi_archive(grouped_archive, grouped_okay=TRUE) # TRUE +is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE } \seealso{ From 82b164ce50300fe1b4b19a791472b7d57a53ebc8 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 12:01:31 -0800 Subject: [PATCH 054/345] fixing constant check warnings --- R/archive.R | 93 +++++++++++++++++++++++++- man/epi_archive.Rd | 159 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 250 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 1908b77c..baece5d0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -216,11 +216,23 @@ epi_archive <- classname = "epi_archive", ##### public = list( + #' @field DT (`data.table`)\cr + #' the compressed datatable DT = NULL, + #' @field geo_type (`character()`)\cr + #' the resolution of the geographic label (e.g. state) geo_type = NULL, + #' @field time_type (`character()`)\cr + #' the resolution of the time column (e.g. day) time_type = NULL, + #' @field additional_metadata (`list()`)\cr + #' any extra fields, such as `other_keys` additional_metadata = NULL, + #' @field clobberable_versions_start (`Date()`)\cr + #' the earliest date that new data should overwrite existing data clobberable_versions_start = NULL, + #' @field versions_end (`Date()`)\cr + #' the latest version observed versions_end = NULL, #' @description Creates a new `epi_archive` object. #' @param x A data frame, data table, or tibble, with columns `geo_value`, @@ -679,7 +691,18 @@ epi_archive <- return(invisible(self)) }, - ##### + #' group an epi_archive + #' @description + #' group an epi_archive + #' @param ... variables or computations to group by. Computations are always + #' done on the ungrouped data frame. To perform computations on the grouped + #' data, you need to use a separate [`mutate()`] step before the + #' [`group_by()`] + #' @param .add When `FALSE`, the default, [`group_by()`] will override existing + #' groups. To add to the existing groups, use `.add = TRUE`. + #' @param .drop Drop groups formed by factor levels that don't appear in the + #' data. The default is `TRUE` except when `.data` has been previously grouped + #' with `.drop = FALSE`. See [`group_by_drop_default()`] for details. group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { group_by.epi_archive(self, ..., .add = .add, .drop = .drop) }, @@ -688,6 +711,74 @@ epi_archive <- #' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms + #' @param f Function, formula, or missing; together with `...` specifies the + #' computation to slide. To "slide" means to apply a computation over a + #' sliding (a.k.a. "rolling") time window for each data group. The window is + #' determined by the `before` parameter described below. One time step is + #' typically one day or one week; see [`epi_slide`] details for more + #' explanation. If a function, `f` must take an `epi_df` with the same + #' column names as the archive's `DT`, minus the `version` column; followed + #' by a one-row tibble containing the values of the grouping variables for + #' the associated group; followed by a reference time value, usually as a + #' `Date` object; followed by any number of named arguments. If a formula, + #' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as + #' in `~ mean (.x$var)` to compute a mean of a column `var` for each + #' group-`ref_time_value` combination. The group key can be accessed via + #' `.y` or `.group_key`, and the reference time value can be accessed via + #' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the + #' computation. + #' @param ... Additional arguments to pass to the function or formula specified + #' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an + #' expression for tidy evaluation; in addition to referring to columns + #' directly by name, the expression has access to `.data` and `.env` pronouns + #' as in `dplyr` verbs, and can also refer to the `.group_key` and + #' `.ref_time_value`. See details of [`epi_slide`]. + #' @param before How far `before` each `ref_time_value` should the sliding + #' window extend? If provided, should be a single, non-NA, + #' [integer-compatible][vctrs::vec_cast] number of time steps. This window + #' endpoint is inclusive. For example, if `before = 7`, and one time step is + #' one day, then to produce a value for a `ref_time_value` of January 8, we + #' apply the given function or formula to data (for each group present) with + #' `time_value`s from January 1 onward, as they were reported on January 8. + #' For typical disease surveillance sources, this will not include any data + #' with a `time_value` of January 8, and, depending on the amount of reporting + #' latency, may not include January 7 or even earlier `time_value`s. (If + #' instead the archive were to hold nowcasts instead of regular surveillance + #' data, then we would indeed expect data for `time_value` January 8. If it + #' were to hold forecasts, then we would expect data for `time_value`s after + #' January 8, and the sliding window would extend as far after each + #' `ref_time_value` as needed to include all such `time_value`s.) + #' @param ref_time_values Reference time values / versions for sliding + #' computations; each element of this vector serves both as the anchor point + #' for the `time_value` window for the computation and the `max_version` + #' `as_of` which we fetch data in this window. If missing, then this will set + #' to a regularly-spaced sequence of values set to cover the range of + #' `version`s in the `DT` plus the `versions_end`; the spacing of values will + #' be guessed (using the GCD of the skips between values). + #' @param time_step Optional function used to define the meaning of one time + #' step, which if specified, overrides the default choice based on the + #' `time_value` column. This function must take a positive integer and return + #' an object of class `lubridate::period`. For example, we can use `time_step + #' = lubridate::hours` in order to set the time step to be one hour (this + #' would only be meaningful if `time_value` is of class `POSIXct`). + #' @param new_col_name String indicating the name of the new column that will + #' contain the derivative values. Default is "slide_value"; note that setting + #' `new_col_name` equal to an existing column name will overwrite this column. + #' @param as_list_col Should the slide results be held in a list column, or be + #' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, + #' in which case a list object returned by `f` would be unnested (using + #' [`tidyr::unnest()`]), and, if the slide computations output data frames, + #' the names of the resulting columns are given by prepending `new_col_name` + #' to the names of the list elements. + #' @param names_sep String specifying the separator to use in `tidyr::unnest()` + #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix + #' from `new_col_name` entirely. + #' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If + #' `all_versions = TRUE`, then `f` will be passed the version history (all + #' `version <= ref_time_value`) for rows having `time_value` between + #' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be + #' passed only the most recent `version` for every unique `time_value`. + #' Default is `FALSE`. slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index a4a58645..01f8286f 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -111,6 +111,29 @@ toy_epi_archive <- tib \%>\% epi_archive$new(geo_type = "state", time_type = "day") toy_epi_archive } +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{DT}}{(\code{data.table})\cr +the compressed datatable} + +\item{\code{geo_type}}{(\code{character()})\cr +the resolution of the geographic label (e.g. state)} + +\item{\code{time_type}}{(\code{character()})\cr +the resolution of the time column (e.g. day)} + +\item{\code{additional_metadata}}{(\code{list()})\cr +any extra fields, such as \code{other_keys}} + +\item{\code{clobberable_versions_start}}{(\code{Date()})\cr +the earliest date that new data should overwrite existing data} + +\item{\code{versions_end}}{(\code{Date()})\cr +the latest version observed} +} +\if{html}{\out{
}} +} \section{Methods}{ \subsection{Public methods}{ \itemize{ @@ -192,6 +215,7 @@ rows of \code{x}.} \subsection{Details}{ Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information and examples of parameter names. +print archive } \subsection{Returns}{ @@ -206,6 +230,15 @@ An \code{epi_archive} object. \if{html}{\out{
}}\preformatted{epi_archive$print(class = TRUE, methods = TRUE)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{class}}{print the class label} + +\item{\code{methods}}{print all available methods of the archive} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} @@ -217,6 +250,30 @@ See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_ \if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf, all_versions = FALSE)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_version}}{Time value specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose time values are at least +\code{min_time_value}.)} + +\item{\code{min_time_value}}{Time value specifying the min time value to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{\code{all_versions}}{If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{\code{x}}{An \code{epi_archive} object} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} @@ -287,7 +344,8 @@ does not alias either archive's \code{DT}. \item{\code{sync}}{as in \code{\link{epix_merge}}} -\item{\code{compactify}}{as in \code{\link{epix_merge}}} +\item{\code{compactify}}{as in \code{\link{epix_merge}} +group an epi_archive} } \if{html}{\out{}} } @@ -296,6 +354,7 @@ does not alias either archive's \code{DT}. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-group_by}{}}} \subsection{Method \code{group_by()}}{ +group an epi_archive \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$group_by( ..., @@ -304,6 +363,23 @@ does not alias either archive's \code{DT}. )}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{variables or computations to group by. Computations are always +done on the ungrouped data frame. To perform computations on the grouped +data, you need to use a separate \code{\link[=mutate]{mutate()}} step before the +\code{\link[=group_by]{group_by()}}} + +\item{\code{.add}}{When \code{FALSE}, the default, \code{\link[=group_by]{group_by()}} will override existing +groups. To add to the existing groups, use \code{.add = TRUE}.} + +\item{\code{.drop}}{Drop groups formed by factor levels that don't appear in the +data. The default is \code{TRUE} except when \code{.data} has been previously grouped +with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop_default()}} for details.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} @@ -326,6 +402,87 @@ details. )}\if{html}{\out{}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{f}}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{\code{...}}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{\code{before}}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{\code{ref_time_values}}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{\code{time_step}}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{\code{new_col_name}}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{\code{as_list_col}}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{\code{names_sep}}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{\code{all_versions}}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} From becd8bd69a836d9a88aa71e907a389e6c8577d10 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 16 Jan 2024 13:52:55 -0800 Subject: [PATCH 055/345] Iterate on archive R6 docs --- R/archive.R | 34 +++++++++++++++++++++++++++------- man/epi_archive.Rd | 32 +++++++++++++++++--------------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/R/archive.R b/R/archive.R index baece5d0..1736bdc2 100644 --- a/R/archive.R +++ b/R/archive.R @@ -217,21 +217,22 @@ epi_archive <- ##### public = list( #' @field DT (`data.table`)\cr - #' the compressed datatable + #' the (optionally compactified) datatable DT = NULL, - #' @field geo_type (`character()`)\cr + #' @field geo_type (string)\cr #' the resolution of the geographic label (e.g. state) geo_type = NULL, - #' @field time_type (`character()`)\cr + #' @field time_type (string)\cr #' the resolution of the time column (e.g. day) time_type = NULL, - #' @field additional_metadata (`list()`)\cr + #' @field additional_metadata (named list)\cr #' any extra fields, such as `other_keys` additional_metadata = NULL, - #' @field clobberable_versions_start (`Date()`)\cr - #' the earliest date that new data should overwrite existing data + #' @field clobberable_versions_start (length-1 of same type&class as `version` column, or `NA`)\cr + #' the earliest version number that might be rewritten in the future without assigning a new version + #' date/number, or `NA` if this won't happen clobberable_versions_start = NULL, - #' @field versions_end (`Date()`)\cr + #' @field versions_end (length-1 of same type&class as `version` column)\cr #' the latest version observed versions_end = NULL, #' @description Creates a new `epi_archive` object. @@ -438,6 +439,10 @@ epi_archive <- self$clobberable_versions_start <- clobberable_versions_start self$versions_end <- versions_end }, + #' Print information about an archive + #' @param class Boolean; whether to print the class label header + #' @param methods Boolean; whether to print all available methods of + #' the archive print = function(class = TRUE, methods = TRUE) { if (class) cat("An `epi_archive` object, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) @@ -500,6 +505,21 @@ epi_archive <- ##### #' @description Generates a snapshot in `epi_df` format as of a given version. #' See the documentation for the wrapper function [`epix_as_of()`] for details. + #' @param x An `epi_archive` object + #' @param max_version Version specifying the max version to permit in the + #' snapshot. That is, the snapshot will comprise the unique rows of the + #' current archive data that represent the most up-to-date signal values, as + #' of the specified `max_version` (and whose `time_value`s are at least + #' `min_time_value`). + #' @param min_time_value Time value specifying the min `time_value` to permit in + #' the snapshot. Default is `-Inf`, which effectively means that there is no + #' minimum considered. + #' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in + #' `epi_archive` format, and contain rows in the specified `time_value` range + #' having `version <= max_version`. The resulting object will cover a + #' potentially narrower `version` and `time_value` range than `x`, depending + #' on user-provided arguments. Otherwise, there will be one row in the output + #' for the `max_version` of each `time_value`. Default is `FALSE`. #' @importFrom data.table between key as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { # Self max version and other keys diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 01f8286f..8d1913c3 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -115,21 +115,22 @@ toy_epi_archive \if{html}{\out{
}} \describe{ \item{\code{DT}}{(\code{data.table})\cr -the compressed datatable} +the (optionally compactified) datatable} -\item{\code{geo_type}}{(\code{character()})\cr +\item{\code{geo_type}}{(string)\cr the resolution of the geographic label (e.g. state)} -\item{\code{time_type}}{(\code{character()})\cr +\item{\code{time_type}}{(string)\cr the resolution of the time column (e.g. day)} -\item{\code{additional_metadata}}{(\code{list()})\cr +\item{\code{additional_metadata}}{(named list)\cr any extra fields, such as \code{other_keys}} -\item{\code{clobberable_versions_start}}{(\code{Date()})\cr -the earliest date that new data should overwrite existing data} +\item{\code{clobberable_versions_start}}{(length-1 of same type&class as \code{version} column, or \code{NA})\cr +the earliest version number that might be rewritten in the future without assigning a new version +date/number, or \code{NA} if this won't happen} -\item{\code{versions_end}}{(\code{Date()})\cr +\item{\code{versions_end}}{(length-1 of same type&class as \code{version} column)\cr the latest version observed} } \if{html}{\out{
}} @@ -215,7 +216,7 @@ rows of \code{x}.} \subsection{Details}{ Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information and examples of parameter names. -print archive +Print information about an archive } \subsection{Returns}{ @@ -233,9 +234,10 @@ An \code{epi_archive} object. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{class}}{print the class label} +\item{\code{class}}{Boolean; whether to print the class label header} -\item{\code{methods}}{print all available methods of the archive} +\item{\code{methods}}{Boolean; whether to print all available methods of +the archive} } \if{html}{\out{
}} } @@ -253,17 +255,17 @@ See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_ \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{max_version}}{Time value specifying the max version to permit in the +\item{\code{max_version}}{Version specifying the max version to permit in the snapshot. That is, the snapshot will comprise the unique rows of the current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose time values are at least -\code{min_time_value}.)} +of the specified \code{max_version} (and whose \code{time_value}s are at least +\code{min_time_value}).} -\item{\code{min_time_value}}{Time value specifying the min time value to permit in +\item{\code{min_time_value}}{Time value specifying the min \code{time_value} to permit in the snapshot. Default is \code{-Inf}, which effectively means that there is no minimum considered.} -\item{\code{all_versions}}{If \code{all_versions = TRUE}, then the output will be in +\item{\code{all_versions}}{Boolean; If \code{all_versions = TRUE}, then the output will be in \code{epi_archive} format, and contain rows in the specified \code{time_value} range having \code{version <= max_version}. The resulting object will cover a potentially narrower \code{version} and \code{time_value} range than \code{x}, depending From a2905763bcd6525359ea5c06bce070369b3c83e3 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 17 Jan 2024 10:21:53 -0800 Subject: [PATCH 056/345] docs: note source --- R/archive.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 1736bdc2..faaf048b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -504,7 +504,8 @@ epi_archive <- }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. - #' See the documentation for the wrapper function [`epix_as_of()`] for details. + #' See the documentation for the wrapper function [`epix_as_of()`] for + #' details. The parameter descriptions below are copied from there #' @param x An `epi_archive` object #' @param max_version Version specifying the max version to permit in the #' snapshot. That is, the snapshot will comprise the unique rows of the @@ -728,7 +729,7 @@ epi_archive <- }, #' @description Slides a given function over variables in an `epi_archive` #' object. See the documentation for the wrapper function [`epix_slide()`] for - #' details. + #' details. The parameter descriptions below are copied from there #' @importFrom data.table key #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' @param f Function, formula, or missing; together with `...` specifies the @@ -828,7 +829,7 @@ epi_archive <- #' Converts a data frame, data table, or tibble into an `epi_archive` #' object. See the [archive #' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. +#' examples. The parameter descriptions below are copied from there #' #' @param x A data frame, data table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. From 0331ea85bd4854caf61e49716d9262489ce8f359 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 17 Jan 2024 10:39:45 -0800 Subject: [PATCH 057/345] docs don't run themselves --- man/as_epi_archive.Rd | 2 +- man/epi_archive.Rd | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index d63a5faa..e3604341 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -81,7 +81,7 @@ An \code{epi_archive} object. \description{ Converts a data frame, data table, or tibble into an \code{epi_archive} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. +examples. The parameter descriptions below are copied from there } \details{ This simply a wrapper around the \code{new()} method of the \code{epi_archive} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 8d1913c3..f328eb44 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -247,7 +247,8 @@ the archive} \if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} \subsection{Method \code{as_of()}}{ Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for details. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for +details. The parameter descriptions below are copied from there \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf, all_versions = FALSE)}\if{html}{\out{
}} } @@ -389,7 +390,7 @@ with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop \subsection{Method \code{slide()}}{ Slides a given function over variables in an \code{epi_archive} object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. +details. The parameter descriptions below are copied from there \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$slide( f, From a1ccf71a40c1f9603719e479b96bcd448acded20 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 17 Jan 2024 11:25:01 -0800 Subject: [PATCH 058/345] update version number when merging PRs --- DESCRIPTION | 2 +- NEWS.md | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 339d5681..2566dcb0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.0.9999 +Version: 0.7.1.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 5b338b47..daf4ff4c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,10 @@ -# epiprocess 0.7.0.9000 +# epiprocess 0.7.1.9999 + +* Patch `select` for grouped `epi_df`s PR #390 +* switched `other_keys` default from `NULL` to `character(0)` PR #390 +* minor doc updates PR #393 + +# epiprocess 0.7.0.9999 ## Improvements From 6d3ed0f8d3fb33a3825f2fd56aecb07b5f1a959a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 18 Jan 2024 13:38:32 -0800 Subject: [PATCH 059/345] Minor NEWS.md tweaks --- NEWS.md | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index daf4ff4c..ef3045c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,20 @@ # epiprocess 0.7.1.9999 -* Patch `select` for grouped `epi_df`s PR #390 -* switched `other_keys` default from `NULL` to `character(0)` PR #390 -* minor doc updates PR #393 +Note that `epiprocess` uses the [Semantic Versioning +("semver")](https://semver.org/) scheme for all release versions, but any +inter-release development versions will include an additional ".9999" suffix. +Pre-1.0.0 numbering scheme: when making changes to a development version +0.x.y.9999, we will increment y when merging PRs, and will have increment x (and +reset y) on release. + +## Breaking changes + +* Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 + +## Improvements + +* `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 +* Minor documentation updates; PR #393 # epiprocess 0.7.0.9999 @@ -12,10 +24,6 @@ # epiprocess 0.7.0 -Note that `epiprocess` uses the [Semantic Versioning -("semver")](https://semver.org/) scheme for all release versions, but any -inter-release development versions will include an additional ".9999" suffix. - ## Breaking changes * Changes to `epi_slide` and `epix_slide`: From 241918f33e7498091a257d3b7fd2df96df89300b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 14:37:29 -0800 Subject: [PATCH 060/345] versioning proposal --- .github/workflows/pkgdown_dev.yaml | 52 +++++++++++++++++++ .../{pkgdown.yaml => pkgdown_main.yaml} | 4 +- .github/workflows/require_semver_bump.yaml | 18 +++++++ DESCRIPTION | 2 +- NEWS.md | 11 ++-- _pkgdown.yml | 2 - 6 files changed, 76 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/pkgdown_dev.yaml rename .github/workflows/{pkgdown.yaml => pkgdown_main.yaml} (96%) create mode 100644 .github/workflows/require_semver_bump.yaml diff --git a/.github/workflows/pkgdown_dev.yaml b/.github/workflows/pkgdown_dev.yaml new file mode 100644 index 00000000..1f33c083 --- /dev/null +++ b/.github/workflows/pkgdown_dev.yaml @@ -0,0 +1,52 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# Created with usethis + edited to run on PRs to dev, use API key. +on: + push: + branches: dev + pull_request: + branches: dev + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + # only build docs on the main repository and not forks + if: github.repository_owner == 'cmu-delphi' + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + env: + DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE="devel")) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown_main.yaml similarity index 96% rename from .github/workflows/pkgdown.yaml rename to .github/workflows/pkgdown_main.yaml index 5d70a744..61d5d9c7 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown_main.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to run on PRs to dev, use API key. on: push: - branches: [main, dev] + branches: [main] pull_request: - branches: [main, dev] + branches: [main] release: types: [published] workflow_dispatch: diff --git a/.github/workflows/require_semver_bump.yaml b/.github/workflows/require_semver_bump.yaml new file mode 100644 index 00000000..46d4f700 --- /dev/null +++ b/.github/workflows/require_semver_bump.yaml @@ -0,0 +1,18 @@ +name: semver-check + +on: + pull_request: + branches: [main, dev] + +jobs: + verify: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: rayepps/require-semver-bump@v1 + env: + GITHUB_TOKEN: ${{ github.token }} + with: + file: DESCRIPTION + pattern: > + Version:\s(.+?) diff --git a/DESCRIPTION b/DESCRIPTION index 2566dcb0..d9bdb435 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.1.9999 +Version: 0.7.1 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index ef3045c0..1bf9da0f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,6 @@ -# epiprocess 0.7.1.9999 +# epiprocess 0.7.1 -Note that `epiprocess` uses the [Semantic Versioning -("semver")](https://semver.org/) scheme for all release versions, but any -inter-release development versions will include an additional ".9999" suffix. -Pre-1.0.0 numbering scheme: when making changes to a development version -0.x.y.9999, we will increment y when merging PRs, and will have increment x (and -reset y) on release. +Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.7.x will indicate PR's. ## Breaking changes @@ -16,7 +11,7 @@ reset y) on release. * `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 * Minor documentation updates; PR #393 -# epiprocess 0.7.0.9999 +# epiprocess 0.7.0 ## Improvements diff --git a/_pkgdown.yml b/_pkgdown.yml index e6bacebf..4efc4920 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,8 +1,6 @@ template: bootstrap: 5 -development: - mode: auto # Colors from epipredict & epidatr, including Carnegie Red https://www.cmu.edu/brand/brand-guidelines/visual-identity/colors.html navbar: From eb9914b1db32509e7a994728434b5d685bee6b10 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 14:48:24 -0800 Subject: [PATCH 061/345] specify main is the release --- .github/workflows/pkgdown_main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown_main.yaml b/.github/workflows/pkgdown_main.yaml index 61d5d9c7..a72010f5 100644 --- a/.github/workflows/pkgdown_main.yaml +++ b/.github/workflows/pkgdown_main.yaml @@ -40,7 +40,7 @@ jobs: - name: Build site env: DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE="release")) shell: Rscript {0} - name: Deploy to GitHub pages 🚀 From 5d3c6d15f2b4ed310161efbe9715844437c30d30 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 14:52:32 -0800 Subject: [PATCH 062/345] testing semver check --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9bdb435..2566dcb0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.1 +Version: 0.7.1.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 7c3b48b49b4992d28c721259ffa89abe809ea783 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 14:53:24 -0800 Subject: [PATCH 063/345] semver works, actual version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2566dcb0..d9bdb435 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.1.9999 +Version: 0.7.1 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 5c6f2da1f73017b4691d0a79aed9522042c40643 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 14:57:55 -0800 Subject: [PATCH 064/345] semver correct regex --- .github/workflows/require_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/require_semver_bump.yaml b/.github/workflows/require_semver_bump.yaml index 46d4f700..e1770e14 100644 --- a/.github/workflows/require_semver_bump.yaml +++ b/.github/workflows/require_semver_bump.yaml @@ -15,4 +15,4 @@ jobs: with: file: DESCRIPTION pattern: > - Version:\s(.+?) + Version:\s(.+?\..+?\..+?.*) From aaab8c248cde10293b243c067f39bb3bdd82d92b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 15:01:10 -0800 Subject: [PATCH 065/345] only 3 places for the action --- .github/workflows/require_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/require_semver_bump.yaml b/.github/workflows/require_semver_bump.yaml index e1770e14..bdf20f0d 100644 --- a/.github/workflows/require_semver_bump.yaml +++ b/.github/workflows/require_semver_bump.yaml @@ -15,4 +15,4 @@ jobs: with: file: DESCRIPTION pattern: > - Version:\s(.+?\..+?\..+?.*) + Version:\s(.+?\..+?\..+?) From f1d6bc1c2a728366eb59f990e8cd6e83854bc77f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 18 Jan 2024 15:02:25 -0800 Subject: [PATCH 066/345] version bump for PR --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9bdb435..15b7757f 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.1 +Version: 0.7.2 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 0e1fe51956f54195521e4318de33ec0427b42d59 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 18 Jan 2024 16:10:49 -0800 Subject: [PATCH 067/345] Remove unnecessary `in_range` in `epi_slide` implementation `slider::hop_index` doesn't require starts & stops to be in `.i`, and we aren't actually doing that anyway. Plus comment to help clarify that we're passing the group key to comps via `...`. --- R/slide.R | 21 ++++++++------------- R/utils.R | 4 ---- tests/testthat/test-utils.R | 6 ------ 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/R/slide.R b/R/slide.R index c7493fd3..cbc01f91 100644 --- a/R/slide.R +++ b/R/slide.R @@ -230,17 +230,12 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, after <- time_step(after) } - # Do set up to let us recover `ref_time_value`s later. - min_ref_time_values <- ref_time_values - before - min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))] - # Arrange by increasing time_value x <- arrange(x, time_value) # Now set up starts and stops for sliding/hopping - time_range <- range(unique(c(x$time_value, min_ref_time_values_not_in_x))) - starts <- in_range(ref_time_values - before, time_range) - stops <- in_range(ref_time_values + after, time_range) + starts <- ref_time_values - before + stops <- ref_time_values + after if (length(starts) == 0 || length(stops) == 0) { Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).") @@ -251,7 +246,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Computation for one group, all time values slide_one_grp <- function(.data_group, - f_factory, ..., + f_factory, + ..., # group key + any "real" ... args starts, stops, time_values, @@ -260,13 +256,13 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Figure out which reference time values appear in the data group in the # first place (we need to do this because it could differ based on the # group, hence the setup/checks for the reference time values based on all - # the data could still be off) + # the data could still be off): o <- time_values %in% .data_group$time_value starts <- starts[o] stops <- stops[o] time_values <- time_values[o] - f <- f_factory(starts) + f <- f_factory(time_values) # Compute the slide values slide_values_list <- slider::hop_index( @@ -353,12 +349,11 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Create a wrapper that calculates and passes `.ref_time_value` to the # computation. `i` is contained in the `f_wrapper_factory` environment such # that when called within `slide_one_grp` `i` is reset for every group. - f_wrapper_factory <- function(starts) { + f_wrapper_factory <- function(kept_ref_time_values) { # Use `i` to advance through list of start dates. i <- 1L - starts <- starts + before f_wrapper <- function(.x, .group_key, ...) { - .ref_time_value <- starts[[i]] + .ref_time_value <- kept_ref_time_values[[i]] i <<- i + 1L f(.x, .group_key, .ref_time_value, ...) } diff --git a/R/utils.R b/R/utils.R index 9cc707a6..52a33ad2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -361,10 +361,6 @@ as_slide_computation <- function(f, ...) { ########## -in_range <- function(x, rng) pmin(pmax(x, rng[1]), rng[2]) - -########## - Min <- function(x) min(x, na.rm = TRUE) Max <- function(x) max(x, na.rm = TRUE) Sum <- function(x) sum(x, na.rm = TRUE) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 2319d045..4ad692a0 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -7,12 +7,6 @@ test_that("Abort and Warn work", { expect_warning(Warn("warn")) }) -test_that("in_range works", { - expect_equal(in_range(1, c(2, 4)), 2) - expect_equal(in_range(3, c(2, 4)), 3) - expect_equal(in_range(5, c(2, 4)), 4) -}) - test_that("new summarizing functions work", { x <- c(3, 4, 5, 9, NA) expect_equal(Min(x), 3) From 51edc5ac5ad7d9fe5ea3253f5b19c472800ef823 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 18 Jan 2024 16:21:46 -0800 Subject: [PATCH 068/345] Rename `time_values` in `epi_slide` internals to clarify usage Rename `time_values` to `ref_time_values` or `kept_ref_time_values` depending on the context. Does not change the interface of `epi_slide`. --- R/slide.R | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/R/slide.R b/R/slide.R index cbc01f91..9b9ad0fb 100644 --- a/R/slide.R +++ b/R/slide.R @@ -250,19 +250,19 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, ..., # group key + any "real" ... args starts, stops, - time_values, + ref_time_values, all_rows, new_col) { # Figure out which reference time values appear in the data group in the # first place (we need to do this because it could differ based on the # group, hence the setup/checks for the reference time values based on all # the data could still be off): - o <- time_values %in% .data_group$time_value + o <- ref_time_values %in% .data_group$time_value starts <- starts[o] stops <- stops[o] - time_values <- time_values[o] + kept_ref_time_values <- ref_time_values[o] - f <- f_factory(time_values) + f <- f_factory(kept_ref_time_values) # Compute the slide values slide_values_list <- slider::hop_index( @@ -275,13 +275,11 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Now figure out which rows in the data group are in the reference time # values; this will be useful for all sorts of checks that follow - o <- .data_group$time_value %in% time_values + o <- .data_group$time_value %in% kept_ref_time_values num_ref_rows <- sum(o) - # Count the number of appearances of each reference time value (these - # appearances should all be real for now, but if we allow ref time values - # outside of .data_group's time values): - counts <- dplyr::filter(.data_group, .data$time_value %in% time_values) %>% + # Count the number of appearances of each kept reference time value. + counts <- dplyr::filter(.data_group, .data$time_value %in% kept_ref_time_values) %>% dplyr::count(.data$time_value) %>% dplyr::pull(n) @@ -363,7 +361,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, f_factory = f_wrapper_factory, ..., starts = starts, stops = stops, - time_values = ref_time_values, + ref_time_values = ref_time_values, all_rows = all_rows, new_col = new_col, .keep = FALSE From 2156ff7da444a00d40ffb5b39a6ab01f06cd39cc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 18 Jan 2024 16:32:03 -0800 Subject: [PATCH 069/345] Try to separate out group key passing from slide ... forwarding --- R/slide.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/slide.R b/R/slide.R index 9b9ad0fb..aeb22eb2 100644 --- a/R/slide.R +++ b/R/slide.R @@ -246,8 +246,9 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Computation for one group, all time values slide_one_grp <- function(.data_group, + .group_key, # see `?group_modify` + ..., # `...` to `epi_slide` forwarded here f_factory, - ..., # group key + any "real" ... args starts, stops, ref_time_values, @@ -268,9 +269,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, slide_values_list <- slider::hop_index( .x = .data_group, .i = .data_group$time_value, - .f = f, ..., .starts = starts, - .stops = stops + .stops = stops, + .f = f, + .group_key, ... ) # Now figure out which rows in the data group are in the reference time @@ -358,7 +360,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, return(f_wrapper) } x <- group_modify(x, slide_one_grp, - f_factory = f_wrapper_factory, ..., + ..., + f_factory = f_wrapper_factory, starts = starts, stops = stops, ref_time_values = ref_time_values, From 8eabb4a06b0e284d1431560d89be6041492a8ad6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 18 Jan 2024 16:33:17 -0800 Subject: [PATCH 070/345] Avoid linter warning for `dplyr::pull(n)` --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index aeb22eb2..cedcd354 100644 --- a/R/slide.R +++ b/R/slide.R @@ -283,7 +283,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, # Count the number of appearances of each kept reference time value. counts <- dplyr::filter(.data_group, .data$time_value %in% kept_ref_time_values) %>% dplyr::count(.data$time_value) %>% - dplyr::pull(n) + `[[`("n") if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && !all(purrr::map_lgl(slide_values_list, is.data.frame))) { From a824f6224032b2dbfbdd8990ce008ef76a79836b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 18 Jan 2024 16:39:16 -0800 Subject: [PATCH 071/345] Remove redundant validation of `starts` and `stops` We checked them for nonzero length when we filtered `ref_time_values` down to those present in the `x$time_value`, but now we require `all(ref_time_values %in% unique(x$time_value))`. --- R/slide.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/slide.R b/R/slide.R index cedcd354..e2c0bf55 100644 --- a/R/slide.R +++ b/R/slide.R @@ -237,10 +237,6 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, starts <- ref_time_values - before stops <- ref_time_values + after - if (length(starts) == 0 || length(stops) == 0) { - Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).") - } - # Symbolize new column name new_col <- sym(new_col_name) From c59b4840998a144a2071847a931b08fcb162dc5b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 19 Jan 2024 12:09:39 -0500 Subject: [PATCH 072/345] remove `.real` comment in tests --- tests/testthat/test-epi_slide.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index cd38dc97..8137cf19 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -381,8 +381,6 @@ test_that("`epi_slide` doesn't decay date output", { }) test_that("basic grouped epi_slide computation produces expected output", { - # Also checks that we correctly remove extra rows and columns (`.real`) used - # to recover `ref_time_value`s. expected_output <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) From 21204ae250c348eeb44b268f551e2c2e6a586249 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 19 Jan 2024 12:23:09 -0500 Subject: [PATCH 073/345] changelog --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5b338b47..51499b03 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,9 @@ ## Improvements * Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. +* `epi_slide` computations are now 2-4 times faster after changing how + reference time values, made accessible within sliding functions, are + calculated (#397). # epiprocess 0.7.0 From ada710e5d23ae4e2f9955afac3cd0368ab0fde87 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jan 2024 11:35:52 -0800 Subject: [PATCH 074/345] regex that actually matches more than 1 digit --- .github/workflows/require_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/require_semver_bump.yaml b/.github/workflows/require_semver_bump.yaml index bdf20f0d..548ceb1e 100644 --- a/.github/workflows/require_semver_bump.yaml +++ b/.github/workflows/require_semver_bump.yaml @@ -15,4 +15,4 @@ jobs: with: file: DESCRIPTION pattern: > - Version:\s(.+?\..+?\..+?) + Version:\s(\d+\.\d+\.\d+\d*) From becc554277869496bf600bb9e332b1c20cc9cc8d Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 19 Jan 2024 11:43:15 -0800 Subject: [PATCH 075/345] style: run styler --- R/autoplot.R | 44 +++++++++++++++++++++++--------------------- R/utils-arg.R | 3 +-- R/utils.R | 2 +- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index 8263818d..6c6bf5eb 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -28,8 +28,10 @@ #' @examples #' autoplot(jhu_csse_daily_subset, cases, death_rate_7d_av) #' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .facet_by = "geo_value") -#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", -#' .facet_by = "geo_value") +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, +#' .color_by = "none", +#' .facet_by = "geo_value" +#' ) autoplot.epi_df <- function( object, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), @@ -38,15 +40,15 @@ autoplot.epi_df <- function( .max_facets = Inf) { .color_by <- match.arg(.color_by) .facet_by <- match.arg(.facet_by) - + arg_is_scalar(.max_facets) if (is.finite(.max_facets)) arg_is_int(.max_facets) arg_is_chr_scalar(.base_color) - + ek <- epi_keys(object) mv <- setdiff(names(object), ek) ek <- kill_time_value(ek) - + # --- check for numeric variables allowed <- purrr::map_lgl(object[mv], is.numeric) if (length(allowed) == 0) { @@ -72,7 +74,7 @@ autoplot.epi_df <- function( vars <- vars[ok] } } - + # --- create a viable df to plot pos <- tidyselect::eval_select( rlang::expr(c("time_value", ek, names(vars))), object @@ -89,25 +91,25 @@ autoplot.epi_df <- function( all_keys <- rlang::syms(as.list(ek)) other_keys <- rlang::syms(as.list(setdiff(ek, "geo_value"))) all_avail <- rlang::syms(as.list(c(ek, ".response_name"))) - + object <- object %>% dplyr::mutate( .colours = switch(.color_by, - all_keys = interaction(!!!all_keys, sep = "/"), - geo_value = geo_value, - other_keys = interaction(!!!other_keys, sep = "/"), - all = interaction(!!!all_avail, sep = "/"), - NULL + all_keys = interaction(!!!all_keys, sep = "/"), + geo_value = geo_value, + other_keys = interaction(!!!other_keys, sep = "/"), + all = interaction(!!!all_avail, sep = "/"), + NULL ), .facets = switch(.facet_by, - all_keys = interaction(!!!all_keys, sep = "/"), - geo_value = as.factor(geo_value), - other_keys = interaction(!!!other_keys, sep = "/"), - all = interaction(!!!all_avail, sep = "/"), - NULL + all_keys = interaction(!!!all_keys, sep = "/"), + geo_value = as.factor(geo_value), + other_keys = interaction(!!!other_keys, sep = "/"), + all = interaction(!!!all_avail, sep = "/"), + NULL ) ) - + if (.max_facets < Inf && ".facets" %in% names(object)) { n_facets <- nlevels(object$.facets) if (n_facets > .max_facets) { @@ -119,10 +121,10 @@ autoplot.epi_df <- function( } } } - + p <- ggplot2::ggplot(object, ggplot2::aes(x = .data$time_value)) + ggplot2::theme_bw() - + if (".colours" %in% names(object)) { p <- p + ggplot2::geom_line( ggplot2::aes(y = .data$.response, colour = .data$.colours), @@ -139,7 +141,7 @@ autoplot.epi_df <- function( p <- p + ggplot2::geom_line(ggplot2::aes(y = .data$.response), color = .base_color) } - + if (".facets" %in% names(object)) { p <- p + ggplot2::facet_wrap(~.facets, scales = "free_y") + ggplot2::ylab(names(vars)) diff --git a/R/utils-arg.R b/R/utils-arg.R index b5700b6d..dca21646 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -2,7 +2,7 @@ handle_arg_list <- function(..., tests) { values <- list(...) names <- eval(substitute(alist(...))) names <- purrr::map(names, deparse) - + purrr::walk2(names, values, tests) } @@ -60,4 +60,3 @@ arg_is_chr_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { arg_is_chr(..., allow_null = allow_null, allow_na = allow_na) arg_is_scalar(..., allow_null = allow_null, allow_na = allow_na) } - diff --git a/R/utils.R b/R/utils.R index 787da5bd..f0a586d7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -481,7 +481,7 @@ quiet <- function(x) { ########## # Create an auto-named list -enlist = function(...) { +enlist <- function(...) { # converted to thin wrapper around rlang::dots_list( ..., From ede6c9c92440eb2c3f9659a3f112880e8807e482 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 19 Jan 2024 11:56:01 -0800 Subject: [PATCH 076/345] rename epi_keys to key_colnames --- R/epi_keys.R | 42 ------------------------------------------ R/key_colnames.R | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 42 deletions(-) delete mode 100644 R/epi_keys.R create mode 100644 R/key_colnames.R diff --git a/R/epi_keys.R b/R/epi_keys.R deleted file mode 100644 index 324b4eff..00000000 --- a/R/epi_keys.R +++ /dev/null @@ -1,42 +0,0 @@ -#' Grab any keys associated to an epi_df -#' -#' @param x a data.frame, tibble, or epi_df -#' @param ... additional arguments passed on to methods -#' -#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL` -#' @keywords internal -#' @export -epi_keys <- function(x, ...) { - UseMethod("epi_keys") -} - -#' @export -epi_keys.default <- function(x, ...) { - character(0L) -} - -#' @export -epi_keys.data.frame <- function(x, other_keys = character(0L), ...) { - arg_is_chr(other_keys, allow_empty = TRUE) - nm <- c("time_value", "geo_value", other_keys) - intersect(nm, names(x)) -} - -#' @export -epi_keys.epi_df <- function(x, ...) { - c("time_value", "geo_value", attr(x, "metadata")$other_keys) -} - -#' @export -epi_keys.epi_archive <- function(x, ...) { - c("time_value", "geo_value", attr(x, "metadata")$other_keys) -} - -kill_time_value <- function(v) { - arg_is_chr(v) - v[v != "time_value"] -} - -epi_keys_only <- function(x, ...) { - kill_time_value(epi_keys(x, ...)) -} diff --git a/R/key_colnames.R b/R/key_colnames.R new file mode 100644 index 00000000..91be7ab6 --- /dev/null +++ b/R/key_colnames.R @@ -0,0 +1,44 @@ +#' Grab any keys associated to an epi_df +#' +#' @param x a data.frame, tibble, or epi_df +#' @param ... additional arguments passed on to methods +#' +#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL` +#' @keywords internal +#' @export +key_colnames <- function(x, ...) { + UseMethod("key_colnames") +} + +#' @export +key_colnames.default <- function(x, ...) { + character(0L) +} + +#' @export +key_colnames.data.frame <- function(x, other_keys = character(0L), ...) { + arg_is_chr(other_keys, allow_empty = TRUE) + nm <- c("time_value", "geo_value", other_keys) + intersect(nm, colnames(x)) +} + +#' @export +key_colnames.epi_df <- function(x, ...) { + other_keys <- attr(x, "metadata")$other_keys + c("time_value", "geo_value", other_keys) +} + +#' @export +key_colnames.epi_archive <- function(x, ...) { + other_keys <- attr(x, "metadata")$other_keys + c("time_value", "geo_value", other_keys) +} + +kill_time_value <- function(v) { + arg_is_chr(v) + v[v != "time_value"] +} + +key_colnames_only <- function(x, ...) { + kill_time_value(key_colnames(x, ...)) +} From 187810b82fc6f4890ebdb271cd418456e9e30a66 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:11:38 -0500 Subject: [PATCH 077/345] bump version --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 339d5681..2566dcb0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.0.9999 +Version: 0.7.1.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 51499b03..7ca3e2b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# epiprocess 0.7.0.9000 +# epiprocess 0.7.1.9000 ## Improvements From abd24d3f98a20c8795d35c8d9918a95f5338d6f8 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 19 Jan 2024 16:59:42 -0500 Subject: [PATCH 078/345] bump version again --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2566dcb0..8cac422e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.1.9999 +Version: 0.7.2.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 989ee8c84994e97fc3aa0c4322adecccd8a6148c Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 19 Jan 2024 15:23:05 -0800 Subject: [PATCH 079/345] rename based on review --- DESCRIPTION | 2 +- NAMESPACE | 10 ++--- R/autoplot.R | 18 ++++---- man/as_epi_archive.Rd | 56 +++++++++++++++---------- man/as_epi_df.Rd | 44 ++++++++++++-------- man/autoplot.epi_df.Rd | 9 +++- man/detect_outlr.Rd | 62 +++++++++++++++++----------- man/detect_outlr_rm.Rd | 7 ++-- man/detect_outlr_stl.Rd | 7 ++-- man/epi_archive.Rd | 18 ++++---- man/epi_cor.Rd | 48 ++++++++++++--------- man/epi_slide.Rd | 22 ++++++---- man/epix_as_of.Rd | 25 +++++++---- man/epix_merge.Rd | 10 ++--- man/epix_slide.Rd | 55 +++++++++++++----------- man/group_by.epi_archive.Rd | 48 ++++++++++++--------- man/growth_rate.Rd | 12 +++--- man/is_epi_archive.Rd | 4 +- man/{epi_keys.Rd => key_colnames.Rd} | 8 ++-- 19 files changed, 273 insertions(+), 192 deletions(-) rename man/{epi_keys.Rd => key_colnames.Rd} (76%) diff --git a/DESCRIPTION b/DESCRIPTION index 3ce7b860..211c5212 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -76,12 +76,12 @@ Collate: 'correlation.R' 'data.R' 'epi_df.R' - 'epi_keys.R' 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' 'grouped_epi_archive.R' 'growth_rate.R' + 'key_colnames.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index 9b2134ae..a843813d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,10 +13,6 @@ S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) -S3method(epi_keys,data.frame) -S3method(epi_keys,default) -S3method(epi_keys,epi_archive) -S3method(epi_keys,epi_df) S3method(epix_truncate_versions_after,epi_archive) S3method(epix_truncate_versions_after,grouped_epi_archive) S3method(group_by,epi_archive) @@ -25,6 +21,10 @@ S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) +S3method(key_colnames,data.frame) +S3method(key_colnames,default) +S3method(key_colnames,epi_archive) +S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) S3method(print,epi_df) @@ -45,7 +45,6 @@ export(detect_outlr_rm) export(detect_outlr_stl) export(epi_archive) export(epi_cor) -export(epi_keys) export(epi_slide) export(epix_as_of) export(epix_merge) @@ -58,6 +57,7 @@ export(growth_rate) export(is_epi_archive) export(is_epi_df) export(is_grouped_epi_archive) +export(key_colnames) export(max_version_with_row_in) export(mutate) export(new_epi_df) diff --git a/R/autoplot.R b/R/autoplot.R index 6c6bf5eb..bcd0b54d 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -22,7 +22,7 @@ #' @param .max_facets Cut down of the number of facets displayed. Especially #' useful for testing when there are many `geo_value`'s or keys. #' -#' @return +#' @return A ggplot object #' @export #' #' @examples @@ -45,12 +45,12 @@ autoplot.epi_df <- function( if (is.finite(.max_facets)) arg_is_int(.max_facets) arg_is_chr_scalar(.base_color) - ek <- epi_keys(object) - mv <- setdiff(names(object), ek) - ek <- kill_time_value(ek) + key_cols <- key_colnames(object) + non_key_cols <- setdiff(names(object), key_cols) + geo_and_other_keys <- kill_time_value(key_cols) # --- check for numeric variables - allowed <- purrr::map_lgl(object[mv], is.numeric) + allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) if (length(allowed) == 0) { cli::cli_abort("No numeric variables were available to plot automatically.") } @@ -77,7 +77,7 @@ autoplot.epi_df <- function( # --- create a viable df to plot pos <- tidyselect::eval_select( - rlang::expr(c("time_value", ek, names(vars))), object + rlang::expr(c("time_value", geo_and_other_keys, names(vars))), object ) if (length(vars) > 1) { object <- tidyr::pivot_longer( @@ -88,9 +88,9 @@ autoplot.epi_df <- function( } else { object <- dplyr::rename(object[pos], .response := !!names(vars)) } - all_keys <- rlang::syms(as.list(ek)) - other_keys <- rlang::syms(as.list(setdiff(ek, "geo_value"))) - all_avail <- rlang::syms(as.list(c(ek, ".response_name"))) + all_keys <- rlang::syms(as.list(geo_and_other_keys)) + other_keys <- rlang::syms(as.list(setdiff(geo_and_other_keys, "geo_value"))) + all_avail <- rlang::syms(as.list(c(geo_and_other_keys, ".response_name"))) object <- object %>% dplyr::mutate( diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index e3604341..93b10736 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -99,32 +99,44 @@ would be equivalent to: # Simple ex. with necessary keys tib <- tibble::tibble( geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5), times = 2), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% as_epi_archive(geo_type = "state", - time_type = "day") -toy_epi_archive +toy_epi_archive <- tib \%>\% as_epi_archive( + geo_type = "state", + time_type = "day" +) +toy_epi_archive # Ex. with an additional key for county -df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), - county = c(1, 3, 2, 5), - time_value = c("2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02"), - version = c("2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03"), - cases = c(1, 2, 3, 4), - cases_rate = c(0.01, 0.02, 0.01, 0.05)) +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) +) -x <- df \%>\% as_epi_archive(geo_type = "state", - time_type = "day", - other_keys = "county") +x <- df \%>\% as_epi_archive( + geo_type = "state", + time_type = "day", + other_keys = "county" +) } diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 851aed7e..40c0a1c5 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -76,19 +76,22 @@ the \code{tbl_ts} class is dropped, and any key variables (other than ex1_input <- tibble::tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c("06059","06061","06067", - "12111","12113","12117", - "42101", "42103","42105"), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(geo_value)), + by = "day" + ), length.out = length(geo_value)), value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) -) \%>\% +) \%>\% tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) # The `other_keys` metadata (`"county_code"` in this case) is automatically # inferred from the `tsibble`'s `key`: ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") -attr(ex1,"metadata")[["other_keys"]] +attr(ex1, "metadata")[["other_keys"]] @@ -102,17 +105,21 @@ ex2_input <- tibble::tibble( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day"), length.out = length(state)), # misnamed + by = "day" + ), length.out = length(state)), # misnamed value = 1:length(state) + 0.01 * rnorm(length(state)) -) +) print(ex2_input) -ex2 <- ex2_input \%>\% dplyr::rename(geo_value = state, time_value = reported_date) \%>\% - as_epi_df(geo_type = "state", as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol")) +ex2 <- ex2_input \%>\% + dplyr::rename(geo_value = state, time_value = reported_date) \%>\% + as_epi_df( + geo_type = "state", as_of = "2020-06-03", + additional_metadata = list(other_keys = "pol") + ) -attr(ex2,"metadata") +attr(ex2, "metadata") @@ -120,17 +127,18 @@ attr(ex2,"metadata") ex3_input <- jhu_csse_county_level_subset \%>\% dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% - dplyr::slice_tail(n = 6) + dplyr::slice_tail(n = 6) -ex3 <- ex3_input \%>\% +ex3 <- ex3_input \%>\% tsibble::as_tsibble() \%>\% # needed to add the additional metadata # add 2 extra keys dplyr::mutate( - state = rep("MA",6), - pol = rep(c("blue", "swing", "swing"), each = 2)) \%>\% - # the 2 extra keys we added have to be specified in the other_keys + state = rep("MA", 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) \%>\% + # the 2 extra keys we added have to be specified in the other_keys # component of additional_metadata. as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) -attr(ex3,"metadata") +attr(ex3, "metadata") } diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd index 134a83fe..68aef0c4 100644 --- a/man/autoplot.epi_df.Rd +++ b/man/autoplot.epi_df.Rd @@ -43,12 +43,17 @@ share the same color line.} \item{.max_facets}{Cut down of the number of facets displayed. Especially useful for testing when there are many \code{geo_value}'s or keys.} } +\value{ +A ggplot object +} \description{ Automatically plot an epi_df } \examples{ autoplot(jhu_csse_daily_subset, cases, death_rate_7d_av) autoplot(jhu_csse_daily_subset, case_rate_7d_av, .facet_by = "geo_value") -autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", - .facet_by = "geo_value") +autoplot(jhu_csse_daily_subset, case_rate_7d_av, + .color_by = "none", + .facet_by = "geo_value" +) } diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 4aa0b79c..3a793ebf 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -64,29 +64,43 @@ For convenience, the outlier detection method can be specified (in the STL decomposition. } \examples{ - detection_methods = dplyr::bind_rows( - dplyr::tibble(method = "rm", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5)), - abbr = "rm"), - dplyr::tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = 7)), - abbr = "stl_seasonal"), - dplyr::tibble(method = "stl", - args = list(list(detect_negatives = TRUE, - detection_multiplier = 2.5, - seasonal_period = NULL)), - abbr = "stl_nonseasonal")) +detection_methods <- dplyr::bind_rows( + dplyr::tibble( + method = "rm", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5 + )), + abbr = "rm" + ), + dplyr::tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = 7 + )), + abbr = "stl_seasonal" + ), + dplyr::tibble( + method = "stl", + args = list(list( + detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = NULL + )), + abbr = "stl_nonseasonal" + ) +) - x <- incidence_num_outlier_example \%>\% - dplyr::select(geo_value,time_value,cases) \%>\% - as_epi_df() \%>\% - group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr( - x = time_value, y = cases, - methods = detection_methods, - combiner = "median")) \%>\% - unnest(outlier_info) +x <- incidence_num_outlier_example \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr( + x = time_value, y = cases, + methods = detection_methods, + combiner = "median" + )) \%>\% + unnest(outlier_info) } diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 3efae55d..0d011619 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -59,10 +59,11 @@ terms of multiples of the rolling interquartile range (IQR). \examples{ # Detect outliers based on a rolling median incidence_num_outlier_example \%>\% - dplyr::select(geo_value,time_value,cases) \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% as_epi_df() \%>\% group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr_rm( - x = time_value, y = cases)) \%>\% + mutate(outlier_info = detect_outlr_rm( + x = time_value, y = cases + )) \%>\% unnest(outlier_info) } diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 7e724a4e..34a550d5 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -78,11 +78,12 @@ are exactly as in \code{detect_outlr_rm()}. \examples{ # Detects outliers based on a seasonal-trend decomposition using LOESS incidence_num_outlier_example \%>\% - dplyr::select(geo_value,time_value,cases) \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% as_epi_df() \%>\% group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr_stl( + mutate(outlier_info = detect_outlr_stl( x = time_value, y = cases, - seasonal_period = 7 )) \%>\% # weekly seasonality for daily data + seasonal_period = 7 + )) \%>\% # weekly seasonality for daily data unnest(outlier_info) } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index f328eb44..6a25b2af 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -100,16 +100,20 @@ are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. \examples{ tib <- tibble::tibble( geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5), times = 2), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% epi_archive$new(geo_type = "state", - time_type = "day") -toy_epi_archive +toy_epi_archive <- tib \%>\% epi_archive$new( + geo_type = "state", + time_type = "day" +) +toy_epi_archive } \section{Public fields}{ \if{html}{\out{
}} diff --git a/man/epi_cor.Rd b/man/epi_cor.Rd index 6b2279db..fb56073f 100644 --- a/man/epi_cor.Rd +++ b/man/epi_cor.Rd @@ -58,30 +58,38 @@ grouping by geo value, time value, or any other variables. See the for examples. } \examples{ - + # linear association of case and death rates on any given day -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = "time_value") +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = "time_value" +) # correlation of death rates and lagged case rates -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = time_value, - dt1 = -2) +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = time_value, + dt1 = -2 +) -# correlation grouped by location -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = geo_value) +# correlation grouped by location +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value +) # correlation grouped by location and incorporates lagged cases rates -epi_cor(x = jhu_csse_daily_subset, - var1 = case_rate_7d_av, - var2 = death_rate_7d_av, - cor_by = geo_value, - dt1 = -2) +epi_cor( + x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value, + dt1 = -2 +) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 33c3a7fb..668be9ff 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -156,9 +156,9 @@ through the \code{new_col_name} argument. # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6) \%>\% + epi_slide(cases_7dav = mean(cases), before = 6) \%>\% # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + dplyr::select(-death_rate_7d_av) # slide a 7-day leading average jhu_csse_daily_subset \%>\% @@ -170,21 +170,25 @@ jhu_csse_daily_subset \%>\% # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% + epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + dplyr::select(-death_rate_7d_av) # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% + epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + dplyr::select(-death_rate_7d_av) # nested new columns jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(a = data.frame(cases_2dav = mean(cases), - cases_2dma = mad(cases)), - before = 1, as_list_col = TRUE) + epi_slide( + a = data.frame( + cases_2dav = mean(cases), + cases_2dma = mad(cases) + ), + before = 1, as_list_col = TRUE + ) } diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 51884597..9a0a53ce 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -55,14 +55,18 @@ in the future. } \examples{ # warning message of data latency shown -epix_as_of(x = archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version)) +epix_as_of( + x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version) +) range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 -epix_as_of(x = archive_cases_dv_subset, - max_version = as.Date("2020-06-12")) +epix_as_of( + x = archive_cases_dv_subset, + max_version = as.Date("2020-06-12") +) # When fetching a snapshot as of the latest version with update data in the # archive, a warning is issued by default, as this update data might not yet @@ -72,10 +76,15 @@ epix_as_of(x = archive_cases_dv_subset, # based on database queries, the latest available update might still be # subject to change, but previous versions should be finalized). We can # muffle such warnings with the following pattern: -withCallingHandlers({ - epix_as_of(x = archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version)) -}, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +withCallingHandlers( + { + epix_as_of( + x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version) + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +) # Since R 4.0, there is a `globalCallingHandlers` function that can be used # to globally toggle these warnings. diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 09f67fa2..53dea071 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -60,13 +60,13 @@ be clobbered in either input archive. \examples{ # create two example epi_archive datasets x <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value,time_value,version,case_rate_7d_av) \%>\% - as_epi_archive(compactify=TRUE) + dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% + as_epi_archive(compactify = TRUE) y <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value,time_value,version,percent_cli) \%>\% - as_epi_archive(compactify=TRUE) + dplyr::select(geo_value, time_value, version, percent_cli) \%>\% + as_epi_archive(compactify = TRUE) # merge results stored in a third object: -xy = epix_merge(x, y) +xy <- epix_merge(x, y) # vs. mutating x to hold the merge result: x$merge(y) diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index d94460af..3ac55a18 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -193,17 +193,20 @@ library(dplyr) # Reference time points for which we want to compute slide values: ref_time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-15"), - by = "1 day") + as.Date("2020-06-15"), + by = "1 day" +) # A simple (but not very useful) example (see the archive vignette for a more # realistic one): archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = ref_time_values, - new_col_name = 'case_rate_7d_av_recent_av') \%>\% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = "case_rate_7d_av_recent_av" + ) \%>\% ungroup() # We requested time windows that started 2 days before the corresponding time # values. The actual number of `time_value`s in each computation depends on @@ -221,23 +224,24 @@ archive_cases_dv_subset \%>\% # Examining characteristics of the data passed to each computation with # `all_versions=FALSE`. archive_cases_dv_subset \%>\% - group_by(geo_value) \%>\% - epix_slide( - function(x, gk, rtv) { - tibble( - time_range = if(nrow(x) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) - }, - n = nrow(x), - class1 = class(x)[[1L]] - ) - }, - before = 5, all_versions = FALSE, - ref_time_values = ref_time_values, names_sep=NULL) \%>\% - ungroup() \%>\% - arrange(geo_value, time_value) + group_by(geo_value) \%>\% + epix_slide( + function(x, gk, rtv) { + tibble( + time_range = if (nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) # --- Advanced: --- @@ -259,7 +263,7 @@ archive_cases_dv_subset \%>\% toString(min(x$DT$version)) }, versions_end = x$versions_end, - time_range = if(nrow(x$DT) == 0L) { + time_range = if (nrow(x$DT) == 0L) { "0 `time_value`s" } else { sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) @@ -269,7 +273,8 @@ archive_cases_dv_subset \%>\% ) }, before = 5, all_versions = TRUE, - ref_time_values = ref_time_values, names_sep=NULL) \%>\% + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% ungroup() \%>\% # Focus on one geo_value so we can better see the columns above: filter(geo_value == "ca") \%>\% diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index aee0a07b..5e867bf3 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -93,7 +93,7 @@ to \code{group_by_drop_default.default} (but there is a dedicated method for } \examples{ -grouped_archive = archive_cases_dv_subset \%>\% group_by(geo_value) +grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) # `print` for metadata and method listing: grouped_archive \%>\% print() @@ -102,10 +102,12 @@ grouped_archive \%>\% print() archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% - epix_slide(f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = 'case_rate_3d_av') \%>\% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = "case_rate_3d_av" + ) \%>\% ungroup() # ----------------------------------------------------------------- @@ -113,34 +115,42 @@ archive_cases_dv_subset \%>\% # Advanced: some other features of dplyr grouping are implemented: library(dplyr) -toy_archive = +toy_archive <- tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) ) \%>\% - mutate(age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version)) \%>\% as_epi_archive(other_keys = "age_group") # The following are equivalent: toy_archive \%>\% group_by(geo_value, age_group) -toy_archive \%>\% group_by(geo_value) \%>\% group_by(age_group, .add=TRUE) -grouping_cols = c("geo_value", "age_group") +toy_archive \%>\% + group_by(geo_value) \%>\% + group_by(age_group, .add = TRUE) +grouping_cols <- c("geo_value", "age_group") toy_archive \%>\% group_by(across(all_of(grouping_cols))) # And these are equivalent: toy_archive \%>\% group_by(geo_value) -toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) +toy_archive \%>\% + group_by(geo_value, age_group) \%>\% + ungroup(age_group) # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -toy_archive \%>\% group_by(geo_value) \%>\% groups() +toy_archive \%>\% + group_by(geo_value) \%>\% + groups() toy_archive \%>\% - group_by(geo_value, age_group, .drop=FALSE) \%>\% + group_by(geo_value, age_group, .drop = FALSE) \%>\% epix_slide(f = ~ sum(.x$value), before = 20) \%>\% ungroup() diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd index 203d5d7d..7a3f1151 100644 --- a/man/growth_rate.Rd +++ b/man/growth_rate.Rd @@ -136,12 +136,12 @@ user. \examples{ # COVID cases growth rate by state using default method relative change -jhu_csse_daily_subset \%>\% - group_by(geo_value) \%>\% - mutate(cases_gr = growth_rate(x = time_value, y = cases)) +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + mutate(cases_gr = growth_rate(x = time_value, y = cases)) # Log scale, degree 4 polynomial and 6-fold cross validation -jhu_csse_daily_subset \%>\% - group_by(geo_value) \%>\% - mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) } diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd index 5b133004..2beb3a8c 100644 --- a/man/is_epi_archive.Rd +++ b/man/is_epi_archive.Rd @@ -25,9 +25,9 @@ is_epi_archive(archive_cases_dv_subset) # TRUE # By default, grouped_epi_archives don't count as epi_archives, as they may # support a different set of operations from regular `epi_archives`. This # behavior can be controlled by `grouped_okay`. -grouped_archive = archive_cases_dv_subset$group_by(geo_value) +grouped_archive <- archive_cases_dv_subset$group_by(geo_value) is_epi_archive(grouped_archive) # FALSE -is_epi_archive(grouped_archive, grouped_okay=TRUE) # TRUE +is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE } \seealso{ diff --git a/man/epi_keys.Rd b/man/key_colnames.Rd similarity index 76% rename from man/epi_keys.Rd rename to man/key_colnames.Rd index 8026fc14..fbaa3c11 100644 --- a/man/epi_keys.Rd +++ b/man/key_colnames.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_keys.R -\name{epi_keys} -\alias{epi_keys} +% Please edit documentation in R/key_colnames.R +\name{key_colnames} +\alias{key_colnames} \title{Grab any keys associated to an epi_df} \usage{ -epi_keys(x, ...) +key_colnames(x, ...) } \arguments{ \item{x}{a data.frame, tibble, or epi_df} From 3c68f1db0676b48197c38b1806443402074a97f0 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 19 Jan 2024 15:34:08 -0800 Subject: [PATCH 080/345] pkgdown fix --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 1d9b1955..1b11ee6f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -91,4 +91,4 @@ reference: - max_version_with_row_in - next_after - guess_period - - epi_keys + - key_colnames From 8f7c1c3e6502ca19b78bee26e07775935a2e8a28 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jan 2024 15:37:52 -0800 Subject: [PATCH 081/345] Nat's suggestion --- .github/workflows/auto_semver_bump.yaml | 23 +++++++++++++++++++ .github/workflows/version_bump_options.json | 25 +++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 .github/workflows/auto_semver_bump.yaml create mode 100644 .github/workflows/version_bump_options.json diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml new file mode 100644 index 00000000..796ea20d --- /dev/null +++ b/.github/workflows/auto_semver_bump.yaml @@ -0,0 +1,23 @@ +# this action: https://github.com/marketplace/actions/version-bumper +# also see the options file for configuration +name: Manage versions + +# not actually on all pushes, see the options file +on: [push] + +jobs: + bump: + #if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + + steps: + # Checkout action is required + - uses: actions/checkout@v2 + - uses: actions/setup-node@v1 + with: + node-version: "12" + - name: Bump Versions + uses: michmich112/version-bumper@master + with: + options-file: "./.github/workflows/version_bump_options.json" + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/version_bump_options.json b/.github/workflows/version_bump_options.json new file mode 100644 index 00000000..bbdeb936 --- /dev/null +++ b/.github/workflows/version_bump_options.json @@ -0,0 +1,25 @@ +{ + "scheme": "semantic", + "versionFile": "./DESCRIPTION", + "files": [], + "rules": [ + { + "trigger": "commit", + "bump": "build" + }, + { + "trigger": "commit", + "bump": "minor", + "branch": "main", + "tag": true, + "reset": "build" + }, + { + "trigger": "commit", + "bump": "major", + "branch": "release", + "tag": true, + "reset": ["minor", "build"] + } + ] +} From cce5d8cc1408fa05e934d9c60b88c409f9b29fc1 Mon Sep 17 00:00:00 2001 From: version-bumper Date: Fri, 19 Jan 2024 23:38:23 +0000 Subject: [PATCH 082/345] "Updated version 0.7.2 -> 0.7.3." --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 15b7757f..94ac3214 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.2 +Version: 0.7.3 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 9413804837bfc23ef008615e74c2acce8311c0ab Mon Sep 17 00:00:00 2001 From: version-bumper Date: Fri, 19 Jan 2024 23:42:51 +0000 Subject: [PATCH 083/345] "Updated version 0.7.3 -> 0.7.4." --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 94ac3214..4f5a61ab 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.3 +Version: 0.7.4 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 44b4440635e1b80cba4bca20b881cddcf94ca881 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jan 2024 15:47:51 -0800 Subject: [PATCH 084/345] only bump build version on commits in dev --- .github/workflows/version_bump_options.json | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/version_bump_options.json b/.github/workflows/version_bump_options.json index bbdeb936..a2f6aed9 100644 --- a/.github/workflows/version_bump_options.json +++ b/.github/workflows/version_bump_options.json @@ -5,6 +5,7 @@ "rules": [ { "trigger": "commit", + "branch": "dev", "bump": "build" }, { From eeefc94130b00e90f9c44c48e707799690d5d854 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jan 2024 15:48:45 -0800 Subject: [PATCH 085/345] roll back aggressive versioning --- .github/workflows/require_semver_bump.yaml | 18 ------------------ DESCRIPTION | 2 +- 2 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 .github/workflows/require_semver_bump.yaml diff --git a/.github/workflows/require_semver_bump.yaml b/.github/workflows/require_semver_bump.yaml deleted file mode 100644 index 548ceb1e..00000000 --- a/.github/workflows/require_semver_bump.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: semver-check - -on: - pull_request: - branches: [main, dev] - -jobs: - verify: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: rayepps/require-semver-bump@v1 - env: - GITHUB_TOKEN: ${{ github.token }} - with: - file: DESCRIPTION - pattern: > - Version:\s(\d+\.\d+\.\d+\d*) diff --git a/DESCRIPTION b/DESCRIPTION index 4f5a61ab..15b7757f 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.4 +Version: 0.7.2 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), From 949bf172b2fb068a4caa6e4a7faeb8598e9beaf2 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:48:54 -0500 Subject: [PATCH 086/345] initial version of mean-specific epi_slide update examples --- NAMESPACE | 2 + R/slide.R | 160 ++++++++++++++++++++++++++++++++++++++++++ man/epi_slide_mean.Rd | 147 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 309 insertions(+) create mode 100644 man/epi_slide_mean.Rd diff --git a/NAMESPACE b/NAMESPACE index c59004c8..0b7fe384 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ export(detect_outlr_stl) export(epi_archive) export(epi_cor) export(epi_slide) +export(epi_slide_mean) export(epix_as_of) export(epix_merge) export(epix_slide) @@ -66,6 +67,7 @@ importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) +importFrom(data.table,frollmean) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) diff --git a/R/slide.R b/R/slide.R index e2c0bf55..c77f6c69 100644 --- a/R/slide.R +++ b/R/slide.R @@ -373,3 +373,163 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, return(x) } + +#' Optimized slide function for performing rolling averages on an `epi_df` object +#' +#' Slides a n-timestep mean over variables in an `epi_df` object. See the [slide +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for +#' examples. +#' +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param ... Additional arguments to pass to `data.table::frollmean`. `na.rm` +#' and `algo` are important to consider. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: * For trailing/right-aligned +#' windows from `ref_time_value - time_step(k)` to `ref_time_value`: either +#' pass `before=k` by itself, or pass `before=k, after=0`. * For +#' center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. * For +#' leading/left-aligned windows from `ref_time_value` to `ref_time_value + +#' time_step(k)`: either pass pass `after=k` by itself, or pass `before=0, +#' after=k`. See "Details:" about the definition of a time step, +#' (non)treatment of missing rows within the window, and avoiding warnings +#' about `before`&`after` settings for a certain uncommon use case. +#' @param ref_time_values Time values for sliding computations, meaning, each +#' element of this vector serves as the reference time point for one sliding +#' window. If missing, then this will be set to all unique time values in the +#' underlying data table, by default. +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class `lubridate::period`. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. If using `as_list_col = TRUE`, note that +#' the missing marker is a `NULL` entry in the list column; for certain +#' operations, you might want to replace these `NULL` entries with a different +#' `NA` marker. +#' @return An `epi_df` object given by appending a new column to `x`, named +#' according to the `new_col_name` argument. +#' +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is entered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. The unit (the meaning of one time step) is implicitly +#' defined by the way the `time_value` column treats addition and subtraction; +#' for example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals +#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly +#' using the `time_step` argument (which if specified would override the +#' default choice based on `time_value` column). If there are not enough time +#' steps available to complete the window at any given reference time, then +#' `epi_slide()` still attempts to perform the computation anyway (it does not +#' require a complete window). The issue of what to do with partial +#' computations (those run on incomplete windows) is therefore left up to the +#' user, either through the specified function or formula `f`, or through +#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and +#' `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) +#' +#' @importFrom lubridate days weeks +#' @importFrom dplyr bind_rows group_vars filter select +#' @importFrom rlang .data .env !! enquo enquos sym env missing_arg +#' @importFrom data.table frollmean +#' @export +#' @examples +#' # slide a 7-day trailing average formula on cases +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6) %>% +#' # rmv a nonessential var. to ensure new col is printed +#' dplyr::select(-death_rate_7d_av) +#' +#' # slide a 7-day leading average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, after = 6) %>% +#' # rmv a nonessential var. to ensure new col is printed +#' dplyr::select(-death_rate_7d_av) +#' +#' # slide a 7-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 3, after = 3) %>% +#' # rmv a nonessential var. to ensure new col is printed +#' dplyr::select(-death_rate_7d_av) +#' +#' # slide a 14-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6, after = 7) %>% +#' # rmv a nonessential var. to ensure new col is printed +#' dplyr::select(-death_rate_7d_av) +epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, + time_step, + new_col_name = "slide_value", as_list_col = FALSE, + names_sep = "_", all_rows = FALSE) { + all_dates <- seq(min(x$time_value), max(x$time_value), by = time_step) + + ## TODO: need to deal with `after` + m <- before + 1L + + if (is.null(names_sep)) { + result_col_name <- new_col_name + } else { + result_col_name <- paste(new_col_name, col_name, sep = names_sep) + } + + result <- mutate(x, .real = TRUE) %>% + group_by(geo_value) %>% + group_modify(~{ + pad_early_dates <- all_dates[1L] - (m - 1):1 + + # `setdiff` causes date formatting to change. Re-class these as dates. + missing_dates <- as.Date(setdiff(all_dates, .x$time_value), origin = "1970-01-01") + .x <- bind_rows( + .x, + tibble(time_value = c(pad_early_dates, missing_dates), .real = FALSE) + ) %>% + arrange(time_value) + .x[, c(result_col_name)] <- data.table::frollmean(.x[, c(col_name)], n = m, ...) + .x + } + ) + result <- result[result$.real, ] + result$.real <- NULL + ungroup(result) +} + diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd new file mode 100644 index 00000000..f96dfd94 --- /dev/null +++ b/man/epi_slide_mean.Rd @@ -0,0 +1,147 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{epi_slide_mean} +\alias{epi_slide_mean} +\title{Optimized slide function for performing rolling averages on an \code{epi_df} object} +\usage{ +epi_slide_mean( + x, + col_name, + ..., + before, + after, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_rows = FALSE +) +} +\arguments{ +\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +single data group.} + +\item{...}{Additional arguments to pass to \code{data.table::frollmean}. \code{na.rm} +and \code{algo} are important to consider.} + +\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should +the sliding window extend? At least one of these two arguments must be +provided; the other's default will be 0. Any value provided for either +argument must be a single, non-\code{NA}, non-negative, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of +the window are inclusive. Common settings: * For trailing/right-aligned +windows from \code{ref_time_value - time_step(k)} to \code{ref_time_value}: either +pass \code{before=k} by itself, or pass \verb{before=k, after=0}. * For +center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. * For +leading/left-aligned windows from \code{ref_time_value} to \code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. See "Details:" about the definition of a time step, +(non)treatment of missing rows within the window, and avoiding warnings +about \code{before}&\code{after} settings for a certain uncommon use case.} + +\item{ref_time_values}{Time values for sliding computations, meaning, each +element of this vector serves as the reference time point for one sliding +window. If missing, then this will be set to all unique time values in the +underlying data table, by default.} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a non-negative integer and +return an object of class \code{lubridate::period}. For example, we can use +\code{time_step = lubridate::hours} in order to set the time step to be one hour +(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} +} +\value{ +An \code{epi_df} object given by appending a new column to \code{x}, named +according to the \code{new_col_name} argument. +} +\description{ +Slides a n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +examples. +} +\details{ +To "slide" means to apply a function or formula over a rolling +window of time steps for each data group, where the window is entered at a +reference time and left and right endpoints are given by the \code{before} and +\code{after} arguments. The unit (the meaning of one time step) is implicitly +defined by the way the \code{time_value} column treats addition and subtraction; +for example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals +\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly +using the \code{time_step} argument (which if specified would override the +default choice based on \code{time_value} column). If there are not enough time +steps available to complete the window at any given reference time, then +\code{epi_slide()} still attempts to perform the computation anyway (it does not +require a complete window). The issue of what to do with partial +computations (those run on incomplete windows) is therefore left up to the +user, either through the specified function or formula \code{f}, or through +post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and +\code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) +} +\examples{ +# slide a 7-day trailing average formula on cases +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6) \%>\% + # rmv a nonessential var. to ensure new col is printed + dplyr::select(-death_rate_7d_av) + +# slide a 7-day leading average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, after = 6) \%>\% + # rmv a nonessential var. to ensure new col is printed + dplyr::select(-death_rate_7d_av) + +# slide a 7-day centre-aligned average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 3, after = 3) \%>\% + # rmv a nonessential var. to ensure new col is printed + dplyr::select(-death_rate_7d_av) + +# slide a 14-day centre-aligned average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6, after = 7) \%>\% + # rmv a nonessential var. to ensure new col is printed + dplyr::select(-death_rate_7d_av) +} From 4ea9bf795f86018e7664acf4c81db8ccd3d7a50e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:23:09 -0500 Subject: [PATCH 087/345] adding leading/lagging pad dates --- R/slide.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index c77f6c69..9a4cd04c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -502,8 +502,11 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { all_dates <- seq(min(x$time_value), max(x$time_value), by = time_step) + pad_early_dates <- all_dates[1L] - before:1 + pad_late_dates <- all_dates[1L] + 1:after ## TODO: need to deal with `after` + # `frollmean` is 1-indexed, so adjust our `before` and `after` params. m <- before + 1L if (is.null(names_sep)) { @@ -515,13 +518,12 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result <- mutate(x, .real = TRUE) %>% group_by(geo_value) %>% group_modify(~{ - pad_early_dates <- all_dates[1L] - (m - 1):1 # `setdiff` causes date formatting to change. Re-class these as dates. missing_dates <- as.Date(setdiff(all_dates, .x$time_value), origin = "1970-01-01") .x <- bind_rows( .x, - tibble(time_value = c(pad_early_dates, missing_dates), .real = FALSE) + tibble(time_value = c(pad_early_dates, missing_dates, pad_late_dates), .real = FALSE) ) %>% arrange(time_value) .x[, c(result_col_name)] <- data.table::frollmean(.x[, c(col_name)], n = m, ...) From f4cce8f083724600eaf164f08ba51189900a08b3 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 23 Jan 2024 14:11:28 -0800 Subject: [PATCH 088/345] merge version bump from main back to dev --- .github/workflows/release-helper.yaml | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/release-helper.yaml diff --git a/.github/workflows/release-helper.yaml b/.github/workflows/release-helper.yaml new file mode 100644 index 00000000..c585a1a7 --- /dev/null +++ b/.github/workflows/release-helper.yaml @@ -0,0 +1,31 @@ +name: Release Helper + +on: + push: + branches: + - main + +jobs: + sync_dev: + needs: correct_repository + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v2 + with: + ref: dev + ssh-key: ${{ secrets.CMU_DELPHI_DEPLOY_MACHINE_SSH }} + - name: Reset dev branch + run: | + git fetch origin main:main + git reset --hard main + - name: Create pull request into dev + uses: peter-evans/create-pull-request@v3 + with: + branch: bot/sync-main-dev + commit-message: "chore: sync main-dev" + base: dev + title: "chore: sync main->dev" + labels: chore + body: | + Syncing Main->Dev. From fe10b166e27e63cdcb729e47998daa8cbb2e9be4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 14:18:50 -0800 Subject: [PATCH 089/345] Remove CODEOWNERS to tidy review requests Disable auto review requests to CODEOWNERS by removing the file altogether. --- .github/CODEOWNERS | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS deleted file mode 100644 index 753bb3c2..00000000 --- a/.github/CODEOWNERS +++ /dev/null @@ -1 +0,0 @@ -* @dajmcdon @brookslogan From 77dea3ed7e20d11aa6d0c888e3bc8bbe71fc1a7e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 16:36:56 -0800 Subject: [PATCH 090/345] Add pull request template --- .github/pull_request_template.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/pull_request_template.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..bee8bb7d --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,14 @@ +### Checklist + +Please: +- [ ] Request a review from one of the current epiprocess main reviewers: brookslogan, nmdefries. +- [ ] Describe changes made in NEWS.md or under the "Changelog" heading, making + sure breaking changes (backwards-incompatible changes to the documented + interface) are noted. The right location for this may be in flux (see + [here](https://github.com/cmu-delphi/epiprocess/pull/398)). + +### Changelog (if not put in NEWS.md) + +### Magic GitHub syntax to resolve Issues on merge + +- Resolves #{issue number} From 659208b8c02f957f4cb51c11f94e75055de1df25 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 16:39:19 -0800 Subject: [PATCH 091/345] Auto-fill PR template --- .github/pull_request_template.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index bee8bb7d..1cdfe66f 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,7 +1,8 @@ ### Checklist Please: -- [ ] Request a review from one of the current epiprocess main reviewers: brookslogan, nmdefries. +- [ ] Request a review from one of the current epiprocess main reviewers: + brookslogan, nmdefries. - [ ] Describe changes made in NEWS.md or under the "Changelog" heading, making sure breaking changes (backwards-incompatible changes to the documented interface) are noted. The right location for this may be in flux (see From 9fc7925a116f4eb43dfeeba8bc03f5cfde1458f1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 16:42:57 -0800 Subject: [PATCH 092/345] Tweak PR template wording re. "Resolves" syntax --- .github/pull_request_template.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 1cdfe66f..838cc91f 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,6 +10,6 @@ Please: ### Changelog (if not put in NEWS.md) -### Magic GitHub syntax to resolve Issues on merge +### Magic GitHub syntax to mark associated Issue(s) as resolved when this is merged into the default branch - Resolves #{issue number} From fdc8af10cbbe36643afa19934f00f5332d76b33a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 16:44:09 -0800 Subject: [PATCH 093/345] Remind what PR base branch should be --- .github/pull_request_template.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 838cc91f..e0a3b97f 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,7 @@ ### Checklist Please: +- [ ] Make sure this PR is against "dev", not "main". - [ ] Request a review from one of the current epiprocess main reviewers: brookslogan, nmdefries. - [ ] Describe changes made in NEWS.md or under the "Changelog" heading, making From 681b283a21b6a06651b020c9b9744ff746cdf767 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 16:56:01 -0800 Subject: [PATCH 094/345] Add context/notes heading to PR template --- .github/pull_request_template.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index e0a3b97f..8ce3eb55 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -14,3 +14,5 @@ Please: ### Magic GitHub syntax to mark associated Issue(s) as resolved when this is merged into the default branch - Resolves #{issue number} + +### Context / Other notes From f9201624a31ec6685f3fc4f23207e060b534d7e2 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 24 Jan 2024 16:59:37 -0800 Subject: [PATCH 095/345] Update .github/pull_request_template.md Co-authored-by: Dmitry Shemetov --- .github/pull_request_template.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 8ce3eb55..0a76a8af 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,12 +4,12 @@ Please: - [ ] Make sure this PR is against "dev", not "main". - [ ] Request a review from one of the current epiprocess main reviewers: brookslogan, nmdefries. -- [ ] Describe changes made in NEWS.md or under the "Changelog" heading, making +- [ ] Describe changes made in NEWS.md, making sure breaking changes (backwards-incompatible changes to the documented interface) are noted. The right location for this may be in flux (see [here](https://github.com/cmu-delphi/epiprocess/pull/398)). -### Changelog (if not put in NEWS.md) +### Change explanations for reviewer ### Magic GitHub syntax to mark associated Issue(s) as resolved when this is merged into the default branch From 121844f3f302b198bb91d21ca9d27f66562e3fd5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 16:59:52 -0800 Subject: [PATCH 096/345] Remove redundant heading in PR template --- .github/pull_request_template.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0a76a8af..9277eab2 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -14,5 +14,3 @@ Please: ### Magic GitHub syntax to mark associated Issue(s) as resolved when this is merged into the default branch - Resolves #{issue number} - -### Context / Other notes From cc4c310d1fdacee60db93008ef5a2c6b5ca43fe2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Jan 2024 17:00:18 -0800 Subject: [PATCH 097/345] Format PR template --- .github/pull_request_template.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 9277eab2..ffd9afa9 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,9 +4,9 @@ Please: - [ ] Make sure this PR is against "dev", not "main". - [ ] Request a review from one of the current epiprocess main reviewers: brookslogan, nmdefries. -- [ ] Describe changes made in NEWS.md, making - sure breaking changes (backwards-incompatible changes to the documented - interface) are noted. The right location for this may be in flux (see +- [ ] Describe changes made in NEWS.md, making sure breaking changes + (backwards-incompatible changes to the documented interface) are noted. + The right location for this may be in flux (see [here](https://github.com/cmu-delphi/epiprocess/pull/398)). ### Change explanations for reviewer From 521268be7f31dfe333f0d42202770d73560c02e7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:47:02 -0500 Subject: [PATCH 098/345] handle `after`; reformat to `slide_one_grp` format date and name cleanup --- NAMESPACE | 3 ++ R/slide.R | 78 ++++++++++++++++++++++++++++++------------- man/epi_slide_mean.Rd | 9 +++-- 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0b7fe384..24df62cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,6 +73,7 @@ importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setDF) importFrom(data.table,setkeyv) +importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,dplyr_col_modify) @@ -89,10 +90,12 @@ importFrom(dplyr,relocate) importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,slice) +importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") +importFrom(purrr,map) importFrom(purrr,map_lgl) importFrom(rlang,"!!!") importFrom(rlang,"!!") diff --git a/R/slide.R b/R/slide.R index 9a4cd04c..9028da11 100644 --- a/R/slide.R +++ b/R/slide.R @@ -383,8 +383,12 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] #' or ungrouped. If ungrouped, all data in `x` will be treated as part of a #' single data group. -#' @param ... Additional arguments to pass to `data.table::frollmean`. `na.rm` -#' and `algo` are important to consider. +#' @param col_name A character vector of the names of one or more columns for +#' which to calculate the rolling mean. +#' @param ... Additional arguments to pass to `data.table::frollmean`, for +#' example, `na.rm` and `algo`. `data.table::frollmean` is automatically +#' passed the data `x` to operate on, the window size `n`, and the alignment +#' `align`. Providing these args via `...` will cause an error. #' @param before,after How far `before` and `after` each `ref_time_value` should #' the sliding window extend? At least one of these two arguments must be #' provided; the other's default will be 0. Any value provided for either @@ -464,9 +468,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' leading window was intended, but the `after` argument was forgotten or #' misspelled.) #' -#' @importFrom lubridate days weeks -#' @importFrom dplyr bind_rows group_vars filter select -#' @importFrom rlang .data .env !! enquo enquos sym env missing_arg +#' @importFrom dplyr bind_rows mutate %>% arrange tibble +#' @importFrom purrr map #' @importFrom data.table frollmean #' @export #' @examples @@ -502,12 +505,19 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { all_dates <- seq(min(x$time_value), max(x$time_value), by = time_step) - pad_early_dates <- all_dates[1L] - before:1 - pad_late_dates <- all_dates[1L] + 1:after - ## TODO: need to deal with `after` - # `frollmean` is 1-indexed, so adjust our `before` and `after` params. - m <- before + 1L + pad_early_dates <- c() + pad_late_dates <- c() + if (before != 0) { + pad_early_dates <- all_dates[1L] - before:1 + } + if (after != 0) { + pad_late_dates <- all_dates[length(all_dates)] + 1:after + } + + # `frollmean` is 1-indexed, so create a new window width based on our + # `before` and `after` params. + m <- before + after + 1L if (is.null(names_sep)) { result_col_name <- new_col_name @@ -515,23 +525,43 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result_col_name <- paste(new_col_name, col_name, sep = names_sep) } - result <- mutate(x, .real = TRUE) %>% - group_by(geo_value) %>% - group_modify(~{ - - # `setdiff` causes date formatting to change. Re-class these as dates. - missing_dates <- as.Date(setdiff(all_dates, .x$time_value), origin = "1970-01-01") - .x <- bind_rows( - .x, - tibble(time_value = c(pad_early_dates, missing_dates, pad_late_dates), .real = FALSE) - ) %>% - arrange(time_value) - .x[, c(result_col_name)] <- data.table::frollmean(.x[, c(col_name)], n = m, ...) - .x + slide_one_grp <- function(.data_group, .group_key, ...) { + # `setdiff` causes date formatting to change. Re-class these as dates. + missing_dates <- as.Date(setdiff(all_dates, .data_group$time_value), origin = "1970-01-01") + + # `frollmean` requires a full window to compute a result. Add NA values + # to beginning and end of the group so that we get results for the + # first `before` and last `after` elements. + .data_group <- bind_rows( + .data_group, + tibble(time_value = c(missing_dates, pad_early_dates, pad_late_dates), .real = FALSE) + ) %>% + arrange(time_value) + + roll_output <- data.table::frollmean( + x = .data_group[, col_name], n = m, align = "right", ... + ) + + if (after >= 1) { + # Right-aligned `frollmean` results' `ref_time_value`s will be `after` timesteps + # ahead of where they should be. Shift results to the left by `after` timesteps. + .data_group[, result_col_name] <- purrr::map(roll_output, function(.x) { + c(.x[(after + 1L):length(.x)], rep(NA, after)) + } + ) + } else { + .data_group[, result_col_name] <- roll_output } - ) + + return(.data_group) + } + + result <- mutate(x, .real = TRUE) %>% + group_modify(slide_one_grp, ..., .keep = FALSE) + result <- result[result$.real, ] result$.real <- NULL + ungroup(result) } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index f96dfd94..686204f5 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -23,8 +23,13 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{...}{Additional arguments to pass to \code{data.table::frollmean}. \code{na.rm} -and \code{algo} are important to consider.} +\item{col_name}{A character vector of the names of one or more columns for +which to calculate the rolling mean.} + +\item{...}{Additional arguments to pass to \code{data.table::frollmean}, for +example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically +passed the data \code{x} to operate on, the window size \code{n}, and the alignment +\code{align}. Providing these args via \code{...} will cause an error.} \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be From c133614b67437c92f4555c43248d4bdb6a8f3168 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:50:48 -0500 Subject: [PATCH 099/345] param checks --- R/slide.R | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/R/slide.R b/R/slide.R index 9028da11..1527f235 100644 --- a/R/slide.R +++ b/R/slide.R @@ -504,6 +504,69 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { + # Check we have an `epi_df` object + if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") + + if (missing(ref_time_values)) { + ref_time_values <- unique(x$time_value) + } + + # Some of these `ref_time_values` checks and processing steps also apply to + # the `ref_time_values` default; for simplicity, just apply all the steps + # regardless of whether we are working with a default or user-provided + # `ref_time_values`: + if (length(ref_time_values) == 0L) { + Abort("`ref_time_values` must have at least one element.") + } else if (any(is.na(ref_time_values))) { + Abort("`ref_time_values` must not include `NA`.") + } else if (anyDuplicated(ref_time_values) != 0L) { + Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } else if (!all(ref_time_values %in% unique(x$time_value))) { + Abort("All `ref_time_values` must appear in `x$time_value`.") + } else { + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`, `after`: + if (!missing(before)) { + before <- vctrs::vec_cast(before, integer()) + if (length(before) != 1L || is.na(before) || before < 0L) { + Abort("`before` must be length-1, non-NA, non-negative") + } + } + if (!missing(after)) { + after <- vctrs::vec_cast(after, integer()) + if (length(after) != 1L || is.na(after) || after < 0L) { + Abort("`after` must be length-1, non-NA, non-negative") + } + } + if (missing(before)) { + if (missing(after)) { + Abort("Either or both of `before`, `after` must be provided.") + } else if (after == 0L) { + Warn("`before` missing, `after==0`; maybe this was intended to be some + non-zero-width trailing window, but since `before` appears to be + missing, it's interpreted as a zero-width window (`before=0, + after=0`).") + } + before <- 0L + } else if (missing(after)) { + if (before == 0L) { + Warn("`before==0`, `after` missing; maybe this was intended to be some + non-zero-width leading window, but since `after` appears to be + missing, it's interpreted as a zero-width window (`before=0, + after=0`).") + } + after <- 0L + } + + # If a custom time step is specified, then redefine units + if (!missing(time_step)) { + before <- time_step(before) + after <- time_step(after) + } + + # time_step can be any of `c("days", "weeks", "months", "quarters", "years")` all_dates <- seq(min(x$time_value), max(x$time_value), by = time_step) pad_early_dates <- c() From 73926f3ddb6a9b6b1b9c7f93fd0d52d9b8c76ca8 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 22 Jan 2024 18:07:22 -0500 Subject: [PATCH 100/345] filter to ref_time_values passed by user before and after computation --- R/slide.R | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/R/slide.R b/R/slide.R index 1527f235..1b5be2ac 100644 --- a/R/slide.R +++ b/R/slide.R @@ -507,7 +507,8 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") - if (missing(ref_time_values)) { + user_provided_rtvs <- !missing(ref_time_values) + if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) } @@ -561,10 +562,10 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, } # If a custom time step is specified, then redefine units - if (!missing(time_step)) { - before <- time_step(before) - after <- time_step(after) - } + # if (!missing(time_step)) { + # before <- time_step(before) + # after <- time_step(after) + # } # time_step can be any of `c("days", "weeks", "months", "quarters", "years")` all_dates <- seq(min(x$time_value), max(x$time_value), by = time_step) @@ -578,6 +579,22 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, pad_late_dates <- all_dates[length(all_dates)] + 1:after } + if (user_provided_rtvs) { + # To reduce computational effort, filter down to only data required for + # range within provided ref time values. We don't check if the ref time + # value sequence is complete. Because `data.table::frollmean` requires a + # completed date sequence to correctly calculate the rolling average, + # filtering down to requested ref time values + before and after date + # padding would be complicated and likely not worth the upfront effort + # given the speed of `frollmean` compared to R filtering. + subset_ref_time_values <- seq( + min(ref_time_values) - length(pad_early_dates), + max(ref_time_values) + length(pad_late_dates), + by = time_step + ) + x <- x[x$time_value %in% subset_ref_time_values, ] + } + # `frollmean` is 1-indexed, so create a new window width based on our # `before` and `after` params. m <- before + after + 1L @@ -625,6 +642,10 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result <- result[result$.real, ] result$.real <- NULL + if (user_provided_rtvs) { + result <- result[time_value %in% ref_time_values, ] + } + ungroup(result) } From 42757f504d548256cc980927cf1dbd5101bd9314 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 22 Jan 2024 18:38:57 -0500 Subject: [PATCH 101/345] replace results with NA if `all_rows`; make sure output is epi_df --- R/slide.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/slide.R b/R/slide.R index 1b5be2ac..5330a32f 100644 --- a/R/slide.R +++ b/R/slide.R @@ -579,7 +579,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, pad_late_dates <- all_dates[length(all_dates)] + 1:after } - if (user_provided_rtvs) { + if (user_provided_rtvs && !all_rows) { # To reduce computational effort, filter down to only data required for # range within provided ref time values. We don't check if the ref time # value sequence is complete. Because `data.table::frollmean` requires a @@ -642,8 +642,16 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result <- result[result$.real, ] result$.real <- NULL - if (user_provided_rtvs) { - result <- result[time_value %in% ref_time_values, ] + if (all_rows) { + result[!(result$time_value %in% ref_time_values), result_col_name] <- NA + } else if (user_provided_rtvs) { + result <- result[result$time_value %in% ref_time_values, ] + } + + if (!is_epi_df(result)) { + # `all_rows` and `as_list_col` handling strip epi_df format and metadata. + # Restore them. + result <- bind_rows(x[c(),], result) } ungroup(result) From 71a5ac359881e9b9d08048b0d9fb654ab5c3eec1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:42:03 -0500 Subject: [PATCH 102/345] don't need to pre-filter for user-provided ref time values --- R/slide.R | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/R/slide.R b/R/slide.R index 5330a32f..78326ed5 100644 --- a/R/slide.R +++ b/R/slide.R @@ -579,22 +579,6 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, pad_late_dates <- all_dates[length(all_dates)] + 1:after } - if (user_provided_rtvs && !all_rows) { - # To reduce computational effort, filter down to only data required for - # range within provided ref time values. We don't check if the ref time - # value sequence is complete. Because `data.table::frollmean` requires a - # completed date sequence to correctly calculate the rolling average, - # filtering down to requested ref time values + before and after date - # padding would be complicated and likely not worth the upfront effort - # given the speed of `frollmean` compared to R filtering. - subset_ref_time_values <- seq( - min(ref_time_values) - length(pad_early_dates), - max(ref_time_values) + length(pad_late_dates), - by = time_step - ) - x <- x[x$time_value %in% subset_ref_time_values, ] - } - # `frollmean` is 1-indexed, so create a new window width based on our # `before` and `after` params. m <- before + after + 1L From 1a0741d27eb59db2829767d4f7317c3a3598f789 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 22 Jan 2024 18:28:04 -0500 Subject: [PATCH 103/345] warn that as_list_col not supported Keeping `as_list_col` for now so args match those of `epi_slide` as closely as possible. --- R/slide.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/slide.R b/R/slide.R index 78326ed5..48d3002c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -507,6 +507,10 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") + if (as_list_col) { + Warn("`as_list_col` is not supported for `epi_slide_mean`. This setting will be ignored") + } + user_provided_rtvs <- !missing(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) From a51e7ee720a8005d281ce96923b43c6db6bd904d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:05:34 -0500 Subject: [PATCH 104/345] support list col output --- R/slide.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/slide.R b/R/slide.R index 48d3002c..f770d6e8 100644 --- a/R/slide.R +++ b/R/slide.R @@ -507,10 +507,6 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") - if (as_list_col) { - Warn("`as_list_col` is not supported for `epi_slide_mean`. This setting will be ignored") - } - user_provided_rtvs <- !missing(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) @@ -636,6 +632,16 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result <- result[result$time_value %in% ref_time_values, ] } + if (as_list_col) { + result[, result_col_name] <- purrr::map(result_col_name, + function(.x) { + tmp <- result[[.x]] + tmp[is.na(tmp)] <- list(NULL) + as.list(tmp) + } + ) + } + if (!is_epi_df(result)) { # `all_rows` and `as_list_col` handling strip epi_df format and metadata. # Restore them. From f6464c02c6406a68f4cbffe4043c5cdb288f2303 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 25 Jan 2024 08:43:51 -0800 Subject: [PATCH 105/345] bugfix: add missing arg_is_numeric --- R/utils-arg.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/R/utils-arg.R b/R/utils-arg.R index dca21646..b48a3642 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -24,6 +24,17 @@ arg_is_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { ) } +arg_is_numeric <- function(..., allow_null = FALSE) { + handle_arg_list( + ..., + tests = function(name, value) { + if (!(is.numeric(value) | (is.null(value) & allow_null))) { + cli::cli_abort("All {.val {name}} must numeric.") + } + } + ) +} + arg_is_int <- function(..., allow_null = FALSE) { arg_is_numeric(..., allow_null = allow_null) handle_arg_list( From b3d876d85a077fe606700ca3bd9643458bac5d92 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 25 Jan 2024 09:05:37 -0800 Subject: [PATCH 106/345] bugfix: remove ability to plot non-numeric vars --- R/autoplot.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/autoplot.R b/R/autoplot.R index bcd0b54d..c193e01f 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -32,6 +32,12 @@ #' .color_by = "none", #' .facet_by = "geo_value" #' ) +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", +#' .base_color = "red", .facet_by = "geo_value") +#' +#' # .base_color specification won't have any effect due .color_by default +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, +#' .base_color = "red", .facet_by = "geo_value") autoplot.epi_df <- function( object, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), @@ -51,6 +57,7 @@ autoplot.epi_df <- function( # --- check for numeric variables allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) + allowed <- allowed[allowed] if (length(allowed) == 0) { cli::cli_abort("No numeric variables were available to plot automatically.") } From b1a2eef72b6898e08dcff0d33705f65e4d7901ea Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 25 Jan 2024 09:09:21 -0800 Subject: [PATCH 107/345] remove unused --- R/key_colnames.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/key_colnames.R b/R/key_colnames.R index 91be7ab6..158c5a86 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -38,7 +38,3 @@ kill_time_value <- function(v) { arg_is_chr(v) v[v != "time_value"] } - -key_colnames_only <- function(x, ...) { - kill_time_value(key_colnames(x, ...)) -} From d30fc8542f5f7e11a9b19055200e41c42458f424 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 25 Jan 2024 09:17:03 -0800 Subject: [PATCH 108/345] redocument --- man/autoplot.epi_df.Rd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd index 68aef0c4..df30528e 100644 --- a/man/autoplot.epi_df.Rd +++ b/man/autoplot.epi_df.Rd @@ -56,4 +56,10 @@ autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", .facet_by = "geo_value" ) +autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", + .base_color = "red", .facet_by = "geo_value") + +# .base_color specification won't have any effect due .color_by default +autoplot(jhu_csse_daily_subset, case_rate_7d_av, + .base_color = "red", .facet_by = "geo_value") } From 4fd4d89e625468db1e81c20c0fcfe628ecc1683f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Jan 2024 11:10:09 -0800 Subject: [PATCH 109/345] nat suggestions --- .../{workflows => }/version_bump_options.json | 7 ------- .../workflows/{pkgdown_dev.yaml => pkgdown.yaml} | 4 +++- .github/workflows/release-helper.yaml | 16 ++++------------ 3 files changed, 7 insertions(+), 20 deletions(-) rename .github/{workflows => }/version_bump_options.json (68%) rename .github/workflows/{pkgdown_dev.yaml => pkgdown.yaml} (87%) diff --git a/.github/workflows/version_bump_options.json b/.github/version_bump_options.json similarity index 68% rename from .github/workflows/version_bump_options.json rename to .github/version_bump_options.json index a2f6aed9..38e2b63b 100644 --- a/.github/workflows/version_bump_options.json +++ b/.github/version_bump_options.json @@ -14,13 +14,6 @@ "branch": "main", "tag": true, "reset": "build" - }, - { - "trigger": "commit", - "bump": "major", - "branch": "release", - "tag": true, - "reset": ["minor", "build"] } ] } diff --git a/.github/workflows/pkgdown_dev.yaml b/.github/workflows/pkgdown.yaml similarity index 87% rename from .github/workflows/pkgdown_dev.yaml rename to .github/workflows/pkgdown.yaml index 1f33c083..aa7ddec3 100644 --- a/.github/workflows/pkgdown_dev.yaml +++ b/.github/workflows/pkgdown.yaml @@ -40,7 +40,9 @@ jobs: - name: Build site env: DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE="devel")) + run: | + mode <- ifelse("${{ GITHUB_BASE_REF }}" == "main", "release", "devel") + pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE=mode)) shell: Rscript {0} - name: Deploy to GitHub pages 🚀 diff --git a/.github/workflows/release-helper.yaml b/.github/workflows/release-helper.yaml index c585a1a7..cc0bc6fa 100644 --- a/.github/workflows/release-helper.yaml +++ b/.github/workflows/release-helper.yaml @@ -11,21 +11,13 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out code - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: ref: dev + fetch-depth: 0 ssh-key: ${{ secrets.CMU_DELPHI_DEPLOY_MACHINE_SSH }} - name: Reset dev branch run: | git fetch origin main:main - git reset --hard main - - name: Create pull request into dev - uses: peter-evans/create-pull-request@v3 - with: - branch: bot/sync-main-dev - commit-message: "chore: sync main-dev" - base: dev - title: "chore: sync main->dev" - labels: chore - body: | - Syncing Main->Dev. + git merge main + git push From 283fba2c5f4a0a130fd39b1c8eb83b569b6d3dce Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Jan 2024 11:43:20 -0800 Subject: [PATCH 110/345] fix path, mode source --- .github/workflows/auto_semver_bump.yaml | 2 +- .github/workflows/pkgdown.yaml | 6 ++- .github/workflows/pkgdown_main.yaml | 52 ------------------------- 3 files changed, 6 insertions(+), 54 deletions(-) delete mode 100644 .github/workflows/pkgdown_main.yaml diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index 796ea20d..66d32a56 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -19,5 +19,5 @@ jobs: - name: Bump Versions uses: michmich112/version-bumper@master with: - options-file: "./.github/workflows/version_bump_options.json" + options-file: "./.github/version_bump_options.json" github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index aa7ddec3..ba2ac596 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -41,7 +41,11 @@ jobs: env: DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} run: | - mode <- ifelse("${{ GITHUB_BASE_REF }}" == "main", "release", "devel") + if (startsWith("${{ GITHUB_EVENT_NAME }}", "pull_request")) { + mode <- ifelse("${{ GITHUB_BASE_REF }}" == "main", "release", "devel") + } else { + mode <- ifelse("${{ GITHUB_REF_NAME }}" == "main", "release", "devel") + } pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE=mode)) shell: Rscript {0} diff --git a/.github/workflows/pkgdown_main.yaml b/.github/workflows/pkgdown_main.yaml deleted file mode 100644 index a72010f5..00000000 --- a/.github/workflows/pkgdown_main.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -# -# Created with usethis + edited to run on PRs to dev, use API key. -on: - push: - branches: [main] - pull_request: - branches: [main] - release: - types: [published] - workflow_dispatch: - -name: pkgdown - -jobs: - pkgdown: - # only build docs on the main repository and not forks - if: github.repository_owner == 'cmu-delphi' - runs-on: ubuntu-latest - # Only restrict concurrency for non-PR jobs - concurrency: - group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v3 - - - uses: r-lib/actions/setup-pandoc@v2 - - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::pkgdown, local::. - needs: website - - - name: Build site - env: - DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE="release")) - shell: Rscript {0} - - - name: Deploy to GitHub pages 🚀 - if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.4.1 - with: - clean: false - branch: gh-pages - folder: docs From af454f80f743ba8be81354350d7626d26ce80bf4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Jan 2024 11:53:44 -0800 Subject: [PATCH 111/345] merging dev/main pkgdown workflows --- .github/workflows/pkgdown.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index ba2ac596..c3186e19 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to run on PRs to dev, use API key. on: push: - branches: dev + branches: [dev, main] pull_request: - branches: dev + branches: [dev, main] release: types: [published] workflow_dispatch: From f7293b2dee38b2e7ecc6b759a4e555ff39a4572d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Jan 2024 14:25:49 -0800 Subject: [PATCH 112/345] hacking around branch protection --- .github/workflows/auto_semver_bump.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index 66d32a56..d4ea288f 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -13,6 +13,8 @@ jobs: steps: # Checkout action is required - uses: actions/checkout@v2 + with: + ssh-key: ${{ secrets.SEMVER_BUMPER_KEY }} - uses: actions/setup-node@v1 with: node-version: "12" @@ -20,4 +22,5 @@ jobs: uses: michmich112/version-bumper@master with: options-file: "./.github/version_bump_options.json" + ssh-key: ${{ secrets.SEMVER_BUMPER_KEY }} github-token: ${{ secrets.GITHUB_TOKEN }} From 860b6c182da1f4c3ac22444e2dbf62e388d5bd89 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Jan 2024 14:40:17 -0800 Subject: [PATCH 113/345] unclear where to put the ssh-key exactly --- .github/workflows/auto_semver_bump.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index d4ea288f..f6314220 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -22,5 +22,4 @@ jobs: uses: michmich112/version-bumper@master with: options-file: "./.github/version_bump_options.json" - ssh-key: ${{ secrets.SEMVER_BUMPER_KEY }} github-token: ${{ secrets.GITHUB_TOKEN }} From 1cc4f80914126a5b1da849ebfc4afc4d8590df41 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Jan 2024 15:04:03 -0800 Subject: [PATCH 114/345] version? --- .github/workflows/auto_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index f6314220..12f24f2c 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -12,7 +12,7 @@ jobs: steps: # Checkout action is required - - uses: actions/checkout@v2 + - uses: actions/checkout@v3.5.2 with: ssh-key: ${{ secrets.SEMVER_BUMPER_KEY }} - uses: actions/setup-node@v1 From ea12144d03a43b8d44d7c6a8d202f287fbab70f6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 15:51:28 -0800 Subject: [PATCH 115/345] Update .github/workflows/auto_semver_bump.yaml --- .github/workflows/auto_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index 12f24f2c..b72dee2d 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -14,7 +14,7 @@ jobs: # Checkout action is required - uses: actions/checkout@v3.5.2 with: - ssh-key: ${{ secrets.SEMVER_BUMPER_KEY }} + token: ${{ secrets.DMITRY_PAT }} - uses: actions/setup-node@v1 with: node-version: "12" From 9cdc6de07017c788eb589ea4d2d2a85d36ca594c Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 15:55:47 -0800 Subject: [PATCH 116/345] Update auto_semver_bump.yaml --- .github/workflows/auto_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index b72dee2d..f61f2fd7 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -22,4 +22,4 @@ jobs: uses: michmich112/version-bumper@master with: options-file: "./.github/version_bump_options.json" - github-token: ${{ secrets.GITHUB_TOKEN }} + github-token: ${{ secrets.DMITRY_PAT }} From 6db11947b4385cf7da909629a9f97b0d550654c2 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 17:14:29 -0800 Subject: [PATCH 117/345] ci: fix bumpversion manually --- .bumpversion.cfg | 4 ++++ .github/version_bump_options.json | 19 ----------------- .github/workflows/auto_semver_bump.yaml | 27 ++++++++++++++++++------- 3 files changed, 24 insertions(+), 26 deletions(-) create mode 100644 .bumpversion.cfg delete mode 100644 .github/version_bump_options.json diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 00000000..7ad5a922 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,4 @@ +[bumpversion] +current_version = 0.7.2 + +[bumpversion:file:DESCRIPTION] diff --git a/.github/version_bump_options.json b/.github/version_bump_options.json deleted file mode 100644 index 38e2b63b..00000000 --- a/.github/version_bump_options.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "scheme": "semantic", - "versionFile": "./DESCRIPTION", - "files": [], - "rules": [ - { - "trigger": "commit", - "branch": "dev", - "bump": "build" - }, - { - "trigger": "commit", - "bump": "minor", - "branch": "main", - "tag": true, - "reset": "build" - } - ] -} diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index f61f2fd7..e6334168 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -2,8 +2,10 @@ # also see the options file for configuration name: Manage versions -# not actually on all pushes, see the options file -on: [push] +on: + push: + branches: [dev, main] + workflow_dispatch: jobs: bump: @@ -18,8 +20,19 @@ jobs: - uses: actions/setup-node@v1 with: node-version: "12" - - name: Bump Versions - uses: michmich112/version-bumper@master - with: - options-file: "./.github/version_bump_options.json" - github-token: ${{ secrets.DMITRY_PAT }} + - name: Change version number + id: version + run: | + python -m pip install bump2version + if [[ "${{ GITHUB_EVENT_NAME }}" == "pull_request"* ]]; then + branch=$(echo "${{ GITHUB_BASE_REF }}") + else + branch=$(echo "${{ GITHUB_REF_NAME }}") + fi + if [[ $branch == "main" ]]; then + version="minor" + else + version="patch" + fi + bump2version $version --commit --message "Bump version: {current_version} → {new_version}" --allow-dirty + git push From 2c0eb6ce4ecb3b6bcd25b9ea7389e63c9a976038 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 17:54:56 -0800 Subject: [PATCH 118/345] ci: need Python --- .github/workflows/auto_semver_bump.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index e6334168..1faaf055 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -17,9 +17,10 @@ jobs: - uses: actions/checkout@v3.5.2 with: token: ${{ secrets.DMITRY_PAT }} - - uses: actions/setup-node@v1 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 with: - node-version: "12" + python-version: 3.8 - name: Change version number id: version run: | From 50aa65836641bebd57a45e55c51f40e8efc7d100 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 17:56:45 -0800 Subject: [PATCH 119/345] testing --- .github/workflows/auto_semver_bump.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index 1faaf055..8f57df2c 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -4,7 +4,7 @@ name: Manage versions on: push: - branches: [dev, main] + # branches: [dev, main] workflow_dispatch: jobs: From 0a6523f13d7cd1bce31e4f1fe1d07e390fe0fbb7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 18:04:21 -0800 Subject: [PATCH 120/345] ci: fix variable refs --- .github/workflows/auto_semver_bump.yaml | 6 +++--- .github/workflows/pkgdown.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml index 8f57df2c..de10618f 100644 --- a/.github/workflows/auto_semver_bump.yaml +++ b/.github/workflows/auto_semver_bump.yaml @@ -25,10 +25,10 @@ jobs: id: version run: | python -m pip install bump2version - if [[ "${{ GITHUB_EVENT_NAME }}" == "pull_request"* ]]; then - branch=$(echo "${{ GITHUB_BASE_REF }}") + if [[ "${{ github.event_name }}" == "pull_request"* ]]; then + branch=$(echo "${{ github.base_ref }}") else - branch=$(echo "${{ GITHUB_REF_NAME }}") + branch=$(echo "${{ github.ref_name }}") fi if [[ $branch == "main" ]]; then version="minor" diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index c3186e19..9490acc7 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -41,10 +41,10 @@ jobs: env: DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} run: | - if (startsWith("${{ GITHUB_EVENT_NAME }}", "pull_request")) { - mode <- ifelse("${{ GITHUB_BASE_REF }}" == "main", "release", "devel") + if (startsWith("${{ github.event_name }}", "pull_request")) { + mode <- ifelse("${{ github.base_ref }}" == "main", "release", "devel") } else { - mode <- ifelse("${{ GITHUB_REF_NAME }}" == "main", "release", "devel") + mode <- ifelse("${{ github.ref_name }}" == "main", "release", "devel") } pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE=mode)) shell: Rscript {0} From b2b8355feb6e1b013f79130b64e5b5c0bb604985 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 18:58:42 -0800 Subject: [PATCH 121/345] ci+docs: remove autobump, update template+dev doc with bump instructions --- .github/pull_request_template.md | 21 ++++++++----- .github/workflows/auto_semver_bump.yaml | 39 ------------------------- DEVELOPMENT.md | 33 ++++++++++----------- NEWS.md | 13 +++++---- 4 files changed, 36 insertions(+), 70 deletions(-) delete mode 100644 .github/workflows/auto_semver_bump.yaml diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index ffd9afa9..3afd83a6 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,16 +1,21 @@ ### Checklist Please: -- [ ] Make sure this PR is against "dev", not "main". -- [ ] Request a review from one of the current epiprocess main reviewers: - brookslogan, nmdefries. -- [ ] Describe changes made in NEWS.md, making sure breaking changes - (backwards-incompatible changes to the documented interface) are noted. - The right location for this may be in flux (see - [here](https://github.com/cmu-delphi/epiprocess/pull/398)). + +- [ ] Make sure this PR is against "dev", not "main". +- [ ] Request a review from one of the current epiprocess main reviewers: + brookslogan, nmdefries. +- [ ] Makes sure to bump the version number in `DESCRIPTION` and `NEWS.md`. + Always increment the patch version number (the third number), unless you are + making a release PR from dev to main, in which case increment the minor + version number (the second number). +- [ ] Describe changes made in NEWS.md, making sure breaking changes + (backwards-incompatible changes to the documented interface) are noted. + Collect the changes under the next release number (e.g. if you are on + 0.7.2, then write your changes under the 0.8 heading). ### Change explanations for reviewer ### Magic GitHub syntax to mark associated Issue(s) as resolved when this is merged into the default branch -- Resolves #{issue number} +- Resolves #{issue number} diff --git a/.github/workflows/auto_semver_bump.yaml b/.github/workflows/auto_semver_bump.yaml deleted file mode 100644 index de10618f..00000000 --- a/.github/workflows/auto_semver_bump.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# this action: https://github.com/marketplace/actions/version-bumper -# also see the options file for configuration -name: Manage versions - -on: - push: - # branches: [dev, main] - workflow_dispatch: - -jobs: - bump: - #if: github.event.pull_request.merged == true - runs-on: ubuntu-latest - - steps: - # Checkout action is required - - uses: actions/checkout@v3.5.2 - with: - token: ${{ secrets.DMITRY_PAT }} - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Change version number - id: version - run: | - python -m pip install bump2version - if [[ "${{ github.event_name }}" == "pull_request"* ]]; then - branch=$(echo "${{ github.base_ref }}") - else - branch=$(echo "${{ github.ref_name }}") - fi - if [[ $branch == "main" ]]; then - version="minor" - else - version="patch" - fi - bump2version $version --commit --message "Bump version: {current_version} → {new_version}" --allow-dirty - git push diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index a344b78e..c133099b 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -35,25 +35,22 @@ R -e 'devtools::document()' python -m http.server -d docs ``` -For `pkgdown` to correctly generate both public (`main`) and `dev` documentation sites, the package version in `DESCRIPTION` on `dev` must have four components, and be of the format `x.x.x.9000`. The package version on `main` must be in the format `x.x.x`. - -The documentation website is updated on push or pull request to the `main` and `dev` branches. +## Versioning + +Please follow the guidelines in the PR template document (reproduced here): + +- [ ] Make sure this PR is against "dev", not "main". +- [ ] Request a review from one of the current epiprocess main reviewers: + brookslogan, nmdefries. +- [ ] Makes sure to bump the version number in `DESCRIPTION` and `NEWS.md`. + Always increment the patch version number (the third number), unless you are + making a release PR from dev to main, in which case increment the minor + version number (the second number). +- [ ] Describe changes made in NEWS.md, making sure breaking changes + (backwards-incompatible changes to the documented interface) are noted. + Collect the changes under the next release number (e.g. if you are on + 0.7.2, then write your changes under the 0.8 heading). ## Release process -### Manual - TBD - -### Automated (currently unavailable) - -The release consists of multiple steps which can be all done via the GitHub website: - -1. Go to [create_release GitHub Action](https://github.com/cmu-delphi/epidatr/actions/workflows/create_release.yml) and click the `Run workflow` button. Enter the next version number or one of the magic keywords (patch, minor, major) and hit the green `Run workflow` button. -2. The action will prepare a new release and will end up with a new [Pull Request](https://github.com/cmu-delphi/epidatr/pulls) -3. Let the code owner review the PR and its changes and let the CI check whether everything builds successfully -4. Once approved and merged, another GitHub action job starts which automatically will - 1. create a git tag - 2. create another [Pull Request](https://github.com/cmu-delphi/epidatr/pulls) to merge the changes back to the `dev` branch - 3. create a [GitHub release](https://github.com/cmu-delphi/epidatr/releases) with automatically derived release notes -5. Release to CRAN diff --git a/NEWS.md b/NEWS.md index 3a4bf156..1d98f7a5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,15 +1,20 @@ -# epiprocess 0.7.2.9999 +# epiprocess + +Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's. + +# epiprocess 0.8 ## Improvements * `epi_slide` computations are now 2-4 times faster after changing how reference time values, made accessible within sliding functions, are calculated (#397). +* regenerated the `jhu_csse_daily_subset` dataset with the latest versions of + the data from the API +* changed approach to versioning, see DEVELOPMENT.md for details # epiprocess 0.7.1.9999 -Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.7.x will indicate PR's. - ## Breaking changes * Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 @@ -25,8 +30,6 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.7.x will indicat * Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. -# epiprocess 0.7.0 - ## Breaking changes * Changes to `epi_slide` and `epix_slide`: From 7fd55800d8cecd95c02f726cd8356f399fbbc0b0 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 18:35:09 -0800 Subject: [PATCH 122/345] fix+doc: regenerate jhu_csse_daily_subset and update doc --- R/data.R | 18 ++++++++--------- data-raw/jhu_csse_daily_subset.R | 32 +++++++++++++++---------------- data/jhu_csse_daily_subset.rda | Bin 82105 -> 81174 bytes 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/R/data.R b/R/data.R index ead3dfdd..c528039c 100644 --- a/R/data.R +++ b/R/data.R @@ -28,15 +28,15 @@ #' in Engineering. Copyright Johns Hopkins University 2020. #' #' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -#' These signals are taken directly from the JHU CSSE -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -#' without changes. The 7-day average signals are computed by Delphi by -#' calculating moving averages of the preceding 7 days, so the signal for -#' June 7 is the average of the underlying data for June 1 through 7, -#' inclusive. -#' * Furthermore, the data has been limited to a very small number of rows, -#' the signal names slightly altered, and formatted into a tibble. +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +#' the COVIDcast Epidata API}: The case signal is taken directly from the JHU +#' CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub +#' repository}. The rate signals were computed by Delphi using Census +#' population data. The 7-day average signals were computed by Delphi by +#' calculating moving averages of the preceding 7 days, so the signal for June +#' 7 is the average of the underlying data for June 1 through 7, inclusive. +#' * Furthermore, the data has been limited to a very small number of rows, the +#' signal names slightly altered, and formatted into a tibble. "jhu_csse_daily_subset" diff --git a/data-raw/jhu_csse_daily_subset.R b/data-raw/jhu_csse_daily_subset.R index 14ca85c8..affeb193 100644 --- a/data-raw/jhu_csse_daily_subset.R +++ b/data-raw/jhu_csse_daily_subset.R @@ -2,58 +2,58 @@ library(epidatr) library(epiprocess) library(dplyr) -confirmed_7dav_incidence_prop <- pub_covidcast( +confirmed_incidence_num <- pub_covidcast( source = "jhu-csse", - signals = "confirmed_7dav_incidence_prop", + signals = "confirmed_incidence_num", geo_type = "state", time_type = "day", geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), ) %>% - select(geo_value, time_value, case_rate_7d_av = value) %>% + select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) -deaths_7dav_incidence_prop <- pub_covidcast( +confirmed_7dav_incidence_num <- pub_covidcast( source = "jhu-csse", - signals = "deaths_7dav_incidence_prop", + signals = "confirmed_7dav_incidence_num", geo_type = "state", time_type = "day", geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), ) %>% - select(geo_value, time_value, death_rate_7d_av = value) %>% + select(geo_value, time_value, cases_7d_av = value) %>% arrange(geo_value, time_value) -confirmed_incidence_num <- pub_covidcast( +confirmed_7dav_incidence_prop <- pub_covidcast( source = "jhu-csse", - signals = "confirmed_incidence_num", + signals = "confirmed_7dav_incidence_prop", geo_type = "state", time_type = "day", geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), ) %>% - select(geo_value, time_value, cases = value) %>% + select(geo_value, time_value, case_rate_7d_av = value) %>% arrange(geo_value, time_value) -confirmed_7dav_incidence_num <- pub_covidcast( +deaths_7dav_incidence_prop <- pub_covidcast( source = "jhu-csse", - signals = "confirmed_7dav_incidence_num", + signals = "deaths_7dav_incidence_prop", geo_type = "state", time_type = "day", geo_values = "ca,fl,ny,tx,ga,pa", time_values = epirange(20200301, 20211231), ) %>% - select(geo_value, time_value, cases_7d_av = value) %>% + select(geo_value, time_value, death_rate_7d_av = value) %>% arrange(geo_value, time_value) -jhu_csse_daily_subset <- confirmed_7dav_incidence_prop %>% - full_join(deaths_7dav_incidence_prop, +jhu_csse_daily_subset <- confirmed_incidence_num %>% + full_join(confirmed_7dav_incidence_num, by = c("geo_value", "time_value") ) %>% - full_join(confirmed_incidence_num, + full_join(confirmed_7dav_incidence_prop, by = c("geo_value", "time_value") ) %>% - full_join(confirmed_7dav_incidence_num, + full_join(deaths_7dav_incidence_prop, by = c("geo_value", "time_value") ) %>% as_epi_df() diff --git a/data/jhu_csse_daily_subset.rda b/data/jhu_csse_daily_subset.rda index 6d073eea35d312cac48856fd3f48ab5aeb21f6e2..12fd5f15c3d70e52af0a139ea4750bba4bdb02db 100644 GIT binary patch literal 81174 zcmdSALy#^^5GDGxZQFg@)@|Fi-M4Mqwr$(CZQHipJ^#GTyqL|*^1TyLh+I_U$;w66 ziAq6Dv%ic28sthEd>~wlfTxt-|0DK3@d5vV`@i7^!1q75ZdstL?*ZssU*8sl0e~KN zZQTF>aGn|2=i;vOd{xEBI*aR~4r^Dmf7jbXHkbBTm6leGNSigy<;9)$Gaa2BH`i8c zTkC4nCiS=37dzLt-Cm({Q0v>Z$=`pw}TcN7l{BRV~dG(^VQm9hKCYT!s0$ zJkikRn*ib#aKAaGMPj};Vj!aD^mZlO*GB#@_Ij{-?r0h$u>E%CNl~-r~YxtxT$oxF;i`tRIhFY{O#NW zyt=mJT?;QOh!KwKG;+aTNwK@&hU0NYWLtgq$#p#yR8(Z!wrEnLioB>HS7uLdwl9yX zh-it4;5~>snncbEg&SlxPp$VgX59IX&x5i;I}eT%FU(k`2q# z!CP|AThZs0o93CV*H+^f7gy`;n2hb1>*kUh>+Qwji5cfkEIej5mlg8Hl2aG>4v`YF zs+Ezn3>Hd3h{$+g9tDfV!;9c8L%kQ8RjK{G7JC! zC;)){Um`_g0l@vhAmf1mP#FNgf9L#X{0~e(Obls|a4g`U=JEavB3}av>Og|LzKnXw5bQTnIY-8BcMCD&aCb^|&xkW`!;epU&Omm(; zB^efSeHlP9M>!d#=NS^mJRs!3d`f2XJW3PB-%3j_5uVCE2;q;XLvZNk!se z85DDH_<80REI@^3eja`RY-}^2xH*>xQL{@8Mc_(J&`@LzRU*D_HI7OtkP1bkv02RK z|3AM}|L-2QM2*P&UwQtI>3`AF`u_>r|DEjrb4O|8|1YzCI&yeghzRi?&T!N(oSDzn zRDAax_lt|KIMp*i2a@wc`Fw!N{V+6WFJ!Ue$bz{$jcD5Ywz z$x3M$U=b3bAYn!midmRsFKMOEnw7iEO6|0SD+VBjfo9HsokB z@gWy6%ksg7%t~f7(M8HC$7St$Xi1z2#nS!rYSG_HFx~`Ha!``|z5ICIiKjxlp{dPK zg|r|X^Tgw{e0kLa^JpTMc`6$;YZM-#*;JhWr#_-xORHL~Qhl9Mv?d)hWGR`{S@8fv zUKrI_O*@6U*RE@x8s56kIojgXXkcO0lt$N;Gr}s-633+NmCmeH>yd#v4u`emw6vSf zTWp!}(3TmEAV0micIkRsJ>53Re8 zMUE3CtaH^=DeqxjiL#@5FL8s`b#8NWf(OTz^Tut{)1X{Vi0*Ja@81XKhWR@)wjdPf z6c|;zqiSc=l%A$>Ao`5<{eRJdx4_!}C^k*{h1Ce*z&cP3$cD{{D=FK2UZI_POp??$ z#Qh;dW(L*=XgzIhPLaA2+a(jtR*k?5R@s@Z_S>t3g14Jta6ssXT`DW}O!CHD_zFbS zs_JTX$9IpXoh?-lp}uCa*%C`6^q_SvTQ#6`!M_(b4Sq_|nkOXNy3QEvhS@u}f-UoO zpC{V>a_N3|=F`4NrZ{ItPAC&J>^(jjzFIRcGA)g0m%DRZ>He$GJ|(Q4t>m+lWW%t( z=}=<^|06R!wK=QT*h?sV$kEhUf8hbID-g9+Y^?LLQYmu|JayOiY-PE)N(b_{#r=xy z(#dLcS}>1?w6B(nka>|2KC+p2OxhpsI6P=s;MqGn9Z$x@E0F3k6U&7RL&zF~d6~)f z{sP+0S_C1ohLC8)8WFqT31j(4Ewa6rujKcPr=apMd_onwV4~y0?8Xf8rY^?A+@wG= zR-RF@-+X*Lcxzkb8UgD1kMdzI6uN}fJ?10oAlIjdZ^b)N`?-7J=9_pVgt;0<8vAzz z{&qVtFVS)H&m7Y$TXpN>HiFq|5$ zVn>`J5+%X-i7tDA*kCE;goFO#h zjLmj2Tg;P{sH)io1%CfDiyU_waQcnY=&X9)#m2cbF4BVZGMsWku>5~?ab;4JI zn~Ygjl*h;9)F-V|+X zwZn5lmP@>DmxF=cT@0M35r)Chlh9^!)!=jjXz3Sc2yf|LlP;s=HfI2@;57yB9OXV)D zTt++{Q#~pr%cYF`NS3c_R*9ypXVkiRR4TIoU7u7j#`4Rk%Ir$w6z8|mLvia8Ru4Ub zy|jZk4Krk4Zudm4&WrPDaxjyeXq@EIspVnmehWKWs8zW->ZTB$92@?s-9bmmefGs% zJEBrBefFwmMI57wc5pS(bTL*c_s6yK%lD=mB1B}`fTL*hfnYh*0Ls0*K$JY(9X^JL z3?hBZA#Eqc){x7V7RWH>VjtjC;H?C_%zxdiTQM&2%w&U5nAC=hY){@LG?dsE!k7DGz4*?sWwvCzw8NP$St ztt>OpB^#d1Z*~8-fcT><3rAIFFM?~#wz$Ed=GSXtOhN^#v&2??ZpWp8D zt!V<4!r8ef9eO3lnpZ)0muK3rat?c|R9>pJl+8)UNLyF6EJgbz&z}T)#HZgsY*vS$w*6*U+16;|4z4d!MOkBfbPMFzVBXu7M)V9ov zY&kn*%~9Wh)m9dI(#gyasYP!SgH_LTZwLcHd8?eadu6;`Rk@rklci8bc&#ss$J(3P=Y%U+eAlOk-!zlgIWv}y zPl>%@k+R3Rw%E?+k|q~)OQw3`d_kutK1vqJozikEOJj?7{d(wO=XjP6AAcd;=)>A} zQ5VY2RJ?sua%+i@$TTBkDOW<2Kb_p_b#D+L=;j3qVZ_V@RDQQ8}b_Q zKJlf*?&ghh=KYE%rcwI-mAFtFS*snnSkn=bKZ&i8sO}=wth4gG>6*KWz5jT1a(qMk zFzjDtmU&Ky|I~dIU+o$J>nbVo!9#$qg}f`mGpk(NBXQ@U@s-7xd8YsA$f}UaZtf8! z+r#~nlXG5~I+!ppH#%--OwqKZ(%vMFa6^#oNX=4be=5y5cB(2J=6wUam@kr1m{1l) zW-=>yk;tSMbrg{kW}JXnq) z0lZYXCHI#CrOGqt>Z_M*6kc%`SP{j2#Jagg>z(!OiPQJW8ezA(v}7=7ITSge zZTVAk2!UGQb-C{88R6&V_@&2ZC`XG6N@C7fV92lacK@c<{h~pgp2iJxnl6C&;2*Vt zH9N2CPtfflo9{*qwO}NmlC+x8|2Qx%7R_`|m-p1IXMr)SGU&y`+`BG!)zJ2i;&hO(jV@{_@bfc z5^ReMqK#`g_yP;Y0I1ztfmNgHYY~|Ze?@+yhw06=3+};$(ap6IPrv|Z^_vEcVbJL~ z>VbD7x#`c;=k0G6IwW6t<+3+zv$3hn3;WHpIBjTdwKIlm{vCI7JyQ+i z2qR2%B6h+smq(7VYFf6lKto+Y0gU0zNrKSX8OI9DgT~>izJ>+Il{?OcR$#FU7`3OO z`fi%OSDk5hP8Bv@B?Y(|7uw+y8y22Bc<3VQ2u3!}zqZ@{vB_AUrejC^dt0adJ@wR~ z*|w%}n>*MqW-AN`CR~iWGaul7J?T`@x0GuTMLGUleer9MPaDYK_A4_er0j{(zp%g! z6OXQMZ-*2Fe5ryS-}}?Iw&vHn-5FeV?;e(KoY7YH-VTV*{?emBP+wK~LuF9Y;t|84 zp_dRgv)pw4glb-8dABrXK>^}A7rKiYOjy{Am?8)fA^B3|R{s(K)iSoQZGOu&KkvEe zUA^MW67Gq?I}nF%?xyb;j0xd1%dU?UBP9qiKoJoMH;gD^13>)*k_`wEfoQONPez69 z_*cg^Ne({hXF*{ywmXDr~Gbj^P0i-F1Hl4DFJnmIRV5qT(@MN+GZUw@QIT{+?W8{A_ zDgvJK)IWuMDdJ7%ZC4B^THL!E>|@SP0(#UtkJNVKw9~fx9oJN*YB3c%#7PKud|rHK zvb6*4Qn$DQx7JXH0tZIiz=+H0Ly{e*RM~wx&lsvz`A6+n9^%HP`z=YSRVU{icbqsO4$~HY1jnXW zj>(%?RFIpP5(F8Q7=(m`*fi27^M{VX`k_)0i2;azn9e=BcF!*tY>*(Mps`VN&7W+Z zz9z-)@+#>?glt~*B2S-IG|t_o2~j90h7aZWcKw}GkPm1sDUDhaeo}E&#Kqvcz7X<# zLora??{w*$veWFRd8xh@@9>t8{WKPO9lCQFS|G8lK6N@l^D{j{6h9H;f&aCGXc@aB zI>j?l?H$9u!GmQhd+N7heF-PhO{@f4q5K&a2XML;kcQj;gID5f6jS3^E) zA+6sPehCSblYzk*{xE{hP3e~xnMYV!8V&?M_M4%A1v|rdWPDh=LWvo=|CMQc$`axq zU9x@Z{Nj2c)gW*r+2r1x9hJpqE zMic`J(P0>v8oDH!J7#D@WAO`5fF{rllyyi~g9h@8!&MpJA}d*w2q>f?22muETbL?K z*31iK8A@bPbg+Uh6$+0Ds$?C?P9~#3!_lEmCZ)x^U$2IM{pa0E1TUS1m*7@AD7Ls=)? z1WZ&XAdjMxDqV=8lkkeNu8k=~g`q?nI1|vu&}n4?7l@3x#`Zh<;oB+@lj>E`i)+2_ z%*dTUfNldrkEjb6EBEnZaSc!;(0j0vc^+rmEUq<3$SDXsF}zJ$NtoF1GkTEiJQcoaYh zs-l7Mnb2TCi4AYKO6XyME!EZ#S~iJH1TJkbNM?!=QKF!zFjhc9RZyc7#;^*bQ$W;7 zX38W;`}O?1j;?Ui_kW_xBFzzbP7ln6+JY^>rD4bD@p)$5=E8(=KBj%)JqL5 zK`0s>l)xX;QaV@v@pQ2?ldZyIFZLPvSK+CDSQVvRNk)XipRq4t`4=JWTj&)N6#z4Xn zC|N&yS19m3Wt@p0;ap;(!%&+L0qow9fDgPi=3_h}#%M)@!uI)tCu6!gXw+Z;c$agO zz9n@MP#g+*)}_M&@ahMLuga1aTWu}W@zBA)wO@iBKap~GupFrF@y6h?)o!V04DYb4 z#Tno8xZ3d>*Tj`GA5_V--%4>h!U^Eqj2W+{0uK%NcmzF9j)|Z{X1O^s?yNNOM{YjF`u{kj9mU#hjx0Q7tY%8 zrOyRaAF=G)aU(uGb;WvMWC@QcuuA{0O6hs#KhyNyl{{0=-U?*T0gv}##Eip4Ai+|Y zul8d?Qj2XR44_JzO)!Jb-pyA53i3cKd$(B+YFW{)@cIB!2||FPkRas32v?TLW$x#D z>a!zk($NlApY2lCQUNT~lIB+*6Uj#~1Im3OU*3+LhU+uJkIZ+G?8H2bEN2og$cHDs-0bF0dKgXpc}U!K{G< z5F1ZOw|!z=l;*8AK@V8tFBO-hvu_!Ar5{q*BN_yLS(m{SlY;PHobOLfU*uTtNuKrNFZL{BFymtmE1n zW?_$LujOR6|Q*|wKy7&!}RV4P*8EDFxMkwT9B?%5Nf|J|F zVN-ZuhYil{V#YHxq6&mlrYRT+yY9c1oX#!p3mla^__*^nbq<8U?S;_)<1p}f;?cc& zI6z<(ier-mEC7lwDo+d)f+M|;0xxQsL&{}M?v+Zo zUft=Te?ofOdZs4#29e401B*R=TxLw8zt>caS9Avr62L^j5FjMMDHO)Y<0 zG8fis0=TsI-j*BzH2~KE_L<(-;kUXnKUDXuwPv#IrbCno4~)d$<_x>}6zk*h;xerL zUzvNUdv~yb&0_9ze|19tXeK7XmZER<` z;sXUjo(ki^-IBfHtiWKC?1>%omKx62ar@aw4S1B&Tc4p1$2z`@65}jh@Q^5vMUb6h z9Ri||wS(>a)Rn=;6k%IyY!aTn-nBn+!rEuWt@eTB_N*X`ya-hR&lEqDt8?Fd=&0-@ z&%_@3d4c<6uqG0Zi{e7+hadP*Np10MpT|l$W#*@^bhtO8r268Wf2`}BZ#1+KDzI?I=%0G0KRPA? z54){q_jQ~Ic36u?rP_uen?#(Djsxh+EvDOlzTKvYwK_{sY>CO+*?Rx+-*F>g5s`N| z8vG_0@2}#{(^YLj3nw)Z`Ndj2fThCwW~Jl^^W{sMAJUmHg`q~qDA$ceXL&SIvZ{|p z#T3#SxqvxKRL7{=(B3*Nsb=Z3Cr>sL0G#t-{5nQdmFjJLT%K#)@|hO~(?vgoAby8p zn;s|n$jkhQNR!NJc1D<`Y(k8*Q&r2*wZ7yU_Ij3N$FuXIzb$GN95$~#HwovSZO6|Vu;nuwy4X9#!crBt4K z)S*X}*uy=kPL+_|e?*&$pQKJ@ygTEsA%_HMoq81rIVy&^JMg07}^zaNc5q{|p zyC);GL@50q0?W~m7J(JXCuEuL8$8|320Ipb(7JC71+=pkN-#8kIR~R#SA^&S=x}6z z$9qP_I8G7jOLxYsloHPaLD|krScT!4 z#)wXVM_#vrCfB?XJP%42W_i4kofEEcA8bZDz6 zDvMNrESHX_z?~){lPDa2w~N37Ol#J{@)<1kZj2W5S~(yMBV%dooQ|3VhWh|@AMLP-;sHG;d~PSp+(|Vkf!`y#6^=zaonC{a>t;oQ_ZNOZpxs<(~(d{ z$K|{jk+ZZj7WmR<(XM&(5jTQZf`5-TgJq3R|6J-gb8QOt9xCISsLm+|k??uT7F@+g z)P{(?XRtJf21KgBT!Rv}t@&HGDj>dXj~w`H&~T;D+|@`=r!R>FxXcvv^f}j~`~}BQ z%xryP`cUh4-`NlE*@c(6$$ci*fY&=SaF*`|g0l`-jF+?Z1>irs&6o&giidMFGn3se z0&Qw89^`NI%TdaFpuRjdx~b&$YI$xfqr(a{*ehM5pv1n>>Kt4HMbg7>+*q z3et~)w!{s@^UAX5?epNJ9Mh`iWFQ)=8_)oc^*FhX<*sqR<7U8*NM)P~ti(XG^JML_ z$E}apLTEL$&u9eOApYm@wR7fCa`3STcdhC}@4z5I+JGF{Z+eL*x(NATRVbZ5LG~yl zg~PJne>s4J8780!YZ=2ahi3PrMCF(AgeUrsHUt{Efd|0)@G$*7C)Wmv#%e+;Tr|=ul?%q9@ZgW$+ z+c@1l!F3%hMkQ!EdJqgc1><~)NW-BIC6P*5{Qwwt8k+V*v?*i0#zZVhiF&-pF- z)ViM)D0@5TFVJHM76;f+n)^N{{nk(VF)A&>%V**6^h^t5a>=1Q(?$p=i0vuw9@Z6= z>^q5(mmlST2QBDKvqC7P2WU`W;7^S^xoTlb(90)qsG3pnvEQi>Aqo&-aEx`%OYg-i zv0oE3)C~s^N#cN+4am*0K^gx`Zlxdl=-w3|O&WWi>>4RRW15Wo^1@K{5^G-3;)M#> zpupyR1hioB+B18%lIT-@+!Ky}EPe`if=Ez7WkDUJQpSwuE#3hewE*jCy!n;T7a*+Z*QiY6hIvwwT&LnDe#B+T6~T`0?TtXjYxwm}yQ!`M%uoKF5%f!QVKhnV{z}iPD=!it*(U5fw=}|WM177i)+o9*Tyz-|G~ z(rUvdNK9NRtJnzn*RXQzh#)};+9ei7ES+O|S6-g8Ww$Yy7u{LqP4u9~{$GgOi%pYE zHz~1Na}9f|$_5LAoR;8lJ{wuQx?%gBt z%%4KSEDu*W(lGMCq<@I!nQ#C8LrX&YOWJAaFP~jLvV-vQf4nw;eoHs@{zZYwQ?jwZZHU zE9yjdFX=H4=4%R4W{o(ojksG{(r|L2m58mg#8*3L@ws4Yh3Ozrz|G<1&Asq3sy%{p zVWGo62`SWj=sPeBEPYNhC}0+BZyfS_VuV#ZmM6+*2_-hLV$xkj^;1*>rvcZy{MunP zwVMMbStb3gfkwLKi>nj`IhmF|x!@I=J68XDNm_XN#cj}o<85n$4M zzh)HVzhyCnc62R{jmOx&ei-DLmEfB@K1B>c(MX+m&9bx?TdYCPE@r+@wIgN%Rxg1@ zaHrbe46IG&$mSFKwHE(d)p$V7n`q0`o}Udv=_ZIjYd$XiqAkm)hYT~6hUk=Xnvd_EBvC{@$0<<vEw})j*38X@Ql76#LuDjmv2EW?x1b2)H;NOoE;xOQ=@gUUYgf*8_NbcxB=ihp~ z=Fa`pnDkR&eiEo6jt28%D8X){d&i0UPQLTbB}KZE@QPG-WV1r9TWjEeqmSkHOu&ZK zyy!qy-rG7eu1QnYiQZ2khv*J_fXW^lCsN=}*&!tHtBI~rGw~Y;p@SL@@x4tY%Ep_e zaA{})zOs9rYHC+xB@9u(X;=GW#*C*lF#&^7F~k+K-pT=DzNMrsBbVjA;7+M;lN$(h zMI*W>&wt6<*$}88d_x{LdCYHGWN~V~_0!1$H%Lr1JUXdDKY^dPx5CjwAUK@eQ@-!Bb>3)3cj`QY@cG(4*StkpGpn#WiZtMlF z+)_lV&9%5SZH!ya2N~>gC`KwzyE<5pk*p?)Y?AzO5548ERGd-xcq^agZiUQ2u2I=? z5jD0_v)$ceJL+M8vOGItJkyl^-}ChI6{3K`wobze%$+d;fYrW-Nz^Cc?QVr?grW7J zWmzRQQrmp+)9z@yk;N2Sog^vxO!{st0G zOeg__8m|grtX^H@MXBJ`x$+$nBc1_G)9aoRy!{XunG2joKj>p~8X5wT6jb;jm+n_S zHfyA&2xJa@K;Cw`9T|OkfnaMp*@|?=F$^WISufuauimntKpR}=?cbhi`9f&}Z|^?e z1pNMp@>Xtv-@Eug&7N5BRfUX(nDn7g=&cq6#_Cn%>HGvr({ z^A4lvVe7CTVWl6|@QsStofQ&qzo6}#_U6fYebVVYOb$rlp1Vk7J(DRAKIFt}9pvRN z@4QxW#lB5=HUiK%jZ|D?0Pnj*Jpr@$^Te*63AM}7T~$>buxXndB4|gRBc#ec0yE(y zq03a^X?J*f`U)=uJwLR`?4v_2y)i2<^9O)>5O}<+!&N`6Nud%|K&{Q92zYS&Ah*V% zr%|UN)*Vu3OXJBblc*iPe!F99E~z#a9d*%`0>{p;=PY`~3#w{=pS`*E9|t|DU-&D~ zU7rrf`=1J~e46;TC@84HOYK;VmeBZZQ%K(evMD^#)Z*FR$a>IPwm#XMU`gY#Tu& z0D=V~2+5Ga7^VU{$i6A);&EN`!Z8+N+3SE=$|uwl)&RkXdx=0i>)7;9I5-ODE-8Rx zO<6KY0N}0tKxAh(EUj`RSVoUu1pHMso*5A&T1w^%W*n{^WuiG22)aR5PV&ure9K9j z@Qu%pT69_`0ci#&MMx=@SCzz__BRdn^s$z84*uL?qa>6$SrhL*(U|2qhDmBK0SS>9 zx=0?L@J-&83MzHV@Kf@{al`Da`dwWxff@g@vOn~@GYTT)3Slbrq{e!`Mt34#Snq(i zfqlJNf8&j>TZBAO8b`6w6UXL4k8aj8dVO97R8R1`&B-eroFp zv|>;Z^s5~|PMvX7^Z*Xcr0sE{n3I~|OWRFs>;@Dmrb8UU7Lx~5K@D_g#l{`x-J7L$ zJRrp4@NsN1&P@r((zYxvex{<>&r90xO&kdTfxlv%Yf;7zq)Z&N@EZ6<^(@==;tZj)w9U3he-w{T{n_X{txj9kB_T2a^sE!(f0 zBiLMv>G$AAKYu80`=wB$F+22i(6~8?WeNg3q@O<8X;XtCje2ITJ&p_B7$_A0rkt9V zl<(VOKsbjZG~5NoZt|1|Ig2#$DUTC}U1;?(y1eN|aDXgyfq)QKN8|!#X8EAyW;&Zd zlo~kYfR_|FWQ%5k@D(iLDCP4QX*)2lM0LzJC%=i_|J;dKIVccqO#L1 z7Twf3&a^I`UrQD+-zzZ0(U+=iA#`_{@N+a)TKASbmZgjQ|yU$(1W+<^& zW~zr@DOSE!PJA}*?5)wx{%E@<_9Hi*OToJ~8oug3sbPVDBGHKARL9z{RG}G?DPJPR zDh-&)L1<3j5#@wipG^Eu*^XE8i*1Eu9P>U!4t1Ex?Y|pZ396Dv|1KfKcwN&)bh3wIn)-6;mz^Dc~qf6U9xeO$1l#~LQ zB(s_FaMCYwlU^B2eOZK|M9G7f9!S>`99n6wD}@Si}~O0E^?YN+a1JP<)XV@Km9B}s1lqM5jdcOn;gbQFxP`pcvvTJTo9~RN ztTo@kbN?_OXwa<~Kyf8qdcU-s{UoI1W&%fy8f?F;CxPE856DNkWpQ36b=U~>YyE~AF z4tKcs>cW7Z5@QO{t<#Z_uTHo&N#Cq0=cVYt+keA3=0#{b8;K+SjOKVlV{Wq5!j5Ei zHp5Kv$J!wmLGK<3w{*bmH(c*c1Rp8+swRJ)UmZOWmyMW?_Y5f2LI;6z%az3LIR8QMmd7#|DbKd4vwS{+a<`oL-wfQ-nQHG zO*u(&Y;&p8YS^*GYW=6OIn{X5rDGs<=pNFQ)5Nl6&S9JaDO50N&sdVJ>Z`so>2*01mzIi8Je2Ze=9uk~%bS#une5Np zy0K+@;7ZqXRo>@{=@JigeTtX6=Jagkfj5o4q)d_zqCkL+m_rqg)edt<#Rf>;VkE}~ z>5jl9#uM1v*}s9>e#6~0)PYRaL3Q&fuyRM*b22JFb`sGn`9@6frk5EqOtmmYnAU=% z&rG~x_nT(#np1o>g#YZk^_;|A(b7Z<*XZ-du#N1Doo{GdL&IiT45?z zrGx`QT9Mxap0r#Uh~jZw?u(d_vEs<)fq0WoLF?Amhof6v5^qS4whpR+WNm(FAeIQX~f{+!#rFa=vn}&Y^0)`ujP?)sH;%Pt5 z<=*ar5Ee&ubaU!A)OGG+LRS4gSoD zn=~>m5YZ%=(7$ajBm_r*cP>0}ztGEuQ4G+~ZOyXXJUTBE_6|;;c#}>I6Pkwj39pD7 zF(4s|#|1lcEipT*uL`(UqKn8Or3L7NE9dR+y`#WmUhLO#TI%FGh)Z6NLo{k1RLwJKLtF3cng|BU{PikOU#Z=9d9bEC{Te zs;8cjvwwOSzMrUYE(N@h3^?verosRbo_dp=UO}DPo@^=#a=dA%F|=yq{(*72G+PHu zjPH##DR5a%6)u6SZVA_8b7h%r@-P|Tn-dQ{lt;qy905Y&Q@V}}P+{MUcrw?ikZjkLdFyS3hN8>bsbSrl32bDk# z;t_(RqaRp<00b4|=0Z{xE50lcg>s`xPV{-FMXa#PlV9P$9|`fcBU-eOeJ=hyHEUBj zl{PA`$$qXSzABZM;O4&`g>@>mE4Rw7+Tbm9XgS_>oVhWdG6jM_=ipS2=itF5IfXw6 zIl~1b%GO6%Kqjhv(`Q?pBZM$j-3TKfzN=5ai@!&=R1}^<n$*w{aO*p94c5eCK4H zzV+TZ%=R#`Z_gyC%ydc0gx)^o@(AnYz0|ih<1wX*@*1<7-1>B$Z%{hEK6Gx=f3b45 zkZguaq+X)=N=lM~IvArS7di+b(|ozQgujlQV_#xxBYYd?!&aA6!q%D>tfCAz^b7L; zmfsdLeM!MZlZcP&*bb;Krkw7G(iPa8d3?l6Y!~6b%}YT^qjW@;EI#11Kfgg^TjnOv zYD#^3f|sX>KSxO#I2VikJhY`-53{SKdQC1~H^4>D4DiaRv=D4p3x8&M{enD%Fv=@w z#Tt#7^UKNDt#+sVOOu?N5%k=ZGsy{%{(L@HGiio7=%dmqL|;Qux_*vcAXAd5HcHra zN=czeb8$1LET(S;htoCuQb*#;Fsl(zzw9F*sa&{A|w!GvhukxCusMx)FhJ(1&^0E;>`gM4< zB`3FtY9qN`O$uL!!>%9TOQ_4Vq-!LhzPG|V?U^wph-L*K!$lku~W(TI1MtH>xqLq(-B4If9K1we)PY!MF zC&vopjm4?ZeATc@cCRIv4GOUO5u8KS&>DVJ&la9%*DPiXaKf#(rgv);#~03Mj&TmZ zbFEmcF|lFS;B$7qiljs^O5rfk$ri5p4Vh;(fw*FRp9n>^&Sttj>Wq84yW_Ty`}SG} z*2ub5=RZG*3{?o!zZL(N6QsWx=i4YP@)R+j7NCP%$uorb)JXk%MluG=Da?S;p%lfj zM-!zk16w#0VN+g+ppn=1k33a9aamrK@6KhNRb289%-?B0lSP{hBgo066Rr{FOzNPP zNiZwUdT1?Fgz6P{(c;rHcEx7Ly3w%>Ff>aowL46mUkCgE6J8ypE${=TA4@ZS%w9{m z`R7x^sG9uhDuP_J(Np!2-m9h#i%l-Is~^RTgwxZsf!{M~Xt33K7nE;3G9Zg)kYW-3 zSJ_<}h9DM~M4_jd+L5Ct=6idVWlKKSZpn3%zv!1<`{r6!E9ks@&XXRDbVfPDP{kju zm>mzeQZM9xKf6R+`5i5|aF*_Lcc1Z{eqAxir{$k5hDz@a7LiGu1zLRRB;G8iaeX)? zh7(pzeDG|q5styK7yTu?wsSvBP|(<~B{Zz&IV;~1W@}(sPV|J2M{21!Qt!8q$>B%U zXI)ylr==X6oOahkl~_`H2U0KI7AK0Am2_X&)6631pQtKxE|dF^D%4{~d>PjS>`5Jv zDg&yd%E<}HHGgJvPox!^TKzD&3nY?xv*QL3_{#?EeUQz8BF{J`1!s@ae4|>x+72M7 z`e@I)W*SRSN=8vzkUa+F?Cg#e!w=OTOl2c`w_=@w!biOvQ72k%E&bf~?h9lI3Lz7hzubhCVBR1+!=*B@kf< z<<}X#Z0E|P=+78uE8i}xV!f$x4~fRfqNA=|lVvu7&y1&>8sO$cL(9 zqZtfZ*143*^@vaU&lx8z3khZ)A3o+(B#k0-ACQQlPm~Ij+ID@K&BiH`b*V{Uj$dn; zOKUx$%-;Dx-wS^0DEJt+G+HxG%Z$wp{jso3{roD0jY&xh_T)D-Usj{hF;=^TG;PbQ z;d7kSI4aOk=nmASw!3krN*#{_iCNvwW{#ldI2i^}Z5rMw02Q{~ zf-fB~SplfJ~f(%q|~TPfPO9{X7l|umhE)R1#x2 z#sWraFr|Lc;;WJue_ztwkix-MmuHf$cWp4|FHD*AiYN{J$7S0}VseQNQrl3{Ufx+m z)nl;b$rR2q28knzVE$$@eKyCk`~^P?!G`Y2?d=K^*6oI7PFWs7Qe!P#wZnXep1E_A zp9u5x9fDH%C!QtOZ%~Ge+wkX0m4O;=^Lo#49zWHmB#cDkA5;DmNhw><3V7fEuC@SQB4H{B-r?_MS_yt!Yqw>UffVBNcer%Hk+0vj`_B z!Y%So@rX*D08HRsuVDw?coICCAja5~oD4krV)$~A;~E{oRum3j&eP<#W36-bF<-M9 zh4`&cSqIpg`eu}{pDmQ1Lx&Ri4($%>mbN+Iaklb4p;|>)w*DR4fztw8Z<5YK-=XsC zF$(!VmkaOapYjdYe?~Ky228s4-2*zdJ$H01Q>Gg$a`@#|9=LS4xVyVE_~0&^ zd$(%$Ztbo8uwS~Kdb<0_Q`J)4=l?soilnZd>smvEqcU}0A!@wC_yN1Y>alvfvAlM% z=&*wt+Xc3Fet$|OcRp8G^wj%zQtNOhXzRYWlzyTN`~d;cE?O)UBoLqb&`LlaQ9u7T4CE*$aWgb!4Bm za5>OWY$(Ls=>r^|gd69|+J%U?F8IhoaZmq8lw7kVp7HG!n|g%1wC{Z^Qx$zcG>`!0D2d-DHK}dc8AgY(3p-8t zR9=SFu6MyCj0+0~_ngabQzJ|0fm(3pV8Q}hM;y6EWlL)!zjBo94D-&+e@Z0@S9(xM zk32o=v`Z*&;^HOCH^Gn*&xbv+6_rWu>tgYrFQQD=f5-9(J>HeHTNI(NKa6iMSH0yAeAz-ittVEabH`QSA{dq3DzvpGzp%a z`w9(eE$oXn?{r((gqG9nzyqj0@sIIvldEq!dF{2LRp&9rv$~uG*^#sh5XyyWUrL9r^fIjVroSHUdvSMclmo4$ZEMbGL<*PE630U2Za2+5 zv=c3+EXH2>%F2G~8@2l~evL!JfRXGkipFgi zVIzelw;H&IDRbV5;_(H`ng|bKuKTENvHQng#TJpS;haQc>IHU6CuaF)xd+GOKH<30 zm!Dy=JYJg)lHuEiXB0>c9E4|0yfIxI{yk0qep0r2$fRS29ZXzx=E(V#*}&K6ov)T6 z*u=1lVMssJf+V3H*_^5yRr|_;L*}PrR=C;tEkneie)H+OT-qjAuD&f(NS5Y(W~2mN zkt-27BE^j!rqr;LJpzv$=HnZ~xbZ~Th>x^ZZ{2Y>D>N$Cx$0MDd0#|-@ zeCMY4JX!W3i)MTgaj%u8Zz~-)oC65QA#8`@+Yt+Oma(gqZ9h z)Ser?VVeq^3Z~9me$|$zIJjK%vjexO$ufglahnyJ^z{|-=Xf()Yh?${NUydH0Jgq1 zsyEnvvEemLq3!KnCx|7t0EnN7n;Y1j34GWU5GPC46{Aw>qG)|eiTP`F>ws4i{JwnL zjW5xdw<&B}{*>2u<1f9`2|%C;ns!uoT5`rrarf&*a^)cIjd7KE1%A8 zYP;fug3tE&uHe^#Q-nKLu`VxAINH?~4WD|u+}AZxy|!kp*yN1_OB0WPmR9psRBZc^ z{|^7!zVDg*;9ber%k_OmI90moDkuzP(|YQDPw}reqiXv2){QST{x?mI^%pzc-9Kn( z6$po%&O?N5&s;>a-X`rL-83b58OU(u2`D0`53nr@Uj`DX8Djc zv_$DW2hfX)& zu10!`=&SA z;olxJiWY7CIzzS?U5ofB_wI0nrqVm1!yWtZ9g|PFV8gL&!{o3lZ)baWB=Wu?BLl7$ z(elX5Ge+~)g`BtB6g!km4BNOPKAo_zEod`?!b^BZL5`5KowDx8lK#DIzT4Q@L1JiY zFiHs%!*|jO`NF7gdpJ<^1voA7Qte1CcFL$uq$m+3yPJ`OQde31>t@z6P%fN$;&v^d z{F_^vFUqR%u|@Fa$k4AM$O8}(17xN1O4 zwk^z754ST9C{1ZHdw9-4=y;U+KXrOjRPoNsm#_LhW)^Fa7gQrnKeKnLdorAq>G4u% zCOFjhowwK09QySXW_7F?W&E>s%@2ab8W zHSvRF{!HNO&(vB>)2RKmbNTYR`>5$u#L42(QhF31#vo$L2p1S{Wc?cUYsTm0GcEJY zk9O>bHuVS>tc>rw8PMgbl~ztO=EZ_*p&aiXK+5#l9m4bVuXo z#ntW|*4bsX4IH*AP7XvbnCHALa7vlCO>ZgNA)9E&sAYtGC_IRP2X<9TJ2gE)-~eb! zifno!O1aFFtIVRK%4Cb?J+!H;i|{2va+NBXcx!BvCZSH`d#O#sA;9QRcLe(-*J2LU z(twYE#^SEAH^YzuHPkJ!;I{-cutC%n)u)&TAVLJ#43!;@UrsDgMu8uWsY=80qtKtN z{=!B7p?e8P-ttZ9P^;Dy&IX#b=(aUY&l@4 zy|SOdN)%UmVv6!utO^akzq$LbT>5WLC3V2Md-tX5T4*gVg{3%!cxC8JnYew+F73vQ z^ItrG>t;*ae}bc0uYPe>8K=;{yH4&mw7IdRe}$4r>{ofZzPrQW{PJAxBE|a4o3z5> z42_Pu%8h-5g~5yr6BBr8t-37d&0~~L)NK6p%+^9vBe>l@duiavtM1&rwWVO8@`TSw z+$Acg0`V~aAn*qf2Acg-bd?RD4cNf5{B{aJlFnCPyNlz8Pfa%A6F^7(G`lkc+9Y8y z`VV!QcDU$mg)CBeEP^8cr$H;Xz{?`#>OgrW;+ul8;75x!DV8S!z%ZNq)cO4*8TmW3xDGY^zU)R9EC9L()y=34VPk5zSR5@Y5I8mrAITn zj|zmjqUs2?*lY#K_u*ZHq5p16m+%Q8_%dM^R@=!RC{tX4!NY?-?gQ6G%;F;=)BFRY z(cXAt3vJyrvY^>UTsE%hJFKSl9Dx|@Fxg{F<#(E=%1<{~ah^O=cfLB}Up{4@9Qph7 z{+ z7IB->Vr3GQEu6osH@0dG8OenyzW3FEd8}o6qe`NJUhUT;8P9pCLI%tjr=!a7$lY-O zs?s-rmhkOJ4fYo}>~T;MpSKON%R&^*%#M5kPze)6+iv7YG$CCU@)`OmRE8b(VWW}k z9|evuRKL&wepe;_H$PtM+kr{qW1563${MvJVAto|Mo#DzuAt&yvN$1&+)VRH=Dn+b zf1IPKh6K`?@WoR)V+HaPG_C8M_-oy>)NSA*;+PYstbft-fj|)p?#rS7RKx>qDZ<%E zEPnoIO&^m1(m!Pj`|TDg`cBCZeljfE%42t6ks&vyzY*G%kY!4KuBAoXtMvu?jZOnQri`-6o#-d;`qa9R{QK^c`Zt#7Gw_V=8*w|@b~T`a>st#X zMz#jU0fTUA+YV?qFBhQ3XC{;4X_&unks6W(_a9#=^PyEEPa+F1u^Y={#VLDmDoIo- zE*`k`No|0Zz2m>Gz_ONk8zM@bHUx4z4qEj4ozK2Q>v;zVzE32WbgjZuVwI(1& zt>5>?O#|3}6=tpP5oGc|xf6>%!uEz1tl~?~U6h*O#+ygw6H#153OT)Hp5-u|T%oyE zvv~C~l|E1UPHuSUUm`nlov-h%t9$^y&o6glpL{qbceA}&oXRABQkhkGoexP*tkB+( zqLRY@lmAL3XbA{spR3jld+RI*w@o`yvbz zrGkV#$m!L2h&49=eXR|)&w+r~f+$lBkM}=iOKGaN8Zv)%;kD}%CZA(WW~qN1a45CV zYb+0^y|U8>lCh4Yy8484uN{uSR?BW-e8Z3^N$9wARqDXeY2cy4zT!3l*PZ^CT)07j zkWmNuMm0fm${--k+SE7$U=^^`pD5(EbNpR-{ULOF^4p|`jzG~mXU5f)72E+c;Q~^v z5|Zi1J6IEigV#a#UGBw@7o>5F*P zkD_<2rj$&K8#Jtl!q-Jb%jU#3TGEAq(h1?5&8UIszejzfH$7tgIJjcXYt^PIP@Z>opZZqnvCGBci~ZD3s>*+t?6m*NthG&N&`@>3lpt(|)9vc)5_qk5bSZ(8u)j zjV%ODL8q-eGv((?kWMYy<9)vMf4_ssdZ4Pq$DZJ~OZUGDA_8z!S+>Ehu&iLhSq z+s3GbH{|j=3UU@E-YDXM4oyL?u<>E2fa))PJm)pN+1P$h6Ms34oqn0|401(>Zerbe zgWz-Qr=L*e5A>T4-^J%j%*c>Q^vJJba=VI9Qq$WJ19b2?vUbA{tH+?x6K}W&UtPlL zr=eECqJ%8?#RQ2iT{Z=lIp;S;lZM7E$EdL!W}V7iVM6SDi@JAW6ea8Xx_MZ&Ql=}Y z#6=A58ZuxjGj502uVlUv6KwlMoQ^dn>Ydhfi6T}Fc%AjdxJ^*@3yQQk6TzPqTiR7M zmX^U*JoScyp3ys(8*%L%-huO69i}8DwhRAMw;s@U|CY8glt>}isY|96i4C&iWdd=E zr`PWJmF62k-bSjWMxeL7gfMA!(BM07I#pDASURWs2J^LfMzkWuK8V0HE#@E@7pCJP zL+NW7Cl^d?AbkZ9!HjWJLGI!i70bwh6OF~AanF6bu6%b)b4bEN6^o;uqla%8chf!c z{BqG22S5J3VUH<6gwn5ZvxbKAs$K)ac%9^Z-9!ttc?;pS7S z%JPL9>G&<(%&YB|uiXZ?ndlM4t8IVY;_B&wUjZijzSRj4ZKq_im-O?8tObt5Bqtmv z6HB*9Q4$)+p(A5|D$uMR*Nc3M=i|LdnGUNBl3qXH5R2%k({8wX!SBXs+leOTu zDrr?J#q<*Qo&dX~Z9-DBY~;7QitY|wlU7N#Mhl#FA;QtX=XfXmYYd`UM9a1$_cRQT}yykX1@f~(hogcNgYe9!b5bP~NS=P}OVs|gaa zTJGlIFSfTmPAsd@iJccfXYJ2ylv@KSjymO+l)P=Be0_XNw{b6b8E#)dlP4eKc*66L zz5+vx%#bXLX73R^cFfsdN;Im#D;@68%GGnW3)87KG;*t=KxkEby6S&y-iLTgm zo0Zz-s3u^JZC#+F{l`2;5mCW}S1r2OEd^&$mo4kQ9?6>LGbYWE+%3XT)$hlWhS(+kZ424r75{J1pl4? zJjXIG|9V9o;izh6!QAAzsj3ZsQID-N(k2^5;LPXnXS!%h{V2PF0VRw3L0P#4w44fb zqlflKb!TyjI(P5^>!~@DbSZ}uRNy?Pb3XmfZbn38pz!4DCi4ihHt6x!3MeO}Ih*L@ zC1|X;&!kA@t?!e?!-)qsmoXnH)=$LmE%a3XnYp?vW#?YvLr@X0=^dZtinIEWsS*Sq zjP}Ivo&TC1pHFTV8&Ny!bG9s0nbh2^Rk!3wgL0%v(YJOkJbP>F$G5UHVqC^V(5hwJ ztAdMgeEV_Dx)a?4Bg)%@q$Ua@&A>YwLiG02pP2Yf=ez^JXPIRp3h{x-xza5wfmJ3d z=Q_SMtuW&voKhgyZCGg{vcQJ$HJg0S*8NreaYN0}?&YfWZ24jVJrOl7&SWFUu7929 zzQ&r4wo5el_?Gu#PtS0jh>8Bk&wtz@6h*;PH`1D}T)rK9G$WGLS2PprjPHd}1clUb zH*>0x=!dWl{sC>#YxfA$&{$^xfluq=ub!Cb0GflVA94%T2f)e%FC+D}kzQ^s6&sJ8 znjx7Xg2Cex@WXViqZD?_2P~F@6`L-dS<)}3%Hx)n8#IB{rPszA}T8Q$fEq+ zO3XP|es7LNQy*Rx7~P?&@Vn?uF^LTZRPI0SpVSdBKK*s~6AxOUszz@jc#I2$g_tlO6!=0xF?TF%V<1guBju+K+vK(;pOBW%(?gVcfZBq%6N$$^Agb^xM3OCo6HJx*JQ z52-(F7vYS?)`n8t4?X8s_7Rv;7ZuF3Hj_0dFYXCb16jcoJjA2${mN z_1vT~^z)Z`d?DyH}Y^B!T&TRoF}Qljg~oh zLqEwezxCg8x+3{qFlyW`JJzo?arZOsJWKPFMBffX3%-nV_rZu3hg)U3^=>p7zB?zm zPtyc1CqA@KuzrX!U1$m<0Z?AmrA3FanyNhpH%7`JI{rB>OS`KuAcOu=$H=fUk^2< zIm4<_Ejq1g-LRp0MgMBI72pm=l)*_woqB8&8cK z-T9EKM@d|~_~@{iE>!YIRRvqOld1UuCE;}zfLwzxz_3v2&m8r z0L9_dl$(N^EdwW2{bJYo+7mh6FTc5owCNr?ih{Iw6H`c=%UwRk2$^m#x!%$LHVzft zGw*_*H!EToQdUlnPgL#F9n$SCY$u%`Tsf4d!X}t??SF+2T*zwALO7PPeGtyuEeX1R zmabF*{sPR}?}&xb3jEo4%mvfjtBN5#lug$+Cn=kH%iTS1gJv_rth3;3zQ|%2XTMRj zjD^~m*e9yJdsZDiL2WVnO52MEB3B06bEM+9I+~8j1tzJ;_7_clB}V&{C%P(cjr|qQ zi*g9yGnMNxG|0R}%clQi?%2gUuBhZlSr(l5(JXbG5k)CLYHiHW5T|XUqySQhL*R+oB zO|!~lkvg)C-&l5c?B5_lRUdNB+<-nTb1v9e`h3wT-$NiRbD-XH&Xv|`j)Y{_^&gGZ z_X2GkFF$YNOD%6B6{-9-2WBcnIA+7k34wLcBQP|So46xMK217cqw%z7)7j=O#PHD7 zG#z>(}NCD&; z7RDt!R*ilmhWL?qTC1-|nl#^ShTUtHe zD`0(b_-QB@E2mZU$$i;G^r2&1e{@7(08*CfF2H4v^7G`oX_bBOmD3hdQ-sA}ndyZ2 z(-UbI(`I&8_k0;>$%EHB*eCg}p}a^Y0QwWKs&3HY*an;;q4~ZLgDm^+3VhMeDDW-G zP7m6V@&xFWt{`j3acSdrg~=nE@n=5lkmwCIiR5F4yo=g<`5(G86r4`P)8k*I59&J;BSy?D*^FVp&4-W@C4*Xk6AtA$+ zfm#L@d3BMuck47hVmUYjXYGQ)4Gl4kx#ER{jG2q~@Xs3ftY-yY-PbHI zgG)<5|J}p>No{pNZc?i$56oc6(TiJ5q)ZUvUU+r5KXDTNAT0ek$vGTY%3h=u5<^_4|kmsGkVreXke!cy^?$mNt!d7?xdnG1)D4}IR zFoR>oySn$h&%Z!hC&Yk72N(OAq{b_V#>hhhH3Y-&lzk^RW8>Dy3XPH);@!VaAsH8{l&NBl>;FN0qbO09Me${le8IZ$1)4I}t zzeSE`Nz|CJhQEtg#{I6>qYa;fSI%%$1yhBbOaIs^vV&C5eA(*1iTr}QtQ}_-l#08p zEo?h5U5vCz|6eoA0kS5;a)r@p#BYY9%H4Hj0tSaa`H*liU16{MHsnes$#$d72v!EN zD*t+9E(X??-VtPF#_6{F8RS>oP@Oo}0T;YgmRk)&17I)ivephbr~;QrQ`kOe8E>8W zvfI!Q*pG*;-D3eCg{!FPXFTACyR7MF1gyX#(%AtGjQ$U%q1!+LF2D8q#J1P3x=l^C*PU~AVgn0w2iMkL#V=`*HKNTJZH!jx%ni+@K)qU{3;o`SO z^A$hCQ?EJI`$E@@-$#Vozzew*iqVscbqiodjZiJ-BxxLt&Gmqt!DR253=#Pa%o1N{ zU1d5;zl-7{O`-Xw83!#X0VpAOBUqc#o}2-h1rIzNu~hk>+E!?&rJ+BF4FBYkht2Ql zx{;`Q;9eEFn)CXe2YVjCYt`2D1Fy-F_~8UtCBxe@lt71`vo>9OVZ!Z-|2AoQUMILw z__Vc&OxeGYd@qPUp|lfFvhkwiK{lQKj}?m@66@BcGvRN%p)urjw~+3*I@f8>ugwft zWL`8hG*5nZp)rr{{1YLPeItzK?n&IylrUt~>XEdfYGsOxOezTOcR$N|^3u)nH$Jh7 zu^Ni-8qqA$y%9MUi!T#x?yI!4-bX+Elf^7srgy``s#RQqCb|Cc3s^t}$S0bgg_5vx z&Ffy^nG!)V{{W#O%8L-l^GNxr*WOWG%d}cG&ZFnd&Z5%37o%+Z^UAy}esw@TQy%r~ zWlju;54*2Q+lKJAX6GRUQ(`t$Z-#`XoHcFyl};7xBieb7>|qF&SJ(k6l)o;FpH5%K z@Za$1LndXDJLMrX$4hb|=Iu-0u;WL!+%C5Kn${}Dm1YK%yDER{QvXTT58G(V4)}xF z8s=P(a!q47S^Chb(|%i|uujerT6L-2Sbw2lBz@YY-L*+A)Q#JR-E`EOm_Er8aTDb9X2WZ$})t;ld(t_3U*nUkS=66mMm@> zt74~+PRcH9d2H>4uR?FH@H8iU+Y7`3XgjK9c4@wVG^%=C1;Dq6q|LWzNvo_b~5 z^$b2V0Car3x9+Vv%ii+?KaBO_nm35|Vv|0btca`Kgd(zB18d3 zIp*Th(JtxOI2N-Scep4x$b|nuM~9KF=#D`AAd=!fCc>hjgZGfJLa0@FD3mZ*?%b5J zaD&KfaTyCq|3NBCOVd7nZTGB2fv8`A{_Zy50@;~mi;`iA7%kJ*i>ur2Y`~eleuS3n zZ(;j;f$nxe)duZ95Z$|7zAjw}qpSp^U&0|K_Oij%4L#Y^-Pwb1ZEU*j+z!yhOGi~B zuveh1pPTuNNS%`@BwP>|8exjAFZ3lm%H5%IbC zohE>%ojz9T+;!O(TmD{ySP6N}6b;u6Uv5?1%?cn+MMO zpCIT{rvz+t=#DZ6x)jG0xDVx@qJlR9r~Ct-bKL4m7ILz8Gd3JoI3tXh-?l{8}p*b+&>?D$ge4UGl)Jg%+pnho|E8rOvevx7v)eMd0|Zv!j#>+#yG`T;^vmS zk7R?^C2GFBBgj1@H|+Iv2D-#<<~`nRs}DkSWWtm&6}$OKfneH{clpe<+HGiHwEMVn zp&uESP1+0+0Fsn9J2W5B&1kGjzY7*m;g5uPG+enKvAe8Of+{$wvEdE&s+vp`<7XNA zW}O)i1s9i-ntL9m%H3*FI2g$DERJ!M6~ahc^kZY zQaLP%MsND28gGl=27XTv_no{rrI%*5R?Y;wcjq9~xMF`RS6t0pGWrNQK)acQYf!;( z>VBE%cAu5L9GwEGM0ZZgPTf7pR97u_a0Gm#Vkhpt`BJryKb1SKzx4^ex^_G-44+pf z;<11HtTNl z_xiI`BpJ#TD*r8)bIg05qT|=LA*X1tpl-Z1cp5#?5}{}FUc*Rk+2X3+YcQW&s$FZ%}T+3U5p+8T>Excp6gm?^!3(hgwT`_5PjPNMtjsb16F?eb7zfuo#z6jP9+N9?+?xXKI8;IExYuXUd>3o34 z>1HR#H34$Ywl7}GyamDg6gCr%7uTt8`!_5M9!+MdRqW=Werh%7FHd|CEN<2p!QlZzp*pxW&6TBnT$Y85q@ZoH*e*+Bozn zmvjGCQZ_0)V=Ln6aU6DO=b*s|kbZty$9({*`9n%(E_gr(LZZGoiN_LhyiljI3+_2r zn)Q#-FXs;*69rEs$#)88MlE{5{yl=?9JP{{Z?6NHlRX<<0*(sr%5ejdP*c9S*Y~bd z`D18q%+=1>hQLSCc)eN32xn?Aiy+mg#a@jU-p6#iB@s+$Z}>rw67#(4T)$DjmKEVG-JPUUDf0ny7z|BTj-GFqmz?x8Y{Sl1jYVylR~$WjrZ>d)~e_Db=`T4bOlIM(26-=LsHc+ zkJTTB5{`1jD9h?;IN1D&y#{XR^OX{_8J|qT=M$AvgQ4ft^GfTtSW#bfK394~?V3uN zn^zm-yKbZck8P|G&pSCI;sus`LSj+JSW%($glCDKhvr(ld9Dm%mQ$T%C6PAXTe?nG9ZckZkrSetz&r2 z2@$BAT6*=IN;|~w$!cR zi=N!5#K-=&#ByhiN!O^nn6cBy2Zkoe;5GH(bX! zU)}>#AAPN}vb6IZyS`OF$W(voLERTimRVs!YW2b+uUm?t;M(Bsn@ll$aTwkg4R;+D z=PB6)$2FReB=Zr~SK`GQ+gixNEc2grOy=e^yMsiw&h};Ovz*B5dHp`T*ZAXK-T$hF zc$j$!VOqvV%Y~X4u3Nr`c(`gI)ylZaw=nYui#F$+?liW&O_&=PY!0p&wr0pk!qt@P z(0CT4j*uoSEP6+f5vu=IuM{iFxS{j=WxQ>mX%kT1mDqTc`!u)R7nw3q|Lzv9?R|Rh zd)Y1Y+I;h|_kQ;%bTsoSV+aQvr7y3>*3CZ!d+)ak~IsY!V|k>v!z|EFYj1fjMKGd-D1SZ z#RvvltIj2cM3Y^V-MMU-$;PnOHG< zm&?^mj9c)~9TN%Rd>*=?J$YV?oKBEir}uAGSo(X&O?IK3a(V~qn>UV`KhAMW{pmRL zu(IH08_8!-f52TPo8(YxJr-rK;w^%YNy-s-KWbq;yV=h3Oi%*O!qVP&wT87x>ei`K>rdgOMS2mq0D1AI?V2-lc_gmA%WQ^@GkP2l0Ex; zG~ka5)MDz#!jKPDr&Cjhftwx`@7D2WO9cPn-ozusy}FO&^y&vLjA)PGbUm)2+OmdH z&C<6mW#HZR>2eb5v{fg@yKVe>66_hE^Ch2-lSc_N>jM|m4M|Y*_)ypyuY4Y3PW~8? zW%Sxy<7Lsab#4wFXWE#`^19*H)|Qs`z76t3%h82uP88`)XPF3zb4$zkFubzII> z?K*b*K3@5OQgQn+{niyfJH1bV>G5;$(}X>_C%nv!#FoXz=I_stbiFLO;$vs&i;e7@ zHAN5tJ&RsahJAgxQMTENx7GD@!ke6D6u*M!K#aUraJ|+JB%Y`+PEiP5?d%xsrKaAy zc-A~$?h>ycyHN6OPp47;5!dt!P$=iGLSK`0ziFA>npA$R{`7@XT2@Q8OjdZY??&%p z=?EA*MeEt8D}bNWXH@CtjV%VN=VE57U!?eDvNd-JYVc;qsI`2cn>B@76a<<|m79=P zE|k4F+28k!O(*WiWMhA}x=kj;;7w0nB;*rQpS*BGJ0~yM+*#%owSc9*Z|=eQHZrjM1|C)7LLk*Lz2Y$s_JB%vC?%LVZu{8d#f?)=Dq?DVpg%i>B5e=kwK^ z?~yH^zMm?NGf4J88S~!xR;c0P&fitdmM~s?%21Cir&H3hL4&#EIfIjUKekTZ%4Ei6 zc5vJo(#GiUS8PxQ-|kL0Btpn7`VtY*VwFRqzuCEO9F_XLz+bfg8Guisfxk(U&&bow zttX=Q2PU(@8BBVC@7}!=Al-VGsa<-br0h$Xpc(I%=L)Z_fs$rY#a}P!0yOjXOBBa) zA43ZYSy`l2kY%mzQa@ui!?Pmlu~k;{R*wP26%b3 zU;DfZ1e`p*L|^mk?A0B!P$V#;D%k(K0gN0kId8r_-;88K`9AJ)LWTApyX2o}8i%@> zK_d?rSA3ZnmlxAHUatzfJXfS42U{XUsr=7zH+QeXU4L(!NS@cjy-}Zd zsEb5jl8BlDo&ME!MgLM>M}da|iOv#uv%^3?Mh}5!lMhq^*1AU1gCKd_ED^Y#po$QA z)abK1k&s{RB}`G!K>|HU6#Bq-fZ49B&hCQ}vZrsGhztLEwsie^axz7er0WEXzGz4~ zb@AK@&p#1vAWw16ZIUQdmk{|uH<}CYJ?^$l>q!wOAAN|x3H^8*>$&CL7rMoLFfiqGXujOK*$NmDVC`=Y}+j}eJWmTh)%rxCp z1$z!;OJtell+ll8M1m~v2+Rk}O?8e`55Et!nP;wNLf;^cPSFx=Z^bN*uYa$Yz7s<(mg+suFT)XUEZ=qt zrwGDO0uqHUDl4R0m*JRMHJDmahQJAF@xsYwNz+$Lke8E13}v|uP{5jc16h{k%D zQ6|6v#KyQxw56r}g>$zQ;2=|sPT}FS`4tyQx{q7MZNSZ}JjGNccrj9}YMxa1Q9S{e z(9A<}d_^^-UwPqSv#xeTAObwv7)thHfq?;8{A2og(`L?m6tVogk(4ra(>8$~sXk(i!1lJo`$ zlJl?HBvXxw^K_$2BYcXMexg1KU!1K-nY?(mlQ$>_@&fPovid6IjFREW{nOJ7hewch zv_nuYqN%IY>kY@Uq*SnRwVO&+)s!5LBwa>SxRkUP9V%$cV0lHxJ{vLpWELhfiek<{ zPEtynRV8oD&leGw5(X`}p5G)|OVMM>ieX|>Oa?i0ssev8{E9pn@1>W~!VPuExziO8 zNfTehEyU4$o(5TkYGW6rmZxGXyA0>gu1HyXo2hzWV4v(7S`7!J7Y3&K8V*SyiK$`* z71h@Ib1`@cEc^_$ISCF8Pt%YrBZ$Y)t>BtN{8o@g9$Nm~Ikv!u=|6r*U>Ot(jL%S$ zD?th{YlT!+8&)um(mlIxq#SI76k5cQNA6ph%dpD;$t~*)6CMcLk&w$UIpvhZ(CL7w zI=sbZRM?u0#}-EMQlHaY8R-?;K|B?j-PITFHpx0D-o)#pt4ZN_#}{h6IvZ22Q)sMg z;5kU)UQ$|6glriEk6vx0q714pVNjS6v2Z&7xVnKn>_kGWD`9oex}&P(!N1?(Io7NP z$##EmxPqj_xnGxo*PoVd)=I{v^qyIe|7{^j!*@GQz#$UAWJ>!a|K1cy|qu+Z_zix5qIb0R8KLO9et6@_KV}g|JDp4?jO&92oEmfVn{e{DQ7K&JaED{?k;H zL;@sR&+PJM+`Voo!69q=_`-`{?WTS7CWqg=84S_LY}lg_c#Q#toz7t@#DuVd@vj{IKEl(=mcfE>*Pe|NMvVw8NBSx3{}sa zyj{xFuqdDi3}pfrc87SqPTkeqoo~J6AI>6Hv!_^CnC8BAym#91=U+baTzOm_dUtL* z-0an$m7NZ)dP1YuJ(~l(^za0hQZm5@LkBVXiMNmX5o41_do_G>Q#9E)AQqL!8Xk%C zYxf=Z+fncoe@2-H3-6TzFErB`lAV0h*TLUx&&QwL;W@=HN!VWP5i~x`vxA$ z8uILTfGpfVkB6F9|K^3$bEpsn)93T;z=Se6{h|en{~p7qffq(*G!8}I^?KK+LA<-q z^cvdxS4V#RB*gqe>V0!}wQ0R9HE`=SB>*NBJRNB6f!t(YZJUhvvHg}!#iEA$|1l7} zH0i--rItlZfHC=-#!7wb_1H5g>Wx_NKdspZa{SNP|GAqZ{r};HkK=#mG-H zAN+t@gAYai97>iH56_c`8cCj-Pn@ZmqOUP2n%A-i(S5auJ-S#nNxd^jU8p6*}kyzz~%ul;YaMjt9M)0nt+3g}K5%hMQ8@IiG z4BYw#6-j6S6V4EsXnTx?V>j&oW1;iqg(ukyzs7pwavFOhWF{NR13#6HOTnss|2fi@ z%WnTc{RyL|E$$cqLGxsT*^;rQOaY9K0o*1(Dpyf$=W(nAA#qY$Il(-nHv#aXKXQ~- zOwXhY3R7sM{cy9;m({9Yl#NfLGUN--8S&OYJ(F564qMUMk5Om#L)zYiB<}Ic3uR;B z{S^wkzz9O-%hD+Tc)3pmLsy%H{Nn4K+=q}1zU9J+qQD-XD-M#ldp^?-727tP8hB7A zKS*hr;=xX_VAl9?yEw!ZR)xK@L^$4414ny(WUidtgMa#_Rx4Rhef#_3{hJL#aLt!B zJ9ifJy!8|g6#j<5{;DhO^`ROygr>3AdHuZpm-zE+|tOtLe>kqh5!a31TBHtgy-vuBZY zhXK9ZRTK-i-hz>jO1`=&Cp1B`s=KPX`SxTvZlg-#ie3{o);bpY&MwwEGhrM?2L3Yb z)QLE2uk#AZa?O_DEZ7nqbj)w_?MD9x06##$zXM@d0p+~MkTnUus|TKo<@>bS)0u0x z6aA>qwN5C!qQv!g1V+GPPO+nJ=R!cukdG--`UpZJycQ5WSB{{d>z_|5m@Fs}#tq1i z=|}1ad8P2jF7(%Ipt2cliI!+qR{7D+GLPI1kMJ6&yU z&Q1+&_4!QvH!8(T(VS7xA@4Wy$0LMfds#+}acmQ2%bv^FR%>^`4;il<#>0v-f%O2`Z@sjfwnOnU9!sRXVDFko_id>vKdrpyL1~76dKh69XVy4 z@fwX%O0PQM{6>RB>Cs%3{wb8jSL~+S3w)D#jRd*;sVt*!C z2eT!K-7^HamM%M>->mr%Zh>nG6OnJ=u>2fy%B`t%U%Cq5oYl7nYZfk_e!4w8oZmPf zZ?l0G?GOWq*{n72ulMiA!RZIk_0?_FU=>^~Ci0 zpI)LfRqamI01im4h2k({iAq7xJ$pTF0lsdQsTbd|H^@7zDd5iHH>P*KAHLoJ=kk?k@dThKcE}2If z+s%3;-HXlD#saUZC7?FnSbg+Ozeb1B5HoS{dTG??a}-h^0g~irZ7H1m#f6Ws?c?0= zIhTD_7#GI>6_oK=L7D)S=}^Ajk3$GSwr}1WQb-2hWUb#-Nej_N$O6|Cmz5}$0-|eb z%;9ia3TLD%&NH8GoyxHt&#i2j)Xe8M>h>NxG{m@zoqi#&Ony&sm$Tc4^K&}kAq265 z$GKj-RVrdjd1=Sfc;0)%PldUTDo6QfHul=!_O0xD+`-W+9)pMZWng2Cq1E`v1QK9BPJ z+3Z?+mF;=mPm$VJ#D}xIa>oMihc55gZqk+N@9}*7Kb8C^8`{@W%yF-mm|)`B6*i13 zDD|7w;^36p@5~DL8(VKHo}Kq_$5mZlU)Y_@uIj1X@p{&t4c>L5Sdm{VsBix6u>2jT zsXtV_?nM_aY{MK1LMOV?CFi39vSdg~k<-8I^_EY)>l&-Hi_0wspChfY%h17VceIw) zPk#@GW)-h5%4^-}qioQ)E@b&Qy>__mv^54ee*2=%wqDOUZOV>-`|OaMbiFOS2TMz{ zX;}p8gd8I}Gc~bk+mi1svM0T{gZv2jh}FKPhyAS!&Ysr%gReWk-Xfv!>dPML49VE~ z(c-oSW%C?2TAKespTgr*{M$q1U){G(bWA^F@vM5i7O{cQJyF`EqsyJP(Re0qXN$Hc zMGG2y@+fv?jUSfdTJ_j$k^hePqbI8|NXiI}`u;jvI$80BiNEgi#fn=VU^@I?u*k(; z85^c1pMf*Ob;p^REq^zB^i_57n1u^y_Na#|^9obw+LYzBO5U`|8Q?WJ zJOH?zPf-#V7T$K~Yr8HrM{cQ>nY``&gfyL@VQF|(6pEM-qyh+$ML>j5K%iAcXkO<9 z(S#u*KUd7xD_^av4{tZm+DlH$18VlghlMc}PrDn#=*@kV3d`gH0%a)4WLp5?>jD;Y zGNS#FNyGBSdgcO7=g-}(!a&SnR4%5~LNkfylk#u9mlTm5_G>gJuf$1{_0Iu(iNRA^+WL=|mGI0TTC6 zM1axoD0A1%A71RhK29dVFp$!i0+3Udfx2F`7GVdD7q8&|n*3_n-`D7W$TJ7=q5csP z{p~owW7J?}-!O|XKcTtN#D_mZls*Vd4(CLTV5X%6^3?^i174wvY(Ah)7S#>*0Id*l z1ZYSIPNyeP`%PsIbpvZMOnk_ErYSdA%?6!igwuhEe!EKPn`4r2uD1L&%I(Llb2E2! ztq*3q@<#+^GjX1loWS4NL5cjwO*P=qCh|GPc(@xFhEP(mt^h90fxxNBLed0LoAx9M z{LA|5JCVnTU|9UZ2b&@l1f__I3Mj@6!4WTIB64O>0RbI6Cx|`u1s1KxHf#F= zuqK=`Tk|xp!?!`!!I$4Bss8gurgvb^APp#Iuw-=D3*Jm)2LjzeZs=ke$OQod1PJ-81DkIJ&5UTL#@e2f4-* zE^s$|f3j2@|2y0VX!QID%g`mKh5&uuQP}PEtOvGJV@bKfTg1eXT6Za{Z9RFJRYzk@ zEKg~7`!A|jU!k1H0&Ze8(taJRO8#Pk>&nO9d*iM&BW${^BvRk=4A*=HAGp^gf(%Mz zEugT4gIgQK7tM8{*RIWbY13>B`UFE8V1C};K}0A=vwYuqsVBGrPK12;cLPD^sG#Ea zNkK-pIJYC}UWsm++2V1irM>g!E!OGSo52SuCNgH^ft!f z5HM@d_tWJVOxm%MQV3zLW`6NuLZXrVsiQL`%@W)Pv2h*5QBa@>gjC ztLjPFfe}C7{GsDD#nFIfmE8!&`;1WHKc)qz&V3l?@9i>=lT$01vv;Dz-Bt~aE|gUj z1QzSqC)sb`#T_XTf=ufmBo$0Qpw9WXTG^QVj*m6q^Hq|XxwfV!a|zD)AF8x;#B29T zm+LKEOXbLj2Y?ipSwS9w*!6rK2fee38~5lkbApZB8jKq3Zq|egtq9!iUA(O{DkJi4 z-w22i4J4VUHTGNpx2x6W+?YDJE6^=Kx;W?iWBy*eu-&h(2@<1>UHKtF>1mQv3zA;@ zgifJk?;0rHG)h|Tk27Ebcfo|sQ;Ac)TeJRhxeu9O#zYf;-B_G|bjbCOF(S@EOFkjT zkwyTysM*0xN|nIUe^L*VuKR9>^NoDd!Ssb2Ik#^?*OWyrDf$g@lMXG=~e*A;gg}KZIX5AwJWrfrOV7K8%Xl2#oAf ze_X&`2CzBewC~S1z*R~e@s=~lQ_&QoakxFW7)Y*0Y7JvO!p(P_zW(j9D#YxEFh!Kd zG&IQp5aZ^n|0T3Ej`OWJ#$CJhENpqb81Uy@dMN3DTTc)xSp}`_>4e}H;c|$%;6es`0*5cE1IBm&kH1=o z=AuM)nXtv=Ly$B4MKW69ENabKweUV8H-d%_dPMW^?>4~N zuhfte-$DhQTF4?5w!lo+e@cJ)dM+~KkexTAT@E)N$ia)zAE4K|zQeF;O}>1$pL=>s zPw_6Mzk^=#n^MpV{DM$T9F#WP4Cs)jw@YX56zChOaJev_d8A1}HPKeMajRs=m4 zq?M(c<6foxd*5%Kzn7btj){vD-N^=A7BTz85gZb`#7EJ3px^CZnMdH6K&2aY4i{4@ zweE-l4F*b3=Pz_-zk;EUfA6Um)7HHLva7du=_kzi57#samG&<*!5gK4^(Nz4Kls>z zBnuXOU#FEPlE_|G^ml=A2~OA(L@8y5vq*(EsKnPmofdHmhqAhC_Io{3y%Vf2lU}KQ;BT3SZQlqx+BD#E_s%;h-3<=H$k1+F z(c$rfdvmT}!rsWM3>E-Hk(llGfgpHOo*1cMg~tv7+ep4lFGX6&4kXOG&3&^&6f$;J z9#oO}URSB;juQ&_Lf`C}Tm(3Ln71i%Bm@6u&+`RsJm+8T3fql#-Z2rvQ`+Xm=%jL* zhDI(R`9dqqDdQ6YgIM3WOAteqLRJn?!onF-<$47fB!mAh4s-ECoako9cgD>*on!>V z6Iwp>9sGGhl^Wd}ZYj=N>C1!Y``y6%WS{2sytR@K>0gdUDeh;vO+^lWoLzVOZ-dmS zp@=C@{)tkJ-?IH$Z3HKI+d$8P^bh*4iUtr@3s7C81quj-|9X#1Jy8L6L>B$Bf2MZz zI~ezJT1vra?M0u+$n;bqt}p%|Iw5!#B7?-8Q~2b7<66Yd<$DpGVa(!sJoQnb;ti)5 z8ftgg##_KDh^z(Xd#x1Ik&ncr)a==KC2_K3%Yq|fb;;gv$S|}}Wui48u%cA8qE z2|^!lXTErKoQXg1d0^KvZxX{rl(|Mq|30{z9;y&C%6 zeq>8ruJgPiO9>FKKxgpo!%{pojLgEZlkBSNyy*qqPFZo7+ zD`{I1(Q(etj1m6P>JuuzTv|>=hTZDTZ(xp1LsBzg^};3W2aL|l638ZW;XGkn>?E`~ z`dzYNjgR%Lz%L3vin}_{C!(TME^GNfTXRdAmx8XONHLc-2-GqUay~mC@Izxd;Nh%7 z${Ykz3Ysp~5iB8ZP(Vup2tW_Ldzp@6prMeCpnDq`-1V+Y^BWP3#La)*ZIl@XPpGbA zTu#XcR_6kSe|*BYLFHTq0+o4~cKrr63IM%wMm%+L8^cH{B+K=dd+#UPf}HaL=fsgD zfgS?tT|9F{hjv%u0irsdu3_W(C#!k&8+>r~seq%{9{7U1I1FKBSMcP3OoZIwyPin1 z4g$0g6~lh}rz`q{>SQt20NES``Nw644)(;LDR{$%or@?H;K+%Oky1i^UC4pFWC(a4 zQkRiHSD28sSo&V5x1-7-1H0!W@}1@`TZ#W0~+=Nm&?;FsVxVG-+vSgR0C(dU?&u0{K-KK`RxMW zheq>w1f=oQ+)sV^QB3+xCTxEo)fGy(9`Dqlr{03O**;?NTpUG}54~U#m8BrT;^>H6 zT3!V;@DAYI0L6BgWbB8rAC7at$?qL}N}HvJfCF3-Re1lB8p}TJA}VkQ|BkX~7U3MR zSMijB^}jv638wVpFec#BZL>gRMVugEPVI1z>xwb}i3~Xtx>g_IL(qlzY*%^Qw3_Mf z_thOwTNpkY>87YX!WOU~)aH{n(A3)XZZ>x(gJb)~+N;O)@~PWy%oSI@Ir0ZD!^r)A zdxS8V2YX(e#|OLas8sLBR7gg(rZBUWTaoqor&V+m_SIXwcI|#6k_T&efT87Ln}K3q z!I$2q5Z)JmH7Q36o%f}p6H-FfbHz=_=}=EaWdKBUs*>5z&{A&IK^_V22-LqZBAB(_ zo{&SZ(+VMJsuiaw$@8V}y0bY91}NZ$cMaWFCCr=4k0;al>x@%LKm^!*3N9nKxxE|J zK-_*stBCCJx#Qc(!gjw_#UmrNzQDl^X4i%EOC&+JEczKYCEJbwJQ`w1HgO`eFyKF`bBm`#`QOX~#_wI=hHHw+W+n+<3gD?*#C2-+v+-A(Uw$q9$WUE2Viy< z?Gy+~r8LqD97cwIOxVwkMc`rOK#Ce>ph@@I3M`$z-Ajx`QYx)}WaK?TvZRah>iJld zM5+iC4{$7 z$j}P`rzK30ZvywI%B=h!2@@*0KmXq6(p?m(efZ2!KVzJrvqs3K@Cr*v&qQrq!1 zP)kfFZYt$648I62AsGlDtc^{1st#g3wU##iv!<`*-N7m9r$p_uZ+}&Z&yio{c&&3` zh%U#EEvum3ZKm*9Uv7*+UGfCACCwRTN)#kvWI|)JAOjDYurLjvN*c6)62DmiXuH;x zSVcnQzt&_pxMt0@Z$8hce?HMFFR&P4XQA#PRpsO&cXmMl05RhQx_!1?{cBuL{Q?XE z7GRzk2%e5BOxd~0*xLPp$6A|&0PM~~c3E9MEZ`&lcNiO7LO+RfTRi|EOVTf_#DVUd zyp+m{UFqor4-OEX(rD`)8#%2iz>jH)ehpiPCD(J~Z0f92wQabw??SR$j1km4CAaeG z6k)IXEmrt5e=WAauMHHNhCEMgBW8r5CxyQ+2)*AXtr)8(Y0qBp!UHsV^a{Sv=F>#^ zrf5$vn9C`U>Eq^Wrg4CnKaz~lNB|riz)7KgY?QlO-aMoqhkmDdkD;w49t-(e9>X5j zhYUFb@V@?3jC)0Lz$JPS?%SA;WqhF)m2noBmvO#Q-!=juF4@gC({H!ekA%1p8>Lly z&Ou$N@qe)IUd#z}IK&gDDl!{`1vRaY^amhzL=2vAO+{gg_B{Ya zfr!sg3hvR4&;UJCmI{%Yu+s&B-I0q~u&z(>v!Ql=q>@02l1U&%T_lhY>SzcKMEv+? zY%j<@11t834~4`6T93@Pjr>!%b$bf@4AJykMuTCgj42q{faAByPyrkA`1V{lKJ!#% zXMa9Ee?FIR0O)dYkOKNf?&?{%Gt`+yT7ejX$B4iM$e+8x_6*1%l5-aKgLmeKQcHA8AG)}--qTT-W!HA(k3gXA)L(_XVTU|tv!-_Xx5W2L@L zk5eQW7{9!#3yA}I*U$#YLebn%p*RDUfXgfhBqe5zfL*;P4&8AXO9PDM*RojVF8>fc zdr%2A-XF&WhD0@71u#-TEeJzLA^=+nWoxYT7Qx&rH~A2A0$Zl20?hTb1kOG)K3YMy z$e3?FF~0KfgaWoP|I$t~n|dh>+B;B10To@45TM8b9<@}+j5&zbQooPEB`8&QOUcfR1$7o$NRk;X45CQ~j z>@|@rTYA_B%&AVgsllG3d0i(s_Ed;?JGNXEpcd*RCo`paYbLT~v%*1~BwVDzO%FNm z7Rawm>n(dWl2p~ONWd->jVuMXptUWPUWELs@u`4TYF5lWX2+)VjYf`Dh!Z%xfIx>*wghg&mJmRGMIY?1Q4b{7d#WmLP5)9qrSxKN zW|a2_&id=)?{+gBd8G6M3@}9l%y!fT7Ew!oAqr{mfWYuylTmw3qm5?^HDTH0Rj!Nc z@ZHJpz>y1MBfC_wCioa`jw9UatdKGApno8{A1ES=vYBlKqaRg}du?LbL=aCj?)6g? zEiX(9Rg%;7@;U{f5Am1zYCrY+c4OiGZRWoKD(+anBNRG;en$8U?=y_mLfe#lSMY0z zAAU*ZqVLuujb3zHtOk^uY&@jJc-;t|CR)Nlha*_Z3#=TmwPrNjNEqwTg}_*=if}a; zVMi`#CLKVOD}OmR`skBA<@S{C(1h=k(r!U!OkivdrKA#C$;XE79Gx4{LUh^oAR#FL zEkKGZXyWOH1-+CzfN>vCad^IOSHs~(m%u(qT@*r)-S`fNWiHlmUWFr@P$~S5b>G1D zTemIxew7T644fsIe3>hbftnhKaH5WG@qV_fz@uK40&k7<0{Z6N1jWOP zLCJe%yIYp}7@-vKJuR~iWC4_*s8m1R;fj_U%|K^*lh3V=CV2G{NdhkaxpywHaeV3+ zo@C}XJ$LxC0K@a!T$@*R8TWMrv?m8-H-hYd1Y=k?p#Y;fUXX4SNry*YjM!PXe=UW! z*6%pgzq=pE>?7I8GbEG&CD;lz>>hl4Az^O-w`$FTY`3sq-`IAP zC$#|ZG_mHX`Ou_574)-jAUv-;MD8URV5=0tDvu`IO0m!3d9haNaaPeK^?ag=SmcJ= zD4ka}a7E+#h0!2H3a7TuzB5J;>^1%oh1*mt6hNVb)*JaPl^0n>y)@opE?^N`sIzbK zY1sjB{!MO1#UVWbB40_D;&n4zk{-7-)*`i_x&zb0V?@mMrY(GSN;v%i#T)XV6eu#^>Bs#&- zdg79-CCd$6{C56s>36=ypKtq&n0snMDe4k>YQ;C8-b?57iR~=DmZ`daI95+wR+I}m!4mVi?!$yp=3atee)JS z^^N=~U-kV-#MCXL4$EidTJO~F2Tln(e!owVZY2~k);+udV~)3~w+wulGO_2uO0Ao*@L58l%2CFC^b@?5sq4B8hJz)v{JdPJwB*}7f;$sFl| zLR}2FTHMUV*#8uWib6E{i18orw+R5b|7|rod4)WD``*LRyP;PE>hCR_XUYh{bc%z# zuTx1Gpm%wZK2~XLu+mOz25vc<^LKQw|F308Grddv3oSIaEt01&gJP|tq(`ZlsqNhe=z}z;P-r18SQqkT`$u5FZzqX`oJ=}hOdY?R! zyOFZrlGpZ(_Twpjq09pan{CA_m~$hTYx)Me66B9y%(-U-P>7o$>l^Br<~-LEr<*CB z=Mw}A65LYVXi6Ib=HqR;|Ixc@vI!zV!guxGr1-V?y%tJ}K#w$S=@a+@fDjN6Y%>0z z3&4nYB8fXR4{xA_+cdtxNtY&^}+vXj1ro+Z?)C}fH^RYX|GIC7hr-q3*tIh1~6IO-@%G!WAUYWre)LBNP6Ss z+4?;SF-48qQt%xsy2*Sojl$*{++_#WYkl3vd}?_C^eUYNLIeMJK%`mdjQPYS1wf*5fK+SLBp3Q zqf(-P>aMpu!wdQC_kVPK-`~HW_xdlveYyDmp!$lbE>z+{>j~lh)~k3o;<*vhM_-6s#U;HB~rZT}3VDQBQ0oE?#XLBnOS#@PtL-5BqKpBRgFRC7nM~~~zk4KUD zH@rTRbL?&*C z{H(<-r6&{VT+!;z?lGp~Th}hu>}+E0H(s@smv;t~jfDMXC#~x|jFJ3GgT_eBs%dm` zl+T(DM2mWEsmrFB1w-Ax+^Qv0RBWl-n3nPW5IOBN?z*-RFGB3qA zZkTpokMH#zp>RE8?F-nzKo-&j2$%RRaP|5`w!_(Y1hzE4O$7->#5-u>mYkeub?2>n z1OxyKE}o&~mu+my1S>wpDNGScMp=dWH)o+XXIp0g^=D-hlEZh}+=Pc)1!uGt;>5Xk z*$!Em)5X+a)AaWOGlhNBoC4fsKoew$OA-KYKdAEkyS~lA)hKpLUf#~@&HbgNXcuA= zXY8j)xRSMW3ZROkgVEEtfprnw_CLSsvtRWPXY92-*U~|B>vGr;Tvfjb49LsVwrMF~ zKpb)^b;?GSI2+^$lYmo`p{ex|4`JM?5(TH4vHpAi^zk!DQf8RZ5mg!SXeR~!x>3!p zvs?B?}GVHfv>+sxBECAk=3%-lEzDPUE&qK!GIe@-zZh@Van%8fp)%TG`a-OP!tY@!Y;aK*Fth z1$Pi*;7%*7p3ck!YyW+YWZ|o95=wMKtqt`Tpim3AEFWJCR;dz^XttVGh%jI|otcfA zX&&@lhIle7w%}{$I)sCnUnq=RMnFrME+iggidmjo)4d`8px1rGJ$bLfz`9&h`FJIc zDE!TfHtY4f|F>xZ{2x|#sL%;0tRcg&Lfi?GDk^`K^D4z~9u#(6MLi}Js}GiD&f`F+ zreFlCOG_|7^|?J!a|_k?Cko5b`S(_c-EFnyDt}3Z#Sa%IfiE;I-cHb!?hu>-^4fR1 z3KzOJJfJ2VhQ7ncRD>wCsTsE;EHSgoGs$4!*c^W!>F3*Cu@#jNgv{mXr61T1>IZ-< z>3&A>biGc^soe24CWNLg1eD*|GvEB>@scCGDH~qCj02-_$XON5k)zEI6MuLo4(&ry zP-R^}N)8;WPcpVwYx$N-k%LoV0YW#j3-^*1Q1Axwc1tM12G;dCn>UX6Kk*3?1;gqk zS1V^pfD$UZrE|2KiNyM;I0?+!(j0a$4Hh;XbO_po?XH{=3T7z8P`{==?zs{HB{3!p z30WwCQoxuDw3Y8*L=k6!RD@t;Tg{v)tCrUJu7MRCa!T0lnhhzvujWJ%A>YZapU)lS zxc(}R{t`%0WC4~xIUTmk;;$5++VI)_+Ywj&Rq2YZXC^laaVZ-H1;Ixs*`9yP0%?4p zB}h_akO1C}TVShjj;H|1F)jox*2qr4o>SwvU>cF(|by(VU4g5KC&uH6>2d<$}1aTb==X~WtdJgm46rg9j zs@J4&Kd13HrUGl!0pKNlVGjoK+=z&PAr>Ifa0L;IP!|TDHZR_(kc>&q$(0ZOUBBOP z(ad&V+c>3KGI0OW?lL}zx(IH6t&3L}*%(2|^{5ixeVy!VHHjn^GRPHnw-DW|$tC*( z4#}>Ce+yzZ$D~myF)=P6&+&CZ@H{Rv4Unb?MyhyTLg}O<7TUBW zGa4^{I@9mXs#;A(dh$}?fx$HC8iUMT!J;M&<3%xy3|78?>L+x#JRo%KTw66s>Y>MGez42P$0NuTq!jQv7G2I>ms}1PvV~A z;@VFa@JS2V1T^t!u!c8r2KIaf*4SSc50mT|SZfLYR63O$52NWI=`-Hgk)}vLMx6@q0tO6tF z3Ayt^oO-WRZNYPU;Sg>4NaY_R@nD~-y#3ZYGW0@nU9C8JVx9%G@7qP$zWVjE&K!Z{ zcgNAd_SQir{rraCo_~3%_`iOy>j@xNm!5&&wEb_EeGz$FxmtThh&4>;L6|}~AliNf zPr0-{+fFdQ-OnV}2~q!QvbDmr{70wh6bD0Ct>AR=ELGeP)Iy^RZaUmSQ49%=89*yM z%8T6}&^p$_v4T-iA#?+!<=0S@Dq^pGE+5U2nhyH2{u@uJ|EV0pjgEPpAXv4J?es*` z&PXbxJd1uAH~ab5Uma@uepnFXe^`q$c(=S4K?rvCFBHI5f-kP>Sh|7o#zbVn*$gxA zLdK292bLmjE-Lt_BaosVFyI7Khf|`6v0bI@_ED+BWV6j3e5mJ0oqmVHNZ2JLqS^q7g%BZv#984QHP4nL%Hr4`-KXz1z-C$7zj9z-22n(gM1fHfee?V+))AUAyknqy=BQ&MA(I*!}ju=SD88jwRz1PeEzA0Uc~#r!#tBY_W=U8s_5>g-zg2Ig(#irR-uP=( z{km27@kJQV=MvQ9pxwwdw09GNM|Cf=Fnhj>jp#`9OZrZVAL!NsUJ3Ns4udcx`l+wj zv5U-(UgO22$0;}%{J!o7XM-kW!vJWBfj#N2Ewi@Wh!+aCY9E9-VLO3db9bE1Rbkq? zQ(;vU-_7!xqqtiHhrhfx1d>39=Cdc%7w*Ae43;;K!QE;-LkTJ3hu%rg?zf9p85J1P zuXy{iWI$u{&Ycwu_#2si#q7X&frO$bJgjx~uocam{$eXH1m#p6Y4a#N5T~Ovy+x7N z-9RzYbutHt!XNd&4n{YIYV@E!RRUoK^TkQ!01+eM004kc`%UHM5&u^7&Hh2))?k#j z0Qqx);YuYhXnclE=NLUyia?UQ&lKl)CDlM8AP4}8u%G<9&9Hvd`Uzb|b7fyl96%9W zj_ProkKK1g5D0uHUH}r{*l+z6r@Nu_pV@mK7;wOd% z?7$Q>Q;2GwRi=p4C!g*#$b4YFv!?_pYH<_$24>bs$}Br-JyebKc!dF9*sx$>o8pj| zGt&T93RPESL-uCaN0rgYdd57>KrC{sSAl6Q3IPpJ1wLhIw4xFCH?2{YXA28H$o!fvr#21%^AZjQ@(a`q#Id9zdssPKjj!C?__EjNU zJN^g^v5ZD2#E-h;m_>Dux)O8CfQs=*Tp8uG>-aubUu&v5Y}-(v!9C&VaXmHUJm<1w zMNZ#d!eDnVY_+=8a73WDAJs#3P(SrF#JW;lc5+dzjrp3BMaQ_#_o!KZZd;0ieyEZMpDz!-x$|GYF~x96CRY zyj)wZ*>Ds(!qYK2u0B2P-tgIlLUGShoozM?@{5Wj?yAhxH&s@RmGTO6YF7PqUS7 zE_a?|wE9W;PFVeGC^6GrObj2H=D;$eYC~PD*^P|RLZEDGW^NBw8CroSrs!hd6)7Ly zTlr$>5|q!oFt5A<)U5yQAOSs(kFwewD{Ox1HLz#&Qdh*rkKG|VAJ;w~?XtQ8_d~Vwu>4^AFR*o)X1l_2x%BmD-f_UD(-TOeaIz3Z zSeIZ|@JFie_=^7GgW3bHv_9wv9+CZ|r!(H{HJ;wpQS`^3W91XB9o#nwP!SkA3+_FS zeT*L;0@k_^&51-X3$7j~FnCTa&NjgTkaMjhzQf;dR;%`Z-dRMpEz5sA8XdJP8{aF# zYwnf#s>y52z3glKYn@EPGS2^!p*Nb?S+QzV7PjA43=EI9eY3lR3rSx$<%e0ot<1?- zugJyyTJKzml|{#DbpD7LSX}yrf5|gHgJr^t@}q7rd2W2PE@dA|n=RW<*sb2w}*{(7hIyWhgGbf3DA)Y`tT;d;N3hVv^`yLWa< z(-V(%W-`A#>ZU60e3G2$D9ug^l)MArCXf>t3GoD|1d#-v1kejWF4(uC?+dgqj9KAx z#qtZZD@eG4(F;r}fUlyl3rZ|tu_Db1nkpE$!mW$i7qTj_z0rRH>k9}g7`=gK1)UWX zTVZ>Gs|(04h+45y1;rIGR}oZ&eiybZP_g343ot8~se-SIj4W`zF>i&{6qvPQuU4)C z+>2Z*kgg)m3#={hzp-V-;TA++z^kIL3ddd-99{8Sh1S=LZZ62S0^tjWEO@^WV8wS9 zyj?+H1&I~_SYcm<{|omQ@Gmg3V*G`!7HC;PVFj`l)LgN1#j+NVU2$W@&lc!f(PTyE z7MNK9dBuzuU|a!f1>qJLT%mrV@QXSwIJLs&i@Gi-wj$jNE-iSqqT`FaE>N@r%M1D! za9h!H1>6=~U14fP#TLemvd?U16szVs!*L3vf*j!5tZ`?&lw9leb|)88p7S zosI|bv?lPI88iet$2->PqZ@gtOt!QRDTkQl!nOlyh7+5TBl9y4UN#dM2B1nxA(23N zsjBJgJg~3XnNO?zFXF$3MfnT*$8s(fy17SjcW$irb1BQ+MbptR!QI`<9KAO$2=Uf) z)QTfC5~U9v%{4%E*^VbC7aiTyaCKQYWp29Yn4Y^i?&Zk|!4(|t;U?*MxqG4*0#rZ?oz7h zwbymmOh*zNtI+f=u43lpAz*8>adSFF>F1v5IeSFI#b|;@BB|7hD~BkHM|6s+Il_vn zBBQdXi;>D8f+CE}!!t85z|5JjGR#HR*Ke{z{L>=;u7frE{B%rLpopH&{yR0uB~?O# zA4aDE2f(;|iT{Jo^FqD}97ILQf4QTHHn-b4#=hlcr*pPsKH^M-IGxP}NY0PdjS#A- z>nY(=vrCOTAS+1GGme}k0l7JyO8U8?5C=v1$a70yWYswuFId(5f25 zMLC875b-XDADph<)yW{AHUQA(5OThh0QJH@?Oo2!%X>B9E>B5d;qklz`gR39qR@GX z73grxpl=5JnciE)l3ODe1PpXj$lj#(14b~PgOwuY;9AZB_47wXbXxlAnL#Orxiq)9 ze*_&vb{E{x>Np*ba=dsLxAZq%9+s!0ftlD2eHgifQ+j2bKna69wfYGGe+9{MKnAD+ zfJ6X{b!;6U{+GuJDa^MgbQ&|>b}hVYcgCnvQ?y+Y&K8uvXi+Sd@P+9>W+ZNhR`_+u6QCTkxY{J9W&#W@@jUIBL{P$GF|9XxK1 zGTIjJdU7iqP?G%Ic`|gh*-$H?Wl-}I&1{$u3zE>xx-_(d(Vv`MjD%|wBvK|ksHKEx zc#9o;Xa09X$7dbGL@*ovy!GGs7pgY?l0s#K)Nr)H+&E`}J4w6G>gt27UI(}Vc8Er2 zhF+09mq+vVW)7$)EWuvyTy5V6*;6Y#%<#yG8!g1G#fzgrDhZ3h)r6fGKc3OA0&H6N zSaoXA#=K*U#@|(TH3*D{0KuDUyBBt%R`N=4+8P6=CJR*;{#n&RD2$ z5-q&mEExWK3*w{aWZ*7-?ak+Tv84zcF@*;R9~8+^yVvRe$E{(rRI2ZC(r`WIJcARK zRjd{&wRlAV7JjL%yys3TQ01I#^A|37IM&;oeqzSb*6~4=Z1CRVYr<+5f>(9)CpIY)4%hqrJPl{$nSQqH$_icEdEsSk zyn~s7*~f8$YOdKkUOiOW_C;O~Vg{J4AFQg5FHQq};gLBEpHQp=Y)6m(! zGN*>Ja8pD9khAI(Dj_vJ6q8^3-k50{LbDW?<{SKLocK;OdOE&3ccanVV*^>?Qz7Ql z)b`$M+C`G{dH1#Ye}w6iMPB-?k?6~{kn~(igf@AlZE@RrWPX#<7N4#Ajckrn4`)R4 zJ1BBL^%jo)pP-ys+KJ1f{&$&#PrzeGCON(z1I+cGG8hy6K2j!s1~{Q>*bukfnXiQ@ zM%D>U4PJsxWntpl8&jlzM_be*2*u~+f&DY?;2akx4DH(;_3y0DimMNfkf~1$X=Spn z@}HTWiY{`@mpSU1Rj|3OuC5~N`*%xR64q9FJCfb(;Q^j!ZP!wMatym*Ro@uWS}avpPm^OGy3;Pw$GYwsa6NOBSbSN zW5%9K0UQb8h$Fm&Oo?Q3Jg<2N8%#DwrLO&UK80SX-{Vi%$qTo;If6)tt* zSRfeWexPo?&^qS|04{o)u{W?7<4ABpf_GE-qimIr$3qN$!ZW-QC0@6cwdM%vU-LCB zFbB8gakMsrvVu7Hx8*)@#iZstm}`}xKb1dKS!kS)grGG$$7N=-OK$V66KO=#yeBHx zHGa<;?;oJ+uK(nJC}m={Wj6bK#N1K4c-fOI+CumF4>(xQlzhPqj&`bzf0Kt_vB+GY zEdv<_Ed4^AB%t@x8$M?4&R}6&WrnzAVq5Za(MBwVrP)!xeH)JDGg$D<+fM?;+On|? z!(W+sRX%-z%KcF|jgd=XvIb%`rnF}bL3lo%MDw#fO*{|gT! z#i?Shafng}$Sv`rx!lX=b3f+Qb>oAnY;nn8L3}n&y-7ccgz`qC zD;(8-37JZ<#)37snEh{f>PPiZKY>@0dN9vDkP#|=ONjxe)__L9S`+g>al`dG<*v4u zK0-+9zBHbhkmyHrVYM)VFs=(IAJCoOo(&c!N5JJoT%YK_&42>Srn>Zw~= zn$$dc$07UQq%$v73>~`$Y$R#cTr}YD;J*p8yIrCmOBARNQKB3tN(2dx`@t4C*Ec;@ z&{(|An&lh!E9>5TE^@=~I}24e(J#<(#sF0%(O8F+!%56hodM$z{}iO1M7B1FzUy4y zU|yvniT4V4?-!om`)4yzC|MEj)ppW$S&GvdZKcbTMu#BUCO;c_fm1O-DB5@~Hz-3A zeD2G!YR2SgP!?F2U7Ygis7g%Ss;qUHvkbNEW2M`&D8G9%$n1|cvf{otdo)Mzguu%z(02PplmtMc&kQz%I7hhPgIsqn4_f2F9q$}N zL`iFmw%hur?ojUW3}2x~`!YDFm8GhuPDKS;z&I&C9ck#NB5a*AzxS1q8DT7c*`<LS@eoY)pHjI-kSIY_{`=r4+Wb)ZTMr?*O}F2)Sb;huI^+84dd&nG5tGFdOx3 zDO4g*#?^^k>|V=kAFK0(t#eiRyDa67ueqX#YTUHrg#M0p>dI&G!!?^68a@JasF5Ne zs$%+?s8$ooI3rL$p%XqGh3(+~5)YWtwvFhvgn`0{b*1++1VJOy-||5c#eEB60;6Lb z0h0wOhmFSLVo)^z1924C+-^Cs|XG>$QBcd^oM?P-?MD~~iM$)0z0iS)1M z{Z5WZd-OZln17X2<(O|QQ8q(@6n1PltbP!8x_vrrm8 zf#-qd#w(Sr97dqYdy+GxQySlZ6h@!h&ja4mIDA4c>ggjW1{kiSK5R9wZ4w zMq93Ej0L^J(2dm}K>OWy?JpGfrG_Sm?NTX_-UCE_-zss58;zwfa78CR-k2{z0R$?&y&M8)2r-m{L|b%@AO-e+8|?Af;|=Td z`EM|)ThpqItT`T%t`8CjJTW_+ybUP^!~%*+LlOLo#>QZnIh%n2`P;^L%78Tb{!J)< zQmp{B2BL2*K1$quPwodC$Kgiyh1w=Vg@i=XHTO5#fOKfgap~pFesfw;O+EN=QG7T7 zBw>SBopLsV3`iRx+5Rcd9w!2NW{y+ivMY)ZdLASjzoQXi&DMCk``2-yeNwnJpsWXf z9caWK!e{Uwi^0gFO~%kUp{KT?_rC7Z%GnDlS|^UnQ}a<0is23W6P5syUa?0i7usnB zHv2 z7u7W86d24kn@9Kr2!&{*`R*A~pP;ntR(_?>AQFaNEBo&_HV`Oqvr=5gSu^7pu4fsT8RPfK ztM2493Yk6wS03o4SHVy>P52#2fG`2xHxH`_4wLQuA4i($KDRm zCfBfRnBY$960;rjObQ|UJb82)6F(Fbygh#xfRjJ}lscA~d`>bgB9ia{Uf-B=KpV%i zSUr562nTktB#nN(r+=r}^oQJPGMAPtYqHvb%29$&-D)D=U^>heEj>)y?G`pCti-EE zejLl>HR~|%Wd0ppW&E-5pWBKNXvU8;>@V{J#h1Zs-wwYf@mJ~U(d=nFyr{*d%Z}2B z1>JEc>ov#22f);;-_L4*rdpkIX|TtiCN`z^p46`@BN*Oer3a5=y`~Z=HH8YR<~3#v z`giXQ6gL3KK5>J1QBou$AG=N1Bv$WU`+L>w<&kItN+_I4wr}liw_hIJ@Dsa%NPgLY9>$vi2fJ% z28B=2f!6rTH&+u^*Ov`ii~2=m&SFQ}6vO#f$0ssD(w-CSv~SV>-8IXvp8HS(%l4P3 z`VMs*NBa$)8PZftEruSSxSe7!v^MnLraJjhd|lIjF1TwT=F7d)vmft^;{7o5A}zXvo;j*kB1(R56cB@GL<~cGV-L&NV=z1P94l3=uf@6XYwlW$qFQ-1C;rW7f;?iiN47*`k=SJuF{i1JMHRgAm zzJc3L;J3!SxkQS{;VohvV}S&L7YaI?`wY(u>C8*vD=Vx$wZ)!!CA?=p;ixXBG2_dx>evaWstiDJuvC86oKU}01UoP)`DY2dH`dvmW^mpp~C_l_f zCRBlM`SQkU6K7ie=aw|J7q6UbZz)Lf>uu`eS-irtG-o5a@G7tnd3b3&`O5c6UMcrI z0d)}yn#tBtDuSa)c%N?>cd6#;`6RhHTyQGUGY>v^O9l(!k28KfvQ_VecT~`?FOXH$ z;wEvmP@WytpVl(*dH}?aPo5P~<1dchxq2}oUGduZ%!Ivz^ta%Xs>>cetL|!irS@V) zRwiYgb4sjaceL-^(rjYx4w-2mZLcx8jhaAf)d*L4z=gIKR%srTy7NRoJY=CWBegiH z&uMwEDhmLFfe|2xCR~aRMG?TNDyxymYUjd-yuQM0_M_!+rCtrwn3U8(O6crU z+aDHxzqiUmf|}VVb9%|wc1CBe=}O7Ew|aK5l-+hc`!^JYdA=tWhOx?rSa9Q&fxZJj zzHo#F0*4R%#rfZO3}*rR5V(AH0s!wchc|^|Am6esUtLcd1)Iu~tPG0w>tTN1Q){t) zkEi^2YTEqM23!=_$uA_BL{Taq)1Vepc<3q~29ODb$+UHuY63&EW=Es+J(O;wy zzF;RYt6qUPog~_XzqC$&-it)3#B2p@%j|gMlo9F76mKcWVUbVHk}>Q>TPdsHNnT>IM(j2 zcPi9VzL=Uaw?B{iz!!Sckq7blR7iZWE9r;XMEq>)n+&b32jdP0)!RVLv64~dZB&*B ze5h2&A-p4Tz%)>aE~Vh2hh8Stw}Wok6oaY$bs&=8r+sWzCxf|6rTibQ;kIjk4b-Ny zvDq_6?k%3Q!r+L-nO!I_fY-%HgepxoVDMllU1TVFnWV0Ue&4)n?zpaEXSIN{yH(}- zL+FFeT2gd~3bcqiv$U&`KLd6VbC9;-mM1fUsLAHJvWWba%@;!JWfw)MWZz4C5g`4E z76=x`zv6)ecSp!#&bFNH!!l#G18d7M+sweY+lPOPN;)>iwy!Khe>@B&3DZbygJ_SV zkM}|0MOz;HN7(Gn5yqTjHIQ~=Eopi#DV4?ox9o%%N-F)Qoj>Qf!m53e?=uC=60i$$T;pEHD4aNo}82o_nD2xhf8h#I7Bi0j35s+D3|aJA80*(hcCH~&P7{>SdNpD{daQL{x+u()ys|^ z4B#*jMs~fi41HHBX*CIh;;m8-*I!DfOWZ}DO$n+$B+JI~QOaJ?q{as-hibI7rr?8f z0MONv9F>=tA)Gjqs=DI=2`Mjt__gTrw^nX>1YUqBZYbQ5dbQkD z6cw^+3NbtKdWZMpSR6dT1cMH?x^>w7$B6bo7Ctot(;Y@JlO0x%|1h zVlzksjvPb&%UI9XRSGHY>N+Y4RY+5fJA!!hEUEbvnCWd=`qYWu7W-A48f$6*PVifE zFZr8t9ttD>48*aa>cZY9FxQ$VX+@$Z{Qat{NFf(eptwfG&oJb?#aEQ`7(rk!Q>boL z1(q3u|5tE!@IgV_@9jbiRuyDtcn%=88Jg%-7TGX-H-LJf@D5{iJ6mM^e!i#&H>8)E z+u#bEF(xPgi%|I*EXzh8&NNf3Jazac4dw!JE4jPu_qz$BBMu051Yz$6j5Gc!^zz0w z!q3V_o?nlv1XA|K>9@3*o6H662d9^;{A#uE>*I1uPdv4IoNj8=Nr5LB<2TXi5MeO? zV4m6@!rOs_J`U@QQV(VPJPD=Eloeu)4SIR4Yc2ofcn1{__!ckIegx=J2@7M-wg_mavB)ot0GjbAUp4%3}cScm(= z1XZ8c`v1F)dSOZdiwj2X9a)qP_DTJ@>b3vBXX{Q;v#9EO@jG7AtwSZF{2fwbR?4er zCTi{optB@Q7ORp^^h1ZaB z>=|d(>YE~Z4`VtG6_OXqfH&`CS@(cVeQXB5i?*yx!Cs zNDC%wYCrHjw^7q%aJQQ#k{csLjvP)KW!B&){Z)Ex-q)0gDsDq2**K;&^ibd&mpAkb zuq4JXl@vw%R^$JKPN+QN)k^jX5rO5d1QRyYW=J3^7!LqFwSH%#k;22LUkia!b5e@+ zP{>5*>75WCPkOAh?RiQ|$y|;!?thA*u@<(j?1^f>AW;Ib-8H15i&-Xbc(Oz; zPe5aBRd;vZ3N4z5Ms8b3$L_y#<$X7Yv3!evE&39m5-16B{#y{+>1Kjqpn$DmH8PW( z$vbPI20;0?zurm}qq$Qa*i9kaf+}sU71-3vqjGk>&gDf}v?R$S1k8vZp_)W-=sE;w zxXU%6t3)Y6L>V=y##B_E?I53h=CFBNW&OX-i$`-L1H35wBlk

9Zq_+dd1od4~t8 z!6wqsZlv3Is9&G(ZhbB)2)_*Cyu{P)UG%D4(>%}73k#oU+umyodY%EPlun7{X}#W~TxRcbUSF^-|!^6vXn>~iYK!rKZfWhhSUMsdB;3g9qHmZzRi?9oGzK*s3*+S(z{kqc1Z*vv_#&D2wFJ z?qwHoMK$f+O)>VAG`gFwaS^~baWUrn#C)Xgw0S;Mi7&sCx&etbIT`sYn@X#kfBsMI zy5RoO3Q_{kn(vWTaH9h#{lVGlJ%3H@))Sh)!}~QPE64U+pHAB*wwyl9*57uFq2(x6 z>M%4Tx@!hC1TV&s+F>C(bFu~D0&zXw-@_0{h%Z6Io`Cci!YaV22P&VvsSk@qSN(87k2H**|`uK{XbXK2so(fWL{v)8T0Azm(LFg&y0UhWb|x8kTVPLM71? z*b(B_aVUJ9c-?yzqJ=`!v5xP0fM6bGy?$E9-8|T878>(;lu(dlIqsp5E<@D#R zh+^eK&-`=lniF~9h_ei8&fNb&ZZxruA#ri7`<0t*Kl11I^3>Vi)3WT+P?W_gqZ1X0 zxf|RsaJmuM>21!Y+5bBrNY9}JPe~X^y<(}@Kw%stz$rCWOs0_bzM@dAWt}taA0k(0hN?H3oj)Q7wYb}vPc=~g>=*zP)Eogvc zi@h%QU!^s;tjy1$Bzi)LLGfMj#~43L#Bz-_`zzl2+2IQAA7qwBZyV+WIMu{&a;?kO;nZB?8R}qaess!ySl9!+xGJ? zbd?BLsW=qdbmD1(#$$ZY#;nc&#Lym32*!3UxF+yQIY>C>P@FNv4lLRuX^J2S{sYwR zxo=|YK6}1$?psMn!6h7XEt;8d=gOJtaU@ElVP`FF^rN%uCCG1@O= zqL!Gz$WT9RW@*N+o;IEQs$xqGmyabAIXk(#{wUu-@#W-u6#84$H-eL6o#b^|ZjgjY ziKuGN316J^_R>L_Zw-6MB0NQ0bb{uIsHQ?suRBhFpnfkESB?8Gtcsa??z=Jkw5=AG zDbp-3zlnd$S3CS25pu;SvqtAOfrAue+W*!S8o+EAhB*9+b(m3lNep5Fq}1ksd-b7V zIy}`@Fe@!h3EFb;P8`;>vJ^bQWQrOhtNN%COK@V_gGm^)<_ji(9TMwpSJ|3w%w6>n zEW&-^rOXQENFDrUx(EwT?&H(~F4S%rNbZ7CcK9F3gFV^bYwt#<-?~9ux#hn0Qis9E z&JAV;lGSDzV*p7M%w;o?n|`C^Gy^$uM^0G+opis}0w7{ym)?XfSPkNjTZJQ zawDtVqN`F4)I$-;3W5Q9syMFg2NWNGaY5pM4sssyxc{D!4_jz*&+l`q)vj1$8JJY< zQxGD6VcxR6yeSwbRou=hn3pg6SUcKO7Z{48lB>5^>in;mSBENx<3RSn%B#lAM#AG` z%%(ou3`NH8mjhna>F!}6MCO>?Uz=*rj2Xzlw9}*yq6SO)y8%UY#gq+XWoTv2-BhEg z*Q*>^8_43-LUteE)IRH#$9%9#nOwhhetD=jvtea!-T1%VP4M|RMf~G;%M^}d&0X!i z@^jgn4m|G`TSJ)O#?HdK`8m}V(RcrAWscb)Y8866P&-v-dK`^=uV4%!1}ky3KUbhmOGr)xwm=$ zh{3L9Jz&Q%m_27J<0WrtZ{Ie=U!mTy{$N4hLl#f+27ak*CeIhd#OX-IgYTO=+M6r; z2UUTZN}x0ANVmNB2R!<^T)X3o~f)E)hg*z~Qn*Hu+d&EH3C zoRwbTSWcH7L~(1DM{!S>^bxSph{IX19`By7v20aJOZOLSoe>I8&EOMm223 zhRE>yT1~zd$KACLUnQ}@W6MzeOk^R|m<-g534h_+$Iy z_-G7I4L?&NYx<{VW0Rgydh{!-!n9Vdic>JeG-fRqPC2(vM#qUV^0{L6ID{_-7j?sm zvjN03{j*kr)Czw3Yq5%iW}P;`*$1hgz~V)a>~ikY}9WN*zzfqK7VkugYUsK$O7AaQIwiLxdV zBwPoo9Fx9_-vNvJbDI+2XWgxHUkXV{fwIZR{L4rLrwQ zorxgo7y|<@c$(uy6DpRfHvSz2%@B4?rH@YNS`9<-R+k1@5^nd~vcXusPI2&cs(Ee# zg1=0S=tx%*9zG)UCQy{ zEm9g#?F4$6PkI$HQ3{Z)xoH<83u{mvrpgV@2`6`Q%4GgKSeqU9#P$u4z?lt&7E*|i z7WWdWPSSSB!s*?F{l$G6XjN`^fNDh}=oZ!OwQ+p+C% zO)Sv4+7}|i-LK=i`H^*Rs>D1t{{5Y6U!z8ng8F1n&6kvFCO^+CYtS><{k z5Fiwl3pE67$!msK%IfYwbNwFoe9R~>VK@Z@y0UIBP2mhnxUxw2!a!AWvnjhEr{dq| zy8z=?M^4${>QCu0D?9cb+!Y-sS>eJOtM9pI+M7^uLC3nF?Yxyk93R*PD^uSsGr@{7 z|GB!uw`~6TP1$Pqs;Y%XWtanu@r7vco=C>`pWLN%@F{-%>P(b(1ty)ZdP!kRIme08 zV&{;wXRv%sWDrJjz%4S5C^FHlM2t_!Dx(0$x!mRTLcJ1 zOnwj$;vDQX+Gp`Ky9b3f*X7Z{ksEQMF#Mh+?jcNSA1wNMfdCg~D7dE@Yo#=N`IR%C zw965mQz(<28?3`nZH_($Rv%*gKRU?oHad)@*cdoQDaJ%)Mk}t}X6`BKoKZ)90~_(W z9fx=ZEzNtdyVjU%rrwk}Q}oKc7mO@lX}Ss?&x>?tZ710f-7s?oSTD`NV3;QshAYf0 z@Z4Ertv<2Ew@~9mVDMtndFhzRD2Bg^VKK_YQ8cBRX3N-RxvN3o7(d9YAp~K;Vu9Jb z&q4*!vRIyb%)>3v!JAA>)(f)pHZlD+>c=&y!R|EKH-Bf&Jx}HnH%|JWj`~@^P#R5_ z`Y>>6bfi8~-06(+*R8ZjsHBfSa`2HFG|pJf`8FZ$H6e_?_T2TzvbZdb-ODzdl8D05NUh_UaqvC;XRTk(!JAcg=`$-i}Fg92bH1CSgb0dyeB(G3PN zi-{bA{wvOIeGaWhVm)cQ%JRtfYtWCx$P#{x!H~$Z{7wwsR1C*-i=-fprmOoN1m-de zZ!>Y=*n}}n9jlI5gD`_AG7E?{=FhoAT&uUGiKFwVM;HhwK9CVKL#s!d44=BWp!7whK;ls z8u%DO$uurH!!@&i8;E2P=62UUTVHpbIP6W`@Up131`Tu0!z`)p8t2)x6c^c8(AMm_ zurp$()I6a0H{L{8g=3Vg0m zcVUf9nqv5^T)JEZg4XL>9=lrQ?#g?zzZ@Kfv8K!WA{zH;It# zO)2<0JZBrv@jZI>O*_Mk(ae-l3@5AWf}8)GmvVZHDGrir%k9sbd-e^P)folSD-L{OE|&YgLJ~ zUdb5N3t&%dA5q{FMFKfxk%9vPAcP9AwkCoru}-6jXYL__zgS&vO zvfyt~qUO@54V?0cUU$Y=yPFj=hOEfOss=+0VV8{nVpu}zhMmijQiDlL&`d6`r^^FT zmxvvyk%VT7Qw%=VTS;iqj`FravrGGCN3;O+uv{G zeb!p#M@ghwj8KP995`H`Q!}C1)jDx$u*x_hL=ORq?V!=1E;M!+#6t0`G9S8X>TZrB zw8u948Co$a(>l|!v78Bc6!)){zFp#^Jj)yG<(2}2`#V{Ibr*4Po6u%lD5Sc^zbiQ% z2^FN6v#~fRbLylV&HdT@J2hfNyn+a8j1yC~I@@`3{X`!=yl8bTK^o!%Q)GjKl}yYr^`ji$PkSe3*TB&GA1RR{LHLuvN}A-?_d zuapQDH0E3a>w$b=zPB?GU7XU68$S7)qFp&bdhU@H`pUiFEWS?!O4}~*#L8H<4>soU zJ4&RT$oIV`s}jA)zb?xrO!1fn)|;BoYp&%G4HmO+msjyc;Px(AI2(}R6AApEr$vrO zI++;Geo2RK(hz9nkB+rU@O@~_)Mjs4dg!>?>w*#j%gKJxtb~EDTN4SVmi1n+`L*f04(Q0{Geclaw$v2-@r+9F)hw z^k0v@YPNhf*Z#7>=;$0E^GO$00?$T-!Lbbagny!M#?cCy`JH<(!-Dl7ol#}EZ?%r$ zNBEzhX|TbOjz>$Rpbr+v`x@Z(iAX=j^Ux8*YesnJWO=NoDLC`BU(cTw=PM%t9vDs~ z4PjNzuty~?`>JFID!Cx`%{$7+IgQe0WzB2j8@TOHYnu#!-vfDAwSQj&<_w~(Z#;dd zZ9{+4CtAC|Ao``iaDPKul47*IM-;0?o%>m=s=cjzHiJOrGedWCItiQ$K*gUc7!N(& zGp3BJ187_SGSH1?8nKMSKDkO9SXh-y?e4F^<6y8DV59P2oMKJT&0V>@J0t#SdQg10 z8J=T7yvSH#jr(tR+g!XYY*5=rLyb=-z}hfnkZF4&RL#k8={1?RMWxs{LjN;>v}3l9 zqg1|so)1Bi^hWtU8Uy6MhH+_Q(PKi$hvywpvh!K`l)}c07TfEVWS#%#qHM?y%1PH)5n=&XG5ezX7XoNjM1E}}OP;Z8}U!NgAM#@fYvDf>S zHrkeLo7l+myz{PQiYa6X7g&Zh_V+Kwd%}5*W#p-l)CdmX3}E+%+2exJ)RtPorJQa! z__;TC8hS82v7neTjnO!|ckiW+CQHJTxbOPvKl&9mX*$c9uen!lhRZh9mY-DX3vjb4 zhW@zOp@dJHXco6Gb)?%e&7X03WsqQ*wq?4lI~jhB>_qW2u2WjUsHahbhZJ;@$y^?Z zuO@YY{I5|kNs%56(rTj>!GZ=mEKylqo1T~?fwoLci>2OIpM72+q}Y_t+`6rmX@O@4 zEMUc-%Nh?Wcc(95vn*CsnZvCvjAP~)%TDH)9tpPUwjoulgrYnBkSfpV{F$fl?)ePb zHGdx7L-O7^QZ+X>pUkJ0KLKNLJ5?Kt!tT~Yz--vCwi>`YSv-Fq2ezR_n(MFG%O=KO z6-76WHSK1sY@akK#Od+PLIH<1QIY#GV`~`L#v)}Leypbf!gXA08y}Qq8cqV4Vuj;1 zcVy~E$;bY+mieqz>^B9h4YXuw*b3c^!Q&XLO7i(wJ01Cm|M`q(1<*3|n5oWYPd*>( z1A|UU137^?8uVhr2TP8Xw)Q+jZkey`N*exHvbI3)bW&?ub5IyJnt}I{pZ;4;c=C0O zbA}9q1M%tBKQ_+I2cX;v9DCxikm;P4@fxtoFV5*5r$PU#A|>U^Z)2LY7w>0=ZF9*x zIc1;vxlDwvRux8O*<#bZJtLwXhW-+17FQK zSQKb+Rys{J_X@AC;=WNFy{8iWnrs+2;T z2YM!F)u+Ny1~Q{o8!t2*F;52?#-7eHr)~noHdV9?h~<tPU{cxwn|jC?{gWZ9KmqpyJM7#C;BU2}=E10lUGOSmHtf1`rseQ`)39IG>DNF%yIX~(HS0ul-UYSoedgt&SJ zBv3pW0n9kanBz6@{3%el_$l1`ef9+@8KhnZEmOt6cy?6rO1zr19n1oiaKngM+EWxk zNTO1VhfD>;CxbDAh3Jw#oBkZWW4xdbkZi$hyS$KKwl)2q?=y;pp~Wq)P~IQUs5wAq zHg=dRj8T{(1t~dMOsQD;Aop7;=))@om2vBZFhWb|hlKw_2K82=zH@Xp3XuutuIc` zke2vf*`#wm#JT9Xf{Y~Wu@HA#wkk?%j2?!x90|i0fVDXt^L|zbm_l&c!C6b`- zZ+zSZN4HkHsN6#aoYoQ1EIY^>ne#POEJ{DG!Kago#-D>x+dL|smpwk-b2Mh7i_Bip zk?zgq=$|uHsCx;wmacihTiP2Wu1bO#M1>0V^P;K-=bLA*H(yqc5k)!UH&)&!rRAqn zIH6=5Iw_n9LF@HXIz^G#`i-btJ_@>GDdw^`{^C6-7S8>+mN-q}0rf(6Yu zcAHBd8e;p;iJF*LT{O8m3ySXPZ+jT}t&b&F=2>B6Tgl@qam+y3*0UTQfngMd;CyVV zh=`d*aJhDJA`V3sR{&&$hNZEf?z4(+nc1t@I^S7892QDYg%Cf+l488bDeZKz8^53Q zZ{{43v9z^mmVft#L-B{tb5EIxRIZ%d0zg9A_QT-3MJ;mF*-gAiII%4`CqCQ zDs^B2;K&$e0tXvWf`Aoga~j*_aevrJ*nD$Ho>o5@U3bptB$%c#>DSR_{KUW~FFdaq zIKLSw8*)u12J(Y`J|=|9p)D`AOW5nu9YxLSDaMWOXZjedE$-c6q*=55nhYFUk79Ox zOlz^UB5b_Bj75m1py57d#RagfWqXika$KKbLAo0Kql`UfS8^V6UK{d--e6=>8>rdB zjfi~s@n$xb|EP$x3BE>^GMj3?8|Pja@@;5&oQ@0Q!P_N$wJ|yIZh5{~*wogRmL|8i zIg4DcDg9!hFe|>}L+tk@cr~&H3RaGj~enYJ7#zYfgt_{@mwEH}2VbZD#0A5^>zS%yGbTfZep96f2PH(b7zSglA+n@Q|Sy79_ z|0G;{k`9<(ach;AIy#X90*F~QexB{c`de;(PAVj#YZ2E zbUGBdPS>X6waL0aWUM?ebRO=#kL^|{IP)%qVayu2@I{Pm^&!`mG>Y=$*ltOdvD&N< z;#ehM%N?F}Z8z+bWX@0Lb`{)5Mxfy8Jq~PWl1LcgmFNOvS;R8Pd z*$O<|d41#gXc6*oa;SG|}jT&>HQ`@xfd^U59N z<(U`J+aLp4e}Tk*ilU}3P!j%AOjny$VAdWBxp=jXPziCMxxw9^n&UtqXekMi)M@R=$ z1c{^jBQIOX;vm=V!;T_o88Fkv@R-6)JR!sgNm`yswIFCobFli%3H6r#Wxg)PhI|j7 z0~`5B>4(DWHCe}Ngyj{PDV(L|g}j>z)AdASjjJDry?nSduc;8*`JJF_^9RF5acZ@= zu9*)Ou3ZVGuf6is_JgOO*U}d*c;VOH$m1N@Ohdk|5c~R@Q1k;KEVoL1{C5vL6Oe-!Q^9chiU;W1HGpRNNppE?2cYG(ir@<*_ zpU7>(z%%YbhGEu}bI=$>@1aXFI+@ryr`ZM0quM`VgLItLrorJps}Hq08O--Fbmqik zE$2v#rf~@zUJ8gX7?$j&<0QctT0_GApS12bwJ?A0buLejZ$^L8lOU7ZI=S~o*5o7j zR1ADcv=>%%o0so*cX-gaJ!Y&iB-?4A>-$;5BxpD}O$j=zUmdi9b34{)n1cm*3UjZbicZOomY)OFEd7iM{@>saMN zrwtohj1^Y2H}2J1+TuUinvU_d%@=+L%b2iBh@&-;j;E^X`x{7v)6xnA^0)r=pq* z`xdZP5pV_j3!W?BumZUXd@pQYfWA>!MV1$YU7=yc#1-IOL0<*=3luJ3xMKRn!xea1 z0ab;d7lc}|SB1nDh*kl9!oUltD}cFTr zi%u)}up-C{A}r{(LbZ!fEl9Is?~9BrK(S)J3uG(^xkAp zx&b#IkWbhps3mYEFel32wV9#7+Go35n0Yu-t7jJe+u6FCiU)=jZc}%EMXAphDVZ)i zB)#?AjCL^hw{`BvA~x4ocK)BK^NP3keqYlkk;~vY@q1QCvi1k%^nL_;C+xav+^#%l z2d~TC_MhS9dRb|WKP7;?USd$5p40{$vxInuoY)hVMTau1vi_R&GY4B61=5U92064# z)C>L?A@=(ZHft`jx6LNnhe}>W)WElY!M&df!_=(uFf$B;OeefChGqr^WHTlhOfqD0 zDxxM-bC6dqRa6%sqAIE)prWX_MHl{7T&gA*NPO2zcy&qxZPA`Q=W@Caix@hzhbfcA z)bw3|YdFo2aedDxWmqOSNFSo?xFtJg9JVa(`?YjGJMy1%j|v8uaK~}((gXhHU0MzS zp9!&b#kEX!7_{xN4~*|Kf?ngy`y)M09ODS>+bi=GUq#Uy3r}?qexV=d*FL#Pg%Qm> zJ?AW&D=ewV_JVd12zT&`R&2&9hy|M0X-7HMIXI;_W^S4$yX=WfksJ9%H2Fawx zSDTGSctJ^yT&L+I1$I7)W247+LRmXj8W!b9Ap8m z-HE;C1XsStSBT(qz1>^>CbDi_Wn!&cjyivS3_>PeT|V^Kn3Kl)VXk_R2~`%q$RP3# z2l&E}*CnJYfw;m_wI9*{uox*ejW=0K;ZWsq6CDffT8Zyvweh?gFXJR`c%{1fB!MqC z&Vx|xnC@PK=Sy60?hFJ&g6u~c2i0cS8R1xG25- zdkw~DMhOJT$ZOAG+MH{G^Ac^O_g+KoHubPDI?d(Ek8>4ah(O$vw>`72I;~zAKv@{< zKf>6?g7c=$#P#X#0V0ZY;*=^H91E05AZp(pOW$N!1r8iTFi;w`H^eCoCs==`5c(qg zoiyX*j{aPd4vzR7H^&ovQ3pI2l7lu<5MAsI>NKNR%gYP~O-0ph9hU2}Nm`S*83F-P zXmKxe;oD97IkDyGXN4&Fjyjalj8K_9_Xmzlz9v{6 zAMEnd%7{;+33u`Rku1##=5T76<-OcCdoX+jxOKY?2==;+UlfD`cs@(aMGFjuVUIs; z`J7B5eoq?p+G5GC^xsRVX)KmZgrP;TCLdoMX_-+XW?mn+LnYtdyoF#?Wz@4b51A0J z#7;j}PQf^L11j^Ti!-@(Cn%|miy0Y|1J#e$%h@0ZOoNFRp1V#Ll)ZW!xXa5CSn;}G z9IvgU7y4L@<}pWe1Lv3@ZG>77JDyq=SbMum7K`-)vrI(A;hseU1ikt+2@#{PL+or) zPnR_mF=su&`)sFkFpAtodD>h2kk{o`1tWkgut)>}zC?UpJrIC3ST|4Jbcor;;0LwS zUc?*`WE~{k#>>i_->ib^Tn!9KKJ;R%wLr+mPBqPKVE-kUclL;m@g_L$9LC^qc^-6`6KFA3-O+J^A#Q7P^0w_S>8MOk z^Jb5g?P`BWIh-Ff(y7s|%Wks(buwl4zVHMizWm!c z+&b`)k&Z7r*C?1+YM7WuJatmU@#Aze&#*uq{M$WLw&JHc@5b#MNK)M_I}RXQVaK@) z1K2E1FU^~185EZ|lVju_jtu7}{sz&}ZpAU<{4?wN=hX5MP2|gGkB@|(xcZ)1q`B`Q zZBHvu@PUu5la3Qx?8TYokcc>{Slnjv(4wU*%|34w#)@&@VqPlF1r; zac|v+LG-nAXF7S8h|IOT$yJKM4%1H>_Wzii=|A6$Q6VcX3cJDAF~i(Gx9)8ek;Qf3 z!Y$7qyl748(~3E*ZEwi_e?FVYrH!gYNLykhVfz-OOwm>Yr?jR~uTHNy%P{NKbJKkZ zO7M0i&6vlAdNa{t_31(<%^FR!uqSZbhkDgV?V#&@4m}nxF%I@ zKVr_i?@jC0P~m=K;@VCy>|Mjz=R=!_kJ+PBcy%e`Jm$5*mA#E~Tf9-3!ljtlAUw;j zB;lY$=TDIU<{omrFay#Bx+t9mI*Y{T7vC%11*^Q4Mua3Pq>dP2nUiFf8p3Nx2v4-u zs@JxwswUy8?dTDx#JuI19}?}gCLTOs?BeRm>@X`mec6_~!5Z_X+t#@U>65kb*r;tD zJjH+8YX9R$;%gdmN{OGotoI&em5_f9*vhdAw8( zPXUdFI}aJ}hDIn?f7x8RKfiKz@wjT^9BjJ@A#Z#wieU6)rpBze=_5W9Y6=BhXd*Rf zf-qwflm!iDkt^=kHO*Gzrln9mXQwWIwQdKHTblljj0(=n)|}D4y1zm?yB!*}EJ$n! zuOiCX&0tP0kJ^uOH+*+t@+>ceTNCJCL{j+>b527}NBm8XbDZD9qEGnfn|YKHA+g)8 z62d>tcpTa`Ub>4M)z7u%j4{j^;K&=6M;cH;0j3OhgdXa_U%0?dXCW-w9GFK9ZlL2N zL;$0z2B82e$R;Rg<~U>!Dj5X=is9A@aNsAtv}JX+P?ZKVVcdqkKv&wTP_nTsnBI^@ zJ#K_g_~1Ix_1hoLBS5A#&Ibc9Ggx~M+sMKXGAuaewsQL74j1$edcr$zs~}wa>#T~I z_>Y|>i9YSVV{gJe%7O03ERRc5Ui1ci-u-%&nwPs+nQLZ>ikdrbuQs9g#fi;5&~8yj z!mzc(-$16p-=uPK9X4N5nGh@b6NLasTw6PmvQ$iT5vs_*@44S0^x>Zm7a`RxxR{qM zvWy;k@*AX}fF>62%7%mpo?DofG~g& zKIx7Dbqx-!F}pz#sZJ2?xaTi&X;}6VbWIFEyeo=($}0nyEw0kF{@c;FAf|m}#&Jt2 zna93B+BZX}bW95Tt12&NBupx1UQo@DNL=rw(F!g6&tfh=<#gH{uc%rzr=$!_-8HsB^Ne zH42ybJ`#2LdB>gBqIyO$X4;5<$ov~`eU#w*DU6Z>Nf<}Nk#0tKoqQFPXyRV`G2`~N z9#XQX1>PTi>Jj%nktsjPztg6A@yVc2A4PR57DJ7CW0=|lKIE>6IKE3JJ*4rQ+oLbE zM`>k79}{d(g_^+UHyeRkd0ipAOWe2=P>A)!XI|#EV}XW=bBh>jQ7dS`%7T|rqx1-k zG<(BpI_Nefvy`322VZ^^B)s0>w_0pqMeP$^9J|q5=3*yPkDTPtc zx6W|^VW>Zx*EMraSa`gi2X^ZdI7D&9+tHT8DG-Pjy<7J@7kmw@V3_~fIc$A|h&eIM zcIoc#bDn2^KC{S~w>f9Cp8>k}roOxXb5(ZbN0I??bD359anFO1e0%9lSKXfMWf15V zFA2vbD1$%&XyNm82~rrf$IbwOuU_r9%W6VQuRa~qhH1}qnu7(?SlF0#&>iFg39ZR)~g`>|d&H)mXej^B1M(Q-q7=jY#( z0lcsy`bi#BibBrk2VFy+K_Rwi0@<%YA-K?Oa1##+EqhAt*Ymv~i;a5N%7O?(P25`Ut%8NwwNbK>LH`nbnyd{1TN40T1UWma{de^qQV`(z!X?Vh1dwMAb(nr{*{nb z8!62#`)>c4;(Ru1GpX%;NCcJnZn%gC9Fo6H*F?%4sMlA_F%~#?yy%a9CF9s``-(90 z5fxs zp|bArdtd^%fIGqj zA!oPY*?wttY#`@>%Nni_+aa->_XNY?BH5IcsM(*vmyO{9&UG9-nZKy<_@+& zJo?Oi&P3uoxtiy|bUq1dkv}L7BB3gDg}WEBj9Vfw*LQPT)Vw;#lQK8R!}*Ec<^ec> z{~JD)QsE9~s}QA}3{5OQ5CgEq{RCh4;G%&jzX%57VbX~ATP9=7{G5k0h*4%AGuJy| zY=B@EDri_&Biod7m=KBb=8sSs4BQp(2+p4FDNYp-L(C!LBzdu?R`L#aN+VON zP(#JMc(!W09c_^@ zPah%a0`mwBZEOjxf^nq%1ac{{8w>%8rh3`A3wAeTkVwTxw~ThkmMF<3OEHNKrr(Lo^Q3OBQQamR)B*B8 ze0NDjCqn3dDtI#$BoImP-4Z}J65Mbn8%GBYVX2K;mX@=jhob2yeKmL{`ZkWk`uMVc zy0nC8XwTEwP{KOsJf70yr*E@!)7m)-5S^9=A~=@tjR)Yn$#xZpRQKG(LcU+Epvn** z{qPPZ{3Go{b9P}9=qA_A96>g(X%qz=xcLWRNRgc|B?o@?qScxq?a-V*7I%u@gVL}O zG3IotQ&5m^TX>_g;43#rAU;Nh8Z=H`8R$T)d_)zI?imG7w7C(Np-btH0Ic^sTV zD<@2mkGE3CaNSudIRRF!p^oubfxvn>9L55vEm8%SNJkt)H0QQz;_G8AOJx}DwCRgA znmU&P9ko4Isze`|1%!3Iv-@YZV!*qc#niso-ZJj&Qka0ohkjd@J@&c|0SW8r9!c6_ z@QE5%fP@@CQq=;3Ep9tWac;2jAu8|oI%si0hT+h0A#ok;_aiqjR<^^G6;tZDS2?1H zM7WJixSiNwFksggmX=@Z9e93+lx7~tVVc*49QOBITP=`2*y+p51M^R|zQFSmNI6n) z?hgc43{U=N719CQHKBFh1aM;-wEI*FAlqG@LB;D#qS?9UPxT^sA!9181&*8yicj_1 z-k7EOyrexwuVdWcN66naUq90c7!9)-#npgt-pcS6kEg6i&xZ7kja|QvR@<~(3%^I_ z)#1ml9m%@xZqgu#U;V`{)S3ZPfiuc3v|ZPz3EKb+;Z;whcXm1Z)FGe$DsXja&W0gg zZKZo*6PXEhIBxyhL+M2Wig6(6K@qhMJG5vC%CIPJTikE{BGH3OgDsmagnX_sZXf$L zXQ=FqOj8f-xZa;rl|Smjs_{Ya#K174;xHtrObABpgca5hj3WYw!bU_l_9$(?#X#I9 zP~ZC|G4^5`gapRuj4lIwJd z_jTiJu2>1o+B8&)W*n&8|^*e!QvZl&8D&oXvqlIFbF#=B)Mz?N){49a&r5J zh(#{w;RK`Oa0(*{3*%r;U|FUFBQ(H*lEyLOZIsyG%phq1GB^yOzd)ET>>#tSgj`5M zLYA{xtafVQ7B!p5*LHX@#nwM{a?M1eggsL&1JKMipA_fz*t-W2(0s?^x7@-T#@J^1 z(~y9Us5vaoXn-$NUQ|U4Ig(pt*Vrk1Nqyk0vra{8I6p)8?@C`Z=zuE^90I zb){`>C8B@beR)j#cePK08HMV{^u~_yFuOWvJ*q9OTSOjsbz-ae~st|I)1)Zq14T3p>!f(2z6 zVs4y%erD5>n9Jm&moH^m7VY3(6ZCPi@=jOPQDVU;bYQKaN)C z@Io)DfiE{_{f;Ov8TXeQ4=(Bli{&5y!??S$TuCym2LzDhBM5``#|6FiT27}0e4qn3 z3XFX57cKuGh3tSh1hNe%MN9pdVD(^9;GP0V`dnedR$jet+B0zt;UrQKnkqp?gBvVH znQ|TG4l<)k3iG;OO{D%g-7AoH_%c5)J zv)=on)Ax&V@0^zG|GE?uDQ{Mt4o?XItf=Y}PwG)aj*|8!@UF;_J8Dll}A`-~$=mr<> zF>aK%hS#khI86AoFsUrY6JS?IKj|oP>_{HR$fP_RtwISvt$%Ex->Z*h0)mT(1VC!N zowO+y3}gK5M-R4&kW{`Pho$c=r_0)xn}zjA9!P3)p#^8|z%o&xNrOr$dwYk!3*9R( zTq!$i&wp1O!UM~@#L}&46jWM6ar|mOhGH>(Fg6xhFdDE64h-Aw`zf?CXGOh$&73@3 zV=w%E8I#4%_`SfOA=U@;sblB%owd}l<_mjYmTPMclz$ApEBSUJ1Kqyy#Cy zOvw31J=_VO$x!7)N8O z+!Vi zrUEvcm}qbIkJKpAo+7;&oe$L3M01ZrfRxh*I%@74nHr)=3~K8HU&*8%mX4?p0BZt( zNRH*F$7X0V#Te3@yy$hBlZ{HE+t&dxU4U#71+EsS6<<1gfQq9#`Hk+sh!3By-qZF3GNO< zS&D;nDj9CmbgAt;od3wGef0c2$BCNLPIv^bM_vi0ya3zqbH^$f_#)wC;LzweRQz>u*9k)BY!HRUzEwlzei>hp-eX z6c_NmE*3!{-_TN}PlH7G_u89H&R-l&Mr`2t|}Q4Sw6LLvMt7mZ7O ziM>a*Ax0;zbz1!MI)foh1rviPPG9+(d7^Cc@-{fVcs~!R{WT-3vh%UJ@B;xs+ucUW zYgY&M4qE0IRP=_t0ICS)Eo3YOrk5)ewEB@cjFmyqH_a!PgQe=v-?Ng9p{8gzCGE(< zCbC4xSwnLQ-G5uI>}J*@yN;YNto%tgZGu4@@=wT18|pO|(1j+vVRa}`2kLvjllC#km;PfFIetP;T;t5A=xkPNXbxIqX5AHUK@t;!t=Ws3 zH;$QYz`4q!4v=<>sBy`zm|m{|m*5eD30Uq`QP)jlSe8u!|BLN2~1oy)b_t+p7+4^`OY$@|_@<1Vyikbv2u$bYx!==EA;0)X&5 zZ~ZSbn-N}sPr)%kYR)|^&T6s`B`4Jd4%!q&Hc&fz!Z{fXfjUJ?f9q`jX{K{CDKU0> zJHo8lDj_`?f|qgHN8lkh%a7Ef42L_&u_Rt&(^({%n~4jh`Fd`i|rra?N94qEd;O+u{M!1?HE zzcyaKwx*ZO4nqUA^r{*98}q7loBx%|=Hrz%WF&jj3PgX^_esS#43Y$TADS2e6h|T= zsGy>vD!4^OQ4~cHRnPus+~e`NzZv%*rFs2N^Xg`!^86D`57|1Ph(AB`1%vYR!r@k%j*K1ctjMD)jcEpXGZz0 z&oNf0(EEO?IhBooTsoy*e}M`T5w*PA0kaX6$|VabL-g@zb=eK)i;Ev$Ptku%!`y#G z-S-AP#gRK$bN=?eE^ay5*(S94pPNBw7~Us&jN)VMR`jBW0K4bxx=LK;v39WkAX0W} zIp#_GZ>!?XKCRR|#kT}jCx?b?w8iKRs{*<)r>DqFiQd(W6CnURYz|FMmOANkEhE|rhjhOa%-0|;fAA~=dX;u&KN^ZW*mr@D(DZUGfc6n9FR*feA$D|^U z$tC{xO0S=DP&8@B zge0&6!mc#T;FOU8BVZy@2oRuA5)ecH04a;QFTp9Dz_Bb3nXX7#blCUwE@}MDt`9=9^ z8NtI#FkBbUvW^s_{!6m=Dnj5hPaxhHs`l4&$##gy7rO30R=2KoR+T&l?;V!abM>>z zdG8uOMEAxLWS<8WJ5OtiMSod6$0A=wN)PDeKbn1I9> zpA7OOk*e;bbM9ikxqGL=frHW}l%LAIwjSGHS$bo=a90&~~91ia79(D-R{bNw>(T}vv z9OAqHaIKLziF7M4mz-csWk3{OR`f)q>Ftj|hG?({CkIsB1{X1AdX>U>kgOClGU4w_=PzXYR z$3h^*K?9YytG-;`Ev5J+0D7}rO zyFao${fLkjl!@zpePC{r?J&$3M*KN%u>eGRTl)bQa{`cdJh)UT$B$35ogOo4Qw!zj zM$RMp)04`_Sg~P#Gm@Tg&pE72;e3SeC64%+=CNfdHV^i5jSauhbcyWwRGPgD^$4q~ zk1GEkweuNplU9pIfl19DzNJ`j@`a6S|GT0wyM5VgoROaGsLl6toB1}1W|Jys^y<`s zEq&GQmh@>MVQHA|1I*fwx}n7()=DX;pY#2l?mwDl__fW8#3N*LJKp#2(#UC{2JO!>S z9p`(oxc&FV+i$a9bV-->IZWd|U9Pvg&>zEH82IOnh_Kqu?ec%=Na?hXlRZNC@39YG znCn2<_K4fCVL6L>82f5@$ZL?DsE*KUfI&S$7-d*N#GkIRUdoO-SjNNf014Ro(O(@< z^$?phfbV|L;}DE4IJ7`y@i#`H3Y)Bii9wv3aKt353x~%4K^GxB0fczu9EKP4qPvKe zN*k{9R7jE>%PM$g#`k-FL+-70+&XuR!fhxJhNZ6mhS}t|%yu8lV))e}iIg62?(W4` zFOMf`v63+~OxR!|lt23bbpJa7&8WBR8?$UqEkih)4c5^ejwkM8f9+7G)prg`on(dY z+@1#5<7X6n7utNB2FF*ZOewDg(?Up$NxyMD_KgmLjY3?9&IeE)*O@dKB=gs%`tun+ ztgWGsK4!O_^y?KCgLA`B$WM7p7Z^Fz2;Sq`N=KZD&o3rF;lY|9v*a>(tU)%mvn_2x z1J~}-(^f(CU9n1b z8VEI3Cw(kjHk0a{1#M*Z!|qGkxIEct!v3vWGa(ai@-Ogt_x~T`&>ci5VE?`^RT4xo#<2J<)ZW|LjVRK$)&aTs=~ zF5wvYTLE-X{A~aSSRu=}H7pSLNBG7XZbX1TnH+RQ$=$<<8_r`c%jOLN?ex)J@6*&V zQL@O(@xPap-b&`F6x9kv9+MHsMqx)^CD+(-TU0MG(82J9nol{4c4FP=A&??g^ewjm z7_DZFot|p7N^vk7Ce{49Rq+ z7-~b*>1HVm+PIbA*lJ0ENUbh8!1A=Xv=m*aPV_-yYitJF8YMfyXg<=&dpgm;EWH4l zCIDF(f7x9Ifn?p&$*z7#(E2ab9WVa0eP0>t&1^Y&P5S>V*;t|3iZ#qq#;In`TkI_p zFLXbciq zK>Adm;Ch@qbQ3)XQ=2RLiskhFbkE$L9lbw(P3tMD#w|kq83~Ggw;)Vts;Au$F~Bzy z_O^%3u`*z!fg=GcfHrw5h?(UQ9>BgzG$5ahj*R1;2kUR2hhMRTkq)D0k~gu~$%_9R z30%{hw{K&!s~Le_{8UUS*jPn*&)R7?t2X1|1zG0$KmC0WEXB^m_Z4dMmgp{4vDii+ zwhO+=zE217&Y=&Wvd($v9#+^-Vez#{zTz zK76NWx4Fmdv{F|)hEWw1F0s5Q(vSrO1O%Es-z_DcxnYek6(yXp!eZ+ohE5yE|4 z=l21E6_97_FW~!V%xO4>-0Fk$cyuLebYkqRI-dO*)m71>39Km{DVVg0$15C!k?>{e!6u*q@E2 zCv?6+`v5O<0b%_VFRPtjX~=Hf5#JZ#1=EY(yMK@MXafF3k{0jBG^|kFQrMkvqtZ_H zYvWOi()rkOC}BpipHzu<2K14N0GHPC(=$GbE+64BQSpuBCJBOqYx3VOj~w-C!qKwu zI&kvR5-k~C-V5_@q)NoMU@hg+6q9v8lcj0|mR!Irn~N_CUfSJIgxgfw#UG22cig-N zS4qy6U_LT2e3z#g#@tOsFnxD_&U~+y;zmdH_LKowyLX*yB#M0;xW!W?Wk^E=C@7P@ zKgh^$vmS(SNJ+or4zo0s-CY|aew*K~Xj1J^HhmR7+0rq76 zv5!BO;TaD@h$0h#vI&bG58{|XIB)W~mc`^b^7t|28K{)swB*I^9^p&H7HpFpxu-KBCD$QH2CF>w?ql9R2`QJIp1@$6 z(EiLTz@ZqwtQi1W)a(>G%QG|-pMLNolL$=q4o>=$l4>D2O?5;UfS(wj0wKhkN{HJ| zUkB9%YRn1WLFmNu8FtS-=mX^#JXpSntabUZIVnRo_HRqDwqKt-K+uVwm>@@;F3xj^Xkijhhi~)T zpd!$T9r=YXEaY&QiD+cHTSC>>!O>nAj6EjHqKYP}g{Kz4^?|*F@&4s6;+N(Ks8rQWg z*c|q(&6?Ac)>|}iw7C8oF6?1PiC)NV!bJn?sIJ;;l)*NBlOrdwOLitMDU> z7%F%aMSFNIuqR{W!wfI^XgT1oaGRW2QAGe4y_5S3y&$hJfB7rB>U^ReWM1BsaQ59Ax3YUw zO!V<-sH(E{zm3VrUN6Gx)aGFH5w>=!r!O6G#_Q(ZoJ=tI23~GnNu<~53db$I_}~29nn=A zxri0zdE#m-1>Kqm4t)`;w^z2 z<{zeR2Ytugl+eGIdae1ql!o!syP5frt)7-shZ#iow(un^;)XfIawmRAG)CowJ9#_0 zL57KTPqv~yCZM!FE|~Oo9-dt)`}`F__%(11m5rQRFxUw-ttePPe60_&Cw}^`{E@=E z{K%v9fx6mpO~$IJA0$Axc~@Tq_lA2_sd^KpuQ!>U#WVnuFY5IV@)#o1nrBy5#WYRv z?eSR1ITXO$K7B+He8N=J(KEQy{~SB#9Umi;LV=Dtu(T}SRp3etCyESnzHf=R$c!Jr zck~TZo`)OGB?tWejtovQ@CHxr<`K{f#C#c?1o$Aa$aotODys zA_lH-x*@=`?SvMI6}RSN>in=Ld~gg6We+gS28&=^#|W7%EL|6-kW=b`0x2#`;#sdJ zzJnG@IL9Oqia#_;!?62iVn*CZ0tpeWTZAf$7{gDgt{Eq`HHC24p|o>B4?Wf@NgZnEgyxF#2~ z<$Vifv_JRY<7odFDXIa;yl*Y=H4bMk<}k2=d-GowG%th_Q4v3x?YqoMb@5xsNS!vs zgv`df4d->FZxpR13)OW6-t=THJ&^1S%+qnJO8I)I9U&tD=p(-)#gkauu6nvx9JYx~ zo#lbnDof{Qd1060-};-$U5fzzCcfxq&v})LX-krd+9uE|j1VTV8=lso3^l7^E}>VR zW~B-#3pO}LTM{GYo^?+faVPEMrY#Y0G=_+=W|iBD_Pyhv)BAcObLN|(F5UCGXQ%j# z-^iaq@TY_AFaNkrk4 zm4MZ}DYtaxkWV;nPp0)@#A2i5R5@ckOt2!qRt0qU^Hfs3Gu4g#Agl_pW@y@<0U=q3 zNhAf%NiZQ3JNTH$ifu(+$uYJwf#V_V{U$9h=z8(@4Eiz=JZP0Z=IHVupLA<3g2t(9 zMZcel;8`UF?i8*pAAboQ+Uk{Sk5P{PrG}th%-f!IL%7$iU_|@(!0{44FiL6EPDA^~ zPFLjl!w>neNw}rA@-!)Yet`^>43&nDNspLR3F1My&fXW{K20>#fIM81ymGz}D4LWt}qP{a%>-HojIa$=7_RaFM=!0btQq!_yhoi%_kR;65Ai3*OzJOYb z!zt%b0b3dy@CJ3vw%Rii4B?}`Ig%0^G8WftW}*0bcX%QZAp#fTqeTSTF~Ct`Dx!=C z*jry7m78|D9A%J9YZVtgfOz)d-3-u&;bt62m>pMuv`WT#JAoz z9Km6&J@hC`{etl+tfhgp&1g)Dw>|s(MkDXdPx4py!IvnfKy`_2v^3DEIS1PRnrOZbD zuQ!KmRadV6S5ZAanpPtw;d{Trdmn6i;lL}~QX4B%ae7P`0a9iE*h;hW7!3CJ6R_~@ z(m&q-48&efB=|c}1z)+vcoA{|Neih^4pZ8L_S51cf!#DsPS5SjI7??dsAo+X{>&k+ zi#cIp$nk1n_{hqjZNoURQR9?D2YnQ z7>H=DD|C5=7KDhv&Ye#@+rXe_DVJBC@aIES6f#ZY!tG!CHgm>I*k)C|iaVOG+u&*7 zsx&;;kM+{Z?1Km5!DijwkVj6k5d$u{M6Nu^bqjqkXKHZA*BW z-?H$44CX^fF}AfAuuNM2&i4;tH8uTLnnz39CO9dNd19rHOTCg2j}!0+=qF{g*|uL%T#WxU9$r0lL-Me0k?fA{5t@)|`JVBN{QA=_1nLp`< zrOW>w7lJ`nk+TPpbAFUPtIM6&_xmu4K!jTjI5K#+;8nVyyVnOgi^2CaK=@>|`(Kje zuGz6fHg>`AHO-)q#PyCV13^H*d2)JG*8%2SU+};;jB;nt!=vkyy^qKy9XL2O0E|`G z_f3Y@mh2SFM3f=r2XEBW*IOjCG^vw|NF*M%wx9q*)pP+R$_uPh+R;INIg(&*e=_1` zm9Ie=H|f*K@KAOfz<-U4O3n(X2^hBvzUo^WvglDyhu|(A8(LH3UaQyddcP%rmWQDw zCI4?M{>(wiZ-DWkb};gmxVJP1wHWb0=jlt_yn^y(c9sa#V^S}(0W?cpj;_rJ3BUU*G(+<&%CMPHumxWMzlX{YKab%`}ftSNjDW zWa!c%wX&7j*yaBbG*$T*LAj&g7r|coQjfS@GEXd4P%{F}^>!YQnPG3&K&Q_gRWt-t zRNZv22{g`bw6xLPr_!L_W4OaU9pgIBit7~W?Dls=g3x-F zJf5Z}5IWnkyvGWBi3gybqG~blD^)!Hyok~2a|F{*zDK%mTLhA^q~I+@j#^XkwyS$v z{s{^8Pk12t<#aKhULY8Z{7phL!1`DIK!a$SOgzIg2vDBUOQ=oV>C%C0*b2u(ebV+q zjDdcHD-U(fU7GFAF_1F5>ZiT} zjUWw?5^xt`$f@;1{vYFlqzAN5K@v3v)DY>9D+q%~SN!XvhBzJ8f!B7_3NXX;%gef} zZipE~%%6s_0xnW%Qu~m1si}d`1fv#GFnkxXC9CMsLfK(Tq*M$3m3}dt)JZ4~37{(^ z@P1z5e_inXzWt8UDdJD>-HRY{B{TkLbRLOwOsgpxX@Q{lBP@dgNC_U}UESIet&z8< zY$ISfN2iD8L;^3oqb;?!lbIS!n9p$Gtzsteg_Yd^Pl}-g5EblTUV(k_-Kz)gX#5!4 z;>E5O#-i69B#9Jc7t?DsjsNc|DaZJyVsTx)>@7HUCf6x6`BwBc}ZG6W5>9k&7VG;%R z!hI~ykUx&>uYzEpwu*pM&|+pE+VywtxTqz3e?OOzDRq$1XD71N{`O)8Zi|xFiNND+ zVM(A9uY7tI=h@Zb1pez^HNYQFvu1pC0?1Ys8BQV(kRk`5q4b`7SVVLsN6dw&!u}G0 zTV3iN;a-O~i=YN&)}I|gacvneq!jRg4#_wV5ez@h2fay3O+5^1!gwszKjUW?Uy0Dp zT(%C920AJFLh}CcR0Vfo*r1_G*m2#>h(2=vVxMoY0cx2NQLSq@o(Dm7!_~qx@|xcI z^8>1$0biy#p_Vo>wG==^wukR(F&B-VyqIDXJ@`2g6UE9vppOUd)+PXHg-T4b4bEUfr7hvI zaK5_uQ8tx?7X1Bg)7D4b8PzNdcsEF?qu1R@JP7mp{~Wg&->3l^B!?{4WLa@tUeS8g zz!M{9%t8*YJDhVL5ID1-!!~@g!=z_Pk)NHx$2#++Z&pR{?3-)Vr6sqb}L4cBfMM$(0Rt@`Nx6zTv!MZ3VA=z6S$ew-vv_ zYlHG)y;*Io_t3SraQ0%(>iG3!+6s@05n8+y=e3-Vjm}mp)AF*53-M*VbjmU;;l>wV zq2J0d)O9RWO6&{LGaUN(h=ToH!gb!{F$isw-mAy9sT381m=P`Q)lg9O_Q{ae#uLr- z1P9$tia4{4eVn3DIW@R(XNNE7P z#$|+eurC>1L>2-6^%>=gRbt;hHf7@bJg2fC4=8sjMu_E!({V`CBC2G|Q#cW02r<*A zu`f!;=_`K(5#4-6Koyjw7+x3V&H*=&$6q%C(HM_8PMZYh>DQ<`X)=w&_^@z>Qd0zR z&Y&3`aK)N*6n5%%8kAP2mc8(ZeyCAF2ezP_k(<27M*)72jbFya8Maw)gSls5o5$0t zB=ELqFfD`ai;X}&QQpuJ$51k;8BuFwiWQakw1mhB+<)wQKcIdrx{2!G?%WDTXB&8! zfO8&9! za`wPj<8+BC`C$CNud_@+8vnW2RWZ*9&KPW?^C*5sm5da1Sil!$?$N+`3GrZgtV$da zq1Iv6c*hkJxwgg$a82LwCBY{|C%PCc@;fOFKdMJxnTO=gBD}7Ss z{S6NbIK{9@YFX#z2{?gNChIdIDF2I@+Q!`~Vc=zUi`3xAxwf%UUx=Aa+Mj#jhwDS> z$IBy;jA{i-EV~j)-Ak|cW71at)DDZWjdX-^cOVBdROPnG_wtNxGX4HU&iz|JBTSF{^N zeJ3UcihwS374vc*{q81jy9d1hDd@wOL184--+d$+TNiVY%j8^9>*3gZCDp@+0h@Xu zA=&{iZfPB$>@;v6jSl+QUCLl`aU83Ty@>gf7Yqjm&em879QJ77$fB=ao0MJ!3~wbB z6wcVXkO_1009#?f{+#pabPrbn8!!N-#V@CSswhFq)w=)QPrq!!lxIgmhJ@KQgI&Qz z=($k}T#g+N25GCWX-d)o2nBd#DY2!*r}MV4Yg6WlU0NLidSllR-d{Q82s`hpASqPi z>1lTF(|`70DAvJK^qE<{1hm0T9!Q53=sIha24Sg>%=&qXct zfS{|UG>r|XY)(@f3y;-dVT4&l{5? z$4$?8bAXr+M0sPLASxatLteCd5)ZsomlV^Qut@K>%=t^+inInj*_TV9U@UwIH2 zetS4Am54wHM~D)Fn;kA^PSgz));IcDs%mSe1I19)hbh0oK>ZX9IS~8$y|pE8(Iq1%gIj(c|evgZ2Qj#c!R?&@ao?TP-%gOS505hclXWP2Lj% z{|EH{lT>f}w)(Q3V)nSYR${d6{k>Ox4^E@6`Qf_OdEI!_J=S%^q+6@S%WIuhIooPE zO)1E94X!uargiCQyKZ$JG(1i6WaWBa=H4WpJxA1PJ$&4+Y!^!^If2i<2~HXzBHQv- zwF8Kro&z9ry}ntko(?5O?p@6mmb>GothQF;MUT*3p4r*9h`@pBb;gOl4yMA=8J)-=}Kv*0-yrz23fioqqRMehU8u z2CrWdtg5E(E&posYP;Kv$MY%0wQv5d-|JC<=dOaSTKC>_$?kAn_5|cAy{ALlRK5P* z&AEIaNwa)bvq!V1r?o=1YlVGFjd5*RYpr}U9q&;OZe(C!7BdJ7tC@8QWeBl#Auc;e zQHhX=*~kDT2~g<2z+e-y28K|^f-lRnFjHpA0ptK^;6v-1MDJio}%qF9vB8^Z2vjAX1klCO{6Sz14c7W;%Ze%b!0Fasaf4TL4L;` zx5p4vT~_tUL=ze-k>^)#h^tt6%&&Z7w+_yc3s?J;FH|*K1ZDzRuqCLUSUB@bq|Yml z#QCwyQz*c*WW++lSmiGyD*#XxR*>2JLi|Ej00fY80Q&?2U;=@iUkFbL;1dY8@;~SU z!N``_7yd&JfFYF<0Z_tvp%nh7#S9ld4mcOEq>hAMnPVB{0;zClr7;CNk2-0GVo0N{7E2`J zaM}O=IN-!)|KGT4|JP>x?>YY0D*nG?SevAzK^62mJhP@Q8cGA@1~;%vRy@l$%u2xN zuTkWGyPW4^EPB}9`@9k6dzep%>+$dV77XvOLmRVVUa}KGwTFBy-WFkpVdWJrroetI z&$N=l)q3zYqll&P!@VLcL0}Dmu9%!^h%aC$W{e>-8dese4MJL$ItOZ`JkJ>aSYF@& z3z}Hr&DkJOaB;>WQ^8XLgW#tma8l*PaB;!@o{1qdRH9x;108kUUV(Mkqf+~x^OInsZ5s*8n0 zK19iL6C_q=ON|L*Ow>@R+GFN~eP8npl#Xrr#2EpjVQbDZTw0^0HXO3sWo-!sA&o z8Y-^lS3w5SMuJX0WJf{7X@KSu`-%FIL_x^*}L8MzY1Onjy$Hjj~NI+7pA6Z3D|xK*4!{ zte;Yb;ASB!l4U)1nj3wocy5uQeBZENICkzIc{Gt=%rDZkYXj{>KrzUp9ZMTag(^+S z?s_f1q=zn2HZjBFm1wzRz&ciFqoR7TlqE7qxn~bSsFEBH_W+8t&B&VT;HsiduEq^(?Ug}(Jcr6&$tK>M>p&jm zE-n#`H50a~ZvEQ>_z(1&n?RF_GJic^?^6b>p3xh$?1cVT0%>qicx4?^!Quk_(rBiR zJD#XShg26zUDf6CJcVm5KVcp1iPkk8(?sj1ocs9K3Kadj}F#|4n>H8p32RCqvb5 zAn07f0{*OF#Cv_uDXypN@ zIw4;d?qhPUgSq|MkF>l;nxT7~d_bMG+*Xg&-QlqIc*D~|CHq(H4mkB8x$SNAHG^NY z46MoZB~>vnHJW8*O@Hh?I;X2^@-6+h%(uDFE0(9N&X@xA8hTVvm=3ZgHwXi^BQb*#;6Ss@8&~G2tHeu0BD*enCDak*jXo+W0fp zuN+9xWX2i0CWmNgMOIk&2DHgLs%(`rg`Qr~2@$F6dc^DrwZDvh-O0nTJP|ZlM5xgV zJ9&^7pV`e^Ph>FAe~y<15x~!h6Ay^(SY^6oYcAm_dxib4TlE?g`U=LgHbjI>lMPLF z*H>|=J(ec~;yf0clWS^eXKDl9wbna4esw&f7L`b^P?6*XC?wgW71^`s(orv>*AFPU zP3YS1Xm0419-9>6_sz4tQ(!QW*xAP>8##?8Yh-F(@pPMT-of&zftuZRE);uhz zU7|G3HC3w0BsMEyLGnePr@Xb4e0E8@(9ET&iF9Ed?}w) zFR#@aYDP`G|I(|pc}>jPNBRBAvPzxAp`fJtRx2s4XeU)moy($(Jm;?_Rb_}lY@1Yo961vmSfR{aucS?#%ZcfHw)x^R)EEjs*=tLoW zLNY93P{BtptK|EFFeGWVwE4iODnBnX^XI_z^azu9>?nxS*Zhi?OC9U_XGogW+NzAv z@D2g?!bWG+n`h=2MOM>4H2mXWc!1};p@_Q~E|Zm&UCUuvh2(iJ_Hq66dvw27Vtm>A zYsR}9XV3P;b56rS`@3&>P+@Ett4Q=#QPyPav}N{<$aA{;qVQgEl2u&ppq%Dkqdh_@ zoQJ?}`P4bTI1JeDDfK?fvd$ZQjjH^q^Y6wVyYX(9ao8T*6uKJ5BHvN^2&A#>UDL5A)Vpu zwL@difzFS2vnuTFNWoq~T5((%Lr6#M+gq#MHi7tg2VTpBq;9R6%ZBSq=rwZ+ADOn7 z>06b<-sG&`?TK1dXZt62nX+&@<-t{kJ*&czZ8K%`1C20>Scbcw$367%_UUQb{!H%W zjMh0eSpq!3tqkqN_kiru?X#?ned|y8m&K}kCR&cRs{mt7r>ol%{XGUNGq>k2+{fKw zPPW~jnTxBJhn&Nt7kwLC(jPj8j7t3P0E1ndRbk0Z;L82YzZLn$x6^9y_nxqmp|Po# zo){9UO!oi_(JHl(xuII)PV#4u{_4@Qt3x&lOjDY|S1hFD;B0G*16!}z{tS~tS6Pd= zX085dNpG||77pre+Y|mD*l`1|26`I!2COLUIthxC{&k0uDD$k9q05iwC42*xI;S}J zROzKu`c#zY8@%KFu>K)D-YvfH{(@uY>WBjp^~d!g?6<3Z8(#YTt*e&0%6y&S)Wtka z{lJ5L3Z@ggtR|kD3Ykr@;WFc+doG=P{(q^E^Q@114j=Ll(`>yxX1E!eym08|)07Gu zQL10-ZWUhpM7S*iF{w}5NfISPM?ry?9^9Vm-s<#*az!s^_|lw*FgEy}Yv{ zy3gglO~w*9wiG66yeq%U0vb;B^BGi}%v>IvxZZpBn%iUho~aJ*zyH+#+Fsrcer1`N zsG&~GJ3TJR#^PRU^;XyUW#<4P9^w-7eT4G6_Yb!NbN{CJP5YX0@Pl;GOPh16o+Cqk zbiTSdon+NO(UWcBLk2yDJI~H04`WA9l3Lhnj1zrDb1^WwiSDlR za3IEwk+~|k3O1GDwJ!L1v;-qnf6xI!4r`Crs4vMZY-=`pfa%o`&}>p3<3Z0{AAAPx_#o!7VPRl65J^@Vza(B^SG*J1(Skh&)nZt{DAn)8mrEUw1VQ>5>Z|R|$ zT;>sZaIlwf@Ll==?sEgtw%m4ADrMsAWt1B6F?9UN4006oKYemzo1Iy2CCfNCt_c6~ z&g40_a+&|>;mxE9z#3oIosoQKK;cX&KE*FBZH?meTyr47HlVt43TA#2j%wjBE!=Fp zTTjVeR;;$?c zPynSV6YV%52$}-Ad27nlIqq5p3*(;Xcprx}ANI6Tedx>2;2Q#jW=@%}hN#O-xxk3|TH2 zj+8ismwT>QJy%s;7F{Vcnv|3-Z7Nt6(L`73(RqoDttzn$HM&ZsDyd~yC%YyHHWdcS0D$Ks{6iHx#>Czmq^WeUxJvEqcX z*_m&8Ou@x=sEMg5%ctSPwJD22UlkWKj_Cm>4&+m@s0*V>50E9Xl~mzU;k)75^ixsv zlk{l9K!jZayUkg{r(=IR{n0N~#sKk{HA;+1M~+T|fMM<|u}(ot0;8>p;X-G<>QHfN zVq~rQQ;h+lgvRZ%f+NS6m@HM8H4CAolq)XEih#yb3wsTJ$_t*trKE4KVC&x z&wOBL!w#FglbYNgaL3~|QF@pgu=fQRPRK*$p#4;|r{!#1{7bQY(?%-YgoYp;JMek{ zXUgb?A>s>Hk4G<#HlLAdA`{GYPwENoI&kkS^)72AdcO;v6WE@6vVrHU z9ps%3=8=0ZHfgYtX2ygO;x&~Zdh7Z35k&GQb4E$;*9ZnJM{P*h1PxNQ|`S?SD_lw^y zy`yel{A2?bvQT(UUZZSxZv@nwGlUDVtnXGxo24oNkFhamN_ zZG6FJ`=fHzc7MEgvQ$%})q+*3yYHHMR~N@R7T;KAh7*`LfR^k6b7sQy$T6p292!xO z!7uR4Ef(hLK5=ocRcdCFOs}AH)~2aPWCky0NKnTkMwLvWs{sh4jB5;5O!xk=b`n$F z68T400`qk1BEr_ZgFvzO65gc(me$}+dBr+w3|KmfFZ{s=Ho1SG4iGX7RaDc$Fog?G zEuF;DdH;nK-o6ZfNFCd73E9Yn)Y1qhm3eXTvb%Q|+iPg|L{MFXr2z6v zecPE~TgNn?G(GQ7bG*VFeBuY^I#E4g8cyg*ieO8)11q=E(4Eo=@26Y$wCNJVsNk7(d zT2RL-PM1~0eWh!y@B@wpFMXc5tB-ZrI#;kq;FY+s>EAKlK?4{wD%l<%gw$JEKuQOm zl>+AIHkq(DLUXYxJTDPd-8ov(1K4E?xovPoyE@*be)mw8WO|s`OwGaKb6}p}fjW1$ zrwKS;e>0L85GLHt8amGNd(t!&P>aUDm|0JZK{HvaUW>-ghPGp|McBgj7cHyb|U z${CEd0ilrApBtuUV0Fo8h8X31h_R>M2o2k_ZVctzk^cyf+1Ix1ZS#9hpkHihkCA;( zun~sWrvG=t9eR-H5a`5d`lje4Gk1gO**ZgC)}Qdn&;nzDlE^wDqaYc=ypra;0_q}` zXid`u@GOh;1f%f%FFof{KuqFFT|Sb-hd4sEj)Y!?wh~#x#{?T===iG1OMZ1iRK}AT@4FBb zNz%s|*zVKe%eIjpkH8$#t^MD`@M>WL`g@0$kMr32laTyt)x|K4D@T%fJF#*;wH2c= z+x@^}VdfvxZT8$A*n5#Tu==SkXLdl2rdN`3PvOX)pBY_${og8v`GmGicx~BY+o{?h zeB-V-d*wa6vpsUE_w7NpsiW4tYlDLW+Za!+nIFj`%m%nhRdRHH0%zW_S+=b_Q!^cH z5-C?Ec(1aB^_HPoUvNaUn^@Oj7U@ZG(u@a$v1ud{#DzN`S zj+q*#xWs@Nez>8ER|~0Uk%@hc?gcDUWv1K@o5=0Up!ju-JhL$)F3=xr6N|)7Rx#wzRd+ zJHCS?lupR`anOLykTm%Q*9sXvy3C==5{2jGEOA675=2G+HKp|HCXGdiszoQ(1dq03 zFyhm$ru7I{v~Q@>h|-8N*`2c#$%t0(P9k$aALHGua`Q!br3T2K0C>f z2qY^eH{$Mx$)29_q4mEkr5+f6!I~u|OIdVkOcTBg=JnfS|@?e_%JJ~dqu{r^2v%#98z~fA1iIGrx z`0S+KcVLQ0qgJGb^~l8Z9FFQPku%NL(o?y^!;J*=X^OV1)zzV$s*8m0tfb_?6tgXoFp^MD5U*gV0Si{XXoIope!(s8+c4BmpyQAlq_{$Ev9KKu|u z>KD#vS}oADTTC9x1PLf7v;D(i#@&c@MlX@3ShL}~97?v1^Qg~aYO?qbD?`Y1C}O^X z%Bh;MX`PU9AhN_=O>ce>HLK&1?V$pe%M{PkbHtP!8~yD)eKD0;;xVI`3#8z%A=|hh z3kHHnr0%ac|%}CATO81dNz^CIH z%A$qUYLiKNos}%toL3W(@0sNV=YD|lu#g&Z%Y>^2ib|mZX3kdnWIEs*o05&38L28j z!|QsC_~AYG9$7DMU_g?7?k%6Q&T%Mt#t~;9Cye-vjRT>?U-H+Cn6%7jQWC)zacbb? zVa_WJ<*cOZpj~VD@R<hO`EcuWo=q{~(_`nA4FQL*j#{HldS_25 zupS(i_7Sd;CIX=BtrI-o!n;@DUsWVE*zbv<`X-I7>_Q;svLO&h%J_HB5#;U9=%5qd z5=9ax7^0Ot)NJC+Ktg@dk_(r-!tDxD`lqKMqqR{uOh_iQi!*-EvuvDNO$a1o6<)z8FKQ}S9KXR4?10}5aDjZXRED`yXaoR$X>l5j>0K**Bk{#Ev zp5|^DQBOH3`zwo6NjZO^m1wIB+7@9;$6MH03qK?@b_=h(B$A2k5|$0m-gR6D zU8DRG55N8~jAEKp9+61QIi!vtG!8!@=Wg6=+Yf?+_6YT*I`z%J)rxlaQ8Vs0%aZAw zcw^UhEj*clvkaIgl&s>LUB!=WE)VCNztc@}fD977hR-E7s)b#&vZc+>MC<4!6CTV@9V2V^^u!IKA?j@!f5s7z@lVuB z<>haw13Dze@TG?C zD?Typrgkh%l74@#Nvd2ZdclJCY6Gb;qg&3z- zRn1IMtigPe^JJuD4YfezCjI`tTH$L7BBv zzb2~wqvecD(L#d90o013RUjm?{gFYQbBC39cX#P!5?1OdItz>HmGNazG-fb$HAuPz%7Auh5;Q)M^3@)cg;mpT z?LDjk1>}oAekfnC5yo$VJu#U5CBdfUJZEbCR1XhrGYp{Wv|kNwKFDh2!8x5n!`ug% zP1(K6K0c3Pm=!(*<{cG-_3?&99J0Q3g*6VF#SPC)F@PE&|NCYTqaPbefua48FM`vu6IY0_gr&- z$%MpI-izuhC{RluqHKaX^tz=TNtzyash*Nr6QEM4PEGt9G1LMVyHD{sPvu@Zar5c- z?SpB$?a#e~8GIHhh?+zx*^HW8^?EU>+{rg>J2LVrjJc;n{AV4AQf+ncU9b6|AU3S^ zkG<{_Y5!sJ&4Q8X97b{_Kn)n2^gkGcZ$F3Zhi|34#&9L_3pJ|^GYkR41 z7O7i1;!02=3bGxEJHnlLaU|-u+;F>q_{;oVGw-hVsoOXVJUJ}~a6@98U=_F!@1V1}QNCY~$KKY{e+LzW zbcMl1p-#He!_)Ily8xQ#Y}&Ctg49aQ9)0-4ewj^J0x8rq1*(77bG{smZmQO-?|EWV zt+)TUQ8bvor7EAg2GD0mj_e|Q6SThcd@u{b%EL=8K<%F+xG*Ky^2oVgFaA_RQ#$QE zq2!#YPHNz2gB@|+lgg#sH~|Mg3Jt(SkHw3Y_bkDFQ$Eqj#rD`pea4@J8kJ#NeiV{F z2q6$?^R|Z;d$@}yN*)@!LOzSHGM{^TF#7Sgm~U#R_^di(R3-%DWg^KCBxee|uB^Au zEM^5!*a$kIkPx)$_*=S@LmVJBp!$hJyrUi06=VMQz%SUu$~VW=a3@>lB3DaoC-m1= z2P<^KeP^Pv%MzxRlutBvk7=&dZb&kmO>F;J_bo8bPu-8qNEIxKz4R{`v4CJ6tvy~drH(2| z4w<1`UC>kRG{GT@-`iwjZmyq^$W`wv1ztK(-zfnW#ZLA24A>;@F zUYHtb7bFzr7=j}Q6Y#vtON!3tNpv)X^{+O8)%+>(P&w=pcmgoENpQ|u;~A1uCX+E^ zRC8O|?a_-3@oVTtExN)3y}BqMnzaQFdSeV?TLjaQ(r9HzUHq6S{cXl6U-;|x2n}Ld zZE)zuz4~h=+a*Y?yuD>@jU@2Tb*w~VEJmP5^D6!4>n8+D@Nfbsjw^tnsKk{VB@o|FqT#;}}edf|%pLN>b8%QxASY-0r=Qy4w{-%T?z+8gl8EB2aw;te0 zfc(!ijje!ibe+uTy6MW!kkIk18n=jVkTY9AXox0X?7W}b0&~$DybPD@k(FJ5Tl>Ni zNTMESl>JY&EHR%qWsqBTz&^;ATp^C*;&mWmaAVSLH#rEY0kme90nAi#`sX(yIR5Cq zPqwM`)4r7`NERvbWPJ2m7(Vn~y{>cVVw@Y3_Pa^tfT4U&9-Qrt& zZi!HP4AMWfD&6H1NC$YzwZp>snYzLY%GC>8X7a4~oj#fxewWachnKb!3D1v^-Y-$8 z@t+^EYhBiDMBT>G0P;KhaK;;MJLf9>vd#I`jFCTvw;eC9n9Wpi!TFG zhk1f}+J1Sm-3X+wt6$1zeg zx;F@~OX!DG5)7vK#>mKzk`|VS)8UV6Sv!dFyFAT7x=ToM9(2`(_p^f@PKa^>+UBJ) zD-jufEhbll3trCq#;nh7fn$g91M{~h*L$GfiZBhryKw)nI8_=p$@ddPngT}5qdPXt z3hHCO?rE@6Qg;2|2wkkzC993yCqq+m!{=@!S3Sdg=s!1JM+!LwKlo-Gko)pR|Fq|L zLkxcW!a`&_7h=a?m3y5zaJ759=pk*wM&ZaHT>cd>k@8G<3oB4ev;fP%cwZw%2CJ^& z0}>kOh9fOi-x}#n`%4D{@CP+VAUIF6+gSe&fv7*4O$OVyrTPJ-<;%vW1m!#XotU~N z97b1B;Vi&~zS$(UOdCJ=-)+|kodIS1TuStX`YBx5O$#qIcI0rnuJhl160@&?)1pF| zutdWI{5O7u`|Z4Y;gE9U?Y(Ty{*(jxr(Q;vxJjY@O;l+CURVDNVI@?)R)dgO@y$_s zz9({oyzbI!ICYQwk@;jRohU#{70K8VqLAjr^khcC&K#$}1b$^cHkRH<)9z0w!Skt3M^2huHZqVGy%qJ-@jW`5qVH5X$AL3C=0U;EB};uHFWD#DX16^trU6 z=W_nT7_)aSc-0Jj%Ucmn?3`r2-Ijocv0i!sLfZRvaS)s8>^dIbzNLu~%=h>VNdF_) z`HoP%6nKHDtd+7W@z}5#y$k*Vs&L6e&@)5AJKCR}i*+erby!$Lj{lB*w^3+`&-%-P zk78H9AAV(If6Qm>z3u6OJVN5;uWvG{&g z6Q=z+PzL83i0@slg%qMMf5n8oIYr8wI3Aq>z9X8GX36>z6+;Fnxo?=UN*n%95xKc=l1WGiuTx7>F33OZ2b^F>eWd{862}nm z&hdswrn`T zkEHk{aRhB7YCDhwSGXkz9pJ$G>;GIy%-eKkPIm9hAh>zs$B@Y%;Q3y`a z^Boh4tN}YHXuU=u_guA9!K(+Mqz1(HC4azbU1;$ncv*5`#l=ue-*n@_OJ9Dc5?e*B zcAM=hsg?Px6inyQ{!71qQpA)o4_3sh$fXnWWg_{zhC@#kQ5a5pFNo6m@otG`iXB9A zPsNg(s-9))V#2IAzC->lJfPdw3qFPOp<=vow<>(PPQ1{)Ezk`Yg!cayCK3#I1&E?nEtQYF?bnJrxJZPU6?tjXZ2M&_tn8a<~)M}SFstt6$%X%8dDP&yN;q! zNco0_O7e#efeOR+_|(SddjjgcOS$Yu(MQAoJ)@Q*+))&F+3VoF&p#zLS@L_*w%^XP zLm9(m4-p}Iri}SNIQ%&L2`q15@sw8cAoOPKbDez(7f@-I= zPBC%nheNi9z1~2v^*A($L9p66AKmNSj-P7uYOOq@CS1{Vrf@YY(|Bm-D2Pd^-gymS zo}GN+fwaAs>LL!3t&!1SY}kbo&XVs31&lZkVrdm~)7?!gl;7Z$y=5=w1`E>o#G1vr z0Ns0AXdD}&uy0obSr+f5oaI%T_(WPgTCOff*3ChnN~5S$3&dV`kk1Ky9+lrcdpGx~ zeR|{FCkxl^$C7D7+|g85iq=(-g~w41kFUl@aIM$3&EM$>#v6)6p^l+miHS(uZq`YD zM3e>6KDtTg?7Dd@C=w+I}lu`0zg&^Jh zaEPQHu-PewAy4tk`-bU?jx$3{Fzz#Rt>y85l>6BoelqfqIB1JJAHHVgFt0}q|~ zkqeTQ4wcgfPoGpDk)|=N-!x7>^sAhj93jyhZBy-Y(tnZgTB)CDEUD8GNmgow$fFO# z6E8ikM75u6{F=cyL**JagdTO@COXmILoZp|;*=SdM7c%1sEauzVl$K*ALIGRlA00x z(R#>VCz?zUlHb_)xr8<%Lq}m*VrhxQN?$#b>_?zzA+i_^7QIe~1A+Y)RM*1$^s_{S z%}O^ryWrOd42}ibC!jGc#x_0)$O=0N!Cpha5_FYBB|Qp+2o(g)mR_t^pw9Sa>pltj zVso+AxX4QVd!;Xo6~sc{S@YGuiCQ0!r3qE$ZDX5JMpyPzi#=?1U=A3`5xe9%+3*$7 zdI?4Uug8CnJq{-sO`qq&gg7gK;L*l%)l|f~*i>@1r`!lh^3=7y#p2i3C)vi^PI}3O z`AzFI6+UWQYpRLk$B_e0FHn#<5ooP##b1N-?Tl`&UGqr7f8Nj776>L?RPyuqsJ_3a zc-^{z0%0gmK8DfBcfLZ^=52;>Z@tLw>V79cH5T{dsb>DYO2nkrK2S9Cu_gT15>ss{ zHvz_%lI#jl+H zLp6vk%UF=<==CdXT2_P1<*5rOLZR{FuhJwzVc=iMO4h;V+%Bdc_C8rmsol2TJjrhs zZiQD(;wg?FNQFK~MFydL^Z9qb_uw>NlTjeRKh0L#PSIuKx{(%RXIOZge;g=@RtI>O z<5_9!n{l^X&8{6Ah2+w&qnq4PF+F>R$nea{o0?#0SM2=-N+hRE#2R-2FcvY=fV`jL zZ`+v(IcNH$x{juXID9LKd4R3GM?aF1aS@9+G*WX$`Khx}Ec_lf(i$N}bFYAA70M;^ zuG+xJ<8>@EXp?AmRPbmj<7JD}vpe`Ku@g}OUK&YRpV$Tm!Geg}mYb|TWX+=}G=avc zIKg9q6Txqf%|WW5@q=U*DK9N!YRW#p0hf~+0fSeoVAwJ9uVVHNh`({KLlR{ZGn-yb z{;HeI{N9vueRt3sYXz$8L55HM`Ui$`r0FwK6vOh(j=AjjOpkGK+MORgO&YnAl-_|m zUgE8H_JLpb^wvpShjmI-$^X>5M0h zp>3u{tEkzp9v1VNmUH;|`;eg^u0sX=(;y;`G^=l@sz%ilfh?Rm`=_h6n?pw;k6_C& zpFCdwo#0s29xK{8qHY=Hrk1n(s$8S$mBp-!x_$qwfa=_KhAHZMa=Fo6is z7IBbNw%91@4y*0l-d4Z1ZrXkV?AmhK>K{f>4riD@-W2Pm)TOMBpt-3BrFN67qTicQ zA(kLMqu$gNiEL5MiWODY`Dg~%&0@R6=|$`4?mVpTE{|GLV=&}dWYN8YVg`L1KaPeU zhtoAs=(qnYRBG|0T#I#4 zH=QMp}=WH4j_(9z0~J9Cdrk5)xO?@L#Tkb_Ios zeX5dEB-J*(lHv*vClGZg-n7{6xq$mDuVZd9;ZMyc`{-PT3L~Rl*~0glJs$mF?n*0E z4@EmRxEE2ZfcOd|OD2}0MNf`x#Pvzi8o#VWzD9LW^j;Zc+w4ASp?;hM2FbZQxda*X zM#dJk(DI2Lv!%i`aF}`j7;(2*`l?gkS~M9^?r^^)Vl%%Xg}41%plQ(%3lK>Ib20P- zMd7{{NC^uj?MqKTRJ_Mlr)vE2F1!aCW4oyRdL+7o$4!?y%7kSqR_S}}A9I3q0_D11 z|D8bBL${fp?)_%M&=g#QMFf?PK)&HB328JCAy6Z zdiL!0p?=J))x3GKaaBPgc7G4Aa%s2_j91Wq`lEp8#8_DK;fJ;QspSpxw~nmJiqd6= zh97dJhuWh=fQK{kqGtLMcD~H@|4{bUL2(6Nw`c++Sb`?F1b2527A&|08{Az79fCUq z_rcxWA-KD{Gxz|5>&yMU`@Z|VdR4FP{pfEciLPVfjx3Sj~yR&Qa`Yd=#$Vk|xJ%70NWqLjGR@ z+4eAl`ph~=;8)&aDE3x@LE>VI%z}zx7^4FU6RB(MtVXe)TSW1Ly67rL*$hUI71g0v z>oAo#O;l-%@ z+>vuik*~+;?d)@kW+?3Ht7mII3!E&QeuRpPUzYE`lmuOTw{r*VbzkY8bQnX7`{__8 zq7AR+kZ90@I>f4P#onR4eV`Jx_1I+p1Wi+y55**nPb1xKwh{67W;_S5aV39G$y>sS zwG9!de&0$f{=u~I+aCu5YnsX|3VL}2!pM`4YlW!W#@81^4qDuG>eVqA2GZLCEToq{ z4Lv809q&6xj{0&o3(WFgoo=Xnyfx3aF=1&S-+b4t{yJMra)eVF^}pTEAc z(okZT1@D8x7og#nuoOp3mzWcc6mgVYxKE9+4vjcNAvrEwjcsrDi+qp7!4AhdA#gDz z)D$ri)tLQcJ~=}~uB&jzKuX1(d&vwO>{+fWT>fAH{>d7@gV)8jD7D|3(mG8S92ZJT z>Jw@qWAde{JV6)_G|p+(-6@D%-|LfEza@jPT5*kqT}f!d-CJU1rt|q<6bhqpW%ujN_CU-y++od^aE0glhCt^+ z-Y-9lxVOdzh_qiNzX~ySt^3+m(T;>)Al!ag#(|=zROZZYi z0c4`{80`|<#%lcJFpFarpCj@j&(Zdo^CH^oOJ)PwR~5Ean7(3Nm5&zo=2W=H6y_1)!6B#`p#V^(?!^wFaOgCrTJ&eWm$uJSCYX&svRsz!Eh~ zfJl9I@`&?`1*`Z4ozbaC{@x{V0?JqGd(P$;@#A=eS{AGX^8GkVWt<9XxY46bm z?*|h0_y)7w6@&iY373Km?g}!a=x6Attr!?*jc0ww+8dO-{|@LC^^0H!jgRZTH@-H`{9dAr;7m9P zmh9?GrUHif&ew2*m0eFJ6F9eo6T_p?!n#KF4Ec*D%sP1s zlJF1f1($5Ct#?J#sebRxHl#qBy*>~^mFfJ&XmOV!PJg%9N-FO5#ak94BqIuov)gbk z*YkaNFt8q?0;RJ&gq=d~@DT4wZMt;JlfT6Nx+@)&c~|Gxe&WWZh*=sss>y?&(S;8= zI(aG#9HIEaY;Z$u@RY3gKD#@sPGnUqrPqse>zvEKX2Wexj*oOp2LW{+)GY+e@u3H5 z435nah#^IOlaKl6RK;6)LR#c){J|`WW&I`Xx?;C`iB1lVhsEdHbL3Ii!SgbC{wj6C zMIfZQJ4+Z4984}O?nl6=s&wY`3tb_PglpEdNH&5N$*WddrD2ofi`B@7bm+faG6SH& z8!DLDE6SkdSd>~TuRP+^9FBai{*uf)HE72VvQ8?XjC%PkV3_Cv!DF0nNTZ*gEOQ60 z=DGud2U%;RrWB^Ne8^hMWCaGny<4KJg~b~INp~6_(C+k;n^RXXvOXGJEO#Ym zmvY|_y!dVpk(chX!=Iut)oY~DxS@Uz4*Eyt7@j;Sl2#uLk7;FniAd;vgAZtz+5H48 zC`y)Gv~TI)@CUYGa%i;3ebG#Wj7L|f4l*KPNiwY4a%qZcS21UdI}i-|1^1uNe|&n@Cb$M4s|!LIG01AIMg!zxl4Io%_pA3>NWM{C(Sh9I98X?xt4uxhpy>eIMG^z;nQ%k!(v9J9)!>r%gNhomITC4bQDbtNP8pE zGd-wIsc(G~i!7)tbzv*WwE8Fcs_5a9C{|7iRZ>OJyu@8)7XSkaGy(AG8R7_ty0 zt&5_*C0Tm;uu0lMi>5z|P+}es+wiogDMdrcF1{+b3zs}H4O?Q^ftspm zzQ-JPKV}c0bqZ{gZP)hpRav^f8Nc`r%oH#-FkwAPmDy^=$Mz5CuH7ffAX%QWt@%hX zDcW|_-_ap9%`mr!lAGJwRKw+O2#psBCmkA3y{yOs10?+5h+lFa--NhkHeK$#C80D1 z{T$fkTl8@WKjICmrS5(CiAMfD2A?3PSDX%i9u9t9EO7`wEj5KqQR}dG$iMh!Qd*K9+PYaQW-F>AW1a7 zur8_Qu|A`WoV#6#N=>(v6MRWcAJZX;wC1fxr4$bS9EwXYKt!|m1oj?bS=rK z8jPB`X1t^7GNt?S)rcJP@AbP@8i=prJ`Tu*)m7)5oOiVtasTo{oZIQJlYJ!ePaj=K zjj`xVxxcVm{yv2n`pLwtAnuF$+^O^yp8Rm#Da*y!mmYGayl6OT9Z&`045Gd?DNc7y zfdQC3%>JT|*(Kpy?WXg)?T~X8OO{^s1hAn8b! ztii38^Kz^M8hiEkn7{U}O{$j;rBk7uy0Smf+}JlcL(|MH{(wDm3WA&N55@A8*Y4<< znOBLPy^d1KxKm9BTU~3fy$dE61Px_AWK}7(sM~&JvEY%&nhaG08tuYgEHh>J=4BWP<^g(RjcJ#t(e`#LIHE!aj|kDjy0R<$5Zd>jglg zqJD@+qJhP${VeMF732~8wtwm}DY2O{{IPyl7-_bbSw|1{alqnYU%R96t0!yyu6<+t zP1NYNQ1wr9hR7pTnX&r5sN@hw`1%VZz{}f)(SiIOM}N#+Xbm*_QvK}40+5;a`Tce) z%)xP~URm__RJKy74mT599mfdZqnzVd`bol1zp9=gII~{FHsU3_jQ+N~=K13NhymW5 ztLI&Uki5Jiq!jg(P;LgA46RR;K!f4fCic0~ee9=Mp8EOK{V4$Q4<`3rvOe0oXqq-t z2dYBi?*t0;yc>Ros~IXL9EA4BC^QrgksKc#K|B@j?aM#?{^NYyNLDpTYVlSg13m~6{K-w>5M^DYcs1+enJvCYLO06K?PqHE zll=aij*mg;h=c(ge=#OpA`Jvl-fQ^;Uw`;s0<$o5QzbLO3~WciyewR$ZPc<}VhogO z9|)?@R5tJ^M?LNH%)16+7!Ad)P0@D!Ku;m5V+|HZom-boBkgOg4x3}K4gE_SHdG17 zw?Ytd7&7+wz74kl7Zw+sTbsR-HlybBj52Aw%-b8*IJHAQ>HktRLm2HDW|V+j!+U@e z{CNnGsFu&!clH1)vr|>Mdww|!%-9uo@2kMg9>_1AM3raN=rNVB`*v|9qj_Jh1DM3e zd|MM)qc!%pdj>#7DF%g-5Ms9JBV$1hA8C%d({KXVvSt9UT|I%Zv7ukrGaD#G)@)u4 zb8h`FB*cDmrK=a*qUP0I+y#y{3n%s3LYtC6T{U*rBPcQ`yBE&3!?4rm?_*<>;eC&(HQ|b zL*XXgB(k9NZZz+n!V0e4YDF%?`#^xXRiSy)P1LBEw=<`&1|Z6T$EaI9o=^OFb-t)I zoyyoxV<368DMfX+bB=dWcJl}W z#Q((h6dIVl3(xc4`M=~ElAME%qOGITSAIK%8>A-^wFOm=2fVjFngpYP`7cmA0=m!_ z;V@0cKa!Mi40!RR*#}KX1{hOl9>UR4IT10GP#>tk$I^X5bCdy9vtGFtLB-`Q%T{LD zGUiIACo$fZk|F7Ts48v~mko8ucPFxtl^+!&%SEKuY_f(w{VfsUG(}9!=GtjbCAhaa z?bOqrtEwR9zwyQv80ygLLP6^B_p-@d#d3eIMBkk*1bWu7H#VmUxkwz?Z zsWUF+d~cF z&5>JDSy(&g1fgSElx%_ZJF0xQuaT9R!9}JWszN)b&)=f0+quePI>zYJnAQcy=PcmQ zCu^$3fuBJ(^jV!w8({m{qVm1>KwIdp9>P;i78YK)=5u)GzY7*#iTD9 zpVl~Vtb;0Ag`XU~N21gRXbkvk9wHrPs@w*bt`0vP*uvPI^kFm3YHv@__FpCex4|kB zpV>^~J;GlIb}96jiS&KEmz)%*G7i@D-n(B`$1V2eu#w(n?A~;l?AfzVy`u;G;Ug7m z{8pGjs-G2qeR&U?FcVOzCg6VHTWVH0qcepJ#B>W{K9ne23s$@`K?&||ovb=&HU&P% zXhb|_s@?X>VM7EW(@tBDFaw#vkuySn2(E-5iy~SaD@oYPXJI!Ff|{@#CB8(de3;ua zR53MW0MO3+*a}LLdo%J(;#OXWQNeb3Ux4T9!buo_vFs|ZwkBQ&lEhfmLAQJHQ z?-4s(5%ZyT8K&qD^p#0XtmT{o=BYd!VCj_m3hW%z&U-iItzt<&E~ZuLX!qV~1pJjG zT$9{v8SPZ{A>I;LvifPhMl+wOZ@(_N)s{-0HGiy3hSN*_kYiq7zSH%0&+3f-9!szlo)YNFOcyx^Kp(hz(d-^Ill#51otBMC~$753D!Q-l0& zRZOPS7$z+7%-lW7x86R{Q2NXcILC1j*;^>*E=g$=<{9MXG&W!@!{#aubrK33OO*Nr zfdxjeBo$;pEUPTyZ}}vuJBV5TLIxa0P#Jx=aaD~MGS~Bw)bn?%%Hv`o92xNs0zidu zWm`%D>=xolMza{owHU;Vi)MSbW(J2NSrlzs!->W$ATI0cPIc5&Ut~p7>x+l$6~#`0=#ts2gHy153b7x#!RdCP9WsAML)1!g> zFS=@83CRxLD}Y^tKYm#RKGVIlp={kx8Ht%9{i*Mza{6#ASFL(U5>^5UO&@-IF)774 zN&URx_LhezE<}0qrq^oq5N?bAu~D)T_PC&2cHMjBv3hX8?18u_@O10Z0oj_@`Y8X> zB>7;vXz|#RS%20}o-S3q^Cts4gMdcrYr@Tw>1G8BQyTF%XTMj=H~2J8x4V|R@dmaG z&s}yphjo_`f(rDb^he*7vfoox7b@=Qg+k5-VYAQtLJ6hC{CHzDs#KK zwj?A{f*xj?mJW(>LMt3~mZGP|&L)~sH&7{nU=tfSTMt;9GBq~oC{1{?r5Y~Nx+yoQ zau!++ow>N7@KQhjNWsF{FSo0Zq@ z-Gs^$0!vhboa$)mvZ=1j@m(C6t>b;l!9hB+D2UdojoNj)2X}*gD%6iy^B3GKJ6w@d zJj_*{KPA!AhIyWU%z`kMEIRLN*7;7K+!k&MPe@LE>$p34v1$Kor+ISYq`B|`QXn`{ zi5s-oHrMp?z96>v7Lco6HGdu>88^~z;^!FX17H7flet|`z(S*RnKS=)(SakV2b=bCHptPzX?ZxLYG znHIFId`JdFbwbl3v0uH9(V*B#DhNJvJ=23%M`Po-56Vh4{?c1}NydM)Q8{!Q+o?YN zqx>j}u4ge6bwpDK*!pT|UmE zX*Q1CVwKWzZ{Oe|SfI78f-%b_3`~_t*CqTNgz%q}oKu$oC}uI=G$fQQJwiF}_n`r2 z41KFO+TWl?6=g$@-cISEYi*ZhK59r;^dH(niQI3JhA^xg87Jlv{*J4thzO}b|J9}r|ecf_4V zt8<-{ma&KkMCU(_46~7I-5VS=p4HgMGNZ_B2O+8uEi6Ar=Uftatk05MrbT*$T~gXn zYttIJJ+eg4TCdo)KAbLAfeuUE5s;~JQuj9?bp)zr6nji%Ikl)|?jFO$6uZ{5T@`hS z+qV1tq*1F7tlW#-m#jqu*$oPfj|1Ji>b7J$jj35J{MFF=@Ot`I1wT#?bKVC*K>uA7 zJDJbmndSF7-^P27sVWs%3#dRy`&CMLB+k1X(>eYUVRK2bM zJp@H(w&uPnO;UW{CAUw`2A{KE$Ss-i$rf{It=#TJ22+VoZ7r>Os$V==$`EgMJ71S(vom zwSO1bnqPiA@g8f8h{!q^peY5Y)i(MxkT^gwZo6xI@?;iyfA96rIt{v^p&1E3gd+*n zj=FKHKiI|JhRjc{buwyKd2w=P0zIf+La?k#HmV;5wM|HfdBnH;oYf#TZ=k}B`$@oL zq*a1uZSA!00|zGJ^z%u+Lj&=;7q`FEP36MYB&)queF|y&NiP^A=N*1v-lYTmA#bu; z`rw#QCl~z;*8UM=QIRqXC>K~fSB_cJR?P&pRBHTIsa-Q81>7p>HiFYp31yuNDpRzTlTuXzS9EYUkDdTjNPnqU>Q+jFRv1%S~&B zZwo|XR&Q~up@v7+kQ zZYKwv5QtyjnUN6jCS}NVa~T`0OcM4e7$N-n&|XRNk42B)FQo;&=GWNVwFFi@BUZpM zIXl4Gh&6RR1YTj3G$-X(deONJ{ry&@Pcgmm(721bhyxMw-Yh$vxxJ>d>I-8*Z{p4o5iy2^f4JybEt zt9ZgcE!_Mpo@W;|;Cq}GOG?9{hpyu4;AMJp`(*sI5<=XVx^{*@!Xp!p%hJQPE7T0n z`ioA~UoOP~?UiNKippMXN~gL-?$QcRpBJA zzB&bF_fFiT?Yfh4w1cOaV;YOomJiA!?o3W^B_CT?V#=cs%}ix0x@2atm3WEkf7v%} zF!|YQA;$bwskIm|BA(gDzj5j+t$zWS1Ab-bg0Urx3%nUJ_V4;@+ZU_kr5y8xS*H}o zYlccZk7z47^@S6D7V+z7X6jcs{`*}H>*smLvt0v^=ML4K0D|jur_LhVb-v~|=Xy7Y zOfq+HmjICpQ95=n%%`^NNjWLg5GkpX21HPtJ71TtDKbAR?H9@BuI}Etr_~=M;`KRjbS8s=IZB9 zr#zR;|rpsJ&AJ*#l~0Jb;2 z;vC!<(T_g}O!i(zA}ej)?7C08Osm9L+jM$j`)SP}Wb<13bie5kj~#(kI$KCWGPt9Y zo~_{NlCN{3-DLlEvYPQmQdA?ID*3f9GFq^gWp!WZ*0C%?iuS}&avHk zs4AD;3S+@u$if%NwFPYtq0_U+4RJwZV$}tyAgqq?KihJZ-v~_cUps$vG%5(hwU3>h zO>?eF-6UJojmxHrnw20AdSkeSA3#J>|1=kUExg_`)3ix%dA^=+*yrWa?d#%(%`RQ7 z=hjL%@vlAIL9oGfS0b}5jV_G?jRs{E?Usaj%yTQmH=RihS%vyeovYjBA%hpW*|`hu zZMn~1o?CIjjOg@Y5tc`L`|)6}gt^X-2Uo8+?wygBgb)enO}2`(FyFv8NR z0J*&xdwM(=!e-lU6bu}f}ZPrv> z9QiEGK`#Q^^VqXO>=TfRAPZN{ry&TK>h~l3;jTKE^3A%GU+xcnekUTK_Fv~F;(sh9 z`X60Dt(&psJ6Uv{5Tha@2&SToy-J>n&hzIY?ws)LSd8-N3F~>;ipo#X7Oh<&jz`N3 z+byGi?H)U=lG^$2Mf*w~alx*}6|T6ye_gsr!oq(u{i-*rM|EPbXnoyFE#KqIDG3$Q}A7M7A&+U#F!rfyzRcCYe??4^+s-`d9iRsfbk5bFz(AYDFGQc=g z+S48}`xQ^u(97s0sQs&&o}YeD!#}x3qdGMU z+3;%yqz0p9lqHreog72Sc^pbjHt*c&f5M9?wPJJOa}4ild%7b$=?}}rh9fMPvnz5Z zP=lE)D&f{6u5euRS?kCbBB)v3b@};{Klaz-e%I`F`*0tD#Qe9uXEuV7y1+!GzIDK8{Mw_@`-S8&#H> zwl?=5!T}kC?1FrP@b~$ry0-enq^-J1Q4Wak^QIRv=Uw&61mn1yLb!eqI413{cZh#P zcvyt@Z%p({W~lrGv%-%SVV>6J6f~B!$K!)*{icq)e!od#8&Zzk2-4pmfdN9D5PGDW zeIox^x4rCYR}#TvJA%#jwMwI^^3?O@{Ip1m44$UjUbXo-&lTlufj8jBh?&Xq54=$r zhgXt$u%h_P`pf$}1!x-ki!w+0P=Bh}(Hs31#YDCBy>Dg$`BgbP%Hhboy#6sXF05l7 zis_{}vSOmAKjTxTobB)Prc~~ngyX{ZdeRuS|FKQZ1%l}aY3$JMYTZ$c)v985KM-LG z8qogEE7uTzqDFh^Hem2F?!6mod@j`EGtR^7c1fw)2pcN=pSebVb* z&VVyAgH_8N+l-Q=Sw9;EqhBB1lR>6+{_%itDcm+PJSM;m-9_-F+Z>?*Q1--A!8XY7 zg6$zD4fiC2z#BP(-F6h}@Fu!!vU+yRo`CKJ7r%4H0N+Xp_||h(cPeI3gTNsRO@mfQ zeNUxnCov{1H5xX(4IRcD z9JLleP1}K8te}I9kH%F&yJI!yK4|lnDx(g?tTPVYC}l6c4FY*_^1>fia^K?n3Dc^+ z){EsHccl7q-9%d@as#GoUJ1~lgZmV7-sN0@I3E5tXh!`#=XXzIeYRZ`2~`7HuTrj? z1Q{uxfI+^{P1;&RS&&aF_zv-T5xA(-3r;9k=Vm~SFBJ7j9AuY&=%hb`xN<)+UoCje zMSLDH&@a3GagnaO1JJ$D{>GfB>CwdD7Vgw0DFMw~r(bWMt7=<$o=b>$_UxF_f$;a| zTw7}`*j_L^4e8=43k+*^N`SdmvqmGsw{3v3L?{CuzjIv-HuiU`RdZ*$|ua1CPj$G9R+I zd#mj~4QPvon5$UHa4 z6;V6Q=I!%la@L(AnNOOhz}mwmOr1~owGD*`Pve)_BDYwhw{?A3*L9zbqO5rdc>SUk zU(!Y%Vyd@)0PGP^*#cW5P8J!5Expj4ke~a|%ah#8(k^^T{&3kjw)UHX1$$(4Q1M*7 zufcdO$5RHjo;9~%_k0Jf>_m#{EGydL(i0Ime0(cq_(FZbr5`wlgQEQ78C11)F8dnn zNvDSCCB1tm;bfSl`vRqBUygKFYp4wsRTQ#0bLj6G5BX7+s;;=nUbZbG?*MZ+pD!A; zXiIS{o6}W`pcik7Z`1y*NW~l@u$903O0{NcZC{^QdpdI@5^koe6Rl&n?3#AwD+=-R z4=lYLQ^jz4YLG?Y@;;wKwV@u<3jke(;#eNW&G2!f^QpwP^q`=7YR* zZ0)>HM%+Rc)qa!h_US>R)kFngoR%_DcsVmQ<|xpnDg5;-iI75mlz(o7b_0IFnp49g$c---4iJnJ?^Sr7+*F(cdy0sn0F~G7r zZ%yOH)kY3_bMpOXtf7(3p?xgZpHniZ=DPXKB9+(Sj7uo*I;l=&-Z#pY`x0rr1(F8W z^{Nq(Z%el2MI%qJ+MnuB*b)Jmw-XN@M^{!W+M^x2IiRI?$1eK(?C61RKEoQV-#g8^ zv|Qu1#NLX^m6Mv!$KW5~p14E*%1DeVnvmjlVEVuo6lQsZfhNv6y&l_B9d~1Mz9yJb zopml#;|&lNL7(kZ8E*WuIW=}7L+B8z2Bs?x(#T`_*z;>Z;s_vJg z-s@aj@)*K6pqGa(s=!=o+tG0MEVUyR8{oDao%~?ZzLB0qJWH6DslaUr5Z~OeNWvb` zkc(n?k*2Lce{?jelFw<>tVvmR;N*SfM<3@gpLorV8FdFW{M8(DY;$9M-QRptIw|-a zv1;ar(WJJdX4cmZ6q+YVd<%+57H-yDy~9qhfNo+hsJ2K?FV z4O(-r!38pp6AtSX7(mzMdskBQ2ID0i!RMqbP%3N-PBZ^tD<#rk$8hB74_om()eyef zC9o|Xy5G3yn0i29O49R@bJ+J>zvyGf608C89=U4-;i;uv0ZMzy|P<>NBE9m%`nR2a79!k-jU2d*U1}Ij!Z%=J5M#a{L*Ex%IP-49TgaW>2PY zmHF+VF-+V3R+aT_QgAL)E^OL%L2uiv?c$PZy~LFy@5KXE;o!WgpK4ONJ|TVFcKh&# zn*@p)$Rd(lH{(Uus&{}HmXp{s3{6m_A;ocEO#T8+e$$~WNCYJ>lChB5Zk}}6IeMDc z{fUj@na+&#*^D;?jF)9A$|qh}fYQqw$6Mhc?h)R#8D*UPzDB83Bh#jIMe5JT^?x=B z%3?HCL)gaM@A(%s$ab6f=c^@T&b9IBh3v}sCe+oKpis8tx~P5O+GA{RqNCv0(m_7U zKOMh?MfnPst%>p@uQUZ7wc6CbQU@X{%u;e6@KWJLDwV57-x5 zt5Qh)ibQq02iz3dn|8hj9t$3jaNjcl=0$gKQGh07Tel;Z3Rp^Bf|~(pWhbO%th3*M*P;-jaevA8y+XB_;b9eh-OlCgEXdNf`f@d4pp zIxpO51wQxG)gerAZy1`nA|*NywO2I>M&IRK0~FFXdVAw4TTZi2j?wJtIWB5t=at*E zZ|)=~r8ts>TEKqdY{auo8ddXkJm?u_S~)D%bH6qB7GDGy*6Z%9*E=~TG;1!>imT=i z&vHpxy7P%|J6m^mP1?4YcJNHJNFKDfVp3?6Kk{#G7~x8sHQA&W>*u?8Z}2sGo>IG8 zOdauOTq(aPGVnR(4CDe;cV(?=o-3-1rCGwkL|jfrvV`6t0iT;LA*A@Tm%x-`c`Uj9 zkw@;Bt98!_uf+x#nbdd=*0Ek{vyW!~IB?`$*|nx$oX|x7hG*v@MXQU@fP%O7oc}02 zmt*BTa4iR;A7!#5r<0yg-WGAE9G!vUe?3~IGBdwXgr<4CMEFJt-_vAvavAZ+Ic(0< zD^;EzFfi_UbfStRahbWUOI6K`l7onf?}cv#$E|+&$`;vHqy+njnKbNIcC-FlS>?WM z&3@e`qe1_GNW)y3Gpn*(dG**9Xtxecl-b4qxtaiTS~+GjW1%uH7}6;4A93K~h7GlaKJ68=Fvt_HA=2-!*5|0&+J1c6 z35cFBpxx)^>f*?pbRWJ-xUHa4nP21-Qhqz7R#zHC|6xRY`2ru@W+y)+_S;ewhsTeu9&Y8z;^Q{ccojDEn7c*;yP9^I_GhwoF zo>=K%>Wl)NLOaAdC;bUWm@gFX1YljpqL3@zH;TBjBDd$vbG8^n!1q<6vYEOShlb2p zXz^Lh_xpFs5JWp)J6~NAZ6xde&;!@roia*Li_2` zt_%KbEiS*t-}TvZ={Jzbx~(?=d~=|J`m`@^x$E;SY)T{4$os8Xdf~BTqtNV)*wQ?V zuhSWyP>ONw1{AQF{o6IxK&>izFw;^0#;`qSV&W99`B^+H4*#J^-cE$rT+{W^{r)KJ ztXl89-8kQX>=;mH7|P`Nr1^AHl;~2ZI3T|`uKy>wQwq9N4`{tGAQbVn_ih1`U+CR`g*;lr(fMx*ms~>N(1aM`x z6h4*`-ifErDUIB7oW53(z4E51EPCY5EZhD{6M|8TSOr|3obl%j{PnC)Wzi_%hGY?t z5Ms@sq0CZSqM@RXWS?@fteY7Wu`kn!{=kO@=SnORs-1QgJypXnfHbSs0hifzGUyYn z_TMrTob#-EW)(E9rB5(u73xSbC)9(%wOnS?IyZI}7x@Te7(>df!Au^Aa8Fvq=mSkN3$%tg0Yo z@%>Agj!P zv#^Iw#GHAwp}ZJFQl-mMCFC|+rR0PWQhNQM^XF2vn9utpV0PmAA-)@O1exE6Ra6*F z!|w=f+ZO69ro36`tK^Bf@7fy#i(Uw(-U0uqf{Z3_!;UYYEP&?>_j7Si?6X_(+1(gB zUIkI@jslwAb$j9)_Zd__d72o#wXMuOsV?P1Cl3Y6cuz$T@ckVq`!)^*HEPH60QrwK zYHcd!_qcSDN3`dr7t+wDx2C%ouYJ|^lpki` zwly!-r0Qw<=53M5$cxLkP)cRvu(0R^q@!m;OK@b8YU327Zu^k;GHQ|b)mLQx=;jE> zcwpJk(g&ZZs&oyinAZ!C?`J5a89G_@#1|CIm(#-CmwcnYB29OVmL5w|o?-s5nkkS$ zh0t74VAB2@;)*aq8Dcu4r7IpqHgPs3_M_0>RE91H?O8wLfmYf#H^k(?qIQ=_Tc+DY zq|{Wt*FVbJE!ycUw&$C~s^y1SB>Js5Stew1N4W)MY*Z@!RF@FQkUfcxwK+s$kl$Qd z4YAP!8Ij&Bte4$C0_L&5JlcZ%W}1b+ESaK4v*Hx3TS|8X=lz2M>K_RK7H@xGCvO=+ zZJ{cxq54h>`n?++CLx7p_A`GgMVS(NU^ECTY_v%Uh2Nj_-bhI`&`9f#CZELzy41+< zahzBSUD+r$ikOa_K|Vx;I4sm0-QvI@2yJfZe6sz~>*5gvT#KEI%wDPN$SvWtR{|=- zay4b0Du~4>!;uvY8`^U^OXh%x(b6T;Q|r=-Jaq36?PsFI7f4dZ$^<@oo{zA^!tEau z(O)NaNga9?>nd&*F$^<=Ct^C~Mr*xts{kdPxLn!U-Kh*TW7hVn<5E#iG^lVfp)FB`P+7p8i_WYZGN~VW~hU81Ie_-54yQn zW8n*>j_l3OL0?0QQ<1X`EM;cqCaiq}94?v*r;cpDFvj($Nz746G=|f4YZ#xfI}J^d z=T7{(A)fIMJgnUL!bhD>hw?2_9*!?gO`V{xf@C%tpMY^IbtHRKTppWAwpB7X#gae| ztHF$})>J*^yUJJFpK(|x2y&=eD#W4~6I3`xGCgHtJ9rvufi%iSxi`icp+!n_slmGc zXa||5;&GWW0u0{^xRsqg&F84eYde?DXsr}jVY=Qd?Kf!SAlWLj_AqN92V&+T7iX2< z$*LzD56@A=O46EN)H4iIESc_8g^kczjZo0~H#Rl8ho_8C1(l9tMDin#QV|w!D=Djp zN#{(+gqBtnIpxI?o!9pDBC8^lp#^KnsC^Or6;gSOWc&4dF5QFZ+iLG1#Gb!2)n<$y zO%jXABv_8Be#!rv#RzulSi#SJE-d<=d_f^}3No^0j1j2fU#MgiDAnm6*C!M}6fJ%6 zM!pU$3Rt6t=EvurabNUQ_5JP6TP*HWCxz@hG)I#c18K-3Na9;uG&(4q=o#wP0Uo=% zw}mc3v`v-Q=V%^+-r@t>ei7iqL|hhtbMDE6&LjzBlEnQ|k)eucedr2w>)~+-1&E|I zRS9-@nCHVPVafgNyaJWh?cOJoEQ0HWp^9U|9wa#(LbL1Z2eI*!f?!etg%UxpPRA2h zh^KjpccIh$2!JMr1uWPJaCdQem;-yv>g_H9V|bD?cQtLUJ%%@2V1v=^?Ja~oA%lG_ zmkLnk3ln?Z?Jj~1VVWwu*2|c0@qMtNYpCwA=4@}_xdK$xc{MMjN(6tfOX#j1Olstw zxxo@dIF6e&?Rk0OLlPLIPchgEn@o1zH|J9D@f><+8x3h`rwF?{UMm5(K^8NI?l8En z+a_fWuhJ(}cVLr3F68bHcWfs;PF*fioww(>i$eQYLQ;O~!a%1DO%fkLsNl^8G2Mm~ zSR{qys?_=TMH05^=>yOVyY@k>(!JVfKNob~bNBhDSZRR=U{W?7+va6K+O9}djJ6}i*Tad11xNxxl47@t}l4* zHBn+u0*fJKz6#dgVF7s3-@HHp^1o&w?j4I>W| z|L^AinfjSJG6kC&Vc3t|;OD;v;(u9~|K}cs^M70f{{P~&SN3N&kiUw={4eOt|LN?1 zc(wcgLjKPO=jpNde^CDm9%0D7I_>|C zD>LA@6I6w%uP@|1ySNl%+({ZPMDzfi6wchpO70)N01rF~KNT&~R8romwv%#hdp%?U)YcLO(z>f(`Sa+1^+yXK3?5c4PTER$qy+E{8RrztW|SlBr`r##T& z!4gBaO!4dhSpYlDJ2v>|n87IZCp&X0Fjp6R{UNPgpJot?vrs9*m3h2T3!^{?B{+zN zpM5M-^yzWYm=(^OBl&7M-z6qtkl=kymzP-)#x;ERXP5Gs^e!?WJ~pqbNoiVtB;I*| z!Aku<3I=!jJXw%E#~bz`L9G*=NL zD?TR2&>ePhqVmrqc=c2p?Rk>RP%O)U_9 zmAbvFN6-Vor;jhcKTVOkn|FO~9BXgu;&gV#{Vh9}9SJtDwjRcTy`SH>2vdFzc%#6$ zrPv<49PXnO$4<=qcQ|+X`|m;U(dM9Jc1sClxyvRaURUf4*GUr}P`#`>BPBFDmrml0 z##OaL{KVjK$E;CjBnlDKwZhysi+&gXY7!=srPF#MK_y@#gf)N}CO~Na-Se=-HYxW| zk**)NnX{YNB^)eU~W6b7ovOOj@HD`UAhqD-VUevIq&t)>Wn;1x*+ z#9KV^Uo~s~g8T){W8wo3fEe9>BLrhZv@1tiN7q$jp*&U~0T{F6xo*>Iy(YYx5c(&}vK||A+KEB7tVe_Z1UmnO1~kGNo9RPhm0_W6!Rt&n z5syeTOaG8+1Z}>yOauw>TGN2{wdTgyc5t^!W%udVI|ITfEiO(q43Ibj{G9oQF5Pl{$^P}Xfpj*Cx3?P9rcOeVlp)OQYDHS_Gv z>oy$E3BltpzS-fZ1(h-4_XapJI)3zMdqI>Is#gEyY*Dnv{co%E#=C6Wjc)42;mAw^ z9wunJtFSG8hd){UFg3uimA2%Fm2s}{jv|a|E0`{&$O(vJ?AC5{RL{DMFE%3kY1ty> z7s8v*J-5($PHb%8dsIx8`Z#*at0JKnp*Dn-XEeHTo8HyuehKIiK|0eW);O&jFR3%i z<1bYqtOtbAXQpnoj56GC+!TH5H~dL3v442ULMK?6y8>1$z|WjocrK5~+|6_>VfqFh zolFK8gxHkTKOo%}q`Y0XhX&?^e40>5E~#?eISiQaNZ)1O1~=W$ZK{$>wJ6Dc;Q(Uo z-Z_BOpnr(pm1&0Xj=w7rSl&A1nf$`^<|^G$FI$-j-sK-%UO7cmQNz6JNG~}}UC?Y# zlxgQ(LOXEp5!9fGFY<83P<@M`?MNCAV~zBe;{?+4t{z%dL8qmgR|G`wo+nhjA*rY} z(NQ4Y9fgC(kU@f;wT$W4dmdDOU?76cCla7I9sSR}fsFKZ4{Er`-dLV#+|li_^F&hc zC*}Pk@9XTFd=N`d%0jw&1~jOCpti+w15eS=>`ux8g-Ffe75^Yw#wQ1t&MY><{b&<} z@v%eHPi!_;D=BWInsqhtOA20wKiW=Y zW#%zym_!d84&e#zXE0hk+xp+*4|_fzPesf_s)?F9Iouy*NYaLmfs3X6=5j2JF;ai)I zS_Gi(;v~&vI`Z#vp5rD^m^cPJgDk66(#e!Ri7#6q_WfU81&VvU0vSoN zbNBG#VKM-{x>E!}%R}UY^tO|M=0MI@DZI5C0oqMp(>`)Aq?k-HTaFAMZ|#XIr4U}j?DA0|{iZ?b+Qc8E>P-Z>5J5zSvW`z^`I*Yrjp+)rdXKwDj6Nk=C zZg0;oTwKj_BPvbCLCce0MqilS{Uwy(;1SlA`XBk1R^i81apx^zYMT@swk=jF)>u59 zA=@_=gcQgBjwj;Y)!${=o|&^MS1Sr8q&%gr@0ptNdBw`x=GSMf1U`Po@85Rku}pxC z;^rFFVNF1t>R$BR-(Y;)_kU$5RkLOGiknIZNcYXB5-Qq7AuD+7?GVj1=d!eI@_4Y4 zy;|Isa|vwyUhA&=O^u)7vuvv$WA~CTImadhP1o%n)4eBm?VLP1*Kfhn%AJS(@BgbC z6}u_EY^Ob+to|SB;4j$8@-R; z_%aY_Tl5_h)|@6!Ua|QDwe~Jt+-MPi>)L?PT5==(IR=dm!wZ9Ry+WD3)1|ee!u?fk zCjrDBvO?8Y&M9j+phkkev1rPI@cky4fTTqGo44zp3^Y*ZwkM!JP_Ra)DY8#;fj(fv;QyG&=Q6;s0g@(@E7Cvm000={Dy@8bC0ivf~_btLWIA za7F73>y#w|#EX&i&U!vNW@K=-XC^jq%WVbfF@1`sL;k^jwi5yIkov33l ziu_fVP`Vuf#3F=LnmqMf#Qo8;xg<#GRlr14&|T~Cj#okO;&1LmefGt^I>=qm|4b61C%p8?;#OoMoUz{3yvBuZ)~7pTlz3WPgl;9o^AQ--MNc zGo+ruE<^+#R+-e()Y2o0zf}L>G~bC zxNpaNGe<~pe$4dCOpWDf?9DxFE}#d->m2*?Nj&IW8#Sc0`-_v5(w5C|dA`lv5?>ty zve3C}_2Pb2syRJ}Jo^7kbo#Z-6)aT$Xa(*Wa<=burBI8$@%r)QUu3RqKu#NfLUx^J zUra1?+Gi#;BA(FwyE4WTDV#Ach!DWDx0jjM)&3rHv%%dqeFa} zzj)aHmdKAoRC zxKm>UWs@*4N~fh2^s>q_Wj-7bz%prt7PaT~%q0&#qmWz<$90LQoy+T_e?!unU0fx< zG}SG-7G`i;h1j+;?$yYSV+=*~?<6gAE(qvM+_PqdMv&~tiS9{R%>1DfQWW^*;hR25 zD;a}v;F9lrh^XD^=9Yxd6o1wym1=?+r>di?0~Wf84SC^4ui@K7S9d~k@EB{ANYrkC z$V?wyUwXVA;1ro3hfe53Vjg+}AQChTAe}q7D^?^=)SO6PU-d=K=^lk-&C4LfogFzK2 zIGZqdv^&V6{{h*2w)iK;k)?kYe6OWp{Z*BkV8kJy2zRc2Sf(ed%6v}S&icxG$9ROJ{H;u|SnDg$%+nTA%8Am~#GxZC8O1!i%1i z^7HqNb^^aczQtFeTncWxTat&lFcvq^H5f-WduasbBFxHHTXPtMB!JeYKW{GaD_Wl$~O|zm#DRa z2k9G;Q8;vfA}RtyBp$HHESli_0fgt`LX&k0O+O#UILwJVJKy`9W5+zp8ZyU2x~;5w zjG>C)oir0~hHN1(m7jWm9$_mwaNUp^`dNi;(INg`NP2f~yd>^Ew&1bX)SPiOiPyJ5 z%k>yYK=E%U&KO>SXzXk(|FBz#8eiMjIecNIHWDhc{B#>T-kp8>eHIh$(OZ-fS}Wsn z@OSb*Vda|$iM^6+`IpC5!!X5a>F3$}w(Dg6F(V-_&=qVH{K~i@Va}frfqC(O?XIln zgh#3TZ-L*RDc_;O=n_FB?NQ+9>!9yAbi-YUoX#S~>74v_2aiqPW+GA?0fPq^yb0h2 zYPt#i_4kyc82E9$+Vst@gPOiL*XkeSQs^43rH>{taDQ7QD|8=jfVe4-*5pOJxkNgB zEOigA8QsWqY;_;4B_<1{!+YMPw}P<50%wel=t;>0hj=xdZu~Y}KLSwqQ2*yezj0iJ z@DWWt)0Ab48=k?x679o%I$$ejYx`@!$5G?~6SeLgol)NJ$?HH9aeSB*}6y3 z3T~A@O}{4FX;+B_!eI1lzkD~XcAyE&=fm^ae?KOLT=!IuyZ^*UHuPl0e&aTu?xMih zkZG8Y-nALmqw!||h(f!lelU0H0LTkd&eZj+ZY)kBqDf;Y;SzWeua1t^8F;f`-g*B@pn?A2wk#_!7p+XYH zsMf4Ij@}o=P`;?*#oE>aM>3#%5YuoB zt7kGMCF9M6bzR>8-%ys2b#*(wsLGndts$H~S&+|)*q1u(!nh@>ke<S&jA8`u`E*V?n9MJoKdZ{mDy=`Uf1=PkWxAg`n{0a4(td&h zc0VB5&uJ8~IXVtEU~PN0AAJ27E3rH}VGeLn*t(N1%p?OD^c=%EkeRwOo`MI3Fx_FG z1g<~LENtuCoP9i%3%5FV=0D+`@ybULNfjFAdlq0g#8Pv>vzOiWRY?O6OQe`uq)H|L zX>40bZ7+GFZ0*40;DD{v$9EOMFeO@b*C3u8 z12%Gg_dYK%Q$D;d>)w* p8v<}_(6}ke(e0Raf;wrI-)kE0rAynjkFUL)i1n1HG z!6qNj`2G@KFLj^L|LNB@X-Y2Q7+yV@x0|NDN~)m!T8SN2|L)$ zkat3Cjd2DU^$xVu9i5_ltRP0!(I+wEJvlC(5`!=&am2IFXeMu1hvOE zY&?+1o3ED_M%<(?55MP-B&4K~WS@bB*v2#gL+N_0jrAJ6tJy4q9JsaC>fU60KMqD6 zt~Ia2`ta0acC@)=jFQIm*wR}VAK_em2{yVLMNim12c&nZ)`CL5d<5$=igZp-JYB0e zCEXM%0#SH{3sCRJo`8KbzEJL5J+SyNoR;8;yT3}1A?^d0&>=Q!E5x1)#s1#rFvFX_)c{h&6 z86x|1SvuK+mF?J^dQKzDAi_9g;jIK)>boBmruNe)5HHRQkRhXf&uM+5AP`Po-PJzI zbAjjn1eP_dQiN>HVY_A|)sz`*;_&tV*CmMMZ!ORf+5*1%qw$;}R3m`_y5_Ih{bid>(CoD(DWh*@X_G>6QvkJI=SNSE+(yO<5t zfye&(X-N)`*O?MIj>j=fu!2yvX<5n5$+Z08|Jv)%2a>Mil-Chg@?iyY32mEVN1Hoe z=vfV_e1(X{@=m!7Liu#FFHK;6ePy{GO%}FRJ;PU%B1Q$n&03Q@n=0c--16eYV)B;*xo>f(1GhU1Upi#(?Cc$VCYb7hy zM4hsvQcU{0Olog&yiCSM-*vu1qd$|bH5@3sF=u0k5^FZ^z4hT3sG8;~0@hk~i_Z*? z<|*h8@;J&0m~e}|vedl3Q8=d01a}4e6x_G`t~ToS%PF9el`zEI4O;UT^?H$RW24mz zwQRB1=?Q+%{m)J0g6&YGRaELem{n#QVKQ@3ujeA0sYbKIdCu}MG+V`}*XU{YhOk5r z->%pK)AyvaZ^!dDPFhhhePrA;DiJ(*-*bVIzJ|DvJ78fhwhk^R@&_Pb=97Eq4)EYl zFo21H?zj{I+5s|I)O^3#8(3T`r6SXQjd`xfYCkv)^?#9m$|PV!7u$&?QLoYFS?!v# zsH~PzStg=ewDnFiBQLVPfpvOiY0|5t-fEWr^wux_%f3kh= zvM3_n+2WEq zof6zt>sfhfQ2p9W)0IBqtk;gCE@oot3Zd^;r`PV6+q|>twr$CpG937yJEHB(pZ#bVv9kM&!mmJeSXpjdDnqMe-Ls z1Hzj9$G0h&J@J!e$S|L1R^4iwZMg+(Ln_1cjmxt71@>JN3?h{Nd{LdL1L} zlqH06Gtn}To5cjWT-lW6n0mpae$c@R3{&t%o; z180u*#2h*yV8(M}&yySojI(==wDtObago+Y8hsld)EBuKQW?jQc}d&OpBKaz=v9|I z*?Xr2t5N#=yxmdulOyk7)+A@=q!#r~5 z*JYWMfy9E%hmI}y9~R2xxA5AJ(G<;{Uu7pef4W%g-c6ZOeOybT62wofG z>2C~88k}269xN?y2UxG(DfkAUM>OBpOrDB+LTeSrWcY`96Wu&8(|P)>jLz01_pD6z z0fP@fJgQS;3;{3-d04<*4~5vr;Mi!novHX&k%mMQMS}%G!wy(~tR8tZeX1$Y% zFpgqO_rBxb$L!O{PPA+6VMoPYRsTk$#4{vK1jzPd!x1$ECO@kM(&}@o2EK>&raa#L z^Sr{U*9h?(?alU_AtP5h_~1)CmS4fJ%PBIFZp>0fWxDC%dPXpfv6nl&&{IW9`N_Je zf+(w$$N7P{HY@WPL!oFN-Fmg}DdFeTVW-v~@;o(M%g zbz(?@O+j~ zTO(vJ3O}H}!wXhg74Mw?Zr0)3i*n6voT0YXrw)SfmBVrOs^MMnT1XA66-U*S7g-2y zax}eB|BH8A*^v>H*HGcyR&yr@R_5cq;OXUnFGJ|sCn-Uikm#-m%HX@?*&IbsN}O=x zU;A�(_tnMgaT8e==VoA^L9HDN5E@ayKdEMpp+XRkE-VHr*GANi#K?Tp6>QY z4?`t^8*OHxsfl3|#p6`t_Zx2qVfu}eq10vs!Yz*&hqm9IeUq7a+xi}~dQ{G(#30tM zyo$2}j3?V^vPxiw`f!c*^8@hPcR)eQYt(YrWr?=Emlb%{mF+V;7RleSKeQtqG)bXv zx!`4nhI1$kvwkQnx{18_o*OtPQ2)-p;zwUe)qR=6mtA*u`$SMhC($Aw`p{t_t}{F* z_s^SYcu2H(O@}12t5mpF|Gh6hci*G~Pt<@=h9@1QR91sG(*$Ga$UE{WA+qfNmLlZF zKPVq~QlDq;28aeMCS-Y2^dn&Tzxj82`Ad$PmT<8Ak4~;0KI#eOVs9Zca)cZ!HwjOJ zBpzxAb>PJgQ=bM?j1IW2etLQPXTFBfW<1zR-5%G6}A7dUBYZg0r+wQZXh_g z#!ei2M{p1Gmft(1&6L;Pr@^ZkeLNP%w>lxamHC#ol*i9&7$twx4HRmnj7iaBwlKc* z)paym=72&GX{(%>!@e0#sD~_;uDvIglq-_7DfDpjs_P($;~k!YB<$#d(duVQDQR;L zQGK!2jwa*M^M&s!m?kb{k^QBe2S6|5=j}kdmjuV(EOw>#&e|lIx8MdJ8|y!3pns1( zPOQI?lJN!8uOyNLRFX*oDrqEv8Qrl&t$}o{X|k)xzmH+x{;Trk%L3JQ?~-o6XGZN? znGAi$YIN%&`+&uSP{S)f89Kve@djW@@f$9=pGmE3KYPtN?j)1TEwrc`nQTd-eu2kj zEG9NpAudRrim>gI>6ZyxgiI$LAv|coAvF|kXBQ}leZP5tuEM4X13#JAfh5B<5PzvnTMae8LfU`+D#T%wGj-dyeQTFH_fY+PSgdqZuTzDze+ zW^PW9Tw6jU1$dI*$SRRq?ztcnp$nlipR)10`Ei|Sn~&@&P|ouGwIt8ohNh>inCnIo z2jYU>8?x&j_^t=XVgn;Aya@IQo?k4IEC7_YFK7NdTx>YfaF!HqPYFCnyldtS0-vaD z^QU{OdBYTW%l5_zJhRyoatpFkyr{7Ey%G7iSPnTV{R+$zYsdP)5{&+}rU-}KEk#ne zA}4FXJG7g@K1v6mfNE$(X-Y*)j#!o6N^@xxId5`=f|79e@TgxkQZ(BVEbn~1X;Sos zsrqXui#iq{Os~o1mGk{M4|A}p%x2f}pQuevb*kge{3pWEfZWs|_}?NTA^?JnO>d~L zpQL2BUIxAv-IKkku!O*dQ*KG-Sa+52v4TABG_-6}P;6_@Q%uVL<9WJU^xCNBm}zdD1?NAsi&__-omuW?YQ4ca{c%22aTE!OdzgSOz)e#^*Af8^T>pi z%W$_VzcE$CHdA$XT5>4grny*j?8`OKo_FV*XcwN%Tjdis z6fsCL&Ah1PsNR<7NL~7q9o zvrboi0i8^SMzruWb66xbDSBq0bu2q*Tt9E>&?Qe$!1ji1~Ph3$Y8<~Myd zG7gS_MP-7Ct}X+KMoEMQC%HaTFAov?$^ZFZE);lJ21q&LY-b1o0XaUrrWoT}XX+eazC691Ii;BL9VkUfyFm_M#A8Hs znoWXk1YNFkX}g<>ff$=vk7%PvnIR}hm8rXq<8wf}TO`N8_wLYo7tn^m#|TH~$$2O% zOQS_3fRgea`Pm-(C&HFw1t;*ORPo8w$2(0x@@0G;=}nAw0ib2Y`LTp_Uv<4*v^a+w z>~hN=*D^^CnEM?bp6duAVI{`96 zGARTLUt5^*;^Rd_3j0%*;$a?*sq2E=FY?y}ChTs9LPE7PlfLMAyu(Gl*@zK5H~l4| zDjYW4gwEkaf^E~Nmu62<-(WAgW#p;ULrFcX>+>Pn@R3MAIZ-75mKE>=bJDREMze{y zck^F2`to$=^86sEm3!Mk_P9Vv88k#rzx>~U9=pM!8Y_w=;6;HZLmnqI&74-;Kp-uz z2llSY>S;g@#9Q9L2JZVZ?MHpNy5mLlR4k|P&Z|z?>dSX>)&4U@k_7L0u)LBUP78kB z40LPW=1273felH9BaH^K5+?)s{V9w z+)HYS?S`5Moc)2}A4y*|LY(-FqkdekzG7ObJ0U^iQ?hb z{zg5x6FEWkN_4+{ectWs1D(w0=j|lhovf-*1W$?cXL+a{Tqp20^|eFa@%OO3bQ1Ob zkoH8NM8mb&=vqZmHX(|d@( z^UZ2OZ>Xvlp1Qr^sKs~JId@F>l~+SIRg(n&pj=~S`0TBCcY8mvOzr=$*S^oae_{tv z6Th=FAo5)Io+&XGM0><0gADp8rB-H$G3EVZBUci;9R~W~{!TY$CoBo0&XeP2&inZM zPy=Izmw6mSjukLT9H{{tqbF{H9lO6a|8rsm1J8i-3a~(d15ICOs$YuWDj{Y6Tb!hj zA$8GQd-?*xGf&Uhbh)PX8&ZMxXr;)kIS{8%e?YCqb%`TnogZpmEw=j`yT;PP*a-^*I@OwCsBM6BairU ziMM#Fw6Jc*%IzI}n!k#eA+f{DtjhZC9@Q6q`+L>TJU2WGHld8eusp>}>lMw++Ev#L#9|?e_76BMwUE?9Z zo8oU^wVLlb_k)D=7gaPP6P#Cp-471Ww5Y+O_&iJ)=~yfbCXpaqbc0kw4nI^fDmXs< z-}8s86mAkR8n4~K<+wG_FEY}ep@1XaYV{+};~I5{7M%2VDd}o!%>TD-bm_xlCFswr zpvzPB>O0B)#Hc!O&o5Qhx4ACMpc=J^DiIPN!K3@**-kG#N7cHnK>`3kN3Umb@C>*j z&3Ba{FZ*KgJ}8em{#*ERzQ)RIASmB8Uo{W{7f2fmiEx7q!wfLY%*=#E5fu?K4oreM zMKac65DGvc2u~DWXuAEtyw+xqZ;JfGNT4t@6D;~8ApT-%A+Op0AfGKHYoDp^zd;8O zq5kZyc#glYa0rmvX#|l1f{`I6s$ap>S%%yOK}abAnlg(~e!@3FSA|=e{$NV*Vd{5q zR?kV=QQ4+R8{FGH=eoPdMi{)eWqZ)n2|)^&#PWTIVW;=eL*|z;%BpWQ+<3gV z!O;H|QpUfIl&AM=!QjK&R&QB5E>aAhi(7rx(^(TB|EQeaAHAFNGQhfD|1Pt@oA;|- zE!E|ZpZy!g<>YhxJ(F^6{OIg)bWj&7>ToLT?VJ+9w8UFn=aEXYg`4woIodTU9w&}r z^V=!5+xcu@z0Deq4PT`#VA{o;q_RodY+4N4Th8vHA4J}+<^9i2X}Ddv$Gs+9GsUpp z$9ZMmAtPtk6d%Ayoy6_tF78zz`P&R#M)%dnp;714XKZ5jla~4A;ifKrC(4eG602_L zjh{Wi)#acWP%^1$uMLq-KAE0&SO`D+RBC?doTAdODgd*rhek6S5QGAH!*t!67^Ngg zQ4M~ZL>1#1s3DqYVvQsSLI1>?f)Sn-H#cF~-LKvx@9V0Exn9@Q*Sj%DUxtYp|LQCt z5L!RuPYZO!2qqcHQ{K@k^!W+!_-nwF$y^A9*QpOWvwc~MKOL>%33SY(Tbi03NJpt| zV5`h*V+XDcdtB~ua1-^z(q*DPw$Oc!%*{XhGXJsQk(^>&2kLaYZbdcbp|n_#ZUo?t z45#*=^``gJs2pfd#v2Ts1xq5FhS_=rk+Gzb6y{a13Op|fxF z4qDGgEt6l4Lx)_LcoRKWFYHom*MH4W{+4%UDlG{!J2tHkYvAe5_(yz%NhLsv+2D_} zC%II8-|KDt)E>S$Lm?$2ivSD-?TD$;a|<_2ml1x z6=L1BTPySnLSN;utwBUijj26#19y1C7#>&x%a*uT+bd2HG^ z4z)w=Q;wVwi87GLjLkPkF~;aujBPK=ni9G`VKG(W;QEC=3{6~wuTH-uXkg^$osy1_ zkpZlhOW$>=;2RFMjyKEl4IrC-v3?p4ULJbo>gYuDe*c`#htrA$Pq>ctYke-XchZ_0 zh3by^X8)Ic#w3V{fFX()3k~>|Qxe$kZ>J-(%-sD{R|q?_j2)#wph1LCPx2jM5jmY@ zpr#*mfd^UELJfEHHG1MvDe=Rf14${*{Z^$=G2L}r|1;>p#kXVyt+n9oeDIw&(%*^A zAGLB-x0DSTc5V||3e20l+ZT8^g;Z30qwP3gct@^U1dD5=_cyx{SDbVX1l2H_0b`#j zD<{R{>6P6{p8KgSM)9>AJew{t=;ZmcyMkDz;O%}|A}i0s>*UdP=rYgDU-Iu3J06^> z>CZ2D`{p{H%|%%1v+fDo-^l3cqm!y917G9{F`Gtsd0Gle6KwFkrAVoS8WBr=xBQo5 zsD8(-;bY*yo&#l0un3#6?g08|&wsQK!dl>-u579+e&$!G7LFnAchA6c2XPaAb=hfd zIqy6Rd#Tk(*_JVQ63deY8W9Dhd+=Q(_yHW4a1QgA{JO|r!TAGi!VM1rk8h7}=Y$}W z0K1N8519FXnt}id2iks|{d@z|eadD|ubk-v6_s*lx*V@R@k$2YPPsjszum6)qF8g< zMX?x`HYmUJZ5LSVA0;bfIk>@eCMHqddR2x?{U)fz-*E)x?{6JRAUZPQ=${!HTxNVq z#w)FiiCP{Xuk617^q{PTvkY!m&>MT^2x<6gnN z8KP4JhQZ-x%S#VW$#>Z)Kc#^Yk!+% z1^v=$h%ICJb@xRfxzgO5$xBiL_=&%Sz&8{OI*VchnmTO#z$qtlYA3#29t{C?Z~iTi z{yB{X|6_XvHOum??F^Nbejt=cQzyD8dw-6tP0BMQ6tMS-?Ows!ImbVz#!^4~w7+YJ zCBennuV?GhT-IdJ9nhR5lol6X5lHRSWLaffzrf_PJ%-uBsApZSG~Jd*6ht0_WBm{MK_EExek+yCgCY2Lbj01o@Dm4>n}4;2N;GE9TzTkR!aU$^QH982g6O@z z&hl>Cf)TnuK)POFSix)eR`R@s>{+Nox!fXJ3^M`SARIC*NVHH%3MzGa0}4?y#RK|h z@-@CBxVr-wcBOP?VRrVtx!!>!u+!fgZ!z7MmHLA>ENAzs_haMg+ck{1=hFRaQW;JACTo5DY4hx?dhPj77fROS! z>}4_VtWM*{&a zAb~kweF70jjgj#RJh#1&I$3drd9I+9l|Z?Gsa5Nhtreu858-shyCb(cb?-2`*f-5b zN0&)S=}e^_aOi8Js zs3J)3O4FjHI6u2YA4Y1p2=1f0V%>ik+-u=$z#~MidlI^ME^9Ze>gJe(3(cx6b(#P2 zPulq#I%(3lgo3{;Y!>&tHN3zkeAy8Z075@^QXID_u;oR7*mvZ-1fAnp@Dznmnk97h zuEopU!0wylmj=F!9haLl0%!AMO!qxJmnN#NWe>m~J|h9E5AZ7ny~zi~zNm3Fyt_kL zhd(w;cwj{X!}u*=aFk(o)Z%vGbP+`wl#lw6_c`|+9d!xry3>UwZ4+@WyevNXurd{h z1gTpJ_?PwY_P*WBa3p*`XgUYK7r-R7?63BDF=f>V%l$MMQ*!9(8bd4kO~o6P#Qjo9 z0wIOA*UXmGSKc{VTtit@p7E@I#PVarTkFwm@4;$i3&&umsfOBUh=AevJE`w+lGCfC zQYJK%7R5#M7gPH){@B3F@;Q~k+YI!B=@!lMHAAKX>7`lEffH`Acfy&Epm!HfTgLUO zt`R_qmKTIFOPTx7BudB-1PN}bRAX%=la5W#J(z~bXvf{?7nmu?*<^a3N(u9h#vc#E z3}BslJ0!0?z9)<77syJ}?L?9DPH&%2Vr!-Wvv27O%0P^o(tnrnY&YYmk-qFjo1G~# zQz-`b30GRto{=V4F>wk%Ww-vCwX%B&O2CWW)^Dcn9!kFu#{z}SjIBL==E_w3pqgy& z2l~-7FL{3A!7b?=4^Kqpk>IYul=95m`mmx~lbGcF1nw@5n-77E6GEF{b{CF%ZM~b& zPecDP8P`lN4Q<`xkR~$y*@MpxP$Fl1gTSB&a}-U`nmDzfO$)VK^TYF=g&N`zdJYhq z-j9J%+3Y!^FwthJ7)S7O6cB$z-pRKk+)zfES_JPM72SeXqmpR1o(2F?qtia z{kVEkd3qgEdPTfHJ!mKd6~q#Ln8XZ^g-?0T`lbp#gtQmUNzN2LLUtWM>xwBtF*U6* zc~=;>yDR@hi?^4hIKK1lno}RmlIU)BYXOaAoZ~YDipFsn%6?^SfZV zAxFRgN*ar6h3;$^kq}Rry`6_cLaB95`_yV#1Wt)n!0i13+rG|^TQvRff$FEH^3wcY zjx`Ne?C`ew);)qMVSvtlL9aM95t$)(I(I;!f00@HvH~on{|UApyc;xzll~z+zE1op zLv;Q7dmWN2?pjAl8A$Ec=$Pypkpxxq=-nIw03BN)(sy}+3>xfo5n?`HF5M|Ow-}X` zz=cF_Rac$HC;kWF=zG_XLiS;Kwbno$v1_LdkF7e()z>CFJtm=e5hQiX_s6olUFQ4g z*xEgsRZ?)?x_8otS?$uLsC-Wc04VEmXQDB{hwpjRj%@#f-nrrvJ2w>E;Q-(jSQRn3 zs7h>QaNcNXQ-ZJwaIx1rsjZK|VsUEGYk4W)iJQOX&q3$*$#xJ*j%A99@atP9aO^B* zbY4%IP{No)$;`kx7vD|$w0$r`o@an1&X2fKOC-`m zf-!<0{Z%!!Nf%v3fJ8lux3=aAX| zM1iYTwNm+)^Ubjw4Yh1i@Fb}k=RQZoj4FSuNR-6rcBefX_DRppoMeQc$$wRz?L zmQ}!a(B&>Dw@Vq=5_Cf#m%#3<-HVh-;t8Y?)x1Tx9=?w7x%1dE3U7*$(&HxP5+R-E zOEK3(Jn^2Ab^hFQN#WEcf+D-qf5W341_A0K$#1aDgDCNlInAYRVDNmM1B90*Ym0F?pXG@3QG!p zI&6a;F0&t&_33(OU1z7Kd)WlUzKcYYj}1qK)W}BPudi;E)FiyfUN7F_x7TQ@{;7RN z<%wA*;+S&Rtzeh;Oq%=p)ID;ZDK8^St~NdE z7Jp9>vD-GL{e4jSYTqh%(j%8EHKF?{7KJkZqTg3@-ukm<@!b3=xmC@Mc- zU&7fqeli?v-nW?pAPI;Sa4pC$2rob`*ep<5A$r2Z3vMj{uVU$oaw@2{0=Ei|F6gwP z(u)EsIJ~ibg2oFlE?Bps$qKkEXuR=NMd^z?FNj~DtzzYirY}fdc%?;565GUgu&LtMi`N&XEm*3e_63?2+*Hw5h4G8`7XV+Nx#GAA)Gjt8T%lpbco)nqptl0E zi~JT~RZ(OGS{A%j(N=|J7yK)DvclSmR9wMs1&kIrSYdwR`bEqYq*w84#ef&iELgvx zV1=F*=vWb6g{u~DSutP5*B10#k$z&&i}@F9U$D61-V0JJc)Ehoi!?6yv%=g98ZJP( z!odr;EvThMofmvr;cA887MNQxc7@UwtXiRVh1nM%S+QmXj2HYapt*v`i&QM2v_j&G zok^H;C%T<(n@#C@m~O5w%=dkI0yd^6iMY{Z*qS^l^9!vUxCIaEQH}3&k-SQL^{uSX zSrT#RvJ!$qCQ2?8^unAgdLt}?BqTA4(I+2Zg5(<-XWbzj1YTF`N9m~k zZ#DP1z=$Iot329FK_;+5Ls88})4H94;xQ-+F1@EM66rVmkc4nQe2w*0QKBQZ(su{A zCr3|r-{k2+mwxlfTXhd>CHU=je|+N$dK0vYM!TbJ(QlSNOy2I6NK)1leoLImv(zoR9#)fjJ7zymF%uv}lhbU%uIz zy)=OoLme~T$`=GfRmRZxjV_~-y^~^UMg)`r2g9OZw&Kuin^@j2EOBkFn_HJsU&)T? zgJ=p&BBCMd2tq_&9*Y`Cj~Uf)L)}Xqu*OmVkVk?x;D3%09l+6a1*9{K{xTamx+|Qk zz#FL7<&u8jA1+YzQGmkDtl}>`EvI#I#4DIMv|&TTw! zlc~F5U&cq%di?gZzHXI{e>V;{|AArfv4}$IubyQL@MzQ5uLZLmu zs+t$T>a&oWv24a_x`@k!Z|cbY<|bPzs{AiH6VLK z*|$7k8~naW)O&bZAfzuPrEMPn?H9NdVktzeth*RhLs?CE=3qu&Lp%U zG-GaveF9!us?T^H$5&^)=P?au02)1at)Pj5SG6&JgY<>myFXF@K4{`QB^O>pgIug& zt0?x66q!zelb-=S8|QTRyX#P_d)|aMW@zxNAx18>yr~SKmUVm)(%~eOze5Bv{N3iE=GuALX5i%SD;YOBnlYwz(F`uVYYe zt_8B$M1sQ}Kse3)4A`i9jgfz;@*U(`^oZ5XE7^AZ_4xK0eTIWWmSzR_$6TKb`fmjt zk_ubH&(MRsF~Mp2YW78yeQb!0D#$%0cO1C4qhYwir89HARehF5q1V(Sf$qz9i#e<0 zF?jNYFQ?`%~ zh`r$s_j()T2GGUR06{>$zs{|oJYyJNJnhC!Gm}|k5Fs)?E7_~KB$~gcENINf!&M~! zZ*~(GntMnu>#!&RK4NR^Ur7T0#*tPyolPGtc2=a%P^orB)cK-ay8>$H$2WxbEe9>0 zT=HhgyWqd1i3hi{0GXtjYL`0tQOYP1rKRxk`OjG|snb;?XRtH1E1-!WJBfoblcog1R@;&LHIDej{WxP?H43YRb@nJ(yMlW)6%2>Q zoh-pb*r9^2KBaW$RWfsTDSSs9%emrlltsYCoC7a*V1VOHh+=?dCp+`-i_pb!Agw95 zg_T%?QI=Xsu$)N=NIBzi`8C$xU1fRl3~+N%H^IX*!c>CFm+SSuH$+jna;`VaWMz_= zC2Zbo=%!Z&45~t=eULam6O*81d3s+zSGC3Pl8OWW|>Cywp zP;(pnhsN>=I20jywFw;g{<0Op0U~1Rbpu&V;GH$;#U=6Y?u5JQOeZ%0qe^7_C&@|M- znxP@TFxMD7PJWP~p=*`_jWT6J4r!eFHV7;>3EKK|au3inhcz0n^#_v2X^aUP5)Yw; zoEXL&`raLL|y$Sa}7QCYrr@FX4cmnI0ROKU2YbN4ttxSza=C})DuXEX5X z|G@-=gH}2H$RGyP(G&dU0iKj^fgB*-%o3PF^MgVjqF^K~{6Yt3oyhq@FdzXgpck^$ zf6UQpWEQ_hu1w-F1!#7;xnUZ`nXX+Azo#55LH;?)3$37S%ZI~EUw`AtAifC9ms8Rj zwIDc9NQfve&*AMM&NeS9;6iPp0qHnu8P!f}!?ya3y+^yU=Qs;{G1}N6Z$kKu|rmuH(0`S0e3W-$mjq zUCs-t+9noeQga1wW-~9L@?)7xd8e5ixzM?$OU~upW#+FZ!8S23rIvDQImNJIC zX;o?;Kp%=8^B=~Y?I~0jjCx2sx2|s?wpP!m`!$_Z$Cthcf;TWEB zQTX|Luq51Zd?Sa0Ip+PyU<};WamN@$^8QrL*Qx!a>S18uz@J`f#1FcN=dh|G)-z2A z&o1H5Mc_fyXqs| zyzy$a>4*H^|8CD>Z+q?q5|6mIr++ z(FX4JMEFq8FjC@b%mZMF!kNSVUg2!U#f>qq?6yMT(fap<>N&jk-oRZI^6P;8WlWOr zxkpFT$DwI~Tm-Tigul0Z&0v}eGdpDziQs?LPx-%xrg4C57%R#0vG~f-I!JNjyz_557<#>&V6p>$qeou6S7^AY_p{;NXQu@CXBULM!_yeXMYx|(3|gcJ?|lwkZP(PcNE>A>rLrlyn} z!DlajbG*7U-k>&@VliWOn-s+79OjK^i{?=IT1k21jH}uit{xA$mVa==J?m3AY-`@1 zNMqB!wzeucmTH7&e@u~Q zD;f(+-g-@?SUq{Cu;+(26^f|dZI`e>#YlbF$H`O9z~FK>FgSgb|1wRx%+bnvipyvF zGw}_!A3%^LU4Q$o_V25q1KIdrsc3tf2cFnAtK!F?wT|_lxJ-+*Eqf`x+PSUh@FjNax*0<{_@mw`nDO`Mgl@Yl1^;PgA?%C+p-Jy*d=yL7b z98HXQ%Mru?<<2*nOzaQ1r&x{1P4>LPAqjZ;(N%u5#Gi{v7=>uU?vM)x?{m&BdTIDz z$`S)%)9*t7iCbFnP#7lDxf!2K{)f=YQbHCtu9NO4;*be@dkbM%q`<>!VATf*n7dOR z@SEV>c@SD+>wf9>h}^-C#B0eu-$3uN#JW8eiEp7(?ixP*2B2;w-y2k_8KPZxCPU0# z=jeG_7=!N#lO!kKHtg^}DYNt|k~C3rfl?du;j3(SmCXiC^UN7C%-V;&Rq}4|JZ&sT z-Zck9UEO*5vdnN=IV{1900@5O4nT;2NZB@%84Ip6*(8?xZJrNFyBeMR8h?_?gtpap zPIBkx1f65S*6ZzuoJb)(dZ1{}Z9cWz_5F`cJFck!?8yOSN`Is8&MP8uY2|yn?2^q; zwV#^n?PF0)i9CV*&BTdtQybqjtB!(+c_{0m8gZz%FIU*#t-9Fcv$=nIM=KVdr-3)d zbDr39mpt{&TlVP|C2Hl&NREoLbi2O?fzld>>hJbqm1=B}d_AbhPpRh_kD)h0>lYH# zZIa8|B$ZB#_k|f9DnIp0u~+>in5E}s^iMyxFmA}==;=GKT1{eD=WSZfxE$Ticd?}n zk}p-Y9^Pv~6qCNEL`g1Y&m(-NI*zeSWi&$HsqqZSv&x&rBrv&KSN|@vINMOwncB(v zvO74dnHU^6Y>j1w_pW&-@{8iT2#a^&^MlOW*kVhFs-vdyBB8Q5=Lw;i! z`mfvt6pS!w^d(aTB09+-rw!zRK|ac%DvIR=5kzy46cka)BIHmM&iiY)d0-cSq zUX2c)OA&L`(}=Diz?z`2Q_MKqUNOg{%9Di zKmiCs))HG#Y|vItlm{YB%>?=ab2}^h#4qRM>gL`C(A=_>8b!?N@vCxq-?99*+c06U zqnr=(=7!qXdC)4<3FIuSe1}mUoTs(2daD?xE^4ASqQFL#fJ89;PE5}y_V`X+!)y2% zdiD%g~b1>xxCnM{n%hf#ev81FSM+GT4>kWNt2*u5e z&emvYVm`*Ivho$S+snp$JRX#ne!}-QxE^Rs;CA+dmj>kw!UPN!w%>FtkiZ_zo*rMw ze21YBV4L&$c^8X(%mxJT{66;q|C)Zs9bxjQ@R=Oxt;!c9anqZ|CER7HoJ3W= zCs(V5dsP(*bh0qTGWu)()(eLjPd@kF+8q5_k6}QPFumlSJG3b}fJzMcNw^jE8c8s@ z_fsX8TIUTVVZ+qO6Nko#cS({t@D48UsQPYg7|7{8`}MLh;7qC+S5}fM)DiSm95vb; zH!ATTyCTq-=i?ulkzm3mN=MLopDs8Gorf1wr;L*QD|GI1_$81j$zA-Ao&eA9{h^-^ zl*OR`IsBPTkle@kv%|57e^G_+Eoj@;s#^eyy@w8pJ!Q&aA!` z7Ren~6<@~)cPi%!O|UDQ6Hi%s`~{>Q=EXI=)+Lk@8fnw9q=nlO5}b)-6v>8uO7+6* z8Z`f>b3oX?n5t1OLdhJ`$gI&EU#$q(bMz z@iGPmN5~jf>NSEY+<3A>6XPI{RDU4C-xvJB@Qo=D+JJ|U!HuJ3FDF}e&5tX79&++| z#dM4xK{FhKfm6hf<=um zU=eD0?aEqi3CTo?tu$A`XP^Ee&)X_bm_l%MuA4N2M&Rzi%$(|!AH`wgO-@@*g;^uZ z);x^koD${J6_!>Sn!cnA;_n4)zh6#>=&sL@(uM-!K(Pvl6)>OiYF--(A#V;aa~G#n41&P0N7b_gx)iWhbWrP$ijY;jjP^m`Hj1rk|Gs7 z51)f;GNDZ@N|I_;UD_xkqDXf)oQ;<+c~`f)y-(ntng3iUs7Vm*pX=4LVe|R%A6{wt z?8kN6#e3}|GC)b|tLa?kcN`|PE=5iuc@Ic6W2kp{N$*or-^jpbN3iYNL^5^@PSt%I zbj$b%f`i=M%NwJw`>foE7ua!2zjRjZC^8$iJ~ewgaazf&MXL*7J)3yU1HTw)4TxT( z7p9oDFSPeo>**+44R_N`zo3pyB5kMEYW_IKY@U)b6{QxcR=sZ5dvS zaLq+Ap%ep1XjM)pS3g$R89h8l4nqDBj#qV4%$vr}74D1h{zO1Y+j z?2Sqqg*K`}sN3e9oD&~asAy+>UuTIaWn{6_KMP?HUD~D1b6UUf=i?@yc*?uF$dF!B z861g(_f2GFE(-|`; zH^F`7Ka+t!j=}X}MedxbR|(T3-OOx8sg%!^Z|^0YE9NU25=$F`$mTv-So4-E4KQlJ zAIH7>DD6$u`#Z$>Ie=UjQ$b=*GFzHf69O!w>Y; zP1!f4`Z>||9Lq`dpi^6`Ji&1YyI|An?EcTVjrMCu;R2R6E9$f2BGoi$A_Kj!-dsy$`d>x4_=9%d1Br_R1VKsrwN88)cW7nP4QmMjh-^ zrNtaQ+b~Fb?;~ixxQ=w3+fnruY$8(DaXV{XGY{}}KkdGQcu4OgFw%DfiT0nz-ORQ6 zlz`^3&|+5OLR}b&-wo)Eki*;IqdGmfdkv~h;`+AQ-pV%uW3VIdW$OtEh>^>*;Xy%Vxc>?jg3X#>_UL72n)Y;wFPL~7TFmOI`vm``f>RT` z6R06gIcyQc@Cf=Vjt#LGJ2I{%;zzcajve+HD?~|r&Q$F=w0?lMS)YJK#7m!NHN2Yl5^1iZU2Uc0~>9R+40CM3@R9}J;baGj}1 z8gHnZuXn1}EzLX8$$L6f0}c z&pU8!T0Q^3iWNvH2p=gSq~!+``kQeG#wBZ5;`xX78JKbrq+y7!OT;eCC6ymEQa6>u z&6)1=`oW^)mYl18egzBYC6G;=O+Hc29Ko;H1{jXaqJ`aY-$Ag38H0E{rwEv%$GX7} zYx!S21m%KFkfW}=$}zp*6f~F6^=>?5Bjm*x?Q9WTc+m^Oig0wqRxhSFd>a_OqevDGdSJ)ScxV7mBl;UVq66G&7vtEP&p<~rqjx0|*nb`La(W(7 z54ve-fxbL}n6BBdCk_e)WqAK_Joz>(~=e z^$Q4&Ii)1=SJFi1q~e6^5odef`!_m16bLe-{jtKKq40jeWk>7!$PQ0A@`7IO9sxn| zs(i^^{yDBZ=%$>@ek!rvd_6Vkv43G>qxSj;;YaRbOdE?`{~*kQYiMUPigSBb@3|Vq z5%%_MA zkZewrFf5k#^-Tbc0wCtfD%IXn zr>s7qfaMj7D6|Cmk_J^a9vt98n~+g@LNz{D9MuVWq{D-1U$|8;k!lz*KR6tBam5be zN2F^BNcvsQr$@PEwp+ARj++dIRU9BO`zf&Q(NhJBiy=?FI zc1fm%-lfuFo*5OZGc86N^0}l!{_V`CP&!B{zSuS7qzLJBZpR5;?V>bdHxx@rqL$H# z9vI$hTAI|-tb@vG15B*n^a`6R00Q$d{D_NzfN{C7radYY1{<_9l=i)jGe2n2w72{%}C_BXI(Mf zJWhxd*?RKUGL&dSN?#Ea7$=V1>8N8O2IwwjmY;CsB;QAqNq^#ryLt-SX;z&8B%7 zo=An{6?9XXeUDw&*5}WA%aJYkFR&Knu#y7cJE`PPH+ z_$r@mSEyzQOHJSMP@D~&Vu;4yBICYZ8u~f4?v94!%T7x(2J|akDpMU01$Za{z)`^) zZqR1Tt6sAxsI_E5h*n0Vzi8c=QGb``;`!LCihry*`Qv_Y_7T*R+ggxK_~4vAgGCs% z<{t3AccH4FC%dK6=@Hsej=7!mz`rKS8Q?h|o+=p7nzaDok6ja_??a{@LXTS_Iz3~V z>ofNbIupI(N&tGOaK;iizT@EMxYX)jd`-&M0GNO<)YW~23IXuND)xF~e~dMm){1mS z@$#h$ulV{D^aw;z9G3pDI{xB8{cD*Ljo8EQpXzq7k%^rN$(tofDYE)k&l~-x;o(=e zEhI9m9%lLItXzgRZExmGpkcO89`(od6j?QWdnD1!H{j}BWUE+|OA>Ak91aUCv66l> zaO>}1v2%bxDCgSn5Ixn#k8_a~SZnsPH1#Kp{&LB44^0VpMci!85RVd|$P;0O9+<}z zMlMb;WYBII95Mhz7>6`l90x)%Wv029Z!R#t|Fx~Rb+#&YRLch9OyMoxy8gS}y;o^- z{tx4G^QdZEn>W9Xa$q&8Ckc~)aC#%m-_NtVog-qMPEY0}+MC^iIX$(44w7Ku^`^}V z40TM05z49UMwrZI0-PvRz9Kx?jYDQ*E!W-u(2<&N7`yKl(*9<KRiDX zvL1q!ZrIiQUp&5zcoE}knK!-+Fbx+ZBVLzn#%T_nQg^Ok2b#UMA>A-tYdA3Ke6)y_ zW$5mv9(izn;i=7?CV@PFSM=Nv5Jp6HkG3$%>~a}IN@Ayw^os+*@Pp|0GBc5y>}^9f zM7&F9EW9>e<3V=I6-G~@M@J>C%zXQYDwhZEGV+)<^91=N{+-uacR*}O7LsKM&AxXD zc2wp+Q`GYGScPE#vU{iN>v9$e2zq0c{6qcuI|m5uSe8R())rOD+xByhAZEZ716LW?b35DqapjGBy4v<;2gW&N+0QbUr1Ix@93m zBjYO(!YCKMP)Cc?A18h75q!24mG~kIdT!2P=S_LAIXc61A<}3pD}{}@)cIx3m-_v8n?-_XKxH(My`D8Kou z;)-(L-Z46`FnWG>CMK)XC>2hYtR)Z8s4PruS~ydaWZ2?vUq)t~d8`%cxr)3I;}n+P zH?+nOY58|CBsg9zUIrVHPTBo=&XiB?F}!yQEc72j<5Akpz%LDWD&va{7z{A4$rP?b zw4obq)j|~J#?BWF0pZawWjWfuig*R{jp@l%HDi$D-vW4FY^Ft-~5M94qgPsdjOySsQ=5KT|{%epu`vUpGA6*0{(} zgGWo#-?9klH+;_z?~`L~Rgp}POjROKR|az{8p8MbabuFed+mKGeoX%8#uSc*g24`|B@vgXwoORSO%a2!lv^ zF!M|Rl+o_|^vMZ%cP1Yc-OQ_eOhS(8>wpJ~)OB>8T_NRiXLne#t2jS-l}WWtNNvae zYVm&(NpwGecW;UMxrg)xIQ90TM$Jt(643l&GRNju>VyZWFT#J(K5qgO+i6lXIGERu z^F)*lH7S4eDHieXP0wrtb*w*RHf_aagxrtr(VY0(=&e1OoG$;hq+aePHjd@_y_v-} zx#-|>SLf#%8C&yP_q956E1HDWdNcOJx#~haQKEwaEs`^yLU^>_wm;k(l06V4E6;YxhM2p!Vh5<6buO82^dd1s64*izqoz z3L~OFW{^OY0aa1I$YZ6U&;|6}GNIzO5X?x5-*N2v-qZrb3|BHPFIQ}$zx~X`Y0}^A z4p;mr!NUFwE8hI#?mU2Awqxi+g?6zK&n`-bYrO`-iH^!&}EG3X}Th z9;vg*(51fHP;uTxPi-?S!y0C?`QF4atgMY=zEZ*mF@ttf+^@H^xeu8NJItkC>92pH z{XK!EYGm!rGc3hIxm%0l$2AI<5(`6ghFo?%>T0k+@Z)-RD*BAFRtGq28BvKk4S!;K z+-T(|cVzZrH-5|080j(pWZ#O!$ZhUX`2LNG8OF5?vF+poY}s-Kv9{+Wm&@n92=;j6 zsjD*>{WFIRv-=k)3E5y6Det~-<({cV1e{PuJ|C zQ96)t+->4b$Om@axWy7Ya{|%q?EG$wZ(nIXdXD0ky745M;+}9y5A2!K2M)Or=0o#i z5_t|6qlayQyR}TbXoEMpRLCx_^YzJaqZEIA{zZG*YoaGsr&d9XpvOM0>-*f7s zuOv@c$_#luM+zU6EZ+|n9p;a$e@caR_~LS*@M<((?=;M`mZWLARlCALKY)eG+(k2EiJjHj|dHhX8U~VK;vKU^K8b>x4Cb z-uE=-tBM8tzAcb>A~O)@T$}X+AVH2Hx4EMZvjDbkBe+(3Ip;6<78q$2wGZ>gizaw` zn}AMZ+nH&j?lFsv80SKre6dD4!%4Ag8?3bDXw2n4)Odouj#6e?kw3!4%|QU@#NEF~ z-;J_UsZ;xKYA$D50AupP93RWsd9#@1H^V~n!IVXO1sd>C4W;x0SPQrVB0Q1?zfDV@ z9KQBO5-KaK%9W)=3Z=6sr(K?HrE=IYMQNYUA5j()sGj~?FLoT3wmHb*)^$Wu+IZ$g zaS9|kIE)40c><9yQ#Hqh)c97>b@0|aIVja>vYxg-?s+7DJ6TeNsBq8#+5xrKb4C*y zFPDr_$gPX}+r!}3(%`*$c>b)%ov(>kw$IhsJMCVu2}ib0a4G&7CTP z5zLxBd8nyj-=Y7V95I8hym^new7f)8r1NA86Ma<&E7lh=?O{#pDMx1Tn{({vD1B>( z7nEM}a*cU;yuJGDebh226eA3;u(N1yL58fskk&ms>uU2P5q8hRMpo8!X1tGubX4akd27XsHSCUA5MFkiwrFux6hUbO62LGHxlcCasrY2TWE*{P8y>Ng-_ zpklIC7>aT9zv*Fh`VH}_UONXCp4{rVx1eDzXy^=H^9ANIJg+~dNMSVjK;PQLOeYtU zTNq)C#i^4sb+aygJ8?Q3PBqsyQE|^b=o|H3EmTE!0@wFPB^_-Zjndy02K^s_^vte@ z={?+0i2Ecc3wdAYDz9ei-)U}N?x*JSef{1ODdns>TC-;ZEXS0~7tnQAVwA<--dOk& zY%6ye^?6q?P4wFPFbyuI*u)R}sqVZ6M(1cDwR`9`^4-S9T?#80EotAvkkCFeT zAp^-xD&KFQ=TDjSL&G$0C^71@u9!JGWx2tRg{*xLS6x$<->`C?RyZH*UVVMcD^VNQ z^JDkEYW9jn@q-FZ-!aysz~#^!&|xaD;(b!cVLFcj&iwza&QmT*V{=V>5Xapw`>TA2 z()Rehx8z*4x_q93ILt|{NcU96B6j7|sUjCf3zAH=j8lJGlm?JZrImA4@wQSn5>5kb zh9uD4OvZW|mb@@sa?(KvQ#(+%*q?c2b3V#WgT{f`##0Ccd83`yRufrag@w4AjEh$d zG!`e|=G^{yzkY-JSxeQoN!PsDi#JmVIQutFt}Ia6TruM&Teew$VZvg;FKuh`AX@vw z+#8bKV*4i$&NaWtwm=J09_>5KwM4$T#7_d1j zRdiAtV#>-DlI^>gX{GGt?T)twpnw@hBAF#W~x%eB%-; z=aBO(RPxRrrC4*Y18mPP;DMKAb8DL(FxxdYzi^Icapd7WO)9tPN+(yJFv_W}LgSB^cW#%(A7mwU%US>k312dQN|Hfy`58yOK~B zCEmoc)aVM$IJW)~Ylfq;lm@;xWxgNrz~nOYIPP8Y!-a%64y<%t?)+H7CsOBeQvR4u}QHl+9+-$RA*dp%MuD+zoIlpO6`?-s9qQM3wMqD;Ty_8pn`29A9 z?xcMbK9|>@pSZ9W-;-J9Y-VTiG*vN~gPr+5hez`vB+OSaP~x;rkYa|$KXm+NF2gVW zYE1v_mTHLE##-wK=L~Bjfc`;(SCRmAhaa>{8iHC!xQCPI0v~wT7-Pd{D81f+zE$Z4?5E7ieb+qD}yQ@y(dGc*Xj)-uA3U#t7Di@V}46MSUB7dFB##)Y8e~$@S5RC zx%>(*wS(1{gX`=NRIuAdP#YF_9;j6vM$~&A&hJJamz&qz^u6yC@^+Z{iHMi?n7L7- z$$by(hf$k{y>AP-#lMU&?<42`qR`H84ae~fmPFhkhb(o&4z7vGOi0xdq7>a?#AfvPOWsRJ_B z8ED~da&#TkMf>5ULqtShnEhpHMMneH?OP~izCpI5Wg61R{ZKFjOz|Oh;eo_q3=t>FI}>F~ z;>Y%DFg={ejvkt%wfk%IqYStZMj!7WeaKedLc{TfuD!6{uAYV)178bvp>+Gl3ii<)M3Lhd8+)Fnd~mp5NKL0yBF)~%GZi-g>Sy&c+< z79PCV?7MU*F+%!Fw6_!*ac!wdX*H~-DKalWtTZoJ>s%VvPAJnvs9&f`f7(3H+a+Hu ze$f}ob*=0FsyuqA*mon@Wz1-@=;bYscd^O~a$2dCDs^=Z-zO6@gFuIadGe5F$I)q` zE8N>V{VoFJA_h#-FUxVY%^}{#q;GO-TmM~x`dV`5(muz0q6p6iH2GB<(BR)6%kB{R zA8*d(VR?2>@0h+UN^M3ySz~a~lGG|w$m;U$m!n@QjBh>et$kJ9g_olPLy%(Wp;Bb04UJo(D{Fc@%ADTP=RW1s5@Z+IYX1`Wo*>3s}KieK( z51@ya(Xh<_k$La>GkUI{9aOHZgBax0>yr#Cc9Uv>`loDpi6LNFp4DgAVgqvT1FJR= z;Fw;4Aj*r86>z92lt($smnK1m82#M{MhVtfWK&w~j*Lg-UMkh8KnWs<$FW~@a9LBe z)vPYU#`RWfb+#?!pcmf?Z3Ey+wB+z*N@+aAlqa^nL_MbVd+zE2E9WP+Fo<=b9r47t5 z*I&8K5^39C6Bx!14TIHI1xbO4ZTnU{Cp;Bzqq7x}qp`hg!foi)Mn6xf{@;WUg1rut zOFoyubsVJ5dlDwUtwu$8mM%Ka z*|i+Snh2rK_vsfC48VPjqukK>KJ@%tCSFTwr_TY~_yWb}#$VeK>TZ#sKN-%2CF~N~ z-wq-OxDD3^jHz3}3)Tn7gp*#Aq&eLp&5-{)oc%NxquahizS79?Xls28vD%6q&6lS^ zohyxwRY0nCyB{^E??*gAuB}T0bmpwFabY6^z<~x9m5Hl<`dpn6MX+h6ZctaEE}JjD zFCHBLhL#H{njg=eKAficQ&J^@O)N}YZknoGOQm-!{D1uU*!-DlcEUQJ!&?dppLqO# zx-^8u$x`1N#&Xcd9g7P_*1siR9en^zJ-8medGoX6^EM8P588BjZB)NLQ@fp6*sX1D zZSxWHfJ=<#)%CH`a}jEWho>7@qBjll&mx%6o9F#nk0m>(whx4jsj>?}Y3&hBKbk`> zVXZd|(gqsO_KV|>Z%p3bF}IbK-&-gP#uo_P=JIdjhjPm zEjM7cjH)HtqgbP;xG*^`rJM#bYdN}%StbB%YvJ^Wg@27pXSbH1C%Bv+?5>2V zzwj^t_{EqfWdu1)Td(u!G^vrKHNC~V(mY01fyA=JX1AqF>WdDhPPFY(I0n{><4LAl z6GzP(BNPr%TtJ*5{N}!aDDC>vCl=oAjuF;@GQ26mrYz)V74nOrvHL%3;5oNv7b9)GJ%xMsK{a)o&- zg*fMWLW0ipGm+w?JJ$bO z=?l+k`4T@7zqppchyzvg-rn?$TlH+eC2CP&+CVV9s)~Hs_oVx6?2Wung`mnrMA^o{ z+~)8u4OHsCm(4VgmMqjzAU1~T_eq7m940e#iDup+&fCD3CvW#J0`Ut}4n%uU5aZ?2 zc!pMn!EPcUPqn#5nDgPX8?U~3I|J0AP0Eyg40U^Nm)IUiY?-xWhI><*B!g=)-Hgbf z>W#;_Ef~kcID_uChKxD2@6KHK`^}#BfR{n1Fv%*J5L&fdJS`AA=P$!r49Vl=hnKVi5&)b5W*f;!Ly4T}44_b}n(k~Zm=d_^d9VWn{y>4j8R4Jcf`+!Zoejxd z#vl|Pujp1fdmt#P@eS=l!O?OvSQ{}1o3(+imJDV(Fa27}{$EjNQRTtA@4Okf?W+9Z zr^L<5Z`CpX%uE)z-&(jvxpSmTLi<^UM~M2?Qj#!R=%8wRs9B39Ax5RElh=33#HNL? zkE%nKu(Dpg%JL897^E=>Y=7`Q;M)KJp3DF@vwuh zf-r*)Tt zk0P%e*Zg~}T$K)&PW2_2D`dC15OhHZY1)e}G50j9FW~y6U~f}N=y?zz*#o#ZLmnXs z!q;kl3Uv6t=kNKVxzgx4n6*E37XNCY26jdJH2ZnLIZMSpL9h{>GEB4Gu5EH6ls2?! z-{pa6?b;EyY+M`8uhXHPW$7_(W92eg3P=8&$L9U7RBnva&jc%jNq-K`{YvbtQFP8P zs45OI#_-eFw$eG}0NKuUuU;3_|15Rm&BI?OMfLJrj4*SUzid$u1l?Q;Pe zF@B;D3G-5##7Vu$R!-?)tH!b$kP6wpGgP|uKtoG5!Tghav`*&2MB(hpukY`+TTu*{F=k9V|5JGL+r8c55+ z7v91@U`>Z^;%}6tWZ3L57tTdp$?KnHo2Jw<(^!vIdPd9gG_ca%=v;`UtaYRq9qdt%Fe znuF$1Kcz4evOc@!P4I>g!|zsvH>nc{*vnG&FMA{?AIy{U&NzsJKvM`?zoUq8QpO&h zi2`T_&k;-AW<;`uK*754Ful$roH^I;YFCpKfm`^%z|jA6YhceA1X|*(Ex3?nB;F5i zmAhu*^USCq=zC-iD2&bCV_`Sv!EmFPzc+>sp!}q@1ce@2x!O} z6pOwpK9=%V*Gf~(+f;*o<+me40kpp(2!y>-l1wz&8sO4AHzZ6atQ}Du%MQzm)viBu z(#F;ES^m%ScDX0;To7DPSpZXjwVW=fU%_M6jeg`C0NOX`uM8iv>Ii zP5E^|pvYNZh!9K|hEB?a&f0b^EYnbPq5el84MVuXe-Ps_jRXyXzHE@2-Ori^qP@7X zzcTh`jnP%58^}*`SD;#*yf)7x_Fz_K79bcSg#{cqT&Ot|a;mw)Dx&376hU$-D2R$E zIRThvI-@6HguucI{kk{d+DOLu9FA4cfi^FnA&i{615o6!gST4p^RP%W#^+bt_R2=}#QM zMXg}ewLX8`gn_Uw{?mDz6%h;DxvF=!G^0>YJ=H^);UckJ^`h6XdI95vg$+||J{XM@ zT#sqWSl;6kRL4Bl*8B(ha-)aKFSSvkT6^>OEtMG}B+T3Qt9J5m)W>`9-bUX;2-%0_ zwajCRYxHpRFe9I>P;=jJpW5)|0k-AGAb%j{F*VwvPnG7Q8CP$QTP1_NHrWc1uLc_P z4^)h54AfF}sN&)PJWnBur?E+SXZC^-EN~-ZwJbvK#81BPp_1^gizhRu=W`D>;!j>Vurhr~H5YOTtxZI`*nAe-k zr`J&Rdh?PvulyQD8|qGBd# zvG!&fNoUsw*Y>V5la23^!VhpaA8&FHH<;Gma#w8Ldp_*`{k|e!5f^QNx2CmA-LE+! z4ZOjm)Qa|9IioJD92`#j^iiEeAn)YO=q{Auf^5gCu`r~-^v&^r5KWpSU<8|nFow@2 z+hyJ=TVeZZ193g%?e;GY4Z8cZDateIS1y?2uI=s5CbK>G6+^ zYh**QT*BL;?s~Uq5)jk8v@nqlHdUNuvhKZouuUq(&oC#IX*|WqA$y25kXFv)mW0uU zZf|6GQtl`axy=_NJLSYjAOUvc}&3mmt zl8DJMp6R@1$1+>;V!9rNmg~^VN+~-d1qU2Waj{)nebSReHFa1(-tE(b4r5o~`uo1% zh=}GFVg_>QPQ^~sAKk*S4FUJ+vehm(sld6%-%+gG3Gc%IMZQOL)3T?yV(R<$|I$v~@ zC5NrAcvTE@&sq%-%nd)iztg?q=pq-a(oMK;fP!{C2g>I%fO+%%Ep!QTZ0X z-+75lNG>|byVt2l<#tRh@==QEI%21dz zcEZ0iI~Lb!UhI}aNR|=@tZ`481!6O@s#fbYz~IKn)GJbGpfqmoXM%`CFkyd4PR>7r zYb|e}{c9x?E<84C9Xwq3TAJ#UcLQ&%G;#kQS}iqfoBdG;u-_79ZTV=3)oCDaNrAwO z^wXmX+L$56F9zAjp+gYUZ`n~{Qynj0Ie`3rt+Q4JaqY?+}1Ppn(Yg{RqNgJyCdNBqxg(OT{_MeSiKr9-V6JVabV{KuK8Sd z^Wt{?$Wo~v-j56Z6Auz$x`6C3&0owd$q^QGu0p^xM&0WSeK7^@1N03I{v8UdWvqWq zI><{ThWfI|(-TwXjCFTFpHT9W_-10tYykveCh#6B-U6+$SQMv@uP?)y%F^U=)_i?q;#mbA} z;z!W8CNlcr<$T0Q>nNOJs% z>oL6w1M6>SdE^C|I!Ho?IxUu&S=?y&_L*d&CEYmNXo-Z`ow)$#XA2foY(!@nxK)Tl zZo~P*p3*o)SyiSI2i*zldMRWPH2!AN;-+?{eXWyJ9+t+iZQA9+ltZ z$i<7t?um~NodS$uY9a+_2pH$Rwc^YJhKsKyY?1+z7;N#ElGzZXP#&!egGjA-S1ARi z#H0|;Hh-kxc@`-9c#m&DU2YNM7Dmrl?ozl1sw+}Z(dqmMg&>sdEqp4k{1X5lOiBy^ zjZ~orL_l;w%-JSU6a0Oe3d8r{A#jq>13>#|sbwlB8mehR!xt)GU51 zyO8#rSmV}(u-PFQ)jf0a-O&}?T*6AfWRy+*o|LwsZgC3Lem7%~mqL~n{3X{zE6tju zDgQxdDDb%k;vTbYRvwKjB&+ekATRQG4WflM2Q4}tH!qj za~Iw?K*_^XHdEc+@-(8v=O?l*#F=xCuly!7-n*&E)xg73>L}2f<{*eJJ30wbp0ziB zevFH%@gk51@XP=bIKU11b#d2oAR*;`0v@A4Ajj2g(M&eg_Z9G3$bLF@YFLa^fNy9! zQ6FYbryRHaFC6AMq_g~?8Ab6s;U9dg>+gc$k=#tOjup1WMQnlnabl=$-ZE^pJHz%f zmI(3~o^~_&p3Hj$jC>zT508^7S(t)54MfoSGdFrHDqZUUaipZ{>mqxwyNzw`%wM;h ztC20YG_6V4coE^E?ssh(bXiur8ueV_=$pqN;pZZ8m_FGH9z^RzEw>sj=Y8w0@wjnU zBm7S^-z9DDw->=9BdgeDSo%0pL^r`WcwUv*vEz2Rw?bN@z|mrz*EBbF?8;I%E2faq zlUvLUzvs5wvxdT}T_cJ9kCS5oBM->k^%q&m(O&Np85P3i`YReN zB!wYPUars$ahq}sN{;m=y5&3ZukEkl%VEuv{;Z^1?6vwi{^QSbUEh5ioY_wzg54~K z`V*8T3;Ic8N#Q^~T1xPBlxwRzK6A8lyw^e`6GwMpBo85u&RC$-G1qiR`LptWPL0l0 zPt#IX;EZ^>xU`==$P2I9i4Rq!2Dl*ImwHl46hkJTwi@aAl{qM1bAuaSB_A-BWG*%q zw_Q*}=je+xXdmPUdRsp2vq-<{tSJRFL$S~G$ugqtq7UioQhHn?_B*gfVbU0J2EQBO zQ`GX0#aJ2pgqpAr8fXuEwEmekMHf`J;8&!Erg;wg;28#|MB@DvaGWi3>O zSu+P`_j+>3?0)UjZQ~twKttCMJTcyP+y45&tO0&J`eDXgg8&$Z_mA_u_(%nM9lYTZ z0ktgHgCv;4xV4z&ON?$lGCbIcUEuvS)l*C__vZ#As_-_od>q=1vA@oGc`2_X@J;MJ zqZ%v$@)$gOh4K}}(C`O=BsYm)D6WOL|Nln`G-3_#M`>6{CuWn#GD7 zMwAAeYM=8*f5?v|D(R?Sph>_0=}UhsbP{Py!?wNKp5`-Q_?UbEUsw->{pM6(Y5W)b7TGoe^dLFzV>})2gHRuKCNh z-LN_YhJGRvYew)A%9PK5aFXvm5teH|YV|RHWsNYi*x1Jty%CPsmX#BDfyW?>H5Qn} zu1P&l>Czt!s&85_XceKcP5>peq(AZLJK$+zla#0wQJ36a-sq*c#^DE4^~g;d9V+W$ z);s=NVI^TWQVl@fJa3i-G(e#b!=5k$$D*;)a z+*|hH-F@yg#>`VreS*1j9Zsq6=hzUqs~M9(6hTr1gJ>g15<>OOsl*~Wrhyb<=Kfl& zkq>i9P0q;)q$jFp9WZhxU`))M>up}KHtXD{aTVPYq`kQmgtA6>R<`LBbje3Gh=MbDdyyfm6otDE2aBleQ9N=MfgcCY(l^ zkm)i1DMzB?AR7<8m8*-#Ew@Nk&u{DvOJ|2C_iK845zn@qnL^jx*;HwJZ zZobDOpWo#W6}5{)Yj!w!Vsg|1XaP>X&|BdAfF=V%NA(d1p^IapKx$)kz6u@S2ZP{; zm?dXWtSB6MVqS_5SoS$U8@TUQF+!a&!NqCoc}Vw5UUeTo+6pk|wu=J{V3IQucOT%P z)~D;@ztvfo+zTJ_RE2ymBMn}YjSFpw$~}e=`QW?ah+J?jx(nFog!k@0J8bsfa1UYWiRVx^Q;;r#QZn-Yylwaem3$u6 zvDMirOVB|;WJn&)3v;$d9TmAjiMUyyZ~OJ_cpgTF0t7*?>nxfF(t>_`<8KHnHxhOb z@HrWM;vvD_wAas~qW*C}2|(0e)?f&_6P^>#jnk%sv%hd=wtbbO1@?AQbWNc2B%#;O z6WDQBPoe^1bB_A}Au8?u;eTdYrfK7Jt=BNt74~Rbw@|4b{#G_ay2VnqmQ$7-f8d2aeX##dSiqL zelq4|PCo-041g=H6y*G2<9QAYdAthSUgu*0xA)C=S(;AdQ7u_ywncVW;u-m4uD8aUeNY;}_SknqZl#iw7^3wOa9i;u* zBD^8mGP4{Vt|69@SaG_`O83ZK|!IgaMCpHl8Gnzke8slkZTbgyO`Q+fHK%qJZ!d3NLT44p*|Z2ua=`iv&q2 zTvkDcN6py8g@9l>2C2Z^!DFP4Zwe=h+wiLBL6O87K?_r7(nL7)V^^ z*aO53`_@ciLw)LC55T6|`WFWAGm{7Dh#ElO+9AKZK;NM|82!`KV+w7Qz_>EwY~;YE z{*5qxCC2^FARF#}G~k4ESYG1-v#zLyCy<@g{dqW2sjXIJSpKs|k z3aONiDmAO#oLT(djULzKN(x%ZAYb_~f=Pf@HUyN@f3ho7KU-h@nR#kjJrAkuPP+ee z$s~h1z@*qoZx~5C0;w<{?yw+(0KHa1O3=V43}^N@+eyC}kilfeKl~kxe%6>d4ff~) z9<^gnTR@;#3?B`G0Y-zUd7J zZ@x_lwhz8T2izcU`SKu%gd>K+_PqV{0b9lae1ifH`GE+M7(l}bCSiE2FGH@KB&dul z69`4NKuw$1Ge=mDV6eWs)4eH#_E1p>A_0rLRoAP?0ipWr)CC|NPH<_sE?`8U!b_Zc zuinRNg00F6b2+Y1Q4mX*FD;PCtf?3%G}!0B#$q(mL|Z~Frkm_hy6K@=M_eR9OVme< zL)3(^clg8D@|{1P%mNhGat#TxCkD02o0 zD}NcFm8?xh%(+jjqGNF@TKH#0D{)bKK@eJt>i)Ol{x&rv^ zf(Bu0u$Q)vLoZ&RT8L9@He~`hDs0b2VIA-q?Xn$C-zp#~k5&|NkEaVks((z3-@U693jv8VYz9F=)=#hoE_#&65HF$6qnchx`Hr3)Pi z+9Y>XZXpZZvaeS0Vwn)sa3a6I`P}7pzRgN{(4oi- zvB*4z3AysjbwUNhz`RRvq-(rzugI=D@sGpra%|LU=H-zseh9pm$^|;Hj`us5b4p-G ztUzn7kv<^GG|_rL8XD-c{p;xBDr{*CJY89xmRU1x?$(p(diOUkN>rq< za^02q=v(;sNXa&Ji+8mHS{{`igx`JbZ#p+?0Y`__%8E6|@z5tqywwcwKMcke8h@Y3AAu{9xW_|Y*$Xdr4m8r(J(K#EbaIivS{sSZ z+)0gHII-L^n5zl`jNgCv;d}EtEr@=El6IBGNYda zpvmz(41SX_#Sjq843VwHJlQVN>XlHIj1cQE2rkkdH8nNW6f9gBfJ4k@vN38Ti29Fm zp3|-P9+c%`KketR{t_a$vLF?Da z9ZBJ7$!OH=(a0UhR#FrWXCA=PH};ouOz|G;oOc+BM~zumRHmvy2Z>>r9Ms+EBMkEt z4QA7ZY?nu)DJj^#2j#Rz0Dzo6V(z}RGA7VtSfdDiC@=5${>ow4)~f1h%_m!O@V|=F zLtpkQ{X;j5%lh5Bzn#PmAV=`MwxI{4(jWo!%nev{%_x2#o{YBSGAMz4{1-*84R&%9 z&ZerVLPjbZGm_F+GDD`c6x^bc6l2NhbDWVM6cf?n4j{EEIruAw@!>$_eio#tiPKG` z1r72S^+5eTe6aJBzcb>MxXa`^bqN&VfWbC5apn-OL1PjC3O=Hj9O^ODGFFcClxnpO zxe0n&x2&~m?I{WaKG9DkB)$@!pERNX<_aHL<1%FIS`tk7ksEOX;m|(e6u$)GJ8y57 z)w^|$>6}VrOr2DJy1BLIH$PwDxme<5lNOfp?0!J{9cBN3H1PA0# z!TzWGKiEF&?k|HF!apZ$0HTU3l~F|n6%j-fMO9H%P*DH4Kj?#KA(K6A2;YvcpqeS6 zy6C8>5kx_lD98Y} zd@7a-0)l7F_fcA7^0*yHjIG4Fy5swkbB@ah+8RZo!s$ya-2_r`vobh-r&l*gt;|8! zLBaO2+$=>u#o|Ey)Ynp%DZp&e)ie6uW&arrLjGEVuVYhy>eBY51gmhu!`jvyP*DVF z$UP)pDmy3M|IV#F305}sCbbsV7*_2ItV~byv&fHU+PpU|BJvDOWQzfQRdy0gF z&-HWSL#1?oa_v(4(LcAy;`Sa_9`*PAB~9B}%hsj_n@Qo4ckd%*I6ru|>Xe)~XpWMX zlR;UcxQvOL&6z2W(r&u8%WRz(%s$Gw+l0zjDpXEs*$pfXH!hpt&~eo69xJJub$m)w zEv=y1>UI8snqaGhi0{#94?mg8MJ1`2anld~cIzhM;QkOY>uH-;q<|Js(|G8_B?-K; zJi1*aEuGE=CG6^0>JS~6gdosuSECsK2ZD)Xh&Vz4Z@jdC%%pScvuK?%%oLvrfjOEv zrhhP07HGUS^Aig6g`e&HUSypC1iysl^SY<$Bpd9!jaH9)Cq(@p6NbBAJsbfY8`caG z)LC!KId9^7g@dr&hO(C!V9HXu)ty;-PSfSOjNK1qUD~kJaWw_W>$38+NF%T1AezY? z*&^M!NT);Ui!Ha|)$*%_=}Q6rB) zz@XWxL4Re@E;GldniA)BGIv(2aC|N(34jiC=Mr$8^`t;_+a1Q`Ucimp618^s;Fo^m z-C5b-Q!7Z|WtUc(0%vDo-tQ1cRgyh;`X2ruPciU4l^f`{ML^VEV6Esncu0C}Qe#Ew z>8|l8{NG9jG`YGVS{|-@Kh2+8qw~*|Lf+irzhWJ~<#~cl-XirJKG$p}fL6$SZPYLr z7{I+}Z~}`><_XstF%2{vwQM~j=+7UG>qRvqhRJ`vcYq67SgHQn{WTSJr^Ka*iC}j>*q`XEHg1MUdB*o6-V07ct6L zzYs^a`~V!y4rQTBLyz@D=uh@$*S}yK*O&R(!sbt|o@gM@b#D_Sn?pF}DNq9kx$!^O zMf5oxR{4ZFAI`zr+>fgv?{jmt+Yct9Uki*>#9W6x-MO=0sJbJSxFcN^SgmEp=8-K0 zn$wo;sksh+S-uq#Y@8XN9lBq!i#|{jx%BiKsI#2nffl$ZMwBD^fsR-q{I{U@Z-lFn z?bWID>$dEDn7SSo3Xp)5ARrJEVlfyIc)rN$vX!Im%@XaK_ez-p$?EC@U4L{eeDx3slOf zMB7UTOP@?s-NKD+NXi!msUpAuj&^aO72Wz#f%51n|t;BDYBFHJla z8XE5TD}i=(jzXNw{}s37ep56V31*U1nO&$S>j&-S z6a5r!cq=y5W%=f4I}kj-v_+MUR*NC*ZyY4;#7Sn z3^p-){&TL&IyuK1p2nP>-H!sdcaG6S?HJEPDO?zKT(iWt6bOPax>ss%2D%}9i=zzE_=ZXWf!ohA z6ItXBY-Un67znU?s7Qrm$;)V~=mz0ndVYmbk>8^Z+%C|}}ghLSR-GkM`~Aju)!(FG{c>7Pwy|6sRpjAY`6L`Q+d zp{O>lcc#LpM>Wm!;3&78tx3hiF=`l9U($bLJK>%TFl1reXE2%{u16AtJHa-7v;-IG z>uoWCC5`lyMxG#yBfuuGDgF-;I)oTGg~*$t7GB=MoVsDyztVOQC9vU#$KpvZsk?N3 zWEKtRsGjxmZmmhl_s@Po^v%O~)#MM@i;{85bpk2jjB4`_b)AJXXUoQgNATjD#7}Ha z{&Yqm==9)>!XPP_@Il{4P~|x)B7+M<@-Wwo*gvd|C$&2goEUP~baHA%q_ZT{Q>PCaeOpiaotW%Hf+oXRviWL-7N{ zDj$w-aNnM$DD`h=qcA|j*fRwlDTF=~ed05W)`dPtJ(Ixa*x4zY^f>&oj4%01w7@m( z>*z+ft=prMTI&F+VD*57;v?y)K8pE*+Wn zxr*@>=T5F__1{`6G9Q%V@r%@_-laEsIGT7%G>fya5HwNAD!Ux8`HYu%}J2oJ@0clz1m{i-%hS#*O|P?0}pqef!1rtZ8rDzxyJ7N-w@wI^sV{_zMYNljLb^=saYS-C+x^i4kM4p(Z7P-#k2VqBk}I$+%7mX=MjLvBx-FV>G4m;L(ii zm=Dl4F<|L5`UnyeP1wWIrj!#Q`%p|N7@ZB2`z#(N?6qZesPefu#8COw$(+sTQLNf) zQ%*U&l`s;7aj{QPX-vpap^DOMihb*G$AF}$V5V=$ELJfUs;qoM|C!UO&C_+1K<<>|}(l>O31mZXoiRPnxMN&lrUIbtK`uEPD z7Z8_7BJFuhAcldmoJl|JFXMSgKD##6JdlS)y!10D>-hge|8s^C$zX{XK5-#E9}tSO z&m8mNt$@t^tOdbby#X$49iofRtw+oOqlfuHitkJa0# zt43meEpbV zsLeDpw<%09QJ^D8(_xB-%()u2oGFMS22i6wstKRmQymj^;KZa{BSXgPt*rstEb!Ts zVZPiv(#MqZ@=@~_7G6NpU`Ue+AYr%Vr(|5NTHxm|D*%-!aQMv9B8n{ITL4>OjU@|? zpC3R)9>~CDgf@EzR3ndJZBqt+^&k_qVU`PbZ`Jb{wC|C6DDk7Gxv+vHYt5x!Otsz5 zHDdmz727~4j$1GUI%7OqLl|PJZ(Lo=0IafvDFTo23(nB@0uo05!z3@{)r@8ZwrfyV zIEl0#3B#p)vI!NFM15mmFEEcAa(@&~-$8UJp0$ISfkA zgtBZ7p!QsetH5(1j_Ty&i=kY;e^dEufM+PO&Po3FE(Fi2h$eyK!TY2xt{_(nR6j*0FTBzki$=8 z7m6^>ARZ`VPT&aLOI?jYh1A~LmiYLA*5Pmdd)piUoOwR}>A&{BU%Qp-Fan4!?E<`l zj%o#+-iFOuHe?vuq<}=F#;~+y`06x;U`bp|3R$zHpk63v#limBEEH~*!L*GO22#gR09}cVr^q41m37F{7Om%(? zfW*?yOql*k5$^w}^7{X!{LUgZ!AQ!q!=4Y#c=&zhINj?~@FX{r$V_8RBcKM3J}Eqg zi;|7y^L67Y+oy>GHH_l>K?k?ueNt`?8s;{7I=`(>Mw>nj{mXFE6_gOb8FpAk%)NX6 zKcDY?zsp!{3RK|(j@)wPhfhU7*JK$n)5~B#g1Zis`pyTgSs0nSKQ{v->K$Xr2#W|- zK@BIu&};#589oP^z%ORv(@^}`)F4(Q_CP-~$s^o9NMrD+29+iC^$bBXh|1uPf_MaX z!?_?%v5se9KeSO^y5#}ZYsN#i+jfu-Nemrw3SFXjx4^)r)14xm9DH{va1t#4n$W~O zU!Wz1I`1Wc4_KCrO|fO=B_$jrzlZmQNYNReV;*yP9p3JDkR|{3>B(^YEzEUGI4aA7 zGk^21)W!2MK(1Ayr#yry3J!u0AluKs_VfMkCVT=sWUd`^x_j4tQFUZ!@Dah)J6g%eWm zxcHK{bq1~bKrzya?kC-kh$KS(P@h3+SAzvH0vo)RlqF>W7oo5`$oQ4@WRe;3=PF1# z2^)+96l8AAA07#UKI(p)=FR{#I`;wGw+M=CzW;-*JvRqb2&{|>-eWN~1@2g9#l}z* zhu=at)7u)>3i{vP)(9e4rHG?On}Q4qzmUN}7tF;!=#l)rB{r)99~Dvt0Z5aueU2J) z&8+FY!<}nDV9FJw7kO!q#@T@yo<~wv2WXNa#MRKnMz*b~((eN8=L9_-q z+lm=?o8xxbEAT82*CfODsQxrFsBfNoj_*nQxAW|bv)WIO;3>8BUx%qMV%WT6-h)`5 z`~S5tE7=!9E&>s_uI6}(hk(&|3!DWz84>40+oNIyje`~vcD%0#fst!;ro6F8Wo;*Y z-n5%>{;;9`TWvzSZ!?eWTVuA=@~nFNPcM6oJ!?*F-TRF#Cvp(9I6wAITL-tvuum%u z1WXKG+J{A=a`B@*f;Ei>fW?uZRvlXrvdRM^Q78ysIH%2b0F3yp%AmzByP5Q$gXkvw z_WIrf6o|D;*L;|0a`7s%tZ3B+q82 z!RpT-JEq08NPdwZlk5|_Px(jQfv_*ArcQoXow&{SfKelS!3|5wo&MYPSv6s=lY9Y3 zyN8Ly;F=zqBk3{tlzc{C!yZat-m_*mnk-pQKB-JUp(ngaxGw@Gowj(@tIxym>m-PU z5#JWkiP&1UXoy5GCDCUTae7(f&8lFQeD$V(>G1nGNuzjop!CfwKzA6h(g?+B?c9aY zub;f|0q&h7tl=p8x5(2L+t=$Rrj_PEObtqfF$&G*rv~OmvYdOErM+)jl#qd==UI`U z`N{@5veY*Bz<6_74TvF~`BCljgvtt%o0RG5#`Ge8mq@su=RQ-LG@J%#KLs+4r@9KM z_lT(wyPZQmG|6$hI*cyly7JBCPD~I36e;_qfBnC;0)J9p&Y5?>+7Rk$%*7~8hQr8u zea`w6AI-+zXO=lC0GZA89T;|ZM+41svk4LIq@t0}I|NH(=36v_D1a-81Ae66t@0lb zIM_%t2#@^GWb4v%A(_Ocb2Wz|e|GeC?_RClX;X@9F@0HBy?RS?i!3h2DZwL>*`{;^ zSzRFqN_@`2e#NRTCJ*FJ_-F~4%A!jW$?nFx8%qM`T8rZO+k1VVXr@R^nW!y>pQKw< zfsA`!{NGe%#)c4Iv!>70M}HDs!kfa?Rq&~+jPT|bx9U3VfJ5S*RSVB}A+R+uv+qM{ zfY=d)c_%bB4Rjf^G7?6j8$7nJCx% z{QL3f8pL`rO4jO}Gf^!!ZTz;c>CNB>xA+Gz3Y_tedU_}s-fw^)Rwkx4n2qU@?7s5S z;BbKNaKbtU^M)f_jd>injeyW@sMtX|($@Jp0$f>7efsQj6uJHOIqjVMCZ1J0N1X`` zNq?1&gTC9s!@LW7a{Fmj+6Bk#o9OjL|cy;Wcm<=1(TZ%GHB zd?hz?=V~;NI)y`A1WAHzi4MbIAvcrazd=mhk;C-%tXHIUvf) z02E;t zUS>x`ZCMvAfybnFIJk~*+l>F5vlvoW*yiQF%K$p8QDPCl?|LXgE4xY}C;Bo-cpv92 znh%Y@?daqtU@=hDtw5kifB*(1yj2I~2nvMAak1lh>U;_`u(;y1tB5A@>DEXY0~CF?6@ht5LaF6KrD-W z#HgH3{tbH=5s$`)&%>b;+v<~c#ka^J9YYf7C~v^IgG;UjniJLx>YyvgX^dw!%{AK} zA0jN@jw@YaPu$VtZdHwk_kgP+_uQU>*cMMg7ociFX}8xS`*>ugwo9kv5)#j>Yj_1Z zGRt*5Wz5q0yYD7=aK3_OUw(cLugsLyZW|>nw@VFgue1IKL2U0`CEyd@%7yed>PG*K1T2BI2lff25vA4aQ)c<;O{jW{0CH;X_k2UA7 zq)H=P3;*Pvo`Y`-Or-Z+m3zah*I28)Bk!&|R#y56)!zz^?@O46rX?V#$rj{;z+wY> zgFBk}vr9%eRLvVS0>)e=f>gMVc;@~P<>3Z1GUiLc={`wAHX1#jYnASsv&Sdkq;$l_ zd{M6*B|ni0fcYIwYOh=bcWykNthFDt1DdY+Dp7XX@m;sDUPBhnT3SJ)aO*_K+v<}q zdLng87we}#QDOa@!p+gF18NSk5ERA@(p_cLt}P&-e)VUk5n<@O6EgiL*TsgF?`1cM z#ZaC?HqpTmKqJmn;~PiZfkSDq%*C3<(E$}i>?7IN*?}YTJ>llTnH>cmu+!Ar|Fpax zqr1+BtezKyel&+*;t+?yd$UXC^)`T=TyPIlA#4`v%4>`4T;o&?B^&+o3#JWkxjUcr zMG&ZYAQp3GA2C;~{Bvr|&(ATMaa-(jRETOUUYGoAhp{ZN&@ZdDv5xc{K4v+QS%VFj zBeeoq>hk*f_4Z200t;O!)~j$68qP&%*uH97l17TKgUlR$c3*YUDm@rY!vvgC$Ryfc zxo39@li{fOv~KnLi4_g5Y!OsDyZGPaWC82DZ%QBn6z;L(e|gzlR*vIiktv>q55mZb zgCq&<%q$AZ!O6qV4g2C%^ZT^JuP#CLh2CuxhZUw5@?CtqS1tq!1W+d^{Ddo^xgGc& z)6i4Bf;UwE8}49k#rF0^ z@}SDFzu@988@`{D&7NLzW3R9}xEv{{sJv8%QAY{RLhAYLWxb}RcXtUo?wqGv?z)S5 z#(-F_Z6LUbysF$6SX5-Tzn5L|5UzxgTle}9o13n)&|7qNhsc3k6UHGM`~0*bjCs7l z8nTFji#a<@g-bv^V%)l{LMdMxJo14|Z^*pK^noV^zJq%5xRsw>xEk;a-~G_e=LP3c zGvnyS)#LP%_sVsPpl(+bdJV~(?o-JLXolRDB5TcV25(*E5x?Rj(;jQ?wy49cYYe4NTWQ9xoP0u=+(g{kf)S=AYa&UQ~nX7kI^|PaNLkkU4b&2 z%jwTOe(mRl>;Cf+ygHAh6k^a|Btd}%Mg!TVINxmYW!J-q1D zrS5bNu7WL6m<@bs68$5OmtA_ktaVFpDnW#shW<$@zJg5`FOT1H*zKltT8=aJR1@ye zBzGG6H^&Z9{J4aFq0xX80a8;J7k1N(@dZbI&&<}M zpM6`T4}FfnA~odYKf(3LTiNNq9Vo4RNO@A|=AheOpwhUzHz)7oT!VS&uJNeG_Y2?h zO1J`~TSPihFH?Ib>zX^XSzjNqYYFVasWK|t?IAxi3zJU6ywU3j7|g8mCNEvju0=wq ztb8p3ZIUM;vJRV^`<=$p-E0i;NJ0g{V#UMsuJZoG3xvGz+UO*jFWyxiM>o~;i!(>t zd7=N$41X<-Oj^jBlELVq5N7(-V0Y<-vM#7zD`yBu2n4(iXTg5m z{rFL$`Q`X18e_O(Idl`p)c?37PiCNvH-W#L#y!5R_vke&5oBxsHufP~cDwmfYJQ>sB|PdXDZDM;$4*7iNNYqW^P&Ds@-|;us)n`aH~8Dc4SW(GRT4(TEvs{C`=f?7 z!eFYhaa@(aOXxZW%$y6340FdrmyZMhgY(!Gs?NN-LVF*5A4j={WU}P)ur?Np)MFq; zt#-zfRYRp7k0(rK2NH71r+BKa2B($fbq7(?{dr)s z@odJ4dl03M;GAL?3B9b)Ns>z&!{M$}JU&{P1i>QFLop-0uVdc7uCPVx|8xs*$b zbuOm966C~_h!{aX3zG|OXqjYbMAp@i6{bU$r5)UoGq~w|3wUHI+dXfqyBv_2(A(H~C$bG7hfb zzp1BFgv>GllHXg~8Ew`z)mAp%+cDZ)Tubt%m`Q5LU2n$;X8JU@w7-me#m=`K# zT!5&8J1T;!vIvNxiTq^?loJRpRRjkjB!p;~YFuq53@+bo_~H+j-2QX;bL{<}!g>9E zTB;kuB8e>ktCWVa^5tT!^xY1#TT?J4FI0jSn||^oG%ExBPh2%s0zmi2)l+3$L&=k` zK@Na5g+;En{DcxFfEop-;S}?h^ft8i&5Ns`+$X|?!2iYE MkxmpO2PFutz{*pd6951J From bafab464d5501aca79d79ae713cbf4b3102a820f Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 25 Jan 2024 18:42:05 -0800 Subject: [PATCH 123/345] doc: add NEWS line --- NEWS.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1d98f7a5..3d02b981 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,18 +12,13 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat * regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API * changed approach to versioning, see DEVELOPMENT.md for details - -# epiprocess 0.7.1.9999 +* `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 +* Minor documentation updates; PR #393 ## Breaking changes * Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 -## Improvements - -* `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 -* Minor documentation updates; PR #393 - # epiprocess 0.7.0 ## Improvements From f2927136333e46ee6448463f8903fe5d1f3c930d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 25 Jan 2024 15:26:06 -0500 Subject: [PATCH 124/345] class warnings and errors --- R/autoplot.R | 12 ++++++++---- R/utils-arg.R | 31 +++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index c193e01f..ef0878a1 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -59,25 +59,29 @@ autoplot.epi_df <- function( allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) allowed <- allowed[allowed] if (length(allowed) == 0) { - cli::cli_abort("No numeric variables were available to plot automatically.") + cli::cli_abort("No numeric variables were available to plot automatically.", + class = "epiprocess__no_numeric_vars_available") } vars <- tidyselect::eval_select(rlang::expr(c(...)), object) if (rlang::is_empty(vars)) { # find them automatically if unspecified vars <- tidyselect::eval_select(names(allowed)[1], object) cli::cli_warn( - "Plot variable was unspecified. Automatically selecting {.var {names(allowed)[1]}}." + "Plot variable was unspecified. Automatically selecting {.var {names(allowed)[1]}}.", + class = "epiprocess__unspecified_plot_var" ) } else { # if variables were specified, ensure that they are numeric ok <- names(vars) %in% names(allowed) if (!any(ok)) { cli::cli_abort( - "None of the requested variables {.var {names(vars)}} are numeric." + "None of the requested variables {.var {names(vars)}} are numeric.", + class = "epiprocess__all_requested_vars_not_numeric" ) } else if (!all(ok)) { cli::cli_warn(c( "Only the requested variables {.var {names(vars)[ok]}} are numeric.", i = "`autoplot()` cannot display {.var {names(vars)[!ok]}}." - )) + ), + class = "epiprocess__some_requested_vars_not_numeric") vars <- vars[ok] } } diff --git a/R/utils-arg.R b/R/utils-arg.R index b48a3642..dc761e55 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -11,12 +11,15 @@ arg_is_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { ..., tests = function(name, value) { if (length(value) > 1 | (!allow_null & length(value) == 0)) { - cli::cli_abort("Argument {.val {name}} must be of length 1.") + cli::cli_abort("Argument {.val {name}} must be of length 1.", + class = "epiprocess__value_not_length_1" + ) } if (!is.null(value)) { if (is.na(value) & !allow_na) { cli::cli_abort( - "Argument {.val {name}} must not be a missing value ({.val {NA}})." + "Argument {.val {name}} must not be a missing value ({.val {NA}}).", + class = "epiprocess__value_is_na" ) } } @@ -29,7 +32,9 @@ arg_is_numeric <- function(..., allow_null = FALSE) { ..., tests = function(name, value) { if (!(is.numeric(value) | (is.null(value) & allow_null))) { - cli::cli_abort("All {.val {name}} must numeric.") + cli::cli_abort("All {.val {name}} must be numeric.", + class = "epiprocess__value_is_null_or_not_numeric" + ) } } ) @@ -41,7 +46,9 @@ arg_is_int <- function(..., allow_null = FALSE) { ..., tests = function(name, value) { if (!(all(value %% 1 == 0) | (is.null(value) & allow_null))) { - cli::cli_abort("All {.val {name}} must be whole positive number(s).") + cli::cli_abort("All {.val {name}} must be whole positive number(s).", + class = "epiprocess__some_decimal_or_negative_elements" + ) } } ) @@ -52,16 +59,24 @@ arg_is_chr <- function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = ..., tests = function(name, value) { if (is.null(value) & !allow_null) { - cli::cli_abort("Argument {.val {name}} may not be `NULL`.") + cli::cli_abort("Argument {.val {name}} may not be `NULL`.", + class = "epiprocess__value_is_null" + ) } if (any(is.na(value)) & !allow_na) { - cli::cli_abort("Argument {.val {name}} must not contain any missing values ({.val {NA}}).") + cli::cli_abort("Argument {.val {name}} must not contain any missing values ({.val {NA}}).", + class = "epiprocess__some_na_elements" + ) } if (!is.null(value) & (length(value) == 0L & !allow_empty)) { - cli::cli_abort("Argument {.val {name}} must have length > 0.") + cli::cli_abort("Argument {.val {name}} must have length > 0.", + class = "epiprocess__value_length_0" + ) } if (!(is.character(value) | is.null(value) | all(is.na(value)))) { - cli::cli_abort("Argument {.val {name}} must be of character type.") + cli::cli_abort("Argument {.val {name}} must be of character type.", + class = "epiprocess__not_character_type" + ) } } ) From 42da7860f4990dcaee5b4bf1ed7af7663db33bcf Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:11:42 -0500 Subject: [PATCH 125/345] test autoplot warnings and errors --- tests/testthat/test-autoplot.R | 87 ++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 tests/testthat/test-autoplot.R diff --git a/tests/testthat/test-autoplot.R b/tests/testthat/test-autoplot.R new file mode 100644 index 00000000..ba3f8d53 --- /dev/null +++ b/tests/testthat/test-autoplot.R @@ -0,0 +1,87 @@ +library(dplyr) + +d <- as.Date("2020-01-01") + +raw_df_chr <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = "a"), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = "d") +) +ungrouped_chr <- as_epi_df(raw_df_chr, as_of = d + 6) +grouped_chr <- ungrouped_chr %>% + group_by(geo_value) + +raw_df_num <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = 1:5) +) +ungrouped_num <- as_epi_df(raw_df_num, as_of = d + 6) +grouped_num <- ungrouped_num %>% + group_by(geo_value) + +test_that("autoplot fails if no non-key columns are numeric", { + expect_error(autoplot(ungrouped_chr), + class = "epiprocess__no_numeric_vars_available" + ) + + # Multiple non-numeric columns + testdf <- mutate(ungrouped_chr, value2 = "d") + expect_error(autoplot(testdf), + class = "epiprocess__no_numeric_vars_available" + ) + + expect_error(autoplot(grouped_chr), + class = "epiprocess__no_numeric_vars_available" + ) + + # A numeric column is available, but is a key not a value. + testdf <- mutate(raw_df_chr, key1 = c(1:5, 5:9)) %>% + as_tsibble(index = time_value, key = c(geo_value, key1)) %>% + as_epi_df(as_of = d + 6) + expect_error(autoplot(testdf), + class = "epiprocess__no_numeric_vars_available" + ) +}) + +test_that("autoplot warns when a variable is not specified, and lists the auto-selected column", { + expect_warning(autoplot(ungrouped_num), + regexp = ".*selecting `value`[.]", + class = "epiprocess__unspecified_plot_var" + ) + + expect_warning(autoplot(grouped_num), + regexp = ".*selecting `value`[.]", + class = "epiprocess__unspecified_plot_var" + ) +}) + +test_that("autoplot errors when all specified columns are not numeric, and lists column names", { + expect_error(autoplot(ungrouped_chr, value), + regexp = ".*value.*", + class = "epiprocess__all_requested_vars_not_numeric" + ) + + testdf <- mutate(ungrouped_chr, value2 = "d") + expect_error(autoplot(testdf, value, value2), + regexp = ".*variables `value` and `value2` are.*", + class = "epiprocess__all_requested_vars_not_numeric" + ) + + expect_error(autoplot(grouped_chr, value), + regexp = ".*variables `value` are.*", + class = "epiprocess__all_requested_vars_not_numeric" + ) +}) + +test_that("autoplot warns when some specified columns are not numeric, and lists column names", { + testdf <- mutate(ungrouped_num, value2 = "d") + expect_warning(autoplot(testdf, value, value2), + regexp = ".*`value` are numeric.*cannot display `value2`.*", + class = "epiprocess__some_requested_vars_not_numeric" + ) + + testdf <- mutate(grouped_num, value2 = "d") + expect_warning(autoplot(testdf, value, value2), + regexp = ".*`value` are numeric.*cannot display `value2`.*", + class = "epiprocess__some_requested_vars_not_numeric" + ) +}) From 7cd1d258b1293e071c5186323d2d2d5eb3f9e146 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:28:33 -0500 Subject: [PATCH 126/345] raise 'plot automatically' error only when dots are empty because user didn't pass col names --- R/autoplot.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/autoplot.R b/R/autoplot.R index ef0878a1..d5fe871f 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -58,7 +58,7 @@ autoplot.epi_df <- function( # --- check for numeric variables allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) allowed <- allowed[allowed] - if (length(allowed) == 0) { + if (length(allowed) == 0 && rlang::dots_n(...) == 0L) { cli::cli_abort("No numeric variables were available to plot automatically.", class = "epiprocess__no_numeric_vars_available") } From 9409ae7730c6e71505b57d0e170677ab70cd5b07 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:41:17 -0500 Subject: [PATCH 127/345] wrap cols list in all_of to suppress warning Using an external vector in selections was deprecated in tidyselect 1.1.0. i Please use `all_of()` or `any_of()` instead. # Was: data %>% select(geo_and_other_keys) # Now: data %>% select(all_of(geo_and_other_keys)) --- R/autoplot.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/autoplot.R b/R/autoplot.R index d5fe871f..34bf55e0 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -88,7 +88,7 @@ autoplot.epi_df <- function( # --- create a viable df to plot pos <- tidyselect::eval_select( - rlang::expr(c("time_value", geo_and_other_keys, names(vars))), object + rlang::expr(c("time_value", tidyselect::all_of(geo_and_other_keys), names(vars))), object ) if (length(vars) > 1) { object <- tidyr::pivot_longer( From 39f3704c968c6db223d4947db1bf0957684be920 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:56:39 -0500 Subject: [PATCH 128/345] check int positive --- R/utils-arg.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils-arg.R b/R/utils-arg.R index dc761e55..85331dd6 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -45,7 +45,7 @@ arg_is_int <- function(..., allow_null = FALSE) { handle_arg_list( ..., tests = function(name, value) { - if (!(all(value %% 1 == 0) | (is.null(value) & allow_null))) { + if (!( (all(value %% 1 == 0) && all(value > 0)) | (is.null(value) & allow_null))) { cli::cli_abort("All {.val {name}} must be whole positive number(s).", class = "epiprocess__some_decimal_or_negative_elements" ) From 2e301ad568459976f8de63085efc60a64f7dd5bd Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:42:29 -0500 Subject: [PATCH 129/345] test utils-arg --- tests/testthat/test-utils-arg.R | 80 +++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tests/testthat/test-utils-arg.R diff --git a/tests/testthat/test-utils-arg.R b/tests/testthat/test-utils-arg.R new file mode 100644 index 00000000..ab29e061 --- /dev/null +++ b/tests/testthat/test-utils-arg.R @@ -0,0 +1,80 @@ +test_that("arg_is_scalar basic behavior", { + expect_no_error(arg_is_scalar(d = 1, "a", 2, c = c("1"), a = list(2))) + + expect_error(arg_is_scalar(c(3, 5, 5)), + class = "epiprocess__value_not_length_1" + ) + + expect_no_error(arg_is_scalar(NULL, allow_null = TRUE)) + expect_error(arg_is_scalar(NULL), + class = "epiprocess__value_not_length_1" + ) + + expect_no_error(arg_is_scalar(NA, allow_na = TRUE)) + expect_error(arg_is_scalar(NA), + class = "epiprocess__value_is_na" + ) +}) + +test_that("arg_is_numeric basic behavior", { + expect_no_error(arg_is_numeric(c = 1.25, b = 2:5, 1, c(2.22, 2.12))) + + for (val in list(list(1), "a", list(NULL))) { + expect_error(arg_is_numeric(val), + class = "epiprocess__value_is_null_or_not_numeric" + ) + } + + expect_no_error(arg_is_numeric(1, c(1.255, 2.33, 3), NULL, allow_null = TRUE)) + expect_error(arg_is_numeric(1, c(1.255, 2.33, 3), NULL), + class = "epiprocess__value_is_null_or_not_numeric" + ) +}) + +test_that("arg_is_int basic behavior", { + expect_no_error(arg_is_int(c = 1, 1, 3, b = 2:5)) + expect_no_error(arg_is_int(NULL, 1, allow_null = TRUE)) + + for (val in list(1.25, -(1:3))) { + expect_error(arg_is_int(val), + class = "epiprocess__some_decimal_or_negative_elements" + ) + } +}) + +test_that("arg_is_chr basic behavior", { + expect_no_error(arg_is_chr(c = c("a", "value"), d = "a", "d")) + + expect_no_error(arg_is_chr(NULL, allow_null = TRUE)) # + for (val in list(NULL)) { + expect_error(arg_is_chr(val), # + class = "epiprocess__value_is_null" + ) + } + + expect_no_error(arg_is_chr(NA, c(NA, NA, NA), c(NA, "a"), allow_na = TRUE)) + for (val in list(NA, c(NA, NA, NA), c(NA, "a"))) { + expect_error(arg_is_chr(val), + class = "epiprocess__some_na_elements" + ) + } + + expect_no_error(arg_is_chr(c("a", "value"), character(0), list(), allow_empty = TRUE)) + for (val in list(character(0), list())) { + expect_error(arg_is_chr(val), + class = "epiprocess__value_length_0" + ) + } + + for (val in list(c(5, 4), list(5, 4), 5)) { + expect_error(arg_is_chr(val), + class = "epiprocess__not_character_type" + ) + } +}) + +test_that("arg_is_chr_scalar basic behavior", { + expect_no_error(arg_is_chr_scalar("a", "b", c = "c")) + expect_no_error(arg_is_chr_scalar(c = "c")) +}) + From 0bb93e2d1af009ca9acb50bb05456ee62772a938 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 26 Jan 2024 10:46:30 -0800 Subject: [PATCH 130/345] refactor: use checkmate checks --- DESCRIPTION | 2 +- NAMESPACE | 4 +++ R/autoplot.R | 13 ++++---- R/key_colnames.R | 6 ++-- R/utils-arg.R | 73 ------------------------------------------ man/autoplot.epi_df.Rd | 4 +-- 6 files changed, 18 insertions(+), 84 deletions(-) delete mode 100644 R/utils-arg.R diff --git a/DESCRIPTION b/DESCRIPTION index 211c5212..14437481 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Description: This package introduces a common data structure for epidemiological License: MIT + file LICENSE Copyright: file inst/COPYRIGHTS Imports: + checkmate, cli, data.table, dplyr (>= 1.0.0), @@ -86,6 +87,5 @@ Collate: 'outliers.R' 'reexports.R' 'slide.R' - 'utils-arg.R' 'utils.R' 'utils_pipe.R' diff --git a/NAMESPACE b/NAMESPACE index a843813d..23140464 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -68,6 +68,10 @@ export(slice) export(ungroup) export(unnest) importFrom(R6,R6Class) +importFrom(checkmate,anyInfinite) +importFrom(checkmate,assert) +importFrom(checkmate,assert_character) +importFrom(checkmate,assert_int) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) diff --git a/R/autoplot.R b/R/autoplot.R index c193e01f..73bcaf0b 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -25,6 +25,8 @@ #' @return A ggplot object #' @export #' +#' @importFrom checkmate assert assert_int anyInfinite assert_character +#' #' @examples #' autoplot(jhu_csse_daily_subset, cases, death_rate_7d_av) #' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .facet_by = "geo_value") @@ -32,12 +34,12 @@ #' .color_by = "none", #' .facet_by = "geo_value" #' ) -#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", #' .base_color = "red", .facet_by = "geo_value") -#' +#' #' # .base_color specification won't have any effect due .color_by default #' autoplot(jhu_csse_daily_subset, case_rate_7d_av, -#' .base_color = "red", .facet_by = "geo_value") +#' .base_color = "red", .facet_by = "geo_value") autoplot.epi_df <- function( object, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), @@ -47,9 +49,8 @@ autoplot.epi_df <- function( .color_by <- match.arg(.color_by) .facet_by <- match.arg(.facet_by) - arg_is_scalar(.max_facets) - if (is.finite(.max_facets)) arg_is_int(.max_facets) - arg_is_chr_scalar(.base_color) + assert(anyInfinite(.max_facets), assert_int(.max_facets), combine = "or") + assert_character(.base_color, len = 1) key_cols <- key_colnames(object) non_key_cols <- setdiff(names(object), key_cols) diff --git a/R/key_colnames.R b/R/key_colnames.R index 158c5a86..0d34f5f4 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -15,9 +15,10 @@ key_colnames.default <- function(x, ...) { character(0L) } +#' @importFrom checkmate assert_character #' @export key_colnames.data.frame <- function(x, other_keys = character(0L), ...) { - arg_is_chr(other_keys, allow_empty = TRUE) + assert_character(other_keys) nm <- c("time_value", "geo_value", other_keys) intersect(nm, colnames(x)) } @@ -34,7 +35,8 @@ key_colnames.epi_archive <- function(x, ...) { c("time_value", "geo_value", other_keys) } +#' @importFrom checkmate assert_character kill_time_value <- function(v) { - arg_is_chr(v) + assert_character(v) v[v != "time_value"] } diff --git a/R/utils-arg.R b/R/utils-arg.R deleted file mode 100644 index b48a3642..00000000 --- a/R/utils-arg.R +++ /dev/null @@ -1,73 +0,0 @@ -handle_arg_list <- function(..., tests) { - values <- list(...) - names <- eval(substitute(alist(...))) - names <- purrr::map(names, deparse) - - purrr::walk2(names, values, tests) -} - -arg_is_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { - handle_arg_list( - ..., - tests = function(name, value) { - if (length(value) > 1 | (!allow_null & length(value) == 0)) { - cli::cli_abort("Argument {.val {name}} must be of length 1.") - } - if (!is.null(value)) { - if (is.na(value) & !allow_na) { - cli::cli_abort( - "Argument {.val {name}} must not be a missing value ({.val {NA}})." - ) - } - } - } - ) -} - -arg_is_numeric <- function(..., allow_null = FALSE) { - handle_arg_list( - ..., - tests = function(name, value) { - if (!(is.numeric(value) | (is.null(value) & allow_null))) { - cli::cli_abort("All {.val {name}} must numeric.") - } - } - ) -} - -arg_is_int <- function(..., allow_null = FALSE) { - arg_is_numeric(..., allow_null = allow_null) - handle_arg_list( - ..., - tests = function(name, value) { - if (!(all(value %% 1 == 0) | (is.null(value) & allow_null))) { - cli::cli_abort("All {.val {name}} must be whole positive number(s).") - } - } - ) -} - -arg_is_chr <- function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = FALSE) { - handle_arg_list( - ..., - tests = function(name, value) { - if (is.null(value) & !allow_null) { - cli::cli_abort("Argument {.val {name}} may not be `NULL`.") - } - if (any(is.na(value)) & !allow_na) { - cli::cli_abort("Argument {.val {name}} must not contain any missing values ({.val {NA}}).") - } - if (!is.null(value) & (length(value) == 0L & !allow_empty)) { - cli::cli_abort("Argument {.val {name}} must have length > 0.") - } - if (!(is.character(value) | is.null(value) | all(is.na(value)))) { - cli::cli_abort("Argument {.val {name}} must be of character type.") - } - } - ) -} - -arg_is_chr_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { - arg_is_chr(..., allow_null = allow_null, allow_na = allow_na) - arg_is_scalar(..., allow_null = allow_null, allow_na = allow_na) -} diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd index df30528e..a87bc8ca 100644 --- a/man/autoplot.epi_df.Rd +++ b/man/autoplot.epi_df.Rd @@ -56,10 +56,10 @@ autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", .facet_by = "geo_value" ) -autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", +autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", .base_color = "red", .facet_by = "geo_value") # .base_color specification won't have any effect due .color_by default autoplot(jhu_csse_daily_subset, case_rate_7d_av, - .base_color = "red", .facet_by = "geo_value") + .base_color = "red", .facet_by = "geo_value") } From 8671c0c554fa4e99da96292ad3ef0634e379e612 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 26 Jan 2024 15:20:52 -0800 Subject: [PATCH 131/345] refactor: move checkmate imports to one place --- R/autoplot.R | 2 -- R/epiprocess.R | 1 + R/key_colnames.R | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index 73bcaf0b..a6cf99bc 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -25,8 +25,6 @@ #' @return A ggplot object #' @export #' -#' @importFrom checkmate assert assert_int anyInfinite assert_character -#' #' @examples #' autoplot(jhu_csse_daily_subset, cases, death_rate_7d_av) #' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .facet_by = "geo_value") diff --git a/R/epiprocess.R b/R/epiprocess.R index bbdcf4f3..79f9635d 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -4,6 +4,7 @@ #' measured over space and time, and offers associated utilities to perform #' basic signal processing tasks. #' +#' @importFrom checkmate assert assert_character assert_int anyInfinite #' @docType package #' @name epiprocess NULL diff --git a/R/key_colnames.R b/R/key_colnames.R index 0d34f5f4..99d8a9ed 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -15,7 +15,6 @@ key_colnames.default <- function(x, ...) { character(0L) } -#' @importFrom checkmate assert_character #' @export key_colnames.data.frame <- function(x, other_keys = character(0L), ...) { assert_character(other_keys) @@ -35,7 +34,6 @@ key_colnames.epi_archive <- function(x, ...) { c("time_value", "geo_value", other_keys) } -#' @importFrom checkmate assert_character kill_time_value <- function(v) { assert_character(v) v[v != "time_value"] From 1d45d65d78e495dfa424f6e16918d97fd52135ae Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:47:00 -0500 Subject: [PATCH 132/345] utils-arg removed in #410, so no longer need to test --- R/utils-arg.R | 88 --------------------------------- tests/testthat/test-utils-arg.R | 80 ------------------------------ 2 files changed, 168 deletions(-) delete mode 100644 R/utils-arg.R delete mode 100644 tests/testthat/test-utils-arg.R diff --git a/R/utils-arg.R b/R/utils-arg.R deleted file mode 100644 index 85331dd6..00000000 --- a/R/utils-arg.R +++ /dev/null @@ -1,88 +0,0 @@ -handle_arg_list <- function(..., tests) { - values <- list(...) - names <- eval(substitute(alist(...))) - names <- purrr::map(names, deparse) - - purrr::walk2(names, values, tests) -} - -arg_is_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { - handle_arg_list( - ..., - tests = function(name, value) { - if (length(value) > 1 | (!allow_null & length(value) == 0)) { - cli::cli_abort("Argument {.val {name}} must be of length 1.", - class = "epiprocess__value_not_length_1" - ) - } - if (!is.null(value)) { - if (is.na(value) & !allow_na) { - cli::cli_abort( - "Argument {.val {name}} must not be a missing value ({.val {NA}}).", - class = "epiprocess__value_is_na" - ) - } - } - } - ) -} - -arg_is_numeric <- function(..., allow_null = FALSE) { - handle_arg_list( - ..., - tests = function(name, value) { - if (!(is.numeric(value) | (is.null(value) & allow_null))) { - cli::cli_abort("All {.val {name}} must be numeric.", - class = "epiprocess__value_is_null_or_not_numeric" - ) - } - } - ) -} - -arg_is_int <- function(..., allow_null = FALSE) { - arg_is_numeric(..., allow_null = allow_null) - handle_arg_list( - ..., - tests = function(name, value) { - if (!( (all(value %% 1 == 0) && all(value > 0)) | (is.null(value) & allow_null))) { - cli::cli_abort("All {.val {name}} must be whole positive number(s).", - class = "epiprocess__some_decimal_or_negative_elements" - ) - } - } - ) -} - -arg_is_chr <- function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = FALSE) { - handle_arg_list( - ..., - tests = function(name, value) { - if (is.null(value) & !allow_null) { - cli::cli_abort("Argument {.val {name}} may not be `NULL`.", - class = "epiprocess__value_is_null" - ) - } - if (any(is.na(value)) & !allow_na) { - cli::cli_abort("Argument {.val {name}} must not contain any missing values ({.val {NA}}).", - class = "epiprocess__some_na_elements" - ) - } - if (!is.null(value) & (length(value) == 0L & !allow_empty)) { - cli::cli_abort("Argument {.val {name}} must have length > 0.", - class = "epiprocess__value_length_0" - ) - } - if (!(is.character(value) | is.null(value) | all(is.na(value)))) { - cli::cli_abort("Argument {.val {name}} must be of character type.", - class = "epiprocess__not_character_type" - ) - } - } - ) -} - -arg_is_chr_scalar <- function(..., allow_null = FALSE, allow_na = FALSE) { - arg_is_chr(..., allow_null = allow_null, allow_na = allow_na) - arg_is_scalar(..., allow_null = allow_null, allow_na = allow_na) -} diff --git a/tests/testthat/test-utils-arg.R b/tests/testthat/test-utils-arg.R deleted file mode 100644 index ab29e061..00000000 --- a/tests/testthat/test-utils-arg.R +++ /dev/null @@ -1,80 +0,0 @@ -test_that("arg_is_scalar basic behavior", { - expect_no_error(arg_is_scalar(d = 1, "a", 2, c = c("1"), a = list(2))) - - expect_error(arg_is_scalar(c(3, 5, 5)), - class = "epiprocess__value_not_length_1" - ) - - expect_no_error(arg_is_scalar(NULL, allow_null = TRUE)) - expect_error(arg_is_scalar(NULL), - class = "epiprocess__value_not_length_1" - ) - - expect_no_error(arg_is_scalar(NA, allow_na = TRUE)) - expect_error(arg_is_scalar(NA), - class = "epiprocess__value_is_na" - ) -}) - -test_that("arg_is_numeric basic behavior", { - expect_no_error(arg_is_numeric(c = 1.25, b = 2:5, 1, c(2.22, 2.12))) - - for (val in list(list(1), "a", list(NULL))) { - expect_error(arg_is_numeric(val), - class = "epiprocess__value_is_null_or_not_numeric" - ) - } - - expect_no_error(arg_is_numeric(1, c(1.255, 2.33, 3), NULL, allow_null = TRUE)) - expect_error(arg_is_numeric(1, c(1.255, 2.33, 3), NULL), - class = "epiprocess__value_is_null_or_not_numeric" - ) -}) - -test_that("arg_is_int basic behavior", { - expect_no_error(arg_is_int(c = 1, 1, 3, b = 2:5)) - expect_no_error(arg_is_int(NULL, 1, allow_null = TRUE)) - - for (val in list(1.25, -(1:3))) { - expect_error(arg_is_int(val), - class = "epiprocess__some_decimal_or_negative_elements" - ) - } -}) - -test_that("arg_is_chr basic behavior", { - expect_no_error(arg_is_chr(c = c("a", "value"), d = "a", "d")) - - expect_no_error(arg_is_chr(NULL, allow_null = TRUE)) # - for (val in list(NULL)) { - expect_error(arg_is_chr(val), # - class = "epiprocess__value_is_null" - ) - } - - expect_no_error(arg_is_chr(NA, c(NA, NA, NA), c(NA, "a"), allow_na = TRUE)) - for (val in list(NA, c(NA, NA, NA), c(NA, "a"))) { - expect_error(arg_is_chr(val), - class = "epiprocess__some_na_elements" - ) - } - - expect_no_error(arg_is_chr(c("a", "value"), character(0), list(), allow_empty = TRUE)) - for (val in list(character(0), list())) { - expect_error(arg_is_chr(val), - class = "epiprocess__value_length_0" - ) - } - - for (val in list(c(5, 4), list(5, 4), 5)) { - expect_error(arg_is_chr(val), - class = "epiprocess__not_character_type" - ) - } -}) - -test_that("arg_is_chr_scalar basic behavior", { - expect_no_error(arg_is_chr_scalar("a", "b", c = "c")) - expect_no_error(arg_is_chr_scalar(c = "c")) -}) - From fc528de5c83edacf37a4cdceecab25457a5a80ca Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 29 Jan 2024 15:13:38 -0800 Subject: [PATCH 133/345] doc: update installation instructions to reference stable branch --- README.md | 79 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 0c117c61..d0a1c740 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,28 @@ # epiprocess - - - [![R-CMD-check](https://github.com/cmu-delphi/epiprocess/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epiprocess/actions/workflows/R-CMD-check.yaml) - + + + +[![R-CMD-check](https://github.com/cmu-delphi/epiprocess/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epiprocess/actions/workflows/R-CMD-check.yaml) + + This package introduces a common data structure for epidemiological data sets measured over space and time, and offers associated utilities to perform basic signal processing tasks. See the getting started guide and vignettes for examples. +## Installation + +To install (unless you're making changes to the package, use the stable version): + +```r +# Stable version +pak::pkg_install("cmu-delphi/epiprocess@main") + +# Dev version +pak::pkg_install("cmu-delphi/epiprocess@dev") +``` + ## `epi_df`: snapshot of a data set The first main data structure in the `epiprocess` package is called @@ -20,28 +34,26 @@ contains the most up-to-date values of the signals variables, as of a given time. By convention, functions in the `epiprocess` package that operate on `epi_df` -objects begin with `epi`. For example: +objects begin with `epi`. For example: -- `epi_slide()`, for iteratively applying a custom computation to a variable in - an `epi_df` object over sliding windows in time; - -- `epi_cor()`, for computing lagged correlations between variables in an - `epi_df` object, (allowing for grouping by geo value, time value, or any other - variables). +- `epi_slide()`, for iteratively applying a custom computation to a variable in + an `epi_df` object over sliding windows in time; +- `epi_cor()`, for computing lagged correlations between variables in an + `epi_df` object, (allowing for grouping by geo value, time value, or any other + variables). Functions in the package that operate directly on given variables do not begin - with `epi`. For example: - -- `growth_rate()`, for estimating the growth rate of a given signal at given - time values, using various methodologies; +with `epi`. For example: -- `detect_outlr()`, for detecting outliers in a given signal over time, using - either built-in or custom methodologies. +- `growth_rate()`, for estimating the growth rate of a given signal at given + time values, using various methodologies; +- `detect_outlr()`, for detecting outliers in a given signal over time, using + either built-in or custom methodologies. ## `epi_archive`: full version history of a data set The second main data structure in the package is called -[`epi_archive`](reference/epi_archive.html). This is a special class (R6 format) +[`epi_archive`](reference/epi_archive.html). This is a special class (R6 format) wrapped around a data table that stores the archive (version history) of some signal variables of interest. @@ -50,20 +62,17 @@ By convention, functions in the `epiprocess` package that operate on "archive"). These are just wrapper functions around the public methods for the `epi_archive` R6 class. For example: -- `epix_as_of()`, for generating a snapshot in `epi_df` format from the data - archive, which represents the most up-to-date values of the signal variables, - as of the specified version; - -- `epix_fill_through_version()`, for filling in some fake version data following - simple rules, for use when downstream methods expect an archive that is more - up-to-date (e.g., if it is a forecasting deadline date and one of our data - sources cannot be accessed to provide the latest versions of its data) - -- `epix_merge()`, for merging two data archives with each other, with support - for various approaches to handling when one of the archives is more up-to-date - version-wise than the other; - -- `epix_slide()`, for sliding a custom computation to a data archive over local - windows in time, much like `epi_slide` for an `epi_df` object, but with one - key difference: the sliding computation at any given reference time t is - performed only on the **data that would have been available as of t**. +- `epix_as_of()`, for generating a snapshot in `epi_df` format from the data + archive, which represents the most up-to-date values of the signal variables, + as of the specified version; +- `epix_fill_through_version()`, for filling in some fake version data following + simple rules, for use when downstream methods expect an archive that is more + up-to-date (e.g., if it is a forecasting deadline date and one of our data + sources cannot be accessed to provide the latest versions of its data) +- `epix_merge()`, for merging two data archives with each other, with support + for various approaches to handling when one of the archives is more up-to-date + version-wise than the other; +- `epix_slide()`, for sliding a custom computation to a data archive over local + windows in time, much like `epi_slide` for an `epi_df` object, but with one + key difference: the sliding computation at any given reference time t is + performed only on the **data that would have been available as of t**. From 93830c4567c553a043de34496c220bd726884603 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 18 Jan 2024 16:21:54 -0500 Subject: [PATCH 134/345] support different time_step types However, date sequence completion is slow when time_step provided --- R/slide.R | 86 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 16 deletions(-) diff --git a/R/slide.R b/R/slide.R index f770d6e8..2f2fe8b4 100644 --- a/R/slide.R +++ b/R/slide.R @@ -561,24 +561,79 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, after <- 0L } - # If a custom time step is specified, then redefine units - # if (!missing(time_step)) { - # before <- time_step(before) - # after <- time_step(after) - # } - - # time_step can be any of `c("days", "weeks", "months", "quarters", "years")` - all_dates <- seq(min(x$time_value), max(x$time_value), by = time_step) - pad_early_dates <- c() pad_late_dates <- c() - if (before != 0) { - pad_early_dates <- all_dates[1L] - before:1 - } - if (after != 0) { - pad_late_dates <- all_dates[length(all_dates)] + 1:after + + # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` + # defines a step of 1 to automatically be the quantum (smallest resolvable + # unit) of the date class(so one step = 1 quarter for `yearquarter`). + # + # `tsibble` classes apparently can't be added to in different units, so even + # if `time_step` is provided by the user, use a unit step. + if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || + is.numeric(x$time_value)) { + all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) + + if (before != 0) { + pad_early_dates <- Start(all_dates) - before:1 + } + if (after != 0) { + pad_late_dates <- End(all_dates) + 1:after + } + } else if (missing(time_step)) { + # Guess at what `by` could be based on epi_df `time_type`. + ttype <- attributes(x)$metadata$time_type + by <- dplyr::case_when( + ttype == "day" ~ "days", + ttype == "week" ~ "weeks", + ttype == "yearweek" ~ "weeks", + ttype == "yearmonth" ~ "months", + ttype == "yearquarter" ~ "quarters", + ttype == "year" ~ "years", + TRUE ~ NA # "custom", "day-time" + ) + + if (is.na(by)) { + Abort( + c( + "`frollmean` requires a full window to compute a result, but + `time_type` associated with the epi_df was not mappable to period + type valid for creating a date sequence.", + "i" = c("The input data's `time_type` was probably `custom` or `day-time`. + These require also passing a `time_step` function.") + ) + ) + } + + # time_step can be any of `c("days", "weeks", "months", "quarters", "years")` + all_dates <- seq(min(x$time_value), max(x$time_value), by = by) + + if (before != 0) { + pad_early_dates <- Start(all_dates) - before:1 + } + if (after != 0) { + pad_late_dates <- End(all_dates) + 1:after + } + } else { + # A custom time step is specified + all_dates <- c() + curr <- min(x$time_value) + while (curr <= max(x$time_value)) { + all_dates <- append(all_dates, curr) + curr <- curr + time_step(1) + } + # t_elapsed <- max(x$time_value) - min(x$time_value) + # all_dates <- min(x$time_value) + time_step(0:t_elapsed) + + if (before != 0) { + pad_early_dates <- Start(all_dates) - time_step(before:1) + } + if (after != 0) { + pad_late_dates <- End(all_dates) + time_step(1:after) + } } + # `frollmean` is 1-indexed, so create a new window width based on our # `before` and `after` params. m <- before + after + 1L @@ -590,8 +645,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, } slide_one_grp <- function(.data_group, .group_key, ...) { - # `setdiff` causes date formatting to change. Re-class these as dates. - missing_dates <- as.Date(setdiff(all_dates, .data_group$time_value), origin = "1970-01-01") + missing_dates <- all_dates[!(all_dates %in% .data_group$time_value)] # `frollmean` requires a full window to compute a result. Add NA values # to beginning and end of the group so that we get results for the From 6fd21ee461c2c922689da0f9bdfc81fa9343c341 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 30 Jan 2024 17:51:07 -0500 Subject: [PATCH 135/345] use more precise way to generate all_dates; comment cleanup --- NAMESPACE | 1 + R/slide.R | 40 +++++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 24df62cc..77670090 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -92,6 +92,7 @@ importFrom(dplyr,select) importFrom(dplyr,slice) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) +importFrom(lubridate,as.period) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") diff --git a/R/slide.R b/R/slide.R index 2f2fe8b4..09596165 100644 --- a/R/slide.R +++ b/R/slide.R @@ -471,6 +471,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @importFrom dplyr bind_rows mutate %>% arrange tibble #' @importFrom purrr map #' @importFrom data.table frollmean +#' @importFrom lubridate as.period #' @export #' @examples #' # slide a 7-day trailing average formula on cases @@ -566,7 +567,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` # defines a step of 1 to automatically be the quantum (smallest resolvable - # unit) of the date class(so one step = 1 quarter for `yearquarter`). + # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. # # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a unit step. @@ -581,16 +582,16 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, pad_late_dates <- End(all_dates) + 1:after } } else if (missing(time_step)) { - # Guess at what `by` could be based on epi_df `time_type`. + # Guess what `by` should be based on the epi_df's `time_type`. ttype <- attributes(x)$metadata$time_type - by <- dplyr::case_when( - ttype == "day" ~ "days", - ttype == "week" ~ "weeks", - ttype == "yearweek" ~ "weeks", - ttype == "yearmonth" ~ "months", - ttype == "yearquarter" ~ "quarters", - ttype == "year" ~ "years", - TRUE ~ NA # "custom", "day-time" + by <- switch(ttype, + day = "days", + week = "weeks", + yearweek = "weeks", + yearmonth = "months", + yearquarter = "quarters", + year = "years", + NA # default value for "custom", "day-time" ) if (is.na(by)) { @@ -605,7 +606,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, ) } - # time_step can be any of `c("days", "weeks", "months", "quarters", "years")` + # Time_step can be any of `c("days", "weeks", "months", "quarters", "years")` all_dates <- seq(min(x$time_value), max(x$time_value), by = by) if (before != 0) { @@ -616,14 +617,15 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, } } else { # A custom time step is specified - all_dates <- c() - curr <- min(x$time_value) - while (curr <= max(x$time_value)) { - all_dates <- append(all_dates, curr) - curr <- curr + time_step(1) - } - # t_elapsed <- max(x$time_value) - min(x$time_value) - # all_dates <- min(x$time_value) + time_step(0:t_elapsed) + + # Calculate the number of `time_step`s required to go between min and max time + # values. This is roundabout because difftime objects, lubridate::period objects, + # and Dates are hard to convert to the same time scale and add. + t_elapsed_s <- difftime(max(x$time_value), min(x$time_value), units = "secs") + step_size_s <- lubridate::as.period(time_step(1), unit = "secs") + n_steps <- ceiling(as.numeric(t_elapsed_s) / as.numeric(step_size_s)) + + all_dates <- min(x$time_value) + time_step(0:n_steps) if (before != 0) { pad_early_dates <- Start(all_dates) - time_step(before:1) From 58b8163ee46395a8fd4985caeec3342e2701a04c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 30 Jan 2024 18:24:36 -0500 Subject: [PATCH 136/345] fix epi_slide_mean examples Can't use column names like vars --- R/slide.R | 16 ++++++++++++---- man/epi_slide_mean.Rd | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/R/slide.R b/R/slide.R index 09596165..520e0621 100644 --- a/R/slide.R +++ b/R/slide.R @@ -477,28 +477,36 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6) %>% +#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) #' +#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed +#' # and accuracy, and to allow partially-missing windows. +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, +#' na.rm = TRUE, algo = "exact", hasNA = TRUE) %>% +#' dplyr::select(-death_rate_7d_av) +#' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, after = 6) %>% +#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, after = 7) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 686204f5..f51be796 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -125,28 +125,36 @@ misspelled.) # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6) \%>\% + epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) +# slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed +# and accuracy, and to allow partially-missing windows. +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, + na.rm = TRUE, algo = "exact", hasNA = TRUE) \%>\% + dplyr::select(-death_rate_7d_av) + # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, after = 6) \%>\% + epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, after = 6) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 3, after = 3) \%>\% + epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = cases_7dav, names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, after = 7) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) } From 173ea5811747cc0c0ae77bcee5cb213f5fbc296d Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 31 Jan 2024 10:25:02 -0500 Subject: [PATCH 137/345] pkgdown site --- R/slide.R | 2 ++ _pkgdown.yml | 1 + man/epi_slide.Rd | 3 +++ man/epi_slide_mean.Rd | 3 +++ 4 files changed, 9 insertions(+) diff --git a/R/slide.R b/R/slide.R index 520e0621..f107cfe1 100644 --- a/R/slide.R +++ b/R/slide.R @@ -125,6 +125,7 @@ #' @importFrom dplyr bind_rows group_vars filter select #' @importFrom rlang .data .env !! enquo enquos sym env missing_arg #' @export +#' @seealso [`epi_slide_mean`] #' @examples #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% @@ -473,6 +474,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @importFrom data.table frollmean #' @importFrom lubridate as.period #' @export +#' @seealso [`epi_slide`] #' @examples #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% diff --git a/_pkgdown.yml b/_pkgdown.yml index e6bacebf..fa40d0d3 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -60,6 +60,7 @@ reference: desc: Functions that act on `epi_df` objects. - contents: - epi_slide + - epi_slide_mean - epi_cor - title: Vector functions desc: Functions that act directly on signal variables. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 668be9ff..05c4d9ad 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -192,3 +192,6 @@ jhu_csse_daily_subset \%>\% before = 1, as_list_col = TRUE ) } +\seealso{ +\code{\link{epi_slide_mean}} +} diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index f51be796..e0906eb2 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -158,3 +158,6 @@ jhu_csse_daily_subset \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) } +\seealso{ +\code{\link{epi_slide}} +} From 2e49a954fb8b7007f7929102162c83584ab65998 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 31 Jan 2024 12:36:12 -0500 Subject: [PATCH 138/345] leave epi_slide_mean result grouped --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index f107cfe1..93e2e05b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -714,6 +714,6 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result <- bind_rows(x[c(),], result) } - ungroup(result) + return(result) } From 2cc3227b53bccbd99eaed7702f23c7f39ee00416 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 31 Jan 2024 12:56:33 -0500 Subject: [PATCH 139/345] error if any group has duplicate time values --- R/slide.R | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index 93e2e05b..18bd5167 100644 --- a/R/slide.R +++ b/R/slide.R @@ -668,13 +668,33 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, ) %>% arrange(time_value) + # If a group contains duplicate time values, `frollmean` will still only + # use the last `k` obs. It isn't looking at dates, it just goes in row + # order. So if the computation is aggregating across multiple obs for the + # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` + # should be used instead. + if (anyDuplicated(.data_group$time_value) > 0) { + Abort(c( + "group contains duplicate time values. Using `epi_slide_mean` on this + group will result in incorrect results", + "i" = "Please change the grouping structure of the input data so that + each group has non-duplicate time values", + "i" = "Use `epi_slide` to aggregate across groups" + ), + class = "epiprocess__epi_slide_mean__duplicate_time_values", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } + roll_output <- data.table::frollmean( x = .data_group[, col_name], n = m, align = "right", ... ) if (after >= 1) { - # Right-aligned `frollmean` results' `ref_time_value`s will be `after` timesteps - # ahead of where they should be. Shift results to the left by `after` timesteps. + # Right-aligned `frollmean` results' `ref_time_value`s will be `after` + # timesteps ahead of where they should be. Shift results to the left by + # `after` timesteps. .data_group[, result_col_name] <- purrr::map(roll_output, function(.x) { c(.x[(after + 1L):length(.x)], rep(NA, after)) } From 66722311e058a6664fea4d90eda40104d4687af6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 29 Jan 2024 15:59:27 -0800 Subject: [PATCH 140/345] refactor: unclutter epi_archive print, add DT preview --- R/archive.R | 50 +++++++++++++----------------------- man/jhu_csse_daily_subset.Rd | 18 ++++++------- 2 files changed, 27 insertions(+), 41 deletions(-) diff --git a/R/archive.R b/R/archive.R index faaf048b..afd74a64 100644 --- a/R/archive.R +++ b/R/archive.R @@ -445,13 +445,10 @@ epi_archive <- #' the archive print = function(class = TRUE, methods = TRUE) { if (class) cat("An `epi_archive` object, with metadata:\n") - cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) - cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) - if (!is.null(self$additional_metadata)) { - sapply(self$additional_metadata, function(m) { - cat(sprintf("* %-9s = %s\n", names(m), m)) - }) - } + cat(sprintf("* %-14s = %s\n", "non-standard DT keys", paste( + setdiff(key(self$DT), c("geo_value", "time_value", "version")), + collapse = ", " + ))) cat("----------\n") if (length(self$DT$time_value) == 0 || all(is.na(self$DT$time_value))) { min_time <- max_time <- NA @@ -459,19 +456,16 @@ epi_archive <- min_time <- Min(self$DT$time_value) max_time <- Max(self$DT$time_value) } - cat(sprintf("* %-14s = %s\n", "min time value", min_time)) - cat(sprintf("* %-14s = %s\n", "max time value", max_time)) - cat(sprintf( - "* %-14s = %s\n", "first version with update", - min(self$DT$version) - )) + cat(sprintf("* %-14s = %s\n", "min/max time values", paste( + c(min_time, max_time), + collapse = " / " + ))) cat(sprintf( - "* %-14s = %s\n", "last version with update", - max(self$DT$version) - )) - if (is.na(self$clobberable_versions_start)) { - cat("* No clobberable versions\n") - } else { + "* %-14s = %s\n", "first/last version with update", paste( + c(min(self$DT$version), max(self$DT$version)), + collapse = " / " + ))) + if (!is.na(self$clobberable_versions_start)) { cat(sprintf( "* %-14s = %s\n", "clobberable versions start", self$clobberable_versions_start @@ -481,19 +475,6 @@ epi_archive <- "* %-14s = %s\n", "versions end", self$versions_end )) - cat("----------\n") - cat(sprintf( - "Data archive (stored in DT field): %i x %i\n", - nrow(self$DT), ncol(self$DT) - )) - cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( - colnames(self$DT) - ) <= 4, paste(colnames(self$DT), collapse = ", "), - paste( - paste(colnames(self$DT)[1:4], collapse = ", "), "and", - length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns" - ) - )))) if (methods) { cat("----------\n") writeLines(wrap_varnames( @@ -501,6 +482,11 @@ epi_archive <- names(epi_archive$public_methods) )) } + cat("----------\n") + cat(sprintf( + "A preview of the table: %s rows x %s columns\n", + nrow(self$DT), ncol(self$DT))) + return(invisible(self$DT %>% print)) }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. diff --git a/man/jhu_csse_daily_subset.Rd b/man/jhu_csse_daily_subset.Rd index 626bf545..6d4913f0 100644 --- a/man/jhu_csse_daily_subset.Rd +++ b/man/jhu_csse_daily_subset.Rd @@ -30,15 +30,15 @@ in Engineering. Copyright Johns Hopkins University 2020. Modifications: \itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -calculating moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -\item Furthermore, the data has been limited to a very small number of rows, -the signal names slightly altered, and formatted into a tibble. +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +the COVIDcast Epidata API}: The case signal is taken directly from the JHU +CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub +repository}. The rate signals were computed by Delphi using Census +population data. The 7-day average signals were computed by Delphi by +calculating moving averages of the preceding 7 days, so the signal for June +7 is the average of the underlying data for June 1 through 7, inclusive. +\item Furthermore, the data has been limited to a very small number of rows, the +signal names slightly altered, and formatted into a tibble. } } \usage{ From c3e68d3d68265c295425628319735ed9eacdb1a1 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 29 Jan 2024 16:04:37 -0800 Subject: [PATCH 141/345] doc: add tentative news blurb, version number --- DESCRIPTION | 8 ++++---- NEWS.md | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 15b7757f..46bf204d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.2 +Version: 0.7.3 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), @@ -56,7 +56,7 @@ Suggests: testthat (>= 3.1.5), waldo (>= 0.3.1), withr -VignetteBuilder: +VignetteBuilder: knitr Remotes: cmu-delphi/epidatr, @@ -67,10 +67,10 @@ Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 -Depends: +Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ -Collate: +Collate: 'archive.R' 'correlation.R' 'data.R' diff --git a/NEWS.md b/NEWS.md index 3d02b981..27bf793d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat * changed approach to versioning, see DEVELOPMENT.md for details * `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 * Minor documentation updates; PR #393 +* Improved `epi_archive` print method. Now an S3 method and shows a snippet of + the underlying `DT` (#341). ## Breaking changes From 1ca789bb61a2935b422aa387712cc3a79b69dd75 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 31 Jan 2024 11:24:05 -0800 Subject: [PATCH 142/345] doc: update NEWS --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 27bf793d..e4a404e2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,8 +14,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat * changed approach to versioning, see DEVELOPMENT.md for details * `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 * Minor documentation updates; PR #393 -* Improved `epi_archive` print method. Now an S3 method and shows a snippet of - the underlying `DT` (#341). +* Improved `epi_archive` print method. Compactified metadata and shows a snippet + of the underlying `DT` (#341). ## Breaking changes From 13873f2908ba4774ab02e1a82a87afd4549ed622 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 31 Jan 2024 13:07:41 -0800 Subject: [PATCH 143/345] refactor: use cli for print --- NAMESPACE | 1 + R/archive.R | 60 ++++++++++++++++------------------------------------- 2 files changed, 19 insertions(+), 42 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c59004c8..9ae3464f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(slice) export(ungroup) export(unnest) importFrom(R6,R6Class) +importFrom(cli,cli_inform) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) diff --git a/R/archive.R b/R/archive.R index afd74a64..08540955 100644 --- a/R/archive.R +++ b/R/archive.R @@ -443,49 +443,25 @@ epi_archive <- #' @param class Boolean; whether to print the class label header #' @param methods Boolean; whether to print all available methods of #' the archive + #' @importFrom cli cli_inform print = function(class = TRUE, methods = TRUE) { - if (class) cat("An `epi_archive` object, with metadata:\n") - cat(sprintf("* %-14s = %s\n", "non-standard DT keys", paste( - setdiff(key(self$DT), c("geo_value", "time_value", "version")), - collapse = ", " - ))) - cat("----------\n") - if (length(self$DT$time_value) == 0 || all(is.na(self$DT$time_value))) { - min_time <- max_time <- NA - } else { - min_time <- Min(self$DT$time_value) - max_time <- Max(self$DT$time_value) - } - cat(sprintf("* %-14s = %s\n", "min/max time values", paste( - c(min_time, max_time), - collapse = " / " - ))) - cat(sprintf( - "* %-14s = %s\n", "first/last version with update", paste( - c(min(self$DT$version), max(self$DT$version)), - collapse = " / " - ))) - if (!is.na(self$clobberable_versions_start)) { - cat(sprintf( - "* %-14s = %s\n", "clobberable versions start", - self$clobberable_versions_start - )) - } - cat(sprintf( - "* %-14s = %s\n", "versions end", - self$versions_end - )) - if (methods) { - cat("----------\n") - writeLines(wrap_varnames( - initial = "Public R6 methods: ", - names(epi_archive$public_methods) - )) - } - cat("----------\n") - cat(sprintf( - "A preview of the table: %s rows x %s columns\n", - nrow(self$DT), ncol(self$DT))) + cli_inform( + c( + ">" = if (class) {"An `epi_archive` object, with metadata:"}, + "i" = if (length(setdiff(key(self$DT), c('geo_value', 'time_value', 'version'))) > 0) { + "Non-standard DT keys: {setdiff(key(self$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(self$DT$time_value)} / {max(self$DT$time_value)}", + "i" = "First/last version with update: {min(self$DT$version)} / {max(self$DT$version)}", + "i" = if (!is.na(self$clobberable_versions_start)) { + "Clobberable versions start: {self$clobberable_versions_start}" + }, + "i" = "Versions end: {self$versions_end}", + "i" = if (methods) {"Public R6 methods: {names(epi_archive$public_methods)}"}, + "i" = "A preview of the table ({nrow(self$DT)} rows x {ncol(self$DT)} columns):" + ) + ) + return(invisible(self$DT %>% print)) }, ##### From a3efaeb1243ac6b25acdd18400aa3ec7fe59ec21 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 31 Jan 2024 22:24:41 -0500 Subject: [PATCH 144/345] tests --- tests/testthat/test-epi_slide.R | 389 +++++++++++++++++++++++++++++--- 1 file changed, 358 insertions(+), 31 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 8137cf19..f4eb428d 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -28,6 +28,23 @@ toy_edf <- tibble::tribble( tidyr::unchop(c(time_value, value)) %>% as_epi_df(as_of = 100) +basic_result_from_size1_sum <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), +) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + +basic_result_from_size1_mean <- tibble::tribble( + ~geo_value, ~time_value, ~value, ~slide_value, + "a", 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), +) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { expect_error( @@ -38,6 +55,15 @@ test_that("`before` and `after` are both vectors of length 1", { epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), "`after`.*length-1" ) + + expect_error( + epi_slide_mean(grouped, col_name = "value", before = c(0, 1), after = 0, ref_time_values = d + 3), + "`before`.*length-1" + ) + expect_error( + epi_slide_mean(grouped, col_name = "value", before = 1, after = c(0, 1), ref_time_values = d + 3), + "`after`.*length-1" + ) }) test_that("Test errors/warnings for discouraged features", { @@ -53,10 +79,33 @@ test_that("Test errors/warnings for discouraged features", { epi_slide(grouped, f, after = 0L, ref_time_values = d + 1), "`before` missing, `after==0`" ) + + expect_error( + epi_slide_mean(grouped, col_name = "value", ref_time_values = d + 1), + "Either or both of `before`, `after` must be provided." + ) + expect_warning( + epi_slide_mean(grouped, col_name = "value", before = 0L, ref_time_values = d + 1), + "`before==0`, `after` missing" + ) + expect_warning( + epi_slide_mean(grouped, col_name = "value", after = 0L, ref_time_values = d + 1), + "`before` missing, `after==0`" + ) + # Below cases should raise no errors/warnings: - expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d + 2), NA) - expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d + 2), NA) - expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2), NA) + expect_no_warning(ref1 <- epi_slide(grouped, f, before = 1L, ref_time_values = d + 2)) + expect_no_warning(ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2)) + expect_no_warning(ref3 <- epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2)) + + expect_no_warning(opt1 <- epi_slide_mean(grouped, col_name = "value", before = 1L, ref_time_values = d + 2, na.rm = TRUE)) + expect_no_warning(opt2 <- epi_slide_mean(grouped, col_name = "value", after = 1L, ref_time_values = d + 2, na.rm = TRUE)) + expect_no_warning(opt3 <- epi_slide_mean(grouped, col_name = "value", before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE)) + + # Results from epi_slide and epi_slide_mean should match + expect_identical(select(ref1, -slide_value_count), opt1) + expect_identical(select(ref2, -slide_value_count), opt2) + expect_identical(select(ref3, -slide_value_count), opt3) }) test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { @@ -88,8 +137,42 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), "`after`.*non-NA" ) + + expect_error( + epi_slide_mean(grouped, col_name = "value", before = -1L, ref_time_values = d + 2L), + "`before`.*non-negative" + ) + expect_error( + epi_slide_mean(grouped, col_name = "value", before = 2L, after = -1L, ref_time_values = d + 2L), + "`after`.*non-negative" + ) + expect_error(epi_slide_mean(grouped, col_name = "value", before = "a", ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide_mean(grouped, col_name = "value", before = 1L, after = "a", ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide_mean(grouped, col_name = "value", before = 0.5, ref_time_values = d + 2L), + regexp = "before", class = "vctrs_error_incompatible_type" + ) + expect_error(epi_slide_mean(grouped, col_name = "value", before = 1L, after = 0.5, ref_time_values = d + 2L), + regexp = "after", class = "vctrs_error_incompatible_type" + ) + expect_error( + epi_slide_mean(grouped, col_name = "value", before = NA, after = 1L, ref_time_values = d + 2L), + "`before`.*non-NA" + ) + expect_error( + epi_slide_mean(grouped, col_name = "value", before = 1L, after = NA, ref_time_values = d + 2L), + "`after`.*non-NA" + ) + # Non-integer-class but integer-compatible values are allowed: - expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L), NA) + expect_no_error(ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L)) + expect_no_error(opt <- epi_slide_mean(grouped, col_name = "value", before = 1, after = 1, ref_time_values = d + 2L, na.rm = TRUE)) + + # Results from epi_slide and epi_slide_mean should match + expect_identical(select(ref, -slide_value_count), opt) }) test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { @@ -101,6 +184,15 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), "All `ref_time_values` must appear in `x\\$time_value`." ) # beyond the last, no data in window + + expect_error( + epi_slide_mean(grouped, col_name = "value", before = 2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # before the first, no data in the slide windows + expect_error( + epi_slide_mean(grouped, col_name = "value", before = 2L, ref_time_values = d + 207L), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # beyond the last, no data in window }) test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { @@ -112,18 +204,40 @@ test_that("`ref_time_values` + `before` + `after` that have some slide data, but epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), "All `ref_time_values` must appear in `x\\$time_value`." ) # beyond the last, but still with data in window + + expect_error( + epi_slide_mean(grouped, "value", before = 0L, after = 2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 201L), + "All `ref_time_values` must appear in `x\\$time_value`." + ) # beyond the last, but still with data in window }) ## --- These cases generate warnings (or not): --- test_that("Warn user against having a blank `before`", { - expect_warning(epi_slide(grouped, f, + expect_no_warning(ref1 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 1L - ), NA) - expect_warning(epi_slide(grouped, f, + )) + expect_no_warning(ref2 <- epi_slide(grouped, f, before = 0L, after = 1L, ref_time_values = d + 1L - ), NA) + )) + + expect_no_warning(opt1 <- epi_slide_mean(grouped, "value", + after = 1L, + ref_time_values = d + 1L, na.rm = TRUE + )) + expect_no_warning(opt2 <- epi_slide_mean(grouped, "value", + before = 0L, after = 1L, + ref_time_values = d + 1L, na.rm = TRUE + )) + + # Results from epi_slide and epi_slide_mean should match + expect_identical(select(ref1, -slide_value_count), opt1) + expect_identical(select(ref2, -slide_value_count), opt2) }) ## --- These cases doesn't generate the error: --- @@ -140,51 +254,94 @@ test_that("these doesn't produce an error; the error appears only if the ref tim dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) ) # not out of range for either group + + expect_identical( + epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group }) test_that("computation output formats x as_list_col", { - # See `toy_edf` definition at top of file. + # See `toy_edf` and `basic_result_from_size1_sum` definitions at top of file. # We'll try 7d sum with a few formats. - basic_result_from_size1 <- tibble::tribble( - ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - ) %>% - tidyr::unchop(c(time_value, value, slide_value)) %>% - dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), - basic_result_from_size1 + basic_result_from_size1_sum ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), - basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value)) + basic_result_from_size1_sum %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), - basic_result_from_size1 %>% rename(slide_value_value = slide_value) + basic_result_from_size1_sum %>% rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), - basic_result_from_size1 %>% + basic_result_from_size1_sum %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) - # output naming functionality: + + # See `toy_edf` and `basic_result_from_size1_mean` definitions at top of file. + # We'll try 7d avg with a few formats. + # Warning: not exactly the same naming behavior as `epi_slide`. + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% epi_slide_mean( + "value", before = 6L, na.rm = TRUE + ), + basic_result_from_size1_mean %>% dplyr::mutate( + slide_value_value = slide_value + ) %>% select(-slide_value) + ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% epi_slide_mean( + "value", before = 6L, as_list_col = TRUE, na.rm = TRUE + ), + basic_result_from_size1_mean %>% dplyr::mutate( + slide_value_value = as.list(slide_value) + ) %>% select(-slide_value) + ) + # `epi_slide_mean` doesn't return dataframe columns +}) + +test_that("nested dataframe output names are controllable", { expect_identical( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), new_col_name = "result" ), - basic_result_from_size1 %>% rename(result_value = slide_value) + basic_result_from_size1_sum %>% rename(result_value = slide_value) ) expect_identical( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value_sum = sum(.x$value)), names_sep = NULL ), - basic_result_from_size1 %>% rename(value_sum = slide_value) + basic_result_from_size1_sum %>% rename(value_sum = slide_value) ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% epi_slide_mean( + "value", before = 6L, names_sep = NULL, na.rm = TRUE + ), + basic_result_from_size1_mean + ) +}) + +test_that("non-size-1 outputs are recycled", { # trying with non-size-1 computation outputs: basic_result_from_size2 <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, @@ -258,6 +415,37 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { slide_value, NA_integer_ )) ) + + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% epi_slide_mean( + "value", before = 6L, names_sep = NULL, na.rm = TRUE + ), + basic_result_from_size1_mean + ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% epi_slide_mean( + "value", before = 6L, ref_time_values = c(2L, 8L), + names_sep = NULL, na.rm = TRUE + ), + filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% filter( + geo_value == "a" + ) %>% epi_slide_mean( + "value", before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, + names_sep = NULL, na.rm = TRUE + ), + basic_result_from_size1_mean %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_ + )) + ) + # slide computations returning data frames: expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), @@ -401,15 +589,26 @@ test_that("basic grouped epi_slide computation produces expected output", { expect_identical(result3, expected_output) }) +test_that("basic grouped epi_slide_mean computation produces expected output", { + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- epi_slide_mean(small_x, "value", before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result1, expected_output) +}) + test_that("ungrouped epi_slide computation completes successfully", { - expect_error( + expect_no_error( small_x %>% ungroup() %>% epi_slide( before = 2, slide_value = sum(.x$value) - ), - regexp = NA + ) ) }) @@ -449,6 +648,27 @@ test_that("basic ungrouped epi_slide computation produces expected output", { expect_identical(result2, expected_output) }) +test_that("basic ungrouped epi_slide_mean computation produces expected output", { + expected_output <- dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + ) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + ungroup() %>% + filter(geo_value == "ak") %>% + epi_slide_mean("value", before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result1, expected_output) + + # Ungrouped with multiple geos + # epi_slide_mean fails when input data groups contain duplicate time_values, + # e.g. aggregating across geos + expect_error( + small_x %>% ungroup() %>% epi_slide_mean("value", before = 6L), + class = "epiprocess__epi_slide_mean__duplicate_time_values" + ) +}) + test_that("epi_slide computation via formula can use ref_time_value", { expected_output <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), @@ -625,7 +845,7 @@ test_that("`epi_slide` can access objects inside of helper functions", { ) }) -test_that("epi_slide basic behavior is correct when groups have non-overlapping date ranges", { +test_that("basic slide behavior is correct when groups have non-overlapping date ranges", { small_x_misaligned_dates <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) @@ -634,14 +854,17 @@ test_that("epi_slide basic behavior is correct when groups have non-overlapping group_by(geo_value) expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5))) + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) ) %>% group_by(geo_value) %>% as_epi_df(as_of = d + 6) - result1 <- epi_slide(small_x_misaligned_dates, f = ~ sum(.x$value), before = 50) + result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50) expect_identical(result1, expected_output) + + result2 <- epi_slide_mean(small_x_misaligned_dates, "value", before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result2, expected_output) }) @@ -668,3 +891,107 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin expect_identical(result1, expected_output) }) + +test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { + +}) + +test_that("results for different time_types match between epi_slide and epi_slide_mean", { + n <- 6 # Max date index + m <- 1 # Number of missing dates + n_obs <- n + 1 - m # Number of obs created + k <- c(0:(n-(m + 1)), n) # Date indices + + # Basic time type + days <- as.Date("2022-01-01") + k + + # Require lubridate::period function to be passed as `time_step` + day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes + day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours + weeks <- as.Date("2022-01-01") + 7 * k # needs time_step = lubridate::weeks + + # Don't require a `time_step` fn + yearweeks <- tsibble::yearweek(10 + k) + yearmonths <- tsibble::yearmonth(10 + k) + yearquarters <- tsibble::yearquarter(10 + k) + years <- 2000 + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + + # Not supported + custom_dates <- c( + "January 1, 2022", "January 2, 2022", "January 3, 2022", + "January 4, 2022", "January 5, 2022", "January 6, 2022" + ) + not_dates <- c("a", "b", "c", "d", "e", "f") + + test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { + set.seed(0) + rand_vals <- rnorm(n_obs) + + # Three states, with 2 variables. a is linear, going up in one state and down in the other + # b is just random. date 10 is missing + epi_data <- epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = dates, + a = 1:n_obs, + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = dates, + a = n_obs:1, + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = dates, + a = n_obs:1, + b = rand_vals * 2 + ))) %>% + group_by(geo_value) + + result1 <- epi_slide(epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL, ...) + result2 <- epi_slide_mean(epi_data, + col_name = c("a", "b"), na.rm = TRUE, + before = before, after = after, ...) + expect_identical(result2, result2) + + ref_epi_data <- epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = days, + a = 1:n_obs, + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = days, + a = n_obs:1, + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = days, + a = n_obs:1, + b = rand_vals * 2 + ))) %>% + group_by(geo_value) + + ref_result <- epi_slide(ref_epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL) + + # All fields except dates + expect_identical(select(ref_result, -time_value), select(result1, -time_value)) + expect_identical(select(ref_result, -time_value), select(result2, -time_value)) + } + + test_time_type_mean(days) + test_time_type_mean(yearweeks) + test_time_type_mean(yearmonths) + test_time_type_mean(yearquarters) + test_time_type_mean(years) + test_time_type_mean(day_times_minute, time_step = lubridate::minutes) + test_time_type_mean(day_times_hour, time_step = lubridate::hours) + test_time_type_mean(weeks, time_step = lubridate::weeks) +}) \ No newline at end of file From 03a15779378a814d081efac23216c77fcfa5e0d4 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 1 Feb 2024 09:50:54 -0500 Subject: [PATCH 145/345] test unmappable types --- R/slide.R | 4 +- tests/testthat/test-epi_slide.R | 93 +++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 29 deletions(-) diff --git a/R/slide.R b/R/slide.R index 18bd5167..4cb2a26c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -612,7 +612,9 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, type valid for creating a date sequence.", "i" = c("The input data's `time_type` was probably `custom` or `day-time`. These require also passing a `time_step` function.") - ) + ), + class = "epiprocess__epi_slide_mean__unmappable_time_type", + epiprocess__time_type = ttype, ) } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f4eb428d..d807dd25 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -923,10 +923,34 @@ test_that("results for different time_types match between epi_slide and epi_slid ) not_dates <- c("a", "b", "c", "d", "e", "f") - test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { - set.seed(0) - rand_vals <- rnorm(n_obs) + set.seed(0) + rand_vals <- rnorm(n_obs) + + ref_epi_data <- epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = days, + a = 1:n_obs, + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = days, + a = n_obs:1, + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = days, + a = n_obs:1, + b = rand_vals * 2 + ))) %>% + group_by(geo_value) + + ref_result <- epi_slide(ref_epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL) + test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. date 10 is missing epi_data <- epiprocess::as_epi_df(rbind(tibble( @@ -957,30 +981,6 @@ test_that("results for different time_types match between epi_slide and epi_slid before = before, after = after, ...) expect_identical(result2, result2) - ref_epi_data <- epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = days, - a = 1:n_obs, - b = rand_vals - ), tibble( - geo_value = "ca", - time_value = days, - a = n_obs:1, - b = rand_vals + 10 - ), tibble( - geo_value = "fl", - time_value = days, - a = n_obs:1, - b = rand_vals * 2 - ))) %>% - group_by(geo_value) - - ref_result <- epi_slide(ref_epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = before, after = after, names_sep = NULL) - # All fields except dates expect_identical(select(ref_result, -time_value), select(result1, -time_value)) expect_identical(select(ref_result, -time_value), select(result2, -time_value)) @@ -994,4 +994,41 @@ test_that("results for different time_types match between epi_slide and epi_slid test_time_type_mean(day_times_minute, time_step = lubridate::minutes) test_time_type_mean(day_times_hour, time_step = lubridate::hours) test_time_type_mean(weeks, time_step = lubridate::weeks) -}) \ No newline at end of file +}) + +test_that("results for different time_types match between epi_slide and epi_slide_mean", { + n_obs <- 6 + k <- 1:n_obs + + day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes + day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours + weeks <- as.Date("2022-01-01") + 7 * k # needs time_step = lubridate::weeks + + # Not supported + custom_dates <- c( + "January 1, 2022", "January 2, 2022", "January 3, 2022", + "January 4, 2022", "January 5, 2022", "January 6, 2022" + ) + not_dates <- c("a", "b", "c", "d", "e", "f") + + test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { + epi_data <- epiprocess::as_epi_df(tibble( + geo_value = "al", + time_value = dates, + a = 1:n_obs + )) + + expect_error(epi_slide_mean(epi_data, + col_name = c("a", "b"), + before = before, after = after, ...), + class = "epiprocess__epi_slide_mean__unmappable_time_type" + ) + + } + + test_time_type_mean(custom_dates) + test_time_type_mean(not_dates) + test_time_type_mean(day_times_minute) + test_time_type_mean(day_times_hour) + test_time_type_mean(weeks) +}) From a24387983c98392d739e8276e673a42b68f8ad00 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 1 Feb 2024 10:11:50 -0500 Subject: [PATCH 146/345] compare differnt before/after results --- R/slide.R | 2 +- tests/testthat/test-epi_slide.R | 81 ++++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/R/slide.R b/R/slide.R index 4cb2a26c..354e7bc1 100644 --- a/R/slide.R +++ b/R/slide.R @@ -614,7 +614,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, These require also passing a `time_step` function.") ), class = "epiprocess__epi_slide_mean__unmappable_time_type", - epiprocess__time_type = ttype, + epiprocess__time_type = ttype ) } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index d807dd25..444f3ad4 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -893,7 +893,69 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin }) test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { + # 3 missing dates + n <- 15 # Max date index + m <- 3 # Number of missing dates + n_obs <- n + 1 - m # Number of obs created + k <- c(0:(n-(m + 1)), n) # Date indices + + # Basic time type + days <- as.Date("2022-01-01") + k + set.seed(0) + rand_vals <- rnorm(n_obs) + + test_time_type_mean <- function (dates, vals, before = 6L, after = 0L, ...) { + # Three states, with 2 variables. a is linear, going up in one state and down in the other + # b is just random. date 10 is missing + epi_data <- epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = dates, + a = 1:n_obs, + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = dates, + a = n_obs:1, + b = rand_vals + 10 + ))) %>% + group_by(geo_value) + + # Use the `epi_slide` result as a reference. + result1 <- epi_slide(epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL, ...) + result2 <- epi_slide_mean(epi_data, + col_name = c("a", "b"), na.rm = TRUE, + before = before, after = after, ...) + expect_identical(result1, result2) + } + + test_time_type_mean(days, rand_vals, before = 6, after = 0) + test_time_type_mean(days, rand_vals, before = 6, after = 1) + test_time_type_mean(days, rand_vals, before = 6, after = 6) + test_time_type_mean(days, rand_vals, before = 1, after = 6) + test_time_type_mean(days, rand_vals, before = 0, after = 6) + test_time_type_mean(days, rand_vals, before = 0, after = 1) + + # Without any missing dates + n <- 15 # Max date index + m <- 0 # Number of missing dates + n_obs <- n + 1 - m # Number of obs created + k <- c(0:(n-(m + 1)), n) # Date indices + + # Basic time type + days <- as.Date("2022-01-01") + k + rand_vals <- rnorm(n_obs) + + test_time_type_mean(days, rand_vals, before = 6, after = 0) + test_time_type_mean(days, rand_vals, before = 6, after = 1) + test_time_type_mean(days, rand_vals, before = 6, after = 6) + test_time_type_mean(days, rand_vals, before = 1, after = 6) + test_time_type_mean(days, rand_vals, before = 0, after = 6) + test_time_type_mean(days, rand_vals, before = 0, after = 1) }) test_that("results for different time_types match between epi_slide and epi_slide_mean", { @@ -948,7 +1010,7 @@ test_that("results for different time_types match between epi_slide and epi_slid slide_value_a = mean(.x$a, rm.na = TRUE), slide_value_b = mean(.x$b, rm.na = TRUE) ), - before = before, after = after, names_sep = NULL) + before = 6L, after = 0L, names_sep = NULL) test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other @@ -979,7 +1041,7 @@ test_that("results for different time_types match between epi_slide and epi_slid result2 <- epi_slide_mean(epi_data, col_name = c("a", "b"), na.rm = TRUE, before = before, after = after, ...) - expect_identical(result2, result2) + expect_identical(result1, result2) # All fields except dates expect_identical(select(ref_result, -time_value), select(result1, -time_value)) @@ -996,7 +1058,7 @@ test_that("results for different time_types match between epi_slide and epi_slid test_time_type_mean(weeks, time_step = lubridate::weeks) }) -test_that("results for different time_types match between epi_slide and epi_slide_mean", { +test_that("special time_types without time_step fail in epi_slide_mean", { n_obs <- 6 k <- 1:n_obs @@ -1018,9 +1080,10 @@ test_that("results for different time_types match between epi_slide and epi_slid a = 1:n_obs )) - expect_error(epi_slide_mean(epi_data, - col_name = c("a", "b"), - before = before, after = after, ...), + expect_error( + epi_slide_mean(epi_data, col_name = "a", + before = before, after = after + ), class = "epiprocess__epi_slide_mean__unmappable_time_type" ) @@ -1030,5 +1093,9 @@ test_that("results for different time_types match between epi_slide and epi_slid test_time_type_mean(not_dates) test_time_type_mean(day_times_minute) test_time_type_mean(day_times_hour) - test_time_type_mean(weeks) + # Currently doesn't throw the expected error, and returns an incorrect + # result. This is because since the weekdates are stored as Dates -> + # guess_time_type thinks this is "day" type, and the default step size is 1 + # day. + # test_time_type_mean(weeks) }) From 51eb338c0141ae915d5d26c5b74ab2bfa2dec8be Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 1 Feb 2024 13:01:14 -0800 Subject: [PATCH 147/345] lint: copy .lintr from epidatr --- .lintr | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .lintr diff --git a/.lintr b/.lintr new file mode 100644 index 00000000..c7c90554 --- /dev/null +++ b/.lintr @@ -0,0 +1,9 @@ +linters: linters_with_defaults( + line_length_linter(120), + cyclocomp_linter = NULL, + object_length_linter(length = 40L) + ) +exclusions: list( + "renv", + "venv" + ) From a53e98154327a6f4e9583f2c6c9d4b03269f8e54 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 6 Feb 2024 09:38:39 -0800 Subject: [PATCH 148/345] roxygen note --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index c014ad78..7f36ad7c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,7 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ From ab96bad5b9ca67ee02aa0e49d630f6795897bf25 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 6 Feb 2024 09:45:38 -0800 Subject: [PATCH 149/345] redocument --- NAMESPACE | 1 + man/epiprocess.Rd | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 23140464..5a1ddfa0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,7 @@ importFrom(checkmate,anyInfinite) importFrom(checkmate,assert) importFrom(checkmate,assert_character) importFrom(checkmate,assert_int) +importFrom(cli,cli_inform) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) diff --git a/man/epiprocess.Rd b/man/epiprocess.Rd index 9d501aeb..7c3ecd8a 100644 --- a/man/epiprocess.Rd +++ b/man/epiprocess.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/epiprocess.R \docType{package} \name{epiprocess} +\alias{epiprocess-package} \alias{epiprocess} \title{epiprocess: Tools for basic signal processing in epidemiology} \description{ @@ -9,3 +10,36 @@ This package introduces a common data structure for epidemiological data sets measured over space and time, and offers associated utilities to perform basic signal processing tasks. } +\seealso{ +Useful links: +\itemize{ + \item \url{https://cmu-delphi.github.io/epiprocess/} +} + +} +\author{ +\strong{Maintainer}: Ryan Tibshirani \email{ryantibs@cmu.edu} + +Authors: +\itemize{ + \item Logan Brooks + \item Daniel McDonald + \item Evan Ray +} + +Other contributors: +\itemize{ + \item Jacob Bien [contributor] + \item Rafael Catoia [contributor] + \item Nat DeFries [contributor] + \item Rachel Lobay [contributor] + \item Ken Mawer [contributor] + \item Chloe You [contributor] + \item Quang Nguyen [contributor] + \item Dmitry Shemetov [contributor] + \item Lionel Henry (Author of included rlang fragments) [contributor] + \item Hadley Wickham (Author of included rlang fragments) [contributor] + \item Posit (Copyright holder of included rlang fragments) [copyright holder] +} + +} From c84d67859fc07abd830011b997f57b596fff08f3 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 6 Feb 2024 09:46:49 -0800 Subject: [PATCH 150/345] fix: follow instructions for @docType deprecated error --- R/epiprocess.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/epiprocess.R b/R/epiprocess.R index 79f9635d..254ebd01 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -5,7 +5,6 @@ #' basic signal processing tasks. #' #' @importFrom checkmate assert assert_character assert_int anyInfinite -#' @docType package #' @name epiprocess -NULL +"_PACKAGE" utils::globalVariables(c(".x", ".group_key", ".ref_time_value")) From c897c69ca46abffccbf96bc4366a06531d716f7d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 31 Jan 2024 13:39:08 -0800 Subject: [PATCH 151/345] refactor: error messages * swap rlang for cli in Abort and Warn * remove Abort and Warn plugins * format long error message lines * replace and remove break_str * simplify validate_version_bound with checkmate * use checkmate for validation where possible * update tests --- NAMESPACE | 14 ++ R/archive.R | 210 ++++++++----------- R/correlation.R | 7 +- R/data.R | 2 +- R/epi_df.R | 19 +- R/epiprocess.R | 6 +- R/grouped_epi_archive.R | 86 +++----- R/growth_rate.R | 10 +- R/methods-epi_archive.R | 64 +++--- R/methods-epi_df.R | 4 +- R/outliers.R | 10 +- R/slide.R | 50 ++--- R/utils.R | 92 ++++---- tests/testthat/test-archive-version-bounds.R | 67 ++---- tests/testthat/test-archive.R | 12 +- tests/testthat/test-data.R | 4 +- tests/testthat/test-epi_df.R | 2 +- tests/testthat/test-epi_slide.R | 20 +- tests/testthat/test-epix_slide.R | 4 +- tests/testthat/test-grouped_epi_archive.R | 2 +- tests/testthat/test-utils.R | 9 - 21 files changed, 291 insertions(+), 403 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5a1ddfa0..ef55f68c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,10 +69,24 @@ export(ungroup) export(unnest) importFrom(R6,R6Class) importFrom(checkmate,anyInfinite) +importFrom(checkmate,anyMissing) importFrom(checkmate,assert) importFrom(checkmate,assert_character) +importFrom(checkmate,assert_class) +importFrom(checkmate,assert_data_frame) importFrom(checkmate,assert_int) +importFrom(checkmate,assert_list) +importFrom(checkmate,assert_logical) +importFrom(checkmate,assert_numeric) +importFrom(checkmate,assert_scalar) +importFrom(checkmate,check_atomic) +importFrom(checkmate,check_data_frame) +importFrom(checkmate,test_set_equal) +importFrom(checkmate,test_subset) +importFrom(checkmate,vname) +importFrom(cli,cli_abort) importFrom(cli,cli_inform) +importFrom(cli,cli_warn) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) diff --git a/R/archive.R b/R/archive.R index 08540955..428cce76 100644 --- a/R/archive.R +++ b/R/archive.R @@ -14,7 +14,7 @@ #' @param version_bound the version bound to validate #' @param x a data frame containing a version column with which to check #' compatibility -#' @param na_ok Boolean; is `NULL` an acceptable "bound"? (If so, `NULL` will +#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will #' have a special context-dependent meaning.) #' @param version_bound_arg optional string; what to call the version bound in #' error messages @@ -22,47 +22,31 @@ #' @section Side effects: raises an error if version bound appears invalid #' #' @noRd -validate_version_bound <- function(version_bound, x, na_ok, +validate_version_bound <- function(version_bound, x, na_ok = FALSE, version_bound_arg = rlang::caller_arg(version_bound), x_arg = rlang::caller_arg(version_bound)) { - # We might want some (optional?) validation here to detect internal bugs. - if (length(version_bound) != 1L) { - # Check for length-1-ness fairly early so we don't have to worry as much - # about our `if`s receiving non-length-1 "Boolean"s. - Abort( - sprintf( - "`version_bound` must have length 1, but instead was length %d", - length(version_bound) - ), - class = sprintf("epiprocess__%s_is_not_length_1", version_bound_arg) + if (is.null(version_bound)) { + cli_abort( + "{version_bound_arg} cannot be NULL" ) - } else if (is.na(version_bound)) { - # Check for NA before class&type, as any-class&type NA should be fine for - # our purposes, and some version classes&types might not have their own NA - # value to pass in. - if (na_ok) { - # Looks like a valid version bound; exit without error. - return(invisible(NULL)) - } else { - Abort(sprintf( - "`%s` must not satisfy `is.na` (NAs are not allowed for this kind of version bound)", - version_bound_arg - ), class = sprintf("epiprocess__%s_is_na", version_bound_arg)) - } - } else if (!identical(class(version_bound), class(x[["version"]])) || - !identical(typeof(version_bound), typeof(x[["version"]]))) { - Abort(sprintf( - "`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`", - version_bound_arg, - # '{x_arg}[["version"]]' except adding parentheses if needed: - rlang::expr_deparse(rlang::new_call( - quote(`[[`), rlang::pairlist2(rlang::parse_expr(x_arg), "version") - )) - ), class = sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) - } else { - # Looks like a valid version bound; exit without error. + } + if (na_ok && is.na(version_bound)) { return(invisible(NULL)) } + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + ) + } + if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same types as x$version, + which is {typeof(x$version)}", + ) + } + + return(invisible(NULL)) } #' `max(x$version)`, with error if `x` has 0 rows @@ -77,13 +61,18 @@ validate_version_bound <- function(version_bound, x, na_ok, #' @export max_version_with_row_in <- function(x) { if (nrow(x) == 0L) { - Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `versions_end` must be manually specified."), + cli_abort( + "`nrow(x)==0L`, representing a data set history with no row up through the + latest observed version, but we don't have a sensible guess at what version + that is, or whether any of the empty versions might be clobbered in the + future; if we use `x` to form an `epi_archive`, then + `clobberable_versions_start` and `versions_end` must be manually specified.", class = "epiprocess__max_version_cannot_be_used" ) } else { version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist if (anyNA(version_col)) { - Abort("version values cannot be NA", + cli_abort("version values cannot be NA", class = "epiprocess__version_values_must_not_be_na" ) } else { @@ -278,26 +267,15 @@ epi_archive <- initialize = function(x, geo_type, time_type, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end) { - # Check that we have a data frame - if (!is.data.frame(x)) { - Abort("`x` must be a data frame.") - } - - # Check that we have geo_value, time_value, version columns - if (!("geo_value" %in% names(x))) { - Abort("`x` must contain a `geo_value` column.") - } - if (!("time_value" %in% names(x))) { - Abort("`x` must contain a `time_value` column.") - } - if (!("version" %in% names(x))) { - Abort("`x` must contain a `version` column.") - } - if (anyNA(x$version)) { - Abort("`x$version` must not contain `NA`s", - class = "epiprocess__version_values_must_not_be_na" + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." ) } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } # If geo type is missing, then try to guess it if (missing(geo_type)) { @@ -312,24 +290,21 @@ epi_archive <- # Finish off with small checks on keys variables and metadata if (missing(other_keys)) other_keys <- NULL if (missing(additional_metadata)) additional_metadata <- list() - if (!all(other_keys %in% names(x))) { - Abort("`other_keys` must be contained in the column names of `x`.") + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") } if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - Abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } - if (any(names(additional_metadata) %in% - c("geo_type", "time_type"))) { - Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") } # Conduct checks and apply defaults for `compactify` if (missing(compactify)) { compactify <- NULL - } else if (!rlang::is_bool(compactify) && - !rlang::is_null(compactify)) { - Abort("compactify must be boolean or null.") } + assert_logical(compactify, len = 1, null.ok = TRUE) # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: @@ -342,7 +317,7 @@ epi_archive <- validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) validate_version_bound(versions_end, x, na_ok = FALSE) if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - Abort( + cli_abort( sprintf( "`versions_end` was %s, but `x` contained updates for a later version or versions, up through %s", @@ -352,7 +327,7 @@ epi_archive <- ) } if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - Abort( + cli_abort( sprintf( "`versions_end` was %s, but a `clobberable_versions_start` of %s indicated that there were later observed versions", @@ -373,7 +348,11 @@ epi_archive <- maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) if (maybe_first_duplicate_key_row_index != 0L) { - Abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same measurement.", + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", class = "epiprocess__epi_archive_requires_unique_key" ) } @@ -410,24 +389,22 @@ epi_archive <- # Warns about redundant rows if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- break_str(paste( - "Found rows that appear redundant based on", - "last (version of each) observation carried forward;", - 'these rows have been removed to "compactify" and save space:' - )) - + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - - warning_outro <- break_str(paste( - "Built-in `epi_archive` functionality should be unaffected,", - "but results may change if you work directly with its fields (such as `DT`).", - "See `?as_epi_archive` for details.", - "To silence this warning but keep compactification,", - "you can pass `compactify=TRUE` when constructing the archive." - )) - + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") } @@ -447,8 +424,8 @@ epi_archive <- print = function(class = TRUE, methods = TRUE) { cli_inform( c( - ">" = if (class) {"An `epi_archive` object, with metadata:"}, - "i" = if (length(setdiff(key(self$DT), c('geo_value', 'time_value', 'version'))) > 0) { + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(self$DT), c("geo_value", "time_value", "version"))) > 0) { "Non-standard DT keys: {setdiff(key(self$DT), c('geo_value', 'time_value', 'version'))}" }, "i" = "Min/max time values: {min(self$DT$time_value)} / {max(self$DT$time_value)}", @@ -457,12 +434,12 @@ epi_archive <- "Clobberable versions start: {self$clobberable_versions_start}" }, "i" = "Versions end: {self$versions_end}", - "i" = if (methods) {"Public R6 methods: {names(epi_archive$public_methods)}"}, + "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", "i" = "A preview of the table ({nrow(self$DT)} rows x {ncol(self$DT)} columns):" ) ) - return(invisible(self$DT %>% print)) + return(invisible(self$DT %>% print())) }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. @@ -493,24 +470,28 @@ epi_archive <- if (length(other_keys) == 0) other_keys <- NULL # Check a few things on max_version - if (!identical(class(max_version), class(self$DT$version)) || - !identical(typeof(max_version), typeof(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") - } - if (length(max_version) != 1) { - Abort("`max_version` cannot be a vector.") + if (!test_set_equal(class(max_version), class(self$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `self$DT$version`." + ) } - if (is.na(max_version)) { - Abort("`max_version` must not be NA.") + if (!test_set_equal(typeof(max_version), typeof(self$DT$version))) { + cli_abort( + "`max_version` must have the same types as `self$DT$version`." + ) } + assert_scalar(max_version, na.ok = FALSE) if (max_version > self$versions_end) { - Abort("`max_version` must be at most `self$versions_end`.") - } - if (!rlang::is_bool(all_versions)) { - Abort("`all_versions` must be TRUE or FALSE.") + cli_abort("`max_version` must be at most `self$versions_end`.") } + assert_logical(all_versions, len = 1) if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered"). Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', class = "epiprocess__snapshot_as_of_clobberable_version" ) } @@ -526,8 +507,7 @@ epi_archive <- return( # Make sure to use data.table ways of filtering and selecting - self$DT[time_value >= min_time_value & - version <= max_version, ] %>% + self$DT[time_value >= min_time_value & version <= max_version, ] %>% unique( by = c("geo_value", "time_value", other_keys), fromLast = TRUE @@ -573,7 +553,7 @@ epi_archive <- nonkey_cols <- setdiff(names(self$DT), key(self$DT)) next_version_tag <- next_after(self$versions_end) if (next_version_tag > fill_versions_end) { - Abort(sprintf(paste( + cli_abort(sprintf(paste( "Apparent problem with `next_after` method:", "archive contained observations through version %s", "and the next possible version was supposed to be %s,", @@ -621,25 +601,21 @@ epi_archive <- #' @param x as in [`epix_truncate_versions_after`] #' @param max_version as in [`epix_truncate_versions_after`] truncate_versions_after = function(max_version) { - if (length(max_version) != 1) { - Abort("`max_version` cannot be a vector.") - } - if (is.na(max_version)) { - Abort("`max_version` must not be NA.") + if (!test_set_equal(class(max_version), class(self$DT$version))) { + cli_abort("`max_version` must have the same classes as `self$DT$version`.") } - if (!identical(class(max_version), class(self$DT$version)) || - !identical(typeof(max_version), typeof(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") + if (!test_set_equal(typeof(max_version), typeof(self$DT$version))) { + cli_abort("`max_version` must have the same types as `self$DT$version`.") } + assert_scalar(max_version, na.ok = FALSE) if (max_version > self$versions_end) { - Abort("`max_version` must be at most `self$versions_end`.") + cli_abort("`max_version` must be at most `self$versions_end`.") } self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with = FALSE] # (^ this filter operation seems to always copy the DT, even if it # keeps every entry; we don't guarantee this behavior in # documentation, though, so we could change to alias in this case) - if (!is.na(self$clobberable_versions_start) && - self$clobberable_versions_start > max_version) { + if (!is.na(self$clobberable_versions_start) && self$clobberable_versions_start > max_version) { self$clobberable_versions_start <- NA } self$versions_end <- max_version @@ -662,7 +638,7 @@ epi_archive <- ) if (length(epi_archive$private_fields) != 0L) { - Abort("expected no private fields in epi_archive", + cli_abort("expected no private fields in epi_archive", internal = TRUE ) } diff --git a/R/correlation.R b/R/correlation.R index a4a56d1e..e4272fdd 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -78,12 +78,11 @@ epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, cor_by = geo_value, use = "na.or.complete", method = c("pearson", "kendall", "spearman")) { - # Check we have an `epi_df` object - if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") + assert_class(x, "epi_df") # Check that we have variables to do computations on - if (missing(var1)) Abort("`var1` must be specified.") - if (missing(var2)) Abort("`var2` must be specified.") + if (missing(var1)) cli_abort("`var1` must be specified.") + if (missing(var2)) cli_abort("`var2` must be specified.") var1 <- enquo(var1) var2 <- enquo(var2) diff --git a/R/data.R b/R/data.R index c528039c..2a5e5738 100644 --- a/R/data.R +++ b/R/data.R @@ -144,7 +144,7 @@ delayed_assign_with_unregister_awareness <- function(x, value, # all.) rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)), error = function(err) { - Abort( + cli_abort( paste( "An error was raised while attempting to evaluate a promise", "(prepared with `delayed_assign_with_unregister_awareness`)", diff --git a/R/epi_df.R b/R/epi_df.R index 91e6c9d9..1c648ff8 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -114,14 +114,9 @@ NULL #' @export new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, additional_metadata = list(), ...) { - # Check that we have a data frame - if (!is.data.frame(x)) { - Abort("`x` must be a data frame.") - } + assert_data_frame(x) + assert_list(additional_metadata) - if (!is.list(additional_metadata)) { - Abort("`additional_metadata` must be a list type.") - } if (is.null(additional_metadata[["other_keys"]])) { additional_metadata[["other_keys"]] <- character(0L) } @@ -302,13 +297,9 @@ as_epi_df.epi_df <- function(x, ...) { #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, additional_metadata = list(), ...) { - # Check that we have geo_value and time_value columns - if (!("geo_value" %in% names(x))) { - Abort("`x` must contain a `geo_value` column.") - } - if (!("time_value" %in% names(x))) { - Abort("`x` must contain a `time_value` column.") - } + if (!test_subset(c("geo_value", "time_value"), names(x))) cli_abort( + "Columns `geo_value` and `time_value` must be present in `x`." + ) new_epi_df( x, geo_type, time_type, as_of, diff --git a/R/epiprocess.R b/R/epiprocess.R index 254ebd01..05737d58 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -4,7 +4,11 @@ #' measured over space and time, and offers associated utilities to perform #' basic signal processing tasks. #' -#' @importFrom checkmate assert assert_character assert_int anyInfinite +#' @importFrom checkmate assert assert_scalar assert_data_frame anyMissing +#' assert_logical assert_list assert_character assert_class +#' assert_int assert_numeric check_data_frame vname check_atomic +#' anyInfinite test_subset test_set_equal +#' @importFrom cli cli_abort cli_inform cli_warn #' @name epiprocess "_PACKAGE" utils::globalVariables(c(".x", ".group_key", ".ref_time_value")) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index f083cf93..9ddad684 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -53,41 +53,24 @@ grouped_epi_archive <- public = list( initialize = function(ungrouped, vars, drop) { if (inherits(ungrouped, "grouped_epi_archive")) { - Abort("`ungrouped` must not already be grouped (neither automatic regrouping nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, or `ungroup` first.", + cli_abort("`ungrouped` must not already be grouped (neither automatic regrouping nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, or `ungroup` first.", class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", epiprocess__ungrouped_class = class(ungrouped), epiprocess__ungrouped_groups = groups(ungrouped) ) } - if (!inherits(ungrouped, "epi_archive")) { - Abort("`ungrouped` must be an epi_archive", - class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_not_epi_archive", - epiprocess__ungrouped_class = class(ungrouped) - ) - } - if (!is.character(vars)) { - Abort("`vars` must be a character vector (any tidyselection should have already occurred in a helper method).", - class = "epiprocess__grouped_epi_archive__vars_is_not_chr", - epiprocess__vars_class = class(vars), - epiprocess__vars_type = typeof(vars) - ) - } - if (!all(vars %in% names(ungrouped$DT))) { - Abort("`vars` must be selected from the names of columns of `ungrouped$DT`", - class = "epiprocess__grouped_epi_archive__vars_contains_invalid_entries", - epiprocess__vars = vars, - epiprocess__DT_names = names(ungrouped$DT) + assert_class(ungrouped, "epi_archive") + assert_character(vars) + if (!test_subset(vars, names(ungrouped$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", ) } if ("version" %in% vars) { - Abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") - } - if (!rlang::is_bool(drop)) { - Abort("`drop` must be a Boolean", - class = "epiprocess__grouped_epi_archive__drop_is_not_bool", - epiprocess__drop = drop - ) + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") } + assert_logical(drop, len = 1) + # ----- private$ungrouped <- ungrouped private$vars <- vars @@ -136,11 +119,9 @@ grouped_epi_archive <- invisible(self) }, group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - if (!rlang::is_bool(.add)) { - Abort("`.add` must be a Boolean") - } + assert_logical(.add, len = 1) if (!.add) { - Abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden (neither automatic regrouping nor nested grouping is supported). If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. @@ -210,7 +191,7 @@ grouped_epi_archive <- # early development versions and much more likely to be clutter than # informative in the signature. if ("group_by" %in% nse_dots_names(...)) { - Abort(" + cli_abort(" The `group_by` argument to `slide` has been removed; please use the `group_by` S3 generic function or `$group_by` R6 method before the slide instead. (If you were instead trying to pass a @@ -221,7 +202,7 @@ grouped_epi_archive <- ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") } if ("all_rows" %in% nse_dots_names(...)) { - Abort(" + cli_abort(" The `all_rows` argument has been removed from `epix_slide` (but is still supported in `epi_slide`). Add rows for excluded results with a manual join instead. @@ -230,15 +211,14 @@ grouped_epi_archive <- if (missing(ref_time_values)) { ref_time_values <- epix_slide_ref_time_values_default(private$ungrouped) - } else if (length(ref_time_values) == 0L) { - Abort("`ref_time_values` must have at least one element.") - } else if (any(is.na(ref_time_values))) { - Abort("`ref_time_values` must not include `NA`.") - } else if (anyDuplicated(ref_time_values) != 0L) { - Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") - } else if (any(ref_time_values > private$ungrouped$versions_end)) { - Abort("All `ref_time_values` must be `<=` the `versions_end`.") } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } # Sort, for consistency with `epi_slide`, although the current # implementation doesn't take advantage of it. ref_time_values <- sort(ref_time_values) @@ -246,16 +226,14 @@ grouped_epi_archive <- # Validate and pre-process `before`: if (missing(before)) { - Abort("`before` is required (and must be passed by name); + cli_abort("`before` is required (and must be passed by name); if you did not want to apply a sliding window but rather to map `as_of` and `f` across various `ref_time_values`, pass a large `before` value (e.g., if time steps are days, `before=365000`).") } before <- vctrs::vec_cast(before, integer()) - if (length(before) != 1L || is.na(before) || before < 0L) { - Abort("`before` must be length-1, non-NA, non-negative.") - } + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) # If a custom time step is specified, then redefine units @@ -265,15 +243,9 @@ grouped_epi_archive <- new_col <- sym(new_col_name) # Validate rest of parameters: - if (!rlang::is_bool(as_list_col)) { - Abort("`as_list_col` must be TRUE or FALSE.") - } - if (!(rlang::is_string(names_sep) || is.null(names_sep))) { - Abort("`names_sep` must be a (single) string or NULL.") - } - if (!rlang::is_bool(all_versions)) { - Abort("`all_versions` must be TRUE or FALSE.") - } + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) # Computation for one group, one time value comp_one_grp <- function(.data_group, .group_key, @@ -290,9 +262,7 @@ grouped_epi_archive <- .data_group <- .data_group$DT } - if (!(is.atomic(comp_value) || is.data.frame(comp_value))) { - Abort("The slide computation must return an atomic vector or a data frame.") - } + assert(check_atomic(comp_value, any.missing = TRUE), check_data_frame(comp_value), combine = "or", .var.name = vname(comp_value)) # Label every result row with the `ref_time_value` res <- list(time_value = ref_time_value) @@ -312,10 +282,10 @@ grouped_epi_archive <- if (missing(f)) { quos <- enquos(...) if (length(quos) == 0) { - Abort("If `f` is missing then a computation must be specified via `...`.") + cli_abort("If `f` is missing then a computation must be specified via `...`.") } if (length(quos) > 1) { - Abort("If `f` is missing then only a single computation can be specified via `...`.") + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") } f <- quos[[1]] diff --git a/R/growth_rate.R b/R/growth_rate.R index f54d1277..b584f7e3 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -118,10 +118,8 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, h = 7, log_scale = FALSE, dup_rm = FALSE, na_rm = FALSE, ...) { # Check x, y, x0 - if (length(x) != length(y)) Abort("`x` and `y` must have the same length.") - if (!all(x0 %in% x)) Abort("`x0` must be a subset of `x`.") - - # Check the method + if (length(x) != length(y)) cli_abort("`x` and `y` must have the same length.") + if (!all(x0 %in% x)) cli_abort("`x0` must be a subset of `x`.") method <- match.arg(method) # Arrange in increasing order of x @@ -137,7 +135,7 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, if (dup_rm) { o <- !duplicated(x) if (any(!o)) { - Warn("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") + cli_warn("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") } x <- x[o] y <- y[o] @@ -238,7 +236,7 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, # Check cv and df combo if (is.numeric(df)) cv <- FALSE if (!cv && !(is.numeric(df) && df == round(df))) { - Abort("If `cv = FALSE`, then `df` must be an integer.") + cli_abort("If `cv = FALSE`, then `df` must be an integer.") } # Compute trend filtering path diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 45db2855..43b816bc 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -77,7 +77,7 @@ #' #' @export epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { - if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") + assert_class(x, "epi_archive") return(x$as_of(max_version, min_time_value, all_versions = all_versions)) } @@ -113,7 +113,7 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL #' @return An `epi_archive` epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf")) { - if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") + assert_class(x, "epi_archive") # Enclosing parentheses drop the invisibility flag. See description above of # potential mutation and aliasing behavior. (x$clone()$fill_through_version(fill_versions_end, how = how)) @@ -179,31 +179,25 @@ epix_fill_through_version <- function(x, fill_versions_end, epix_merge <- function(x, y, sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE) { - if (!inherits(x, "epi_archive")) { - Abort("`x` must be of class `epi_archive`.") - } - - if (!inherits(y, "epi_archive")) { - Abort("`y` must be of class `epi_archive`.") - } - + assert_class(x, "epi_archive") + assert_class(y, "epi_archive") sync <- rlang::arg_match(sync) if (!identical(x$geo_type, y$geo_type)) { - Abort("`x` and `y` must have the same `$geo_type`") + cli_abort("`x` and `y` must have the same `$geo_type`") } if (!identical(x$time_type, y$time_type)) { - Abort("`x` and `y` must have the same `$time_type`") + cli_abort("`x` and `y` must have the same `$time_type`") } if (length(x$additional_metadata) != 0L) { - Warn("x$additional_metadata won't appear in merge result", + cli_warn("x$additional_metadata won't appear in merge result", class = "epiprocess__epix_merge_ignores_additional_metadata" ) } if (length(y$additional_metadata) != 0L) { - Warn("y$additional_metadata won't appear in merge result", + cli_warn("y$additional_metadata won't appear in merge result", class = "epiprocess__epix_merge_ignores_additional_metadata" ) } @@ -222,7 +216,7 @@ epix_merge <- function(x, y, # partially-mutated `x` on failure. if (sync == "forbid") { if (!identical(x$versions_end, y$versions_end)) { - Abort(paste( + cli_abort(paste( "`x` and `y` were not equally up to date version-wise:", "`x$versions_end` was not identical to `y$versions_end`;", "either ensure that `x` and `y` are equally up to date before merging,", @@ -242,7 +236,7 @@ epix_merge <- function(x, y, x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] } else { - Abort("unimplemented") + cli_abort("unimplemented") } # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same @@ -257,7 +251,7 @@ epix_merge <- function(x, y, x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) if (!x_DT_key_as_expected || !y_DT_key_as_expected) { - Warn(" + cli_warn(" `epiprocess` internal warning (please report): pre-processing for epix_merge unexpectedly resulted in an intermediate data table (or tables) with a different key than the corresponding input archive. @@ -272,7 +266,7 @@ epix_merge <- function(x, y, # sensible default treatment of count-type and rate-type value columns would # differ. if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { - Abort(" + cli_abort(" The archives must have the same set of key column names; if the key columns represent the same things, just with different names, please retry after manually renaming to match; if they @@ -289,14 +283,14 @@ epix_merge <- function(x, y, # version carried forward via rolling joins by <- key(x_DT) # = some perm of key(y_DT) if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { - Abort('Invalid `by`; `by` is currently set to the common `key` of + cli_abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to contain "geo_value", "time_value", and "version".', class = "epiprocess__epi_archive_must_have_required_key_cols" ) } if (length(by) < 1L || utils::tail(by, 1L) != "version") { - Abort('Invalid `by`; `by` is currently set to the common `key` of + cli_abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to have a "version" as the last key col.', class = "epiprocess__epi_archive_must_have_version_at_end_of_key" @@ -305,7 +299,7 @@ epix_merge <- function(x, y, x_nonby_colnames <- setdiff(names(x_DT), by) y_nonby_colnames <- setdiff(names(y_DT), by) if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { - Abort(" + cli_abort(" `x` and `y` DTs have overlapping non-by column names; this is currently not supported; please manually fix up first: any overlapping columns that can are key-like should be @@ -314,7 +308,7 @@ epix_merge <- function(x, y, } x_by_vals <- x_DT[, by, with = FALSE] if (anyDuplicated(x_by_vals) != 0L) { - Abort(" + cli_abort(" The `by` columns must uniquely determine rows of `x$DT`; the `by` is currently set to the common `key` of the two archives, so this can be resolved by adding key-like columns @@ -323,7 +317,7 @@ epix_merge <- function(x, y, } y_by_vals <- y_DT[, by, with = FALSE] if (anyDuplicated(y_by_vals) != 0L) { - Abort(" + cli_abort(" The `by` columns must uniquely determine rows of `y$DT`; the `by` is currently set to the common `key` of the two archives, so this can be resolved by adding key-like columns @@ -409,11 +403,7 @@ epix_merge <- function(x, y, #' #' @noRd new_col_modify_recorder_df <- function(parent_df) { - if (!inherits(parent_df, "data.frame")) { - Abort('`parent_df` must inherit class `"data.frame"`', - internal = TRUE - ) - } + assert_class(parent_df, "data.frame") `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) } @@ -425,11 +415,7 @@ new_col_modify_recorder_df <- function(parent_df) { #' #' @noRd destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { - if (!inherits(col_modify_recorder_df, "col_modify_recorder_df")) { - Abort('`col_modify_recorder_df` must inherit class `"col_modify_recorder_df"`', - internal = TRUE - ) - } + assert_class(col_modify_recorder_df, "col_modify_recorder_df") list( unchanged_parent_df = col_modify_recorder_df %>% `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% @@ -451,7 +437,7 @@ destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { #' @noRd dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { - Abort("`col_modify_recorder_df` can only record `cols` once", + cli_abort("`col_modify_recorder_df` can only record `cols` once", internal = TRUE ) } @@ -676,19 +662,17 @@ epix_detailed_restricted_mutate <- function(.data, ...) { group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) - if (!rlang::is_bool(.drop)) { - Abort("`.drop` must be TRUE or FALSE") - } + assert_logical(.drop) if (!.drop) { grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. if (!any(grouping_col_is_factor)) { - Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", + cli_warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" ) } else if (any(diff(grouping_col_is_factor) == -1L)) { - Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", + cli_warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" ) } @@ -956,7 +940,7 @@ epix_slide <- function(x, f, ..., before, ref_time_values, as_list_col = FALSE, names_sep = "_", all_versions = FALSE) { if (!is_epi_archive(x, grouped_okay = TRUE)) { - Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") + cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } return(x$slide(f, ..., before = before, diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 7e002320..3636d966 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -129,10 +129,10 @@ dplyr_reconstruct.epi_df <- function(data, template) { cn <- names(res) - # Duplicate columns, Abort + # Duplicate columns, cli_abort dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { - Abort(paste0( + cli_abort(paste0( "Column name(s) ", paste(unique(dup_col_names), collapse = ", " diff --git a/R/outliers.R b/R/outliers.R index 1eb3ea01..ee59d64b 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -97,7 +97,7 @@ detect_outlr <- function(x = seq_along(y), y, # Validate that x contains all distinct values if (any(duplicated(x))) { - Abort("`x` cannot contain duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") + cli_abort("`x` cannot contain duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") } # Run all outlier detection methods @@ -108,10 +108,10 @@ detect_outlr <- function(x = seq_along(y), y, results <- do.call(method, args = c(list("x" = x, "y" = y), args)) # Validate the output - if (!is.data.frame(results) || - !all(c("lower", "upper", "replacement") %in% colnames(results))) { - Abort("Outlier detection method must return a data frame with columns `lower`, `upper`, and `replacement`.") - } + assert_data_frame(results) + if (!test_subset(c("lower", "upper", "replacement"), colnames(results))) cli_abort( + "Columns `lower`, `upper`, and `replacement` must be present in the output of the outlier detection method." + ) # Update column names with model abbreviation colnames(results) <- paste(abbr, colnames(results), sep = "_") diff --git a/R/slide.R b/R/slide.R index e2c0bf55..9adabf9e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -168,47 +168,37 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { - # Check we have an `epi_df` object - if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") + assert_class(x, "epi_df") if (missing(ref_time_values)) { ref_time_values <- unique(x$time_value) - } - - # Some of these `ref_time_values` checks and processing steps also apply to - # the `ref_time_values` default; for simplicity, just apply all the steps - # regardless of whether we are working with a default or user-provided - # `ref_time_values`: - if (length(ref_time_values) == 0L) { - Abort("`ref_time_values` must have at least one element.") - } else if (any(is.na(ref_time_values))) { - Abort("`ref_time_values` must not include `NA`.") - } else if (anyDuplicated(ref_time_values) != 0L) { - Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") - } else if (!all(ref_time_values %in% unique(x$time_value))) { - Abort("All `ref_time_values` must appear in `x$time_value`.") } else { - ref_time_values <- sort(ref_time_values) + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(ref_time_values, unique(x$time_value))) { + cli_abort( + "`ref_time_values` must be a unique subset of the time values in `x`." + ) + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } } + ref_time_values <- sort(ref_time_values) # Validate and pre-process `before`, `after`: if (!missing(before)) { before <- vctrs::vec_cast(before, integer()) - if (length(before) != 1L || is.na(before) || before < 0L) { - Abort("`before` must be length-1, non-NA, non-negative") - } + assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) } if (!missing(after)) { after <- vctrs::vec_cast(after, integer()) - if (length(after) != 1L || is.na(after) || after < 0L) { - Abort("`after` must be length-1, non-NA, non-negative") - } + assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) } if (missing(before)) { if (missing(after)) { - Abort("Either or both of `before`, `after` must be provided.") + cli_abort("Either or both of `before`, `after` must be provided.") } else if (after == 0L) { - Warn("`before` missing, `after==0`; maybe this was intended to be some + cli_warn("`before` missing, `after==0`; maybe this was intended to be some non-zero-width trailing window, but since `before` appears to be missing, it's interpreted as a zero-width window (`before=0, after=0`).") @@ -216,7 +206,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, before <- 0L } else if (missing(after)) { if (before == 0L) { - Warn("`before==0`, `after` missing; maybe this was intended to be some + cli_warn("`before==0`, `after` missing; maybe this was intended to be some non-zero-width leading window, but since `after` appears to be missing, it's interpreted as a zero-width window (`before=0, after=0`).") @@ -283,7 +273,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && !all(purrr::map_lgl(slide_values_list, is.data.frame))) { - Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") + cli_abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } # Unlist if appropriate: @@ -309,7 +299,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, )) } if (vctrs::vec_size(slide_values) != num_ref_rows) { - Abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.") + cli_abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.") } } @@ -330,10 +320,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, if (missing(f)) { quos <- enquos(...) if (length(quos) == 0) { - Abort("If `f` is missing then a computation must be specified via `...`.") + cli_abort("If `f` is missing then a computation must be specified via `...`.") } if (length(quos) > 1) { - Abort("If `f` is missing then only a single computation can be specified via `...`.") + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") } f <- quos[[1]] diff --git a/R/utils.R b/R/utils.R index a8160159..6bef5e0a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,9 +1,3 @@ -break_str <- function(str, nchar = 79, init = "") { - str <- paste(strwrap(str, nchar, initial = init), collapse = "\n") - str[1] <- substring(str, nchar(init) + 1) - return(str) -} - # Note: update `wrap_symbolics` and `wrap_varnames` (parameters, parameter # defaults, bodies) together. @@ -33,17 +27,13 @@ wrap_symbolics <- function(symbolics, initial = "", common_prefix = "", none_str = "", width = getOption("width", 80L)) { if (!all(purrr::map_lgl(symbolics, rlang::is_symbolic))) { - Abort("`symbolics` must be a list of symbolic objects") - } - if (!rlang::is_string(initial)) { - Abort("`initial` must be a string") - } - if (!rlang::is_string(common_prefix)) { - Abort("`common_prefix` must be a string") - } - if (!rlang::is_string(none_str)) { - Abort("`none_str` must be a string") + cli_abort("`symbolics` must be a list of symbolic objects") } + assert_character(initial, len = 1L) + assert_character(common_prefix, len = 1L) + assert_character(none_str, len = 1L) + assert_int(width, lower = 1L) + prefix <- strrep(" ", nchar(initial, type = "width")) full_initial <- paste0(common_prefix, initial) full_prefix <- paste0(common_prefix, prefix) @@ -85,9 +75,7 @@ wrap_varnames <- function(nms, width = getOption("width", 80L)) { # (Repeating parameter names and default args here for better autocomplete. # Using `...` instead would require less upkeep, but have worse autocomplete.) - if (!rlang::is_character(nms)) { - Abort("`nms` must be a character vector") - } + assert_character(nms) wrap_symbolics(rlang::syms(nms), initial = initial, common_prefix = common_prefix, none_str = none_str, width = width) } @@ -101,8 +89,6 @@ paste_lines <- function(lines) { paste(paste0(lines, "\n"), collapse = "") } -Abort <- function(msg, ...) rlang::abort(break_str(msg, init = "Error: "), ...) -Warn <- function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) #' Assert that a sliding computation function takes enough args #' @@ -140,8 +126,12 @@ assert_sufficient_f_args <- function(f, ...) { if (n_f_args_before_dots < n_mandatory_f_args) { mandatory_f_args_in_f_dots <- tail(mandatory_f_args_labels, n_mandatory_f_args - n_f_args_before_dots) + cli::cli_warn( - "`f` might not have enough positional arguments before its `...`; in the current `epi[x]_slide` call, the {mandatory_f_args_in_f_dots} will be included in `f`'s `...`; if `f` doesn't expect those arguments, it may produce confusing error messages", + "`f` might not have enough positional arguments before its `...`; in + the current `epi[x]_slide` call, the {mandatory_f_args_in_f_dots} will + be included in `f`'s `...`; if `f` doesn't expect those arguments, it + may produce confusing error messages", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots", epiprocess__f = f, epiprocess__mandatory_f_args_in_f_dots = mandatory_f_args_in_f_dots @@ -152,13 +142,16 @@ assert_sufficient_f_args <- function(f, ...) { # `f` doesn't take enough args. if (rlang::dots_n(...) == 0L) { # common case; try for friendlier error message - Abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), + cli_abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", epiprocess__f = f ) } else { # less common; highlight that they are (accidentally?) using dots forwarding - Abort(sprintf("`f` must take at least %s arguments plus the %s arguments forwarded through `epi[x]_slide`'s `...`, or a named argument to `epi[x]_slide` was misspelled", n_mandatory_f_args, rlang::dots_n(...)), + cli_abort( + "`f` must take at least {n_mandatory_f_args} arguments plus the + {rlang::dots_n(...)} arguments forwarded through `epi[x]_slide`'s + `...`, or a named argument to `epi[x]_slide` was misspelled", class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", epiprocess__f = f ) @@ -181,7 +174,13 @@ assert_sufficient_f_args <- function(f, ...) { default_check_mandatory_args_labels[has_default_replaced_by_mandatory] args_with_default_replaced_by_mandatory <- rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) - cli::cli_abort("`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which {?has a/have} default value{?s}; we suspect that `f` doesn't expect {?this arg/these args} at all and may produce confusing error messages. Please add additional arguments to `f` or remove defaults as appropriate.", + cli::cli_abort( + "`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to + `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which + {?has a/have} default value{?s}; we suspect that `f` doesn't expect + {?this arg/these args} at all and may produce confusing error messages. + Please add additional arguments to `f` or remove defaults as + appropriate.", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", epiprocess__f = f ) @@ -315,14 +314,16 @@ as_slide_computation <- function(f, ...) { if (is_formula(f)) { if (length(f) > 2) { - Abort(sprintf("%s must be a one-sided formula", arg), + cli_abort(sprintf("%s must be a one-sided formula", arg), class = "epiprocess__as_slide_computation__formula_is_twosided", epiprocess__f = f, call = call ) } if (rlang::dots_n(...) > 0L) { - Abort("No arguments can be passed via `...` when `f` is a formula, or there are unrecognized/misspelled parameter names.", + cli_abort( + "No arguments can be passed via `...` when `f` is a formula, or there + are unrecognized/misspelled parameter names.", class = "epiprocess__as_slide_computation__formula_with_dots", epiprocess__f = f, epiprocess__enquos_dots = enquos(...) @@ -331,7 +332,7 @@ as_slide_computation <- function(f, ...) { env <- f_env(f) if (!is_environment(env)) { - Abort("Formula must carry an environment.", + cli_abort("Formula must carry an environment.", class = "epiprocess__as_slide_computation__formula_has_no_env", epiprocess__f = f, epiprocess__f_env = env, @@ -350,7 +351,8 @@ as_slide_computation <- function(f, ...) { return(fn) } - Abort(sprintf("Can't convert an object of class %s to a slide computation", paste(collapse = " ", deparse(class(f)))), + cli_abort( + sprintf("Can't convert an object of class %s to a slide computation", paste(collapse = " ", deparse(class(f)))), class = "epiprocess__as_slide_computation__cant_convert_catchall", epiprocess__f = f, epiprocess__f_class = class(f), @@ -546,7 +548,7 @@ list2var <- function(x) { #' @noRd deprecated_quo_is_present <- function(quo) { if (!rlang::is_quosure(quo)) { - Abort("`quo` must be a quosure; `enquo` the arg first", + cli_abort("`quo` must be a quosure; `enquo` the arg first", internal = TRUE ) } else if (rlang::quo_is_missing(quo)) { @@ -603,23 +605,13 @@ deprecated_quo_is_present <- function(quo) { #' #' @noRd gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { - if (!is.numeric(a) || length(a) != 1L) { - Abort("`a` must satisfy `is.numeric`, have `length` 1.") - } - if (!is.numeric(b) || length(b) != 1L) { - Abort("`b` must satisfy `is.numeric`, have `length` 1.") - } - if (!is.numeric(rrtol) || length(rrtol) != 1L || rrtol < 0) { - Abort("`rrtol` must satisfy `is.numeric`, have `length` 1, and be non-negative.") - } - if (!is.numeric(pqlim) || length(pqlim) != 1L || pqlim < 0) { - Abort("`pqlim` must satisfy `is.numeric`, have `length` 1, and be non-negative.") - } - if (!is.numeric(irtol) || length(irtol) != 1L || irtol < 0) { - Abort("`irtol` must satisfy `is.numeric`, have `length` 1, and be non-negative.") - } + assert_numeric(a, len = 1L) + assert_numeric(b, len = 1L) + assert_numeric(rrtol, len = 1L, lower = 0) + assert_numeric(pqlim, len = 1L, lower = 0) + assert_numeric(irtol, len = 1L, lower = 0) if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a / b) >= pqlim || abs(b / a) >= pqlim) { - Abort("`a` and/or `b` is either `NA` or exactly zero, or one is so much smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting.") + cli_abort("`a` and/or `b` is either `NA` or exactly zero, or one is so much smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting.") } iatol <- irtol * max(a, b) a_curr <- a @@ -627,7 +619,7 @@ gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { while (TRUE) { # `b_curr` is the candidate GCD / iterand; check first if it seems too small: if (abs(b_curr) <= iatol) { - Abort("No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.") + cli_abort("No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.") } remainder <- a_curr - round(a_curr / b_curr) * b_curr if (abs(remainder / b_curr) <= rrtol) { @@ -652,10 +644,10 @@ gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { #' @noRd gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { if (!is.numeric(dividends) || length(dividends) == 0L) { - Abort("`dividends` must satisfy `is.numeric`, and have `length` > 0") + cli_abort("`dividends` must satisfy `is.numeric`, and have `length` > 0") } if (rlang::dots_n(...) != 0L) { - Abort("`...` should be empty; all dividends should go in a single `dividends` vector, and all tolerance&limit settings should be passed by name.") + cli_abort("`...` should be empty; all dividends should go in a single `dividends` vector, and all tolerance&limit settings should be passed by name.") } # We expect a bunch of duplicate `dividends` for some applications. # De-duplicate to reduce work. Sort by absolute value to attempt to reduce @@ -701,7 +693,7 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { sorted_distinct_ref_time_values <- sort(unique(ref_time_values)) if (length(sorted_distinct_ref_time_values) < 2L) { - Abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) + cli_abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) } skips <- diff(sorted_distinct_ref_time_values) decayed_skips <- diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 4000727a..720b33de 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -1,29 +1,18 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { my_version_bound <- NA - validate_version_bound(my_version_bound, na_ok = TRUE) - expect_error(validate_version_bound(my_version_bound, na_ok = FALSE), - class = "epiprocess__my_version_bound_is_na" - ) - # Note that if the error class name changes, this test may produce some - # confusing output along the following lines: - # - # > Error in `$<-.data.frame`(`*tmp*`, "call_text", value = c("testthat::expect_error(...)", : - # > replacement has 5 rows, data has 3 + x <- tibble::tibble(version = 5L) + validate_version_bound(my_version_bound, x, na_ok = TRUE) + expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE)) }) test_that("`validate_version_bound` catches bounds that are the wrong length", { + x <- tibble::tibble(version = 5L) my_version_bound1a <- NULL - expect_error(validate_version_bound(my_version_bound1a, na_ok = TRUE), - class = "epiprocess__my_version_bound1a_is_not_length_1" - ) + expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE)) my_version_bound1b <- integer(0L) - expect_error(validate_version_bound(my_version_bound1b, na_ok = TRUE), - class = "epiprocess__my_version_bound1b_is_not_length_1" - ) + expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE)) my_version_bound2 <- c(2, 10) - expect_error(validate_version_bound(my_version_bound2, na_ok = TRUE), - class = "epiprocess__my_version_bound2_is_not_length_1" - ) + expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE)) }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { @@ -52,29 +41,21 @@ test_that("`validate_version_bound` validate and class checks together allow and x_datetime <- tibble::tibble(version = my_datetime) # Custom classes matter (test vectors and non-vctrs-specialized lists separately): my_version_bound1 <- `class<-`(24, "c1") - expect_error(validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), - class = "epiprocess__my_version_bound1_has_invalid_class_or_typeof" - ) + expect_error(validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), regexp = "must have the same classes as") my_version_bound2 <- `class<-`(list(12), c("c2a", "c2b", "c2c")) - expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), - class = "epiprocess__my_version_bound2_has_invalid_class_or_typeof" - ) + expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), regexp = "must have the same classes") # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: validate_version_bound(my_date, x_date, version_bound_arg = "vb") validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") - expect_error(validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), - class = "epiprocess__vb_has_invalid_class_or_typeof" - ) - expect_error(validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), - class = "epiprocess__vb_has_invalid_class_or_typeof" - ) + expect_error(validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), regexp = "must have the same classes") + expect_error(validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), regexp = "must have the same classes") # Bad: - expect_error(validate_version_bound(3.5, x_int, TRUE, "vb")) - expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb")) + expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same classes") + expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same classes") expect_error(validate_version_bound( `class<-`(list(2), "clazz"), tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - )) + ), regexp = "must have the same types") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -99,28 +80,27 @@ test_that("archive version bounds args work as intended", { clobberable_versions_start = 1241, versions_end = measurement_date ), - class = "epiprocess__clobberable_versions_start_has_invalid_class_or_typeof" + regexp = "must have the same classes" ) - expect_error(as_epi_archive(update_tbl[integer(0L), ]), - class = "epiprocess__max_version_cannot_be_used" + expect_error( + as_epi_archive(update_tbl[integer(0L), ]), + regexp = "don't have a sensible guess at what version that is" ) expect_error( as_epi_archive(update_tbl, clobberable_versions_start = NA, versions_end = measurement_date ), - class = "epiprocess__versions_end_earlier_than_updates" + regexp = "`x` contained updates for a later version" ) expect_error( as_epi_archive(update_tbl, clobberable_versions_start = measurement_date + 6L, versions_end = measurement_date + 5L ), - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - expect_error(as_epi_archive(update_tbl, versions_end = NA), - regexp = "versions_end.*must not satisfy.*is.na" + regexp = "`clobberable_versions_start`.*indicated that there were later observed versions" ) + expect_error(as_epi_archive(update_tbl, versions_end = NA), regexp = "must have the same classes") ea_default <- as_epi_archive(update_tbl) ea_default$as_of(measurement_date + 4L) expect_warning( @@ -128,7 +108,6 @@ test_that("archive version bounds args work as intended", { ea_default$as_of(measurement_date + 5L), class = "epiprocess__snapshot_as_of_clobberable_version" ) - expect_error(ea_default$as_of(measurement_date + 6L), - regexp = "max_version.*at most.*versions_end" - ) + ea_default$as_of(measurement_date + 5L) + expect_error(ea_default$as_of(measurement_date + 6L)) }) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 73f0e166..2eba383d 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -2,7 +2,7 @@ library(dplyr) test_that("first input must be a data.frame", { expect_error(as_epi_archive(c(1, 2, 3), compactify = FALSE), - regexp = "`x` must be a data frame." + regexp = "Must be of type 'data.frame'." ) }) @@ -10,13 +10,13 @@ dt <- archive_cases_dv_subset$DT test_that("data.frame must contain geo_value, time_value and version columns", { expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), - regexp = "`x` must contain a `geo_value` column." + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." ) expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), - regexp = "`x` must contain a `time_value` column." + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." ) expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), - regexp = "`x` must contain a `version` column." + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." ) }) @@ -41,10 +41,10 @@ test_that("other_keys cannot contain names geo_value, time_value or version", { test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { expect_warning(as_epi_archive(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\"." + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." ) expect_warning(as_epi_archive(dt, additional_metadata = list(time_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields\n\"geo_type\", \"time_type\"." + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." ) }) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index 511cc8d7..bd9002a3 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -43,12 +43,12 @@ test_that("original `delayedAssign` works as expected on good promises", { }) test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { - delayed_assign_with_unregister_awareness("x", Abort("msg", class = "original_error_class")) + delayed_assign_with_unregister_awareness("x", cli_abort("msg", class = "original_error_class")) expect_error(force(x), class = "original_error_class") }) test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { - delayed_assign_with_unregister_awareness("x", Abort("msg", class = "original_error_class")) + delayed_assign_with_unregister_awareness("x", cli_abort("msg", class = "original_error_class")) # Take advantage of a false positive / hedge against package renaming: make # our own `unregister` function to trigger the special error message. unregister <- function(y) y diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index decd6fd7..38257282 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -42,7 +42,7 @@ test_that("as_epi_df errors when additional_metadata is not a list", { expect_error( as_epi_df(ex_input, additional_metadata = c(other_keys = "state", "pol")), - "`additional_metadata` must be a list type." + "Must be of type 'list', not 'character'." ) }) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 8137cf19..588ad933 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -32,11 +32,11 @@ toy_edf <- tibble::tribble( test_that("`before` and `after` are both vectors of length 1", { expect_error( epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = d + 3), - "`before`.*length-1" + "Assertion on 'before' failed: Must have length 1" ) expect_error( epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), - "`after`.*length-1" + "Assertion on 'after' failed: Must have length 1" ) }) @@ -62,11 +62,11 @@ test_that("Test errors/warnings for discouraged features", { test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { expect_error( epi_slide(grouped, f, before = -1L, ref_time_values = d + 2L), - "`before`.*non-negative" + "Assertion on 'before' failed: Element 1 is not >= 0" ) expect_error( epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d + 2L), - "`after`.*non-negative" + "Assertion on 'after' failed: Element 1 is not >= 0" ) expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d + 2L), regexp = "before", class = "vctrs_error_incompatible_type" @@ -82,11 +82,11 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa ) expect_error( epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d + 2L), - "`before`.*non-NA" + "Assertion on 'before' failed: May not be NA" ) expect_error( epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), - "`after`.*non-NA" + "Assertion on 'after' failed: May not be NA" ) # Non-integer-class but integer-compatible values are allowed: expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L), NA) @@ -95,22 +95,22 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { expect_error( epi_slide(grouped, f, before = 2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window }) test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { expect_error( epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window }) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index b3fff13d..4af84254 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -187,11 +187,11 @@ test_that("epix_slide `before` validation works", { ) expect_error( xx$slide(f = ~ sum(.x$binary), before = NA), - "`before`.*NA" + "Assertion on 'before' failed: May not be NA" ) expect_error( xx$slide(f = ~ sum(.x$binary), before = -1), - "`before`.*negative" + "Assertion on 'before' failed: Element 1 is not >= 0" ) expect_error(xx$slide(f = ~ sum(.x$binary), before = 1.5), regexp = "before", diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 68e7c76d..9fd15e10 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -39,7 +39,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # Test `.drop` behavior: expect_error(toy_archive %>% group_by(.drop = "bogus"), - regexp = "\\.drop.*TRUE or FALSE" + regexp = "Must be of type 'logical', not 'character'" ) expect_warning(toy_archive %>% group_by(.drop = FALSE), class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 4ad692a0..3454d257 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,12 +1,3 @@ -test_that("break_string works properly", { - expect_equal(break_str("A dog is here", 6), "A dog\nis\nhere") -}) - -test_that("Abort and Warn work", { - expect_error(Abort("abort")) - expect_warning(Warn("warn")) -}) - test_that("new summarizing functions work", { x <- c(3, 4, 5, 9, NA) expect_equal(Min(x), 3) From d46aa2f4ca68f02fba7bf589eda6688d830fa673 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 31 Jan 2024 18:03:50 -0800 Subject: [PATCH 152/345] doc+build: news and version bump --- DESCRIPTION | 2 +- NEWS.md | 287 ++++++++++++++++++++++++++-------------------------- 2 files changed, 145 insertions(+), 144 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7f36ad7c..7b21d628 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.3 +Version: 0.7.4 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index e4a404e2..d6aca370 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,146 +6,147 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Improvements -* `epi_slide` computations are now 2-4 times faster after changing how +- `epi_slide` computations are now 2-4 times faster after changing how reference time values, made accessible within sliding functions, are calculated (#397). -* regenerated the `jhu_csse_daily_subset` dataset with the latest versions of +- regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API -* changed approach to versioning, see DEVELOPMENT.md for details -* `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 -* Minor documentation updates; PR #393 -* Improved `epi_archive` print method. Compactified metadata and shows a snippet +- changed approach to versioning, see DEVELOPMENT.md for details +- `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 +- Minor documentation updates; PR #393 +- Improved `epi_archive` print method. Compactified metadata and shows a snippet of the underlying `DT` (#341). +- Added `autoplot` method for `epi_df` objects, which creates a ggplot2 plot of + the `epi_df` (#382). +- Refactored internals to use `cli` for warnings/errors and `checkmate` for + argument checking (#413). ## Breaking changes -* Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 +- Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 # epiprocess 0.7.0 ## Improvements -* Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. +- Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. ## Breaking changes -* Changes to `epi_slide` and `epix_slide`: - * If `f` is a function, it is now required to take at least three arguments. +- Changes to `epi_slide` and `epix_slide`: + - If `f` is a function, it is now required to take at least three arguments. `f` must take an `epi_df` with the same column names as the archive's `DT`, minus the `version` column; followed by a one-row tibble containing the values of the grouping variables for the associated group; followed by a reference time value, usually as a `Date` object. Optionally, it can take any number of additional arguments after that, and forward values for those arguments through `epi[x]_slide`'s `...` args. - * To make your existing slide computations work, add a third argument to - your `f` function to accept this new input: e.g., change `f = function(x, - g, ) { }` to `f = function(x, g, rt, ) { }`. + - To make your existing slide computations work, add a third argument to + your `f` function to accept this new input: e.g., change `f = function(x, g, ) { }` + to `f = function(x, g, rt, ) { }`. ## New features -* `epi_slide` and `epix_slide` also make the window data, group key and +- `epi_slide` and `epix_slide` also make the window data, group key and reference time value available to slide computations specified as formulas or tidy evaluation expressions, in additional or completely new ways. - * If `f` is a formula, it can now access the reference time value via `.z` or + - If `f` is a formula, it can now access the reference time value via `.z` or `.ref_time_value`. - * If `f` is missing, the tidy evaluation expression in `...` can now refer to + - If `f` is missing, the tidy evaluation expression in `...` can now refer to the window data as an `epi_df` or `tibble` with `.x`, the group key with `.group_key`, and the reference time value with `.ref_time_value`. The usual `.data` and `.env` pronouns also work, but`pick()` and `cur_data()` are not; work off of `.x` instead. -* `epix_slide` has been made more like `dplyr::group_modify`. It will no longer +- `epix_slide` has been made more like `dplyr::group_modify`. It will no longer perform element/row recycling for size stability, accepts slide computation outputs containing any number of rows, and no longer supports `all_rows`. - * To keep the old behavior, manually perform row recycling within `f` + - To keep the old behavior, manually perform row recycling within `f` computations, and/or `left_join` a data frame representing the desired output structure with the current `epix_slide()` result to obtain the desired repetitions and completions expected with `all_rows = TRUE`. -* `epix_slide` will only output grouped or ungrouped tibbles. Previously, it +- `epix_slide` will only output grouped or ungrouped tibbles. Previously, it would sometimes output `epi_df`s, but not consistently, and not always with the metadata desired. Future versions will revisit this design, and consider more closely whether/when/how to output an `epi_df`. - * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` + - To keep the old behavior, convert the output of `epix_slide()` to `epi_df` when desired and set the metadata appropriately. ## Improvements -* `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide +- `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide computations output atomic vectors, and output a list column in "chopped" format (see `tidyr::chop`). -* `epi_slide` now works properly with slide computations that output just a +- `epi_slide` now works properly with slide computations that output just a `Date` vector, rather than converting `slide_value` to a numeric column. -* Fix `?archive_cases_dv_subset` information regarding modifications of upstream +- Fix `?archive_cases_dv_subset` information regarding modifications of upstream data by @brookslogan in (#299). -* Update to use updated `epidatr` (`fetch_tbl` -> `fetch`) by @brookslogan in +- Update to use updated `epidatr` (`fetch_tbl` -> `fetch`) by @brookslogan in (#319). # epiprocess 0.6.0 ## Breaking changes -* Changes to both `epi_slide` and `epix_slide`: - * The `n`, `align`, and `before` arguments have been replaced by new `before` +- Changes to both `epi_slide` and `epix_slide`: + - The `n`, `align`, and `before` arguments have been replaced by new `before` and `after` arguments. To migrate to the new version, replace these arguments in every `epi_slide` and `epix_slide` call. If you were only using - the `n` argument, then this means replacing `n = ` with `before = - - 1`. - * `epi_slide`'s time windows now extend `before` time steps before and + the `n` argument, then this means replacing `n = ` with `before = - 1`. + - `epi_slide`'s time windows now extend `before` time steps before and `after` time steps after the corresponding `ref_time_values`. See `?epi_slide` for details on matching old alignments. - * `epix_slide`'s time windows now extend `before` time steps before the + - `epix_slide`'s time windows now extend `before` time steps before the corresponding `ref_time_values` all the way through the latest data available at the corresponding `ref_time_values`. - * Slide functions now keep any grouping of `x` in their results, like + - Slide functions now keep any grouping of `x` in their results, like `mutate` and `group_modify`. - * To obtain the old behavior, `dplyr::ungroup` the slide results immediately. -* Additional `epi_slide` changes: - * When using `as_list_col = TRUE` together with `ref_time_values` and + - To obtain the old behavior, `dplyr::ungroup` the slide results immediately. +- Additional `epi_slide` changes: + - When using `as_list_col = TRUE` together with `ref_time_values` and `all_rows=TRUE`, the marker for excluded computations is now a `NULL` entry in the list column, rather than a `NA`; if you are using `tidyr::unnest()` afterward and want to keep these missing data markers, you will need to replace the `NULL` entries with `NA`s. Skipped computations are now more uniformly detectable using `vctrs` methods. -* Additional`epix_slide` changes: - * `epix_slide`'s `group_by` argument has been replaced by `dplyr::group_by` and +- Additional`epix_slide` changes: + - `epix_slide`'s `group_by` argument has been replaced by `dplyr::group_by` and `dplyr::ungroup` S3 methods. The `group_by` method uses "data masking" (also referred to as "tidy evaluation") rather than "tidy selection". - * Old syntax: - * `x %>% epix_slide(, group_by=c(col1, col2))` - * `x %>% epix_slide(, group_by=all_of(colname_vector))` - * New syntax: - * `x %>% group_by(col1, col2) %>% epix_slide()` - * `x %>% group_by(across(all_of(colname_vector))) %>% epix_slide()` - * `epix_slide` no longer defaults to grouping by non-`time_value`, non-`version` + - Old syntax: + - `x %>% epix_slide(, group_by=c(col1, col2))` + - `x %>% epix_slide(, group_by=all_of(colname_vector))` + - New syntax: + - `x %>% group_by(col1, col2) %>% epix_slide()` + - `x %>% group_by(across(all_of(colname_vector))) %>% epix_slide()` + - `epix_slide` no longer defaults to grouping by non-`time_value`, non-`version` key columns, instead considering all data to be in one big group. - * To obtain the old behavior, precede each `epix_slide` call lacking a + - To obtain the old behavior, precede each `epix_slide` call lacking a `group_by` argument with an appropriate `group_by` call. - * `epix_slide` now guesses `ref_time_values` to be a regularly spaced sequence + - `epix_slide` now guesses `ref_time_values` to be a regularly spaced sequence covering all the `DT$version` values and the `version_end`, rather than the distinct `DT$time_value`s. To obtain the old behavior, pass in `ref_time_values = unique($DT$time_value)`. -* `epi_archive`'s `clobberable_versions_start`'s default is now `NA`, so there +- `epi_archive`'s `clobberable_versions_start`'s default is now `NA`, so there will be no warnings by default about potential nonreproducibility. To obtain - the old behavior, pass in `clobberable_versions_start = - max_version_with_row_in(x)`. + the old behavior, pass in `clobberable_versions_start = max_version_with_row_in(x)`. ## Potentially-breaking changes -* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when +- Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when dropping the `epi_df` class (e.g., when removing the `time_value` column). -* Fixed `epi_df` operations to be more consistent about decaying into +- Fixed `epi_df` operations to be more consistent about decaying into non-`epi_df`s when the result of the operation doesn't make sense as an `epi_df` (e.g., when removing the `time_value` column). -* Changed `bind_rows` on grouped `epi_df`s to not drop the `epi_df` class. Like +- Changed `bind_rows` on grouped `epi_df`s to not drop the `epi_df` class. Like with ungrouped `epi_df`s, the metadata of the result is still simply taken from the first result, and may be inappropriate ([#242](https://github.com/cmu-delphi/epiprocess/issues/242)). -* `epi_slide` and `epix_slide` now raise an error rather than silently filtering +- `epi_slide` and `epix_slide` now raise an error rather than silently filtering out `ref_time_values` that don't meet their expectations. ## New features -* `epix_slide`, `$slide` have a new parameter `all_versions`. With +- `epix_slide`, `$slide` have a new parameter `all_versions`. With `all_versions=TRUE`, `epix_slide` will pass a filtered `epi_archive` to each computation rather than an `epi_df` snapshot. This enables, e.g., performing pseudoprospective forecasts with a revision-aware forecaster using nested @@ -153,124 +154,124 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Improvements -* Added `dplyr::group_by` and `dplyr::ungroup` S3 methods for `epi_archive` +- Added `dplyr::group_by` and `dplyr::ungroup` S3 methods for `epi_archive` objects, plus corresponding `$group_by` and `$ungroup` R6 methods. The `group_by` implementation supports the `.add` and `.drop` arguments, and `ungroup` supports partial ungrouping with `...`. -* `as_epi_archive`, `epi_archive$new` now perform checks for the key uniqueness +- `as_epi_archive`, `epi_archive$new` now perform checks for the key uniqueness requirement (part of [#154](https://github.com/cmu-delphi/epiprocess/issues/154)). ## Cleanup -* Added a `NEWS.md` file to track changes to the package. -* Implemented `?dplyr::dplyr_extending` for `epi_df`s +- Added a `NEWS.md` file to track changes to the package. +- Implemented `?dplyr::dplyr_extending` for `epi_df`s ([#223](https://github.com/cmu-delphi/epiprocess/issues/223)). -* Fixed various small documentation issues ([#217](https://github.com/cmu-delphi/epiprocess/issues/217)). +- Fixed various small documentation issues ([#217](https://github.com/cmu-delphi/epiprocess/issues/217)). # epiprocess 0.5.0 ## Potentially-breaking changes -* `epix_slide`, `$slide` now feed `f` an `epi_df` rather than +- `epix_slide`, `$slide` now feed `f` an `epi_df` rather than converting to a tibble/`tbl_df` first, allowing use of `epi_df` methods and metadata, and often yielding `epi_df`s out of the slide as a result. To obtain the old behavior, convert to a tibble within `f`. ## Improvements -* Fixed `epix_merge`, `$merge` always raising error on `sync="truncate"`. +- Fixed `epix_merge`, `$merge` always raising error on `sync="truncate"`. ## Cleanup -* Added `Remotes:` entry for `genlasso`, which was removed from CRAN. -* Added `as_epi_archive` tests. -* Added missing `epix_merge` test for `sync="truncate"`. +- Added `Remotes:` entry for `genlasso`, which was removed from CRAN. +- Added `as_epi_archive` tests. +- Added missing `epix_merge` test for `sync="truncate"`. # epiprocess 0.4.0 ## Potentially-breaking changes -* Fixed `[.epi_df` to not reorder columns, which was incompatible with +- Fixed `[.epi_df` to not reorder columns, which was incompatible with downstream packages. -* Changed `[.epi_df` decay-to-tibble logic to more coherent with `epi_df`s +- Changed `[.epi_df` decay-to-tibble logic to more coherent with `epi_df`s current tolerance of nonunique keys: stopped decaying to a tibble in some cases where a unique key wouldn't have been preserved, since we don't enforce a unique key elsewhere. -* Fixed `[.epi_df` to adjust `"other_keys"` metadata when corresponding +- Fixed `[.epi_df` to adjust `"other_keys"` metadata when corresponding columns are selected out. -* Fixed `[.epi_df` to raise an error if resulting column names would be +- Fixed `[.epi_df` to raise an error if resulting column names would be nonunique. -* Fixed `[.epi_df` to drop metadata if decaying to a tibble (due to removal +- Fixed `[.epi_df` to drop metadata if decaying to a tibble (due to removal of essential columns). ## Improvements -* Added check that `epi_df` `additional_metadata` is list. -* Fixed some incorrect `as_epi_df` examples. +- Added check that `epi_df` `additional_metadata` is list. +- Fixed some incorrect `as_epi_df` examples. ## Cleanup -* Applied rename of upstream package in examples: `delphi.epidata` -> +- Applied rename of upstream package in examples: `delphi.epidata` -> `epidatr`. -* Rounded out `[.epi_df` tests. +- Rounded out `[.epi_df` tests. # epiprocess 0.3.0 ## Breaking changes -* `as_epi_archive`, `epi_archive$new`: - * Compactification (see below) by default may change results if working +- `as_epi_archive`, `epi_archive$new`: + - Compactification (see below) by default may change results if working directly with the `epi_archive`'s `DT` field; to disable, pass in `compactify=FALSE`. -* `epi_archive`'s wrappers and R6 methods have been updated to follow these +- `epi_archive`'s wrappers and R6 methods have been updated to follow these rules regarding reference semantics: - * `epix_` will not mutate input `epi_archive`s, but may alias them + - `epix_` will not mutate input `epi_archive`s, but may alias them or alias their fields (which should not be a worry if a user sticks to these `epix_*` functions and "regular" R functions with copy-on-write-like behavior, avoiding mutating functions `[.data.table`). - * `x$` may mutate `x`; if it mutates `x`, it will return `x` + - `x$` may mutate `x`; if it mutates `x`, it will return `x` invisibly (where this makes sense), and, for each of its fields, may either mutate the object to which it refers or reseat the reference (but not both); if `x$` does not mutate `x`, its result may contain aliases to `x` or its fields. -* `epix_merge`, `$merge`: - * Removed `...`, `locf`, and `nan` parameters. - * Changed the default behavior, which now corresponds to using +- `epix_merge`, `$merge`: + - Removed `...`, `locf`, and `nan` parameters. + - Changed the default behavior, which now corresponds to using `by=key(x$DT)` (but demanding that is the same set of column names as `key(y$DT)`), `all=TRUE`, `locf=TRUE`, `nan=NaN` (but with the post-filling step fixed to only apply to gaps, and no longer fill over `NA`s originating from `x$DT` and `y$DT`). - * `x` and `y` are no longer allowed to share names of non-`by` columns. - * `epix_merge` no longer mutates its `x` argument (but `$merge` continues + - `x` and `y` are no longer allowed to share names of non-`by` columns. + - `epix_merge` no longer mutates its `x` argument (but `$merge` continues to do so). - * Removed (undocumented) capability of passing a `data.table` as `y`. -* `epix_slide`: - * Removed inappropriate/misleading `n=7` default argument (due to - reporting latency, `n=7` will *not* yield 7 days of data in a typical + - Removed (undocumented) capability of passing a `data.table` as `y`. +- `epix_slide`: + - Removed inappropriate/misleading `n=7` default argument (due to + reporting latency, `n=7` will _not_ yield 7 days of data in a typical daily-reporting surveillance data source, as one might have assumed). ## New features -* `as_epi_archive`, `epi_archive$new`: - * New `compactify` parameter allows removal of rows that are redundant for the +- `as_epi_archive`, `epi_archive$new`: + - New `compactify` parameter allows removal of rows that are redundant for the purposes of `epi_archive`'s methods, which use the last version of each observation carried forward. - * New `clobberable_versions_start` field allows marking a range of versions + - New `clobberable_versions_start` field allows marking a range of versions that could be "clobbered" (rewritten without assigning new version tags); previously, this was hard-coded as `max($DT$version)`. - * New `versions_end` field allows marking a range of versions beyond + - New `versions_end` field allows marking a range of versions beyond `max($DT$version)` that were observed, but contained no changes. -* `epix_merge`, `$merge`: - * New `sync` parameter controls what to do if `x` and `y` aren't equally +- `epix_merge`, `$merge`: + - New `sync` parameter controls what to do if `x` and `y` aren't equally up to date (i.e., if `x$versions_end` and `y$versions_end` are different). -* New function `epix_fill_through_version`, method +- New function `epix_fill_through_version`, method `$fill_through_version`: non-mutating & mutating way to ensure that an archive contains versions at least through some `fill_versions_end`, extrapolating according to `how` if necessary. -* Example archive data object is now constructed on demand from its +- Example archive data object is now constructed on demand from its underlying data, so it will be based on the user's version of `epi_archive` rather than an outdated R6 implementation from whenever the data object was generated. @@ -279,130 +280,130 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes -* Removed default `n=7` argument to `epix_slide`. +- Removed default `n=7` argument to `epix_slide`. ## Improvements -* Ignore `NA`s when printing `time_value` range for an `epi_archive`. -* Fixed misleading column naming in `epix_slide` example. -* Trimmed down `epi_slide` examples. -* Synced out-of-date docs. +- Ignore `NA`s when printing `time_value` range for an `epi_archive`. +- Fixed misleading column naming in `epix_slide` example. +- Trimmed down `epi_slide` examples. +- Synced out-of-date docs. ## Cleanup -* Removed dependency of some `epi_archive` tests on an example archive. +- Removed dependency of some `epi_archive` tests on an example archive. object, and made them more understandable by reading without running. -* Fixed `epi_df` tests relying on an S3 method for `epi_df` implemented +- Fixed `epi_df` tests relying on an S3 method for `epi_df` implemented externally to `epiprocess`. -* Added tests for `epi_archive` methods and wrapper functions. -* Removed some dead code. -* Made `.{Rbuild,git}ignore` files more comprehensive. +- Added tests for `epi_archive` methods and wrapper functions. +- Removed some dead code. +- Made `.{Rbuild,git}ignore` files more comprehensive. # epiprocess 0.1.2 ## New features -* New `new_epi_df` function is similar to `as_epi_df`, but (i) recalculates, +- New `new_epi_df` function is similar to `as_epi_df`, but (i) recalculates, overwrites, and/or drops most metadata of `x` if it has any, (ii) may still reorder the columns of `x` even if it's already an `epi_df`, and (iii) treats `x` as optional, constructing an empty `epi_df` by default. ## Improvements -* Fixed `geo_type` guessing on alphabetical strings with more than 2 +- Fixed `geo_type` guessing on alphabetical strings with more than 2 characters to yield `"custom"`, not US `"nation"`. -* Fixed `time_type` guessing to actually detect `Date`-class `time_value`s +- Fixed `time_type` guessing to actually detect `Date`-class `time_value`s regularly spaced 7 days apart as `"week"`-type as intended. -* Improved printing of `epi_df`s, `epi_archives`s. -* Fixed `as_of` to not cut off any (forecast-like) data with `time_value > - max_version`. -* Expanded `epi_df` docs to include conversion from `tsibble`/`tbl_ts` objects, +- Improved printing of `epi_df`s, `epi_archives`s. +- Fixed `as_of` to not cut off any (forecast-like) data with `time_value > +max_version`. +- Expanded `epi_df` docs to include conversion from `tsibble`/`tbl_ts` objects, usage of `other_keys`, and pre-processing objects not following the `geo_value`, `time_value` naming scheme. -* Expanded `epi_slide` examples to show how to use an `f` argument with +- Expanded `epi_slide` examples to show how to use an `f` argument with named parameters. -* Updated examples to print relevant columns given a common 80-column +- Updated examples to print relevant columns given a common 80-column terminal width. -* Added growth rate examples. -* Improved `as_epi_archive` and `epi_archive$new`/`$initialize` +- Added growth rate examples. +- Improved `as_epi_archive` and `epi_archive$new`/`$initialize` documentation, including constructing a toy archive. ## Cleanup -* Added tests for `epi_slide`, `epi_cor`, and internal utility functions. -* Fixed currently-unused internal utility functions `MiddleL`, `MiddleR` to +- Added tests for `epi_slide`, `epi_cor`, and internal utility functions. +- Fixed currently-unused internal utility functions `MiddleL`, `MiddleR` to yield correct results on odd-length vectors. # epiprocess 0.1.1 ## New features -* New example data objects allow one to quickly experiment with `epi_df`s +- New example data objects allow one to quickly experiment with `epi_df`s and `epi_archives` without relying/waiting on an API to fetch data. ## Improvements -* Improved `epi_slide` error messaging. -* Fixed description of the appropriate parameters for an `f` argument to +- Improved `epi_slide` error messaging. +- Fixed description of the appropriate parameters for an `f` argument to `epi_slide`; previous description would give incorrect behavior if `f` had named parameters that did not receive values from `epi_slide`'s `...`. -* Added some examples throughout the package. -* Using example data objects in vignettes also speeds up vignette compilation. +- Added some examples throughout the package. +- Using example data objects in vignettes also speeds up vignette compilation. ## Cleanup -* Set up gh-actions CI. -* Added tests for `epi_df`s. +- Set up gh-actions CI. +- Added tests for `epi_df`s. # epiprocess 0.1.0 ## Implemented core functionality, vignettes -* Classes: - * `epi_df`: specialized `tbl_df` for geotemporal epidemiological time +- Classes: + - `epi_df`: specialized `tbl_df` for geotemporal epidemiological time series data, with optional metadata recording other key columns (e.g., demographic breakdowns) and `as_of` what time/version this data was current/published. Associated functions: - * `as_epi_df` converts to an `epi_df`, guessing the `geo_type`, + - `as_epi_df` converts to an `epi_df`, guessing the `geo_type`, `time_type`, `other_keys`, and `as_of` if not specified. - * `as_epi_df.tbl_ts` and `as_tsibble.epi_df` automatically set + - `as_epi_df.tbl_ts` and `as_tsibble.epi_df` automatically set `other_keys` and `key`&`index`, respectively. - * `epi_slide` applies a user-supplied computation to a sliding/rolling + - `epi_slide` applies a user-supplied computation to a sliding/rolling time window and user-specified groups, adding the results as new columns, and recycling/broadcasting results to keep the result size stable. Allows computation to be provided as a function, `purrr`-style formula, or tidyeval dots. Uses `slider` underneath for efficiency. - * `epi_cor` calculates Pearson, Kendall, or Spearman correlations + - `epi_cor` calculates Pearson, Kendall, or Spearman correlations between two (optionally time-shifted) variables in an `epi_df` within user-specified groups. - * Convenience function: `is_epi_df`. - * `epi_archive`: R6 class for version (patch) data for geotemporal + - Convenience function: `is_epi_df`. + - `epi_archive`: R6 class for version (patch) data for geotemporal epidemiological time series data sets. Comes with S3 methods and regular functions that wrap around this functionality for those unfamiliar with R6 methods. Associated functions: - * `as_epi_archive`: prepares an `epi_archive` object from a data frame + - `as_epi_archive`: prepares an `epi_archive` object from a data frame containing snapshots and/or patch data for every available version of the data set. - * `as_of`: extracts a snapshot of the data set as of some requested + - `as_of`: extracts a snapshot of the data set as of some requested version, in `epi_df` format. - * `epix_slide`, `$slide`: similar to `epi_slide`, but for + - `epix_slide`, `$slide`: similar to `epi_slide`, but for `epi_archive`s; for each requested `ref_time_value` and group, applies a time window and user-specified computation to a snapshot of the data as of `ref_time_value`. - * `epix_merge`, `$merge`: like `merge` for `epi_archive`s, + - `epix_merge`, `$merge`: like `merge` for `epi_archive`s, but allowing for the last version of each observation to be carried forward to fill in gaps in `x` or `y`. - * Convenience function: `is_epi_archive`. -* Additional functions: - * `growth_rate`: estimates growth rate of a time series using one of a few + - Convenience function: `is_epi_archive`. +- Additional functions: + - `growth_rate`: estimates growth rate of a time series using one of a few built-in `method`s based on relative change, linear regression, smoothing splines, or trend filtering. - * `detect_outlr`: applies one or more outlier detection methods to a given + - `detect_outlr`: applies one or more outlier detection methods to a given signal variable, and optionally aggregates the outputs to create a consensus result. - * `detect_outlr_rm`: outlier detection function based on a + - `detect_outlr_rm`: outlier detection function based on a rolling-median-based outlier detection function; one of the methods included in `detect_outlr`. - * `detect_outlr_stl`: outlier detection function based on a seasonal-trend + - `detect_outlr_stl`: outlier detection function based on a seasonal-trend decomposition using LOESS (STL); one of the methods included in `detect_outlr`. From 007438da035f753ac23210f6dcc88a23174723c7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:26:57 -0500 Subject: [PATCH 153/345] use reclass fn --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 354e7bc1..9590ba23 100644 --- a/R/slide.R +++ b/R/slide.R @@ -733,7 +733,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, if (!is_epi_df(result)) { # `all_rows` and `as_list_col` handling strip epi_df format and metadata. # Restore them. - result <- bind_rows(x[c(),], result) + result <- reclass(result, attributes(x)$metadata) } return(result) From 7d1e1c7d7773dd416c6f0d933c45899032da52bb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:33:42 -0500 Subject: [PATCH 154/345] check col name length --- R/slide.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/slide.R b/R/slide.R index 9590ba23..7f7b536d 100644 --- a/R/slide.R +++ b/R/slide.R @@ -572,6 +572,15 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, after <- 0L } + if (length(new_col_name) != 1L && length(new_col_name) != length(col_name)) { + Abort( + "`new_col_name` must be either length 1 or the same length as `col_name`.", + class = "epiprocess__epi_slide_mean__new_col_name_inappropriate_length", + epiprocess__new_col_name = new_col_name, + epiprocess__col_name = col_name + ) + } + pad_early_dates <- c() pad_late_dates <- c() From a54061ae47a99f789234a8665834900d46e51033 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:37:34 -0500 Subject: [PATCH 155/345] check that time_step is a function --- NAMESPACE | 3 ++- R/slide.R | 6 ++++-- man/epiprocess.Rd | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 20e4ca66..df65c10e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,7 @@ importFrom(R6,R6Class) importFrom(checkmate,anyInfinite) importFrom(checkmate,assert) importFrom(checkmate,assert_character) +importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(cli,cli_inform) importFrom(data.table,":=") @@ -104,8 +105,8 @@ importFrom(dplyr,select) importFrom(dplyr,slice) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) -importFrom(lubridate,as.period) importFrom(ggplot2,autoplot) +importFrom(lubridate,as.period) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") diff --git a/R/slide.R b/R/slide.R index 7f7b536d..63dc6b75 100644 --- a/R/slide.R +++ b/R/slide.R @@ -473,6 +473,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @importFrom purrr map #' @importFrom data.table frollmean #' @importFrom lubridate as.period +#' @importFrom checkmate assert_function #' @export #' @seealso [`epi_slide`] #' @examples @@ -627,7 +628,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, ) } - # Time_step can be any of `c("days", "weeks", "months", "quarters", "years")` + # `seq` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. all_dates <- seq(min(x$time_value), max(x$time_value), by = by) if (before != 0) { @@ -637,7 +638,8 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, pad_late_dates <- End(all_dates) + 1:after } } else { - # A custom time step is specified + # A custom time step is specified. + assert_function(time_step) # Calculate the number of `time_step`s required to go between min and max time # values. This is roundabout because difftime objects, lubridate::period objects, diff --git a/man/epiprocess.Rd b/man/epiprocess.Rd index 7c3ecd8a..bc7ef263 100644 --- a/man/epiprocess.Rd +++ b/man/epiprocess.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/epiprocess.R \docType{package} \name{epiprocess} -\alias{epiprocess-package} \alias{epiprocess} +\alias{epiprocess-package} \title{epiprocess: Tools for basic signal processing in epidemiology} \description{ This package introduces a common data structure for epidemiological data sets From b548a73244fa08ed36b27f94ce373cdf665ce45e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 7 Feb 2024 13:19:04 -0500 Subject: [PATCH 156/345] check col name length based on desired name construction --- R/slide.R | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/R/slide.R b/R/slide.R index 63dc6b75..7f468940 100644 --- a/R/slide.R +++ b/R/slide.R @@ -573,15 +573,6 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, after <- 0L } - if (length(new_col_name) != 1L && length(new_col_name) != length(col_name)) { - Abort( - "`new_col_name` must be either length 1 or the same length as `col_name`.", - class = "epiprocess__epi_slide_mean__new_col_name_inappropriate_length", - epiprocess__new_col_name = new_col_name, - epiprocess__col_name = col_name - ) - } - pad_early_dates <- c() pad_late_dates <- c() @@ -658,14 +649,29 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, } } - # `frollmean` is 1-indexed, so create a new window width based on our # `before` and `after` params. m <- before + after + 1L - if (is.null(names_sep)) { + if (is.null(names_sep) && !as_list_col) { + if (length(new_col_name) != length(col_name)) { + Abort( + "`new_col_name` must be the same length as `col_name` when `names_sep` is NULL.", + class = "epiprocess__epi_slide_mean__col_name_length_mismatch", + epiprocess__new_col_name = new_col_name, + epiprocess__col_name = col_name + ) + } result_col_name <- new_col_name } else { + if (length(new_col_name) != 1L && length(new_col_name) != length(col_name)) { + Abort( + "`new_col_name` must be either length 1 or the same length as `col_name`.", + class = "epiprocess__epi_slide_mean__col_name_length_mismatch_and_not_one", + epiprocess__new_col_name = new_col_name, + epiprocess__col_name = col_name + ) + } result_col_name <- paste(new_col_name, col_name, sep = names_sep) } From a429145c96c4b692ec442758630a3947ae5e9db3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Feb 2024 12:21:29 -0500 Subject: [PATCH 157/345] move full date seq generation to func and test --- R/slide.R | 169 ++++++++++++++++------------- tests/testthat/test-epi_slide.R | 185 ++++++++++++++++++++++++-------- 2 files changed, 235 insertions(+), 119 deletions(-) diff --git a/R/slide.R b/R/slide.R index 7f468940..57db329b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -573,81 +573,11 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, after <- 0L } - pad_early_dates <- c() - pad_late_dates <- c() - - # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` - # defines a step of 1 to automatically be the quantum (smallest resolvable - # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. - # - # `tsibble` classes apparently can't be added to in different units, so even - # if `time_step` is provided by the user, use a unit step. - if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { - all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) - - if (before != 0) { - pad_early_dates <- Start(all_dates) - before:1 - } - if (after != 0) { - pad_late_dates <- End(all_dates) + 1:after - } - } else if (missing(time_step)) { - # Guess what `by` should be based on the epi_df's `time_type`. - ttype <- attributes(x)$metadata$time_type - by <- switch(ttype, - day = "days", - week = "weeks", - yearweek = "weeks", - yearmonth = "months", - yearquarter = "quarters", - year = "years", - NA # default value for "custom", "day-time" - ) - - if (is.na(by)) { - Abort( - c( - "`frollmean` requires a full window to compute a result, but - `time_type` associated with the epi_df was not mappable to period - type valid for creating a date sequence.", - "i" = c("The input data's `time_type` was probably `custom` or `day-time`. - These require also passing a `time_step` function.") - ), - class = "epiprocess__epi_slide_mean__unmappable_time_type", - epiprocess__time_type = ttype - ) - } - - # `seq` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. - all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - - if (before != 0) { - pad_early_dates <- Start(all_dates) - before:1 - } - if (after != 0) { - pad_late_dates <- End(all_dates) + 1:after - } - } else { - # A custom time step is specified. - assert_function(time_step) - - # Calculate the number of `time_step`s required to go between min and max time - # values. This is roundabout because difftime objects, lubridate::period objects, - # and Dates are hard to convert to the same time scale and add. - t_elapsed_s <- difftime(max(x$time_value), min(x$time_value), units = "secs") - step_size_s <- lubridate::as.period(time_step(1), unit = "secs") - n_steps <- ceiling(as.numeric(t_elapsed_s) / as.numeric(step_size_s)) - - all_dates <- min(x$time_value) + time_step(0:n_steps) - - if (before != 0) { - pad_early_dates <- Start(all_dates) - time_step(before:1) - } - if (after != 0) { - pad_late_dates <- End(all_dates) + time_step(1:after) - } - } + # Make a complete date sequence between min(x$time_value) and max(x$time_value). + date_seq_list <- full_date_seq(x, before, after, time_step) + all_dates <- date_seq_list$all_dates + pad_early_dates <- date_seq_list$pad_early_dates + pad_late_dates <- date_seq_list$pad_late_dates # `frollmean` is 1-indexed, so create a new window width based on our # `before` and `after` params. @@ -756,3 +686,92 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, return(result) } +#' Make a complete date sequence between min(x$time_value) and max +#' (x$time_value). Produce lists of dates before min(x$time_value) and after +#' max(x$time_value) for padding initial and final windows to size `n`. +#' +#' @importFrom checkmate assert_function +#' @noRd +full_date_seq <- function(x, before, after, time_step) { + pad_early_dates <- c() + pad_late_dates <- c() + + # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` + # defines a step of 1 to automatically be the quantum (smallest resolvable + # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. + # + # `tsibble` classes apparently can't be added to in different units, so even + # if `time_step` is provided by the user, use a unit step. + if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || + is.numeric(x$time_value)) { + all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) + + if (before != 0) { + pad_early_dates <- Start(all_dates) - before:1 + } + if (after != 0) { + pad_late_dates <- End(all_dates) + 1:after + } + } else if (missing(time_step)) { + # Guess what `by` should be based on the epi_df's `time_type`. + ttype <- attributes(x)$metadata$time_type + by <- switch(ttype, + day = "days", + week = "weeks", + yearweek = "weeks", + yearmonth = "months", + yearquarter = "quarters", + year = "years", + NA # default value for "custom", "day-time" + ) + + if (is.na(by)) { + Abort( + c( + "`frollmean` requires a full window to compute a result, but + `time_type` associated with the epi_df was not mappable to period + type valid for creating a date sequence.", + "i" = c("The input data's `time_type` was probably `custom` or `day-time`. + These require also passing a `time_step` function.") + ), + class = "epiprocess__epi_slide_mean__unmappable_time_type", + epiprocess__time_type = ttype + ) + } + + # `seq` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. + all_dates <- seq(min(x$time_value), max(x$time_value), by = by) + + if (before != 0) { + pad_early_dates <- Start(all_dates) - before:1 + } + if (after != 0) { + pad_late_dates <- End(all_dates) + 1:after + } + } else { + # A custom time step is specified. + assert_function(time_step) + + # Calculate the number of `time_step`s required to go between min and max time + # values. This is roundabout because difftime objects, lubridate::period objects, + # and Dates are hard to convert to the same time scale and add. + t_elapsed_s <- difftime(max(x$time_value), min(x$time_value), units = "secs") + step_size_s <- lubridate::as.period(time_step(1), unit = "secs") + n_steps <- ceiling(as.numeric(t_elapsed_s) / as.numeric(step_size_s)) + + all_dates <- min(x$time_value) + time_step(0:n_steps) + + if (before != 0) { + pad_early_dates <- Start(all_dates) - time_step(before:1) + } + if (after != 0) { + pad_late_dates <- End(all_dates) + time_step(1:after) + } + } + + return(list( + all_dates = all_dates, + pad_early_dates = pad_early_dates, + pad_late_dates = pad_late_dates + )) +} \ No newline at end of file diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 444f3ad4..266ef64f 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -958,11 +958,33 @@ test_that("results for different `before`s and `after`s match between epi_slide test_time_type_mean(days, rand_vals, before = 0, after = 1) }) +set.seed(0) +rand_vals <- rnorm(n_obs) + +generate_special_date_data <- function(date_seq) { + epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = date_seq, + a = 1:length(date_seq), + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = date_seq, + a = length(date_seq):1, + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = date_seq, + a = length(date_seq):1, + b = rand_vals * 2 + ))) +} + test_that("results for different time_types match between epi_slide and epi_slide_mean", { - n <- 6 # Max date index - m <- 1 # Number of missing dates - n_obs <- n + 1 - m # Number of obs created - k <- c(0:(n-(m + 1)), n) # Date indices + n <- 6L # Max date index + m <- 1L # Number of missing dates + n_obs <- n + 1L - m # Number of obs created + k <- c(0L:(n-(m + 1L)), n) # Date indices # Basic time type days <- as.Date("2022-01-01") + k @@ -970,13 +992,13 @@ test_that("results for different time_types match between epi_slide and epi_slid # Require lubridate::period function to be passed as `time_step` day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - weeks <- as.Date("2022-01-01") + 7 * k # needs time_step = lubridate::weeks + weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks # Don't require a `time_step` fn - yearweeks <- tsibble::yearweek(10 + k) - yearmonths <- tsibble::yearmonth(10 + k) - yearquarters <- tsibble::yearquarter(10 + k) - years <- 2000 + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + yearweeks <- tsibble::yearweek(10L + k) + yearmonths <- tsibble::yearmonth(10L + k) + yearquarters <- tsibble::yearquarter(10L + k) + years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format # Not supported custom_dates <- c( @@ -985,25 +1007,7 @@ test_that("results for different time_types match between epi_slide and epi_slid ) not_dates <- c("a", "b", "c", "d", "e", "f") - set.seed(0) - rand_vals <- rnorm(n_obs) - - ref_epi_data <- epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = days, - a = 1:n_obs, - b = rand_vals - ), tibble( - geo_value = "ca", - time_value = days, - a = n_obs:1, - b = rand_vals + 10 - ), tibble( - geo_value = "fl", - time_value = days, - a = n_obs:1, - b = rand_vals * 2 - ))) %>% + ref_epi_data <- generate_special_date_data(days) %>% group_by(geo_value) ref_result <- epi_slide(ref_epi_data, ~ data.frame( @@ -1015,22 +1019,7 @@ test_that("results for different time_types match between epi_slide and epi_slid test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. date 10 is missing - epi_data <- epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = dates, - a = 1:n_obs, - b = rand_vals - ), tibble( - geo_value = "ca", - time_value = dates, - a = n_obs:1, - b = rand_vals + 10 - ), tibble( - geo_value = "fl", - time_value = dates, - a = n_obs:1, - b = rand_vals * 2 - ))) %>% + epi_data <- generate_special_date_data(dates) %>% group_by(geo_value) result1 <- epi_slide(epi_data, ~ data.frame( @@ -1099,3 +1088,111 @@ test_that("special time_types without time_step fail in epi_slide_mean", { # day. # test_time_type_mean(weeks) }) + +test_that("helper `full_date_seq` returns expected date values", { + n <- 6L # Max date index + m <- 1L # Number of missing dates + n_obs <- n + 1L - m # Number of obs created + k <- c(0L:(n-(m + 1L)), n) # Date indices + + # Basic time type + days <- as.Date("2022-01-01") + k + + # Require lubridate::period function to be passed as `time_step` + day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes + day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours + weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks + + # Don't require a `time_step` fn + yearweeks <- tsibble::yearweek(10L + k) + yearmonths <- tsibble::yearmonth(10L + k) + yearquarters <- tsibble::yearquarter(10L + k) + years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + + before <- 2L + after <- 1L + + expect_identical( + full_date_seq( + generate_special_date_data(days), before = before, after = after + ), + list( + all_dates = as.Date(c("2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", "2022-01-05", "2022-01-06", "2022-01-07")), + pad_early_dates = as.Date(c("2021-12-30", "2021-12-31")), + pad_late_dates = as.Date(c("2022-01-08")) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(yearweeks), before = before, after = after + ), + list( + all_dates = tsibble::yearweek(10:16), + pad_early_dates = tsibble::yearweek(8:9), + pad_late_dates = tsibble::yearweek(17) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(yearmonths), before = before, after = after + ), + list( + all_dates = tsibble::yearmonth(10:16), + pad_early_dates = tsibble::yearmonth(8:9), + pad_late_dates = tsibble::yearmonth(17) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(yearquarters), before = before, after = after + ), + list( + all_dates = tsibble::yearquarter(10:16), + pad_early_dates = tsibble::yearquarter(8:9), + pad_late_dates = tsibble::yearquarter(17) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(years), before = before, after = after + ), + list( + all_dates = 2000L:2006L, + pad_early_dates = 1998L:1999L, + pad_late_dates = 2007L + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(day_times_minute), before = before, after = after, + time_step = lubridate::minutes + ), + list( + all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(0:6), + pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::minutes(2:1), + pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(7) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(day_times_hour), before = before, after = after, + time_step = lubridate::hours + ), + list( + all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(0:6), + pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::hours(2:1), + pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(7) + ) + ) + expect_identical( + full_date_seq( + generate_special_date_data(weeks), before = before, after = after, + time_step = lubridate::weeks + ), + list( + all_dates = as.Date(c("2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", "2022-01-29", "2022-02-05", "2022-02-12")), + pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), + pad_late_dates = as.Date(c("2022-02-19")) + ) + ) +}) \ No newline at end of file From 64ee99eba0606e69be55998386f950b8e4abade0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Feb 2024 13:00:01 -0500 Subject: [PATCH 158/345] trigger week time_type even when date seq is incomplete --- R/utils.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 6bef5e0a..2ab52328 100644 --- a/R/utils.R +++ b/R/utils.R @@ -444,7 +444,13 @@ guess_time_type <- function(time_value) { return("day-time") } # Else, if a Date class, then use "week" or "day" depending on gaps else if (inherits(time_value, "Date")) { - return(ifelse(all(diff(sort(time_value)) == 7), "week", "day")) + # Convert to numeric so we can use the modulo operator. + unique_time_gaps <- as.numeric(diff(sort(unique(time_value)))) + # We need to check the modulus of `unique_time_gaps` in case there are + # missing dates. Gaps in a weekly date sequence will cause some diffs to + # be larger than 7 days. If we just check if `diffs == 7`, it will fail + # unless the weekly date sequence is already complete. + return(ifelse(all(unique_time_gaps %% 7 == 0), "week", "day")) } # Else, check whether it's one of the tsibble classes From 1e6be8fb4b47b553dabf0c2689dba80b598bfeef Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Feb 2024 16:46:52 -0500 Subject: [PATCH 159/345] bump version, news --- DESCRIPTION | 2 +- NEWS.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7b21d628..ca6a367e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.4 +Version: 0.7.5 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index d6aca370..5bf584e6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,10 +16,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Minor documentation updates; PR #393 - Improved `epi_archive` print method. Compactified metadata and shows a snippet of the underlying `DT` (#341). -- Added `autoplot` method for `epi_df` objects, which creates a ggplot2 plot of +- Added `autoplot` method for `epi_df` objects, which creates a `ggplot2` plot of the `epi_df` (#382). - Refactored internals to use `cli` for warnings/errors and `checkmate` for argument checking (#413). +- Fix logic to auto-assign `ep_df` `time_type` to `week` (#416). ## Breaking changes From 5fa8aa046e371f448b6598fb490a769bf7fe85d2 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Feb 2024 16:52:52 -0500 Subject: [PATCH 160/345] bump version, news --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7f36ad7c..dcc6e07d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.3 +Version: 0.7.6 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index e4a404e2..edb847b8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat * `epi_slide` computations are now 2-4 times faster after changing how reference time values, made accessible within sliding functions, are calculated (#397). +- Add new `epi_slide_mean` function to allow much (~30x) faster rolling + average computations in some cases (#400). * regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API * changed approach to versioning, see DEVELOPMENT.md for details From f3f16e3107dc76ef14d82e02bb2ae31099086c81 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 8 Feb 2024 14:14:41 -0800 Subject: [PATCH 161/345] add tests, and style --- R/autoplot.R | 24 +++++++++++++++--------- R/epi_df.R | 8 +++++--- R/outliers.R | 8 +++++--- tests/testthat/test-utils.R | 7 +++++++ 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index ffd9f038..8686fb24 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -32,12 +32,15 @@ #' .color_by = "none", #' .facet_by = "geo_value" #' ) -#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", -#' .base_color = "red", .facet_by = "geo_value") +#' autoplot(jhu_csse_daily_subset, case_rate_7d_av, +#' .color_by = "none", +#' .base_color = "red", .facet_by = "geo_value" +#' ) #' #' # .base_color specification won't have any effect due .color_by default #' autoplot(jhu_csse_daily_subset, case_rate_7d_av, -#' .base_color = "red", .facet_by = "geo_value") +#' .base_color = "red", .facet_by = "geo_value" +#' ) autoplot.epi_df <- function( object, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), @@ -59,7 +62,8 @@ autoplot.epi_df <- function( allowed <- allowed[allowed] if (length(allowed) == 0 && rlang::dots_n(...) == 0L) { cli::cli_abort("No numeric variables were available to plot automatically.", - class = "epiprocess__no_numeric_vars_available") + class = "epiprocess__no_numeric_vars_available" + ) } vars <- tidyselect::eval_select(rlang::expr(c(...)), object) if (rlang::is_empty(vars)) { # find them automatically if unspecified @@ -76,11 +80,13 @@ autoplot.epi_df <- function( class = "epiprocess__all_requested_vars_not_numeric" ) } else if (!all(ok)) { - cli::cli_warn(c( - "Only the requested variables {.var {names(vars)[ok]}} are numeric.", - i = "`autoplot()` cannot display {.var {names(vars)[!ok]}}." - ), - class = "epiprocess__some_requested_vars_not_numeric") + cli::cli_warn( + c( + "Only the requested variables {.var {names(vars)[ok]}} are numeric.", + i = "`autoplot()` cannot display {.var {names(vars)[!ok]}}." + ), + class = "epiprocess__some_requested_vars_not_numeric" + ) vars <- vars[ok] } } diff --git a/R/epi_df.R b/R/epi_df.R index 1c648ff8..0334e1d0 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -297,9 +297,11 @@ as_epi_df.epi_df <- function(x, ...) { #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, additional_metadata = list(), ...) { - if (!test_subset(c("geo_value", "time_value"), names(x))) cli_abort( - "Columns `geo_value` and `time_value` must be present in `x`." - ) + if (!test_subset(c("geo_value", "time_value"), names(x))) { + cli_abort( + "Columns `geo_value` and `time_value` must be present in `x`." + ) + } new_epi_df( x, geo_type, time_type, as_of, diff --git a/R/outliers.R b/R/outliers.R index ee59d64b..a8051dbd 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -109,9 +109,11 @@ detect_outlr <- function(x = seq_along(y), y, # Validate the output assert_data_frame(results) - if (!test_subset(c("lower", "upper", "replacement"), colnames(results))) cli_abort( - "Columns `lower`, `upper`, and `replacement` must be present in the output of the outlier detection method." - ) + if (!test_subset(c("lower", "upper", "replacement"), colnames(results))) { + cli_abort( + "Columns `lower`, `upper`, and `replacement` must be present in the output of the outlier detection method." + ) + } # Update column names with model abbreviation colnames(results) <- paste(abbr, colnames(results), sep = "_") diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 3454d257..11d6e864 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -89,6 +89,13 @@ test_that("guess_time_type works for different types", { expect_equal(guess_time_type(not_ymd3), "custom") expect_equal(guess_time_type(not_a_date), "custom") }) +3 +test_that("guess_time_type works with gaps", { + days_gaps <- as.Date("2022-01-01") + c(0, 1, 3, 4, 8, 8 + 7) + weeks_gaps <- as.Date("2022-01-01") + 7 * c(0, 1, 3, 4, 8, 8 + 7) + expect_equal(guess_time_type(days_gaps), "day") + expect_equal(guess_time_type(weeks_gaps), "week") +}) test_that("enlist works", { my_list <- enlist(x = 1, y = 2, z = 3) From 4bf2a29e2583e7cafc4d6c86cca285c87c3a9f09 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Feb 2024 17:44:43 -0500 Subject: [PATCH 162/345] fix pad date generation when time_step is missing --- R/slide.R | 15 ++++++++++++--- tests/testthat/test-epi_slide.R | 15 +++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/R/slide.R b/R/slide.R index 57db329b..7bd4489c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -739,14 +739,23 @@ full_date_seq <- function(x, before, after, time_step) { ) } - # `seq` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. + # `seq.Date` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. all_dates <- seq(min(x$time_value), max(x$time_value), by = by) if (before != 0) { - pad_early_dates <- Start(all_dates) - before:1 + # Use `seq.Date` here to avoid having to map `epi_df` `time_type` to + # `time_step` functions. + # + # The first element `seq.Date` returns is always equal to the provided + # `from` date (`from + 0`). The full return value is equivalent to + # `from + 0:n`. In our case, we `from + 1:n`, so drop the first + # element. + # + # Adding "-1" to the `by` arg makes `seq.Date` go backwards in time. + pad_early_dates <- sort(seq(Start(all_dates), by = paste("-1", by), length.out = before + 1)[-1]) } if (after != 0) { - pad_late_dates <- End(all_dates) + 1:after + pad_late_dates <- seq(End(all_dates), by = by, length.out = after + 1)[-1] } } else { # A custom time step is specified. diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 266ef64f..41012d9e 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -961,7 +961,7 @@ test_that("results for different `before`s and `after`s match between epi_slide set.seed(0) rand_vals <- rnorm(n_obs) -generate_special_date_data <- function(date_seq) { +generate_special_date_data <- function(date_seq, ...) { epiprocess::as_epi_df(rbind(tibble( geo_value = "al", time_value = date_seq, @@ -977,7 +977,7 @@ generate_special_date_data <- function(date_seq) { time_value = date_seq, a = length(date_seq):1, b = rand_vals * 2 - ))) + )), ...) } test_that("results for different time_types match between epi_slide and epi_slide_mean", { @@ -1195,4 +1195,15 @@ test_that("helper `full_date_seq` returns expected date values", { pad_late_dates = as.Date(c("2022-02-19")) ) ) + # Check the middle branch (`if (missing(time_step))`) of `full_date_seq`. + expect_identical( + full_date_seq( + generate_special_date_data(weeks, time_type = "week"), before = before, after = after + ), + list( + all_dates = as.Date(c("2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", "2022-01-29", "2022-02-05", "2022-02-12")), + pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), + pad_late_dates = as.Date(c("2022-02-19")) + ) + ) }) \ No newline at end of file From 4e10a2bcd535036a90432ae809cca13e9bd46cf6 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 8 Feb 2024 18:41:14 -0500 Subject: [PATCH 163/345] use cli_* instead of Abort and Warn --- NAMESPACE | 2 +- R/slide.R | 50 ++++++++----------- man/autoplot.epi_df.Rd | 9 ++-- tests/testthat/test-epi_slide.R | 86 +++++++++++++++++++++------------ 4 files changed, 81 insertions(+), 66 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 22bb2182..83693894 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,9 +73,9 @@ importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) importFrom(checkmate,assert_character) -importFrom(checkmate,assert_function) importFrom(checkmate,assert_class) importFrom(checkmate,assert_data_frame) +importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) diff --git a/R/slide.R b/R/slide.R index 9c557aca..ed95d0c1 100644 --- a/R/slide.R +++ b/R/slide.R @@ -506,48 +506,38 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { - # Check we have an `epi_df` object - if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") + assert_class(x, "epi_df") user_provided_rtvs <- !missing(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) - } - - # Some of these `ref_time_values` checks and processing steps also apply to - # the `ref_time_values` default; for simplicity, just apply all the steps - # regardless of whether we are working with a default or user-provided - # `ref_time_values`: - if (length(ref_time_values) == 0L) { - Abort("`ref_time_values` must have at least one element.") - } else if (any(is.na(ref_time_values))) { - Abort("`ref_time_values` must not include `NA`.") - } else if (anyDuplicated(ref_time_values) != 0L) { - Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") - } else if (!all(ref_time_values %in% unique(x$time_value))) { - Abort("All `ref_time_values` must appear in `x$time_value`.") } else { - ref_time_values <- sort(ref_time_values) + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(ref_time_values, unique(x$time_value))) { + cli_abort( + "`ref_time_values` must be a unique subset of the time values in `x`." + ) + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } } + ref_time_values <- sort(ref_time_values) # Validate and pre-process `before`, `after`: if (!missing(before)) { before <- vctrs::vec_cast(before, integer()) - if (length(before) != 1L || is.na(before) || before < 0L) { - Abort("`before` must be length-1, non-NA, non-negative") - } + assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) } if (!missing(after)) { after <- vctrs::vec_cast(after, integer()) - if (length(after) != 1L || is.na(after) || after < 0L) { - Abort("`after` must be length-1, non-NA, non-negative") - } + assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) } if (missing(before)) { if (missing(after)) { - Abort("Either or both of `before`, `after` must be provided.") + cli_abort("Either or both of `before`, `after` must be provided.") } else if (after == 0L) { - Warn("`before` missing, `after==0`; maybe this was intended to be some + cli_warn("`before` missing, `after==0`; maybe this was intended to be some non-zero-width trailing window, but since `before` appears to be missing, it's interpreted as a zero-width window (`before=0, after=0`).") @@ -555,7 +545,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, before <- 0L } else if (missing(after)) { if (before == 0L) { - Warn("`before==0`, `after` missing; maybe this was intended to be some + cli_warn("`before==0`, `after` missing; maybe this was intended to be some non-zero-width leading window, but since `after` appears to be missing, it's interpreted as a zero-width window (`before=0, after=0`).") @@ -575,7 +565,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, if (is.null(names_sep) && !as_list_col) { if (length(new_col_name) != length(col_name)) { - Abort( + cli_abort( "`new_col_name` must be the same length as `col_name` when `names_sep` is NULL.", class = "epiprocess__epi_slide_mean__col_name_length_mismatch", epiprocess__new_col_name = new_col_name, @@ -585,7 +575,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result_col_name <- new_col_name } else { if (length(new_col_name) != 1L && length(new_col_name) != length(col_name)) { - Abort( + cli_abort( "`new_col_name` must be either length 1 or the same length as `col_name`.", class = "epiprocess__epi_slide_mean__col_name_length_mismatch_and_not_one", epiprocess__new_col_name = new_col_name, @@ -613,7 +603,7 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` # should be used instead. if (anyDuplicated(.data_group$time_value) > 0) { - Abort(c( + cli_abort(c( "group contains duplicate time values. Using `epi_slide_mean` on this group will result in incorrect results", "i" = "Please change the grouping structure of the input data so that @@ -716,7 +706,7 @@ full_date_seq <- function(x, before, after, time_step) { ) if (is.na(by)) { - Abort( + cli_abort( c( "`frollmean` requires a full window to compute a result, but `time_type` associated with the epi_df was not mappable to period diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd index a87bc8ca..c97ea02f 100644 --- a/man/autoplot.epi_df.Rd +++ b/man/autoplot.epi_df.Rd @@ -56,10 +56,13 @@ autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", .facet_by = "geo_value" ) -autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", - .base_color = "red", .facet_by = "geo_value") +autoplot(jhu_csse_daily_subset, case_rate_7d_av, + .color_by = "none", + .base_color = "red", .facet_by = "geo_value" +) # .base_color specification won't have any effect due .color_by default autoplot(jhu_csse_daily_subset, case_rate_7d_av, - .base_color = "red", .facet_by = "geo_value") + .base_color = "red", .facet_by = "geo_value" +) } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 5429ed0c..ca3c233d 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -58,11 +58,11 @@ test_that("`before` and `after` are both vectors of length 1", { expect_error( epi_slide_mean(grouped, col_name = "value", before = c(0, 1), after = 0, ref_time_values = d + 3), - "`before`.*length-1" + "Assertion on 'before' failed: Must have length 1" ) expect_error( epi_slide_mean(grouped, col_name = "value", before = 1, after = c(0, 1), ref_time_values = d + 3), - "`after`.*length-1" + "Assertion on 'after' failed: Must have length 1" ) }) @@ -140,11 +140,11 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa expect_error( epi_slide_mean(grouped, col_name = "value", before = -1L, ref_time_values = d + 2L), - "`before`.*non-negative" + "Assertion on 'before' failed: Element 1 is not >= 0" ) expect_error( epi_slide_mean(grouped, col_name = "value", before = 2L, after = -1L, ref_time_values = d + 2L), - "`after`.*non-negative" + "Assertion on 'after' failed: Element 1 is not >= 0" ) expect_error(epi_slide_mean(grouped, col_name = "value", before = "a", ref_time_values = d + 2L), regexp = "before", class = "vctrs_error_incompatible_type" @@ -160,11 +160,11 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa ) expect_error( epi_slide_mean(grouped, col_name = "value", before = NA, after = 1L, ref_time_values = d + 2L), - "`before`.*non-NA" + "Assertion on 'before' failed: May not be NA" ) expect_error( epi_slide_mean(grouped, col_name = "value", before = 1L, after = NA, ref_time_values = d + 2L), - "`after`.*non-NA" + "Assertion on 'after' failed: May not be NA" ) # Non-integer-class but integer-compatible values are allowed: @@ -187,11 +187,11 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, expect_error( epi_slide_mean(grouped, col_name = "value", before = 2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( epi_slide_mean(grouped, col_name = "value", before = 2L, ref_time_values = d + 207L), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window }) @@ -207,11 +207,11 @@ test_that("`ref_time_values` + `before` + `after` that have some slide data, but expect_error( epi_slide_mean(grouped, "value", before = 0L, after = 2L, ref_time_values = d), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 201L), - "All `ref_time_values` must appear in `x\\$time_value`." + "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window }) @@ -958,34 +958,34 @@ test_that("results for different `before`s and `after`s match between epi_slide test_time_type_mean(days, rand_vals, before = 0, after = 1) }) -set.seed(0) -rand_vals <- rnorm(n_obs) - -generate_special_date_data <- function(date_seq, ...) { - epiprocess::as_epi_df(rbind(tibble( - geo_value = "al", - time_value = date_seq, - a = 1:length(date_seq), - b = rand_vals - ), tibble( - geo_value = "ca", - time_value = date_seq, - a = length(date_seq):1, - b = rand_vals + 10 - ), tibble( - geo_value = "fl", - time_value = date_seq, - a = length(date_seq):1, - b = rand_vals * 2 - )), ...) -} - test_that("results for different time_types match between epi_slide and epi_slide_mean", { n <- 6L # Max date index m <- 1L # Number of missing dates n_obs <- n + 1L - m # Number of obs created k <- c(0L:(n-(m + 1L)), n) # Date indices + set.seed(0) + rand_vals <- rnorm(n_obs) + + generate_special_date_data <- function(date_seq, ...) { + epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = date_seq, + a = 1:length(date_seq), + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = date_seq, + a = length(date_seq):1, + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = date_seq, + a = length(date_seq):1, + b = rand_vals * 2 + )), ...) + } + # Basic time type days <- as.Date("2022-01-01") + k @@ -1095,6 +1095,28 @@ test_that("helper `full_date_seq` returns expected date values", { n_obs <- n + 1L - m # Number of obs created k <- c(0L:(n-(m + 1L)), n) # Date indices + set.seed(0) + rand_vals <- rnorm(n_obs) + + generate_special_date_data <- function(date_seq, ...) { + epiprocess::as_epi_df(rbind(tibble( + geo_value = "al", + time_value = date_seq, + a = 1:length(date_seq), + b = rand_vals + ), tibble( + geo_value = "ca", + time_value = date_seq, + a = length(date_seq):1, + b = rand_vals + 10 + ), tibble( + geo_value = "fl", + time_value = date_seq, + a = length(date_seq):1, + b = rand_vals * 2 + )), ...) + } + # Basic time type days <- as.Date("2022-01-01") + k From 18f5e61b12f8f0fdf8fa6dc43fc43d76db2a05d9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:13:57 -0500 Subject: [PATCH 164/345] test behavior of time_type week with no `time_step` --- tests/testthat/test-epi_slide.R | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index ca3c233d..35b9ebc1 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1045,6 +1045,15 @@ test_that("results for different time_types match between epi_slide and epi_slid test_time_type_mean(day_times_minute, time_step = lubridate::minutes) test_time_type_mean(day_times_hour, time_step = lubridate::hours) test_time_type_mean(weeks, time_step = lubridate::weeks) + + # `epi_slide_mean` can also handle `weeks` without `time_step` being + # provided, but `epi_slide` can't + epi_data <- generate_special_date_data(weeks) %>% + group_by(geo_value) + result2 <- epi_slide_mean(epi_data, + col_name = c("a", "b"), na.rm = TRUE, + before = before, after = after, ...) + expect_identical(select(ref_result, -time_value), select(result2, -time_value)) }) test_that("special time_types without time_step fail in epi_slide_mean", { @@ -1053,7 +1062,6 @@ test_that("special time_types without time_step fail in epi_slide_mean", { day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - weeks <- as.Date("2022-01-01") + 7 * k # needs time_step = lubridate::weeks # Not supported custom_dates <- c( @@ -1082,11 +1090,6 @@ test_that("special time_types without time_step fail in epi_slide_mean", { test_time_type_mean(not_dates) test_time_type_mean(day_times_minute) test_time_type_mean(day_times_hour) - # Currently doesn't throw the expected error, and returns an incorrect - # result. This is because since the weekdates are stored as Dates -> - # guess_time_type thinks this is "day" type, and the default step size is 1 - # day. - # test_time_type_mean(weeks) }) test_that("helper `full_date_seq` returns expected date values", { From 71f8072b4ab5646082263ba92032a37e2ec427b9 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 20 Feb 2024 11:33:08 -0800 Subject: [PATCH 165/345] fix: check, not assert --- R/autoplot.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/autoplot.R b/R/autoplot.R index 8686fb24..e0a10cbf 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -50,7 +50,7 @@ autoplot.epi_df <- function( .color_by <- match.arg(.color_by) .facet_by <- match.arg(.facet_by) - assert(anyInfinite(.max_facets), assert_int(.max_facets), combine = "or") + assert(anyInfinite(.max_facets), checkInt(.max_facets), combine = "or") assert_character(.base_color, len = 1) key_cols <- key_colnames(object) From 4d99dc51334b391ffea139b87a174ce9b147d02f Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Mar 2024 15:46:53 -0800 Subject: [PATCH 166/345] repo: add renv .Rprofile to .gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6384e142..de393a31 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,9 @@ inst/doc docs /doc/ /Meta/ -*.DS_Store \ No newline at end of file +*.DS_Store + +# Delphi custom +renv/ +renv.lock +.Rprofile From 724f6682c55ff2bb0434ab531bbe769c06f0b297 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Mar 2024 17:18:41 -0800 Subject: [PATCH 167/345] fix: add missing import checkInt --- NAMESPACE | 1 + R/epiprocess.R | 2 +- man/autoplot.epi_df.Rd | 9 ++++++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ef55f68c..03e0e41d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -79,6 +79,7 @@ importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) +importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) importFrom(checkmate,test_set_equal) diff --git a/R/epiprocess.R b/R/epiprocess.R index 05737d58..e3918708 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -7,7 +7,7 @@ #' @importFrom checkmate assert assert_scalar assert_data_frame anyMissing #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic -#' anyInfinite test_subset test_set_equal +#' anyInfinite test_subset test_set_equal checkInt #' @importFrom cli cli_abort cli_inform cli_warn #' @name epiprocess "_PACKAGE" diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd index a87bc8ca..c97ea02f 100644 --- a/man/autoplot.epi_df.Rd +++ b/man/autoplot.epi_df.Rd @@ -56,10 +56,13 @@ autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", .facet_by = "geo_value" ) -autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", - .base_color = "red", .facet_by = "geo_value") +autoplot(jhu_csse_daily_subset, case_rate_7d_av, + .color_by = "none", + .base_color = "red", .facet_by = "geo_value" +) # .base_color specification won't have any effect due .color_by default autoplot(jhu_csse_daily_subset, case_rate_7d_av, - .base_color = "red", .facet_by = "geo_value") + .base_color = "red", .facet_by = "geo_value" +) } From 95982a53e3c0efb922ccfa9c791ef68aeb65dc49 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Mar 2024 17:18:51 -0800 Subject: [PATCH 168/345] repo: we don't use bump2version anymore --- .bumpversion.cfg | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .bumpversion.cfg diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 7ad5a922..00000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[bumpversion] -current_version = 0.7.2 - -[bumpversion:file:DESCRIPTION] From 6283e0a2f5202563dc5b598916b6ecda7f9d636b Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Mar 2024 17:19:04 -0800 Subject: [PATCH 169/345] repo: update .Rbuildignore --- .Rbuildignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 8ca62412..a28a0185 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,6 @@ ^pkgdown$ ^doc$ ^Meta$ +^.git-blame-ignore-revs$ +^.lintr$ +^DEVELOPMENT.md$ \ No newline at end of file From 9885f704cea539857f6a8307c5056680c42adedf Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 16 Feb 2024 10:22:39 -0800 Subject: [PATCH 170/345] doc+ci+repo: fix style and release guide * update with fixes from epidatr * update pull request template * bugfix pkgdown.yaml * add workflow_dispatch buttons to most actions * remove release_helper.yaml which we don't use * update the DEVELOPMENT.md guide with a release checklist * update pkgdown.yaml to match epidatr --- .github/pull_request_template.md | 25 ++-- .github/workflows/R-CMD-check.yaml | 1 + .github/workflows/pkgdown.yaml | 33 +++-- .github/workflows/release-helper.yaml | 23 ---- DEVELOPMENT.md | 60 +++++++--- _pkgdown.yml | 166 ++++++++++++++------------ 6 files changed, 165 insertions(+), 143 deletions(-) delete mode 100644 .github/workflows/release-helper.yaml diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 3afd83a6..eefea863 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -2,20 +2,21 @@ Please: -- [ ] Make sure this PR is against "dev", not "main". -- [ ] Request a review from one of the current epiprocess main reviewers: - brookslogan, nmdefries. -- [ ] Makes sure to bump the version number in `DESCRIPTION` and `NEWS.md`. - Always increment the patch version number (the third number), unless you are - making a release PR from dev to main, in which case increment the minor - version number (the second number). -- [ ] Describe changes made in NEWS.md, making sure breaking changes - (backwards-incompatible changes to the documented interface) are noted. - Collect the changes under the next release number (e.g. if you are on - 0.7.2, then write your changes under the 0.8 heading). +- [ ] Make sure this PR is against "dev", not "main" (unless this is a release + PR). +- [ ] Request a review from one of the current main reviewers: + brookslogan, nmdefries. +- [ ] Makes sure to bump the version number in `DESCRIPTION`. Always increment + the patch version number (the third number), unless you are making a + release PR from dev to main, in which case increment the minor version + number (the second number). +- [ ] Describe changes made in NEWS.md, making sure breaking changes + (backwards-incompatible changes to the documented interface) are noted. + Collect the changes under the next release number (e.g. if you are on + 1.7.2, then write your changes under the 1.8 heading). ### Change explanations for reviewer ### Magic GitHub syntax to mark associated Issue(s) as resolved when this is merged into the default branch -- Resolves #{issue number} +- Resolves #{issue number} diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index ecc1c082..4d0a2b03 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -7,6 +7,7 @@ on: branches: [main, dev] pull_request: branches: [main, dev] + workflow_dispatch: name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 9490acc7..47dd6ed6 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,12 +1,13 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help # -# Created with usethis + edited to run on PRs to dev, use API key. +# Edits from above workflow: also check pkgdown for PRs to `dev` branch, and +# update the documentation web site on pushes to `dev` branch. on: push: - branches: [dev, main] + branches: [main, dev] pull_request: - branches: [dev, main] + branches: [main, dev] release: types: [published] workflow_dispatch: @@ -15,7 +16,6 @@ name: pkgdown jobs: pkgdown: - # only build docs on the main repository and not forks if: github.repository_owner == 'cmu-delphi' runs-on: ubuntu-latest # Only restrict concurrency for non-PR jobs @@ -23,6 +23,7 @@ jobs: group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} steps: - uses: actions/checkout@v3 @@ -34,19 +35,29 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown, local::. + extra-packages: any::pkgdown, local::., any::cli needs: website - name: Build site - env: - DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} run: | - if (startsWith("${{ github.event_name }}", "pull_request")) { - mode <- ifelse("${{ github.base_ref }}" == "main", "release", "devel") + override <- if (startsWith("${{ github.event_name }}", "pull_request")) { + if ("${{ github.base_ref }}" == "main") { + list(development = list(mode = "release", version_label = "light")) + } else { + list(development = list(mode = "devel", version_label = "success")) + } } else { - mode <- ifelse("${{ github.ref_name }}" == "main", "release", "devel") + if ("${{ github.ref_name }}" == "main") { + list(development = list(mode = "release", version_label = "light")) + } else { + list(development = list(mode = "devel", version_label = "success")) + } } - pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, override=list(PKGDOWN_DEV_MODE=mode)) + pkg <- pkgdown::as_pkgdown(".", override = override) + cli::cli_rule("Cleaning files from old site...") + pkgdown::clean_site(pkg) + pkgdown::build_site(pkg, preview = FALSE, install = FALSE, new_process = FALSE) + pkgdown:::build_github_pages(pkg) shell: Rscript {0} - name: Deploy to GitHub pages 🚀 diff --git a/.github/workflows/release-helper.yaml b/.github/workflows/release-helper.yaml deleted file mode 100644 index cc0bc6fa..00000000 --- a/.github/workflows/release-helper.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Release Helper - -on: - push: - branches: - - main - -jobs: - sync_dev: - needs: correct_repository - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v3 - with: - ref: dev - fetch-depth: 0 - ssh-key: ${{ secrets.CMU_DELPHI_DEPLOY_MACHINE_SSH }} - - name: Reset dev branch - run: | - git fetch origin main:main - git merge main - git push diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index c133099b..369b51c3 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -19,38 +19,62 @@ devtools::check() # check package for errors ## Developing the documentation site -The [documentation site](https://cmu-delphi.github.io/epidatr/) is built off of the `main` branch. The `dev` version of the site is available at https://cmu-delphi.github.io/epidatr/dev. +Our CI builds two version of the documentation: -The documentation site can be previewed locally by running in R +- https://cmu-delphi.github.io/epiprocess/ from the `main` branch and +- https://cmu-delphi.github.io/epiprocess/dev from the `dev` branch. + +The documentation site can be previewed locally by running in R: ```r +# Should automatically open a browser pkgdown::build_site(preview=TRUE) ``` -The `main` version is available at `file:////epidatr/docs/index.html` and `dev` at `file:////epidatr/docs/dev/index.html`. +If the above does not open a browser, you can try using a Python server from the +command line: -You can also build the docs manually and launch the site with python. From the terminal, this looks like ```bash R -e 'devtools::document()' +R -e 'pkgdown::build_site()' python -m http.server -d docs ``` ## Versioning -Please follow the guidelines in the PR template document (reproduced here): - -- [ ] Make sure this PR is against "dev", not "main". -- [ ] Request a review from one of the current epiprocess main reviewers: - brookslogan, nmdefries. -- [ ] Makes sure to bump the version number in `DESCRIPTION` and `NEWS.md`. - Always increment the patch version number (the third number), unless you are - making a release PR from dev to main, in which case increment the minor - version number (the second number). -- [ ] Describe changes made in NEWS.md, making sure breaking changes - (backwards-incompatible changes to the documented interface) are noted. - Collect the changes under the next release number (e.g. if you are on - 0.7.2, then write your changes under the 0.8 heading). +Please follow the guidelines in the [PR template document](.github/pull_request_template.md). ## Release process -TBD +Open a release issue and then copy and follow this checklist in the issue (modified from the checklist generated by `usethis::use_release_issue(version = "1.0.2")`): + +- [ ] `git pull` +- [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_epiprocess.html) +- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "false"))`. + - Aim for 10/10, no notes. +- [ ] If check works well enough, merge to main. Otherwise open a PR to fix up. +- [ ] [Polish NEWS](https://github.com/cmu-delphi/epiprocess/blob/dev/NEWS.md). + - Some [guidelines](https://style.tidyverse.org/news.html#news-release). +- [ ] `git checkout main` +- [ ] `git pull` +- [ ] `urlchecker::url_check()`. + - This may choke on the MIT license url, and that's ok. +- [ ] `devtools::build_readme()` +- [ ] `devtools::check_win_devel()` +- [ ] Check email for problems +- [ ] `revdepcheck::revdep_check(num_workers = 4)`. + - This may choke, it is very sensitive to the binary versions of packages on a given system. Either bypass or ask someone else to run it if you're concerned. +- [ ] Update `cran-comments.md` +- [ ] PR with any changes + +Submit to CRAN: + +- [ ] `devtools::submit_cran()` +- [ ] Approve email + +Wait for CRAN... + +- [ ] Accepted :tada: +- [ ] `dev` +- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push). +- [ ] check the release notes and publish the branch on github diff --git a/_pkgdown.yml b/_pkgdown.yml index 08fde0ce..146f71b0 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,92 +1,100 @@ -template: - bootstrap: 5 +# Colors should stay consistent across epipredict, epiprocess, and epidatr, +# using Carnegie Red +# https://www.cmu.edu/brand/brand-guidelines/visual-identity/colors.html + +# This is to give a default value to the `mode` parameter in the +# `pkgdown::build_site` function. This is useful when building the site locally, +# as it will default to `devel` mode. In practice, this should all be handled +# dynamically by the CI/CD pipeline. +development: + mode: devel + version_label: success +template: + bootstrap: 5 + bootswatch: cosmo + bslib: + font_scale: 1.0 + primary: "#C41230" + link-color: "#C41230" -# Colors from epipredict & epidatr, including Carnegie Red https://www.cmu.edu/brand/brand-guidelines/visual-identity/colors.html navbar: - bg: "#C41230" - fg: "#f8f8f8" - bootswatch: cosmo - bslib: - font_scale: 1.0 - primary: "#C41230" - link-color: "#C41230" - navbar-bg: "#C41230" - navbar-fg: "#f8f8f8" + bg: primary + type: dark url: https://cmu-delphi.github.io/epiprocess/ home: - links: - - text: Introduction to Delphi's Tooling Work - href: https://cmu-delphi.github.io/delphi-tooling-book/ - - text: Get the epipredict R package - href: https://cmu-delphi.github.io/epipredict/ - - text: Get the epidatr R package - href: https://github.com/cmu-delphi/epidatr - - text: Get the epidatasets R package - href: https://cmu-delphi.github.io/epidatasets/ + links: + - text: Introduction to Delphi's Tooling Work + href: https://cmu-delphi.github.io/delphi-tooling-book/ + - text: Get the epipredict R package + href: https://cmu-delphi.github.io/epipredict/ + - text: Get the epidatr R package + href: https://github.com/cmu-delphi/epidatr + - text: Get the epidatasets R package + href: https://cmu-delphi.github.io/epidatasets/ articles: - - title: Using the package - desc: Basic usage and examples. - navbar: ~ - contents: - - epiprocess - - slide - - growth_rate - - correlation - - aggregation - - outliers - - archive - - advanced - - compactify + - title: Using the package + desc: Basic usage and examples. + navbar: ~ + contents: + - epiprocess + - slide + - growth_rate + - correlation + - aggregation + - outliers + - archive + - advanced + - compactify repo: - url: - home: https://github.com/cmu-delphi/epiprocess/tree/main/ - source: https://github.com/cmu-delphi/epiprocess/blob/main/ - issue: https://github.com/cmu-delphi/epiprocess/issues - user: https://github.com/ + url: + home: https://github.com/cmu-delphi/epiprocess/tree/main/ + source: https://github.com/cmu-delphi/epiprocess/blob/main/ + issue: https://github.com/cmu-delphi/epiprocess/issues + user: https://github.com/ reference: - - title: "`epi_df` basics" - desc: Details on `epi_df` format, and basic functionality. - - contents: - - matches("epi_df") - - title: "`epi_*()` functions" - desc: Functions that act on `epi_df` objects. - - contents: - - epi_slide - - epi_cor - - title: Vector functions - desc: Functions that act directly on signal variables. - - contents: - - growth_rate - - detect_outlr - - detect_outlr_rm - - detect_outlr_stl - - title: "`epi_archive` basics" - desc: Details on `epi_archive`, and basic functionality. - - contents: - - matches("archive") - - title: "`epix_*()` functions" - desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - - contents: - - starts_with("epix") - - group_by.epi_archive - - title: Example data - - contents: - - archive_cases_dv_subset - - incidence_num_outlier_example - - contains("jhu_csse") - - title: Basic automatic plotting - - contents: - - autoplot.epi_df - - title: internal - - contents: - - epiprocess - - max_version_with_row_in - - next_after - - guess_period - - key_colnames + - title: "`epi_df` basics" + desc: Details on `epi_df` format, and basic functionality. + - contents: + - matches("epi_df") + - title: "`epi_*()` functions" + desc: Functions that act on `epi_df` objects. + - contents: + - epi_slide + - epi_cor + - title: Vector functions + desc: Functions that act directly on signal variables. + - contents: + - growth_rate + - detect_outlr + - detect_outlr_rm + - detect_outlr_stl + - title: "`epi_archive` basics" + desc: Details on `epi_archive`, and basic functionality. + - contents: + - matches("archive") + - title: "`epix_*()` functions" + desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. + - contents: + - starts_with("epix") + - group_by.epi_archive + - title: Example data + - contents: + - archive_cases_dv_subset + - incidence_num_outlier_example + - contains("jhu_csse") + - title: Basic automatic plotting + - contents: + - autoplot.epi_df + - title: internal + - contents: + - epiprocess + - max_version_with_row_in + - next_after + - guess_period + - key_colnames From 9982a49eff2470dab4aee2c307e8c3f5de9a7527 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 5 Mar 2024 19:11:42 -0500 Subject: [PATCH 171/345] Clarify doc names Co-authored-by: brookslogan --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index ed95d0c1..dc379e4c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -367,7 +367,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' Optimized slide function for performing rolling averages on an `epi_df` object #' -#' Slides a n-timestep mean over variables in an `epi_df` object. See the [slide +#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' @@ -499,7 +499,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean("cases", new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, From ef332d3c7aa2704d04e4af3979407402c3ae484a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 5 Mar 2024 20:12:14 -0500 Subject: [PATCH 172/345] bullet formatting in docs --- R/slide.R | 44 +++++++++++++++++++++++-------------------- man/epi_slide.Rd | 21 +++++++++++++-------- man/epi_slide_mean.Rd | 25 ++++++++++++++---------- 3 files changed, 52 insertions(+), 38 deletions(-) diff --git a/R/slide.R b/R/slide.R index dc379e4c..1804045c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -32,16 +32,18 @@ #' provided; the other's default will be 0. Any value provided for either #' argument must be a single, non-`NA`, non-negative, #' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: * For trailing/right-aligned -#' windows from `ref_time_value - time_step(k)` to `ref_time_value`: either -#' pass `before=k` by itself, or pass `before=k, after=0`. * For -#' center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. * For -#' leading/left-aligned windows from `ref_time_value` to `ref_time_value + -#' time_step(k)`: either pass pass `after=k` by itself, or pass `before=0, -#' after=k`. See "Details:" about the definition of a time step, -#' (non)treatment of missing rows within the window, and avoiding warnings -#' about `before`&`after` settings for a certain uncommon use case. +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. #' @param ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the @@ -385,16 +387,18 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' provided; the other's default will be 0. Any value provided for either #' argument must be a single, non-`NA`, non-negative, #' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: * For trailing/right-aligned -#' windows from `ref_time_value - time_step(k)` to `ref_time_value`: either -#' pass `before=k` by itself, or pass `before=k, after=0`. * For -#' center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. * For -#' leading/left-aligned windows from `ref_time_value` to `ref_time_value + -#' time_step(k)`: either pass pass `after=k` by itself, or pass `before=0, -#' after=k`. See "Details:" about the definition of a time step, -#' (non)treatment of missing rows within the window, and avoiding warnings -#' about `before`&`after` settings for a certain uncommon use case. +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. #' @param ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 05c4d9ad..bdb36506 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -50,14 +50,19 @@ the sliding window extend? At least one of these two arguments must be provided; the other's default will be 0. Any value provided for either argument must be a single, non-\code{NA}, non-negative, \link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: * For trailing/right-aligned -windows from \code{ref_time_value - time_step(k)} to \code{ref_time_value}: either -pass \code{before=k} by itself, or pass \verb{before=k, after=0}. * For -center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. * For -leading/left-aligned windows from \code{ref_time_value} to \code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. See "Details:" about the definition of a time step, -(non)treatment of missing rows within the window, and avoiding warnings -about \code{before}&\code{after} settings for a certain uncommon use case.} +the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass +\verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item For leading/left-aligned windows from \code{ref_time_value} to +\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +or pass \verb{before=0, after=k}. +See "Details:" about the definition of a time step,(non)treatment of +missing rows within the window, and avoiding warnings about +\code{before}&\code{after} settings for a certain uncommon use case. +}} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index e0906eb2..9dfe7767 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -36,14 +36,19 @@ the sliding window extend? At least one of these two arguments must be provided; the other's default will be 0. Any value provided for either argument must be a single, non-\code{NA}, non-negative, \link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: * For trailing/right-aligned -windows from \code{ref_time_value - time_step(k)} to \code{ref_time_value}: either -pass \code{before=k} by itself, or pass \verb{before=k, after=0}. * For -center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. * For -leading/left-aligned windows from \code{ref_time_value} to \code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. See "Details:" about the definition of a time step, -(non)treatment of missing rows within the window, and avoiding warnings -about \code{before}&\code{after} settings for a certain uncommon use case.} +the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass +\verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item For leading/left-aligned windows from \code{ref_time_value} to +\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +or pass \verb{before=0, after=k}. +See "Details:" about the definition of a time step,(non)treatment of +missing rows within the window, and avoiding warnings about +\code{before}&\code{after} settings for a certain uncommon use case. +}} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding @@ -88,7 +93,7 @@ An \code{epi_df} object given by appending a new column to \code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides a n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ @@ -154,7 +159,7 @@ jhu_csse_daily_subset \%>\% # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean("cases", new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) } From 6865039be02cf51e71b478d529df5b76b5a848cc Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 5 Mar 2024 20:29:26 -0500 Subject: [PATCH 173/345] pluralize col_name args --- R/slide.R | 63 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/R/slide.R b/R/slide.R index 1804045c..a23821dc 100644 --- a/R/slide.R +++ b/R/slide.R @@ -376,7 +376,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] #' or ungrouped. If ungrouped, all data in `x` will be treated as part of a #' single data group. -#' @param col_name A character vector of the names of one or more columns for +#' @param col_names A character vector of the names of one or more columns for #' which to calculate the rolling mean. #' @param ... Additional arguments to pass to `data.table::frollmean`, for #' example, `na.rm` and `algo`. `data.table::frollmean` is automatically @@ -409,18 +409,18 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' return an object of class `lubridate::period`. For example, we can use #' `time_step = lubridate::hours` in order to set the time step to be one hour #' (this would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_name String indicating the name of the new column that will +#' @param new_col_names String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. +#' `new_col_names` equal to an existing column name will overwrite this column. #' @param as_list_col Should the slide results be held in a list column, or be #' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, #' in which case a list object returned by `f` would be unnested (using #' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` +#' the names of the resulting columns are given by prepending `new_col_names` #' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. +#' from `new_col_names` entirely. #' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in #' the output even with `ref_time_values` provided, with some type of missing #' value marker for the slide computation output column(s) for `time_value`s @@ -431,8 +431,9 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' the missing marker is a `NULL` entry in the list column; for certain #' operations, you might want to replace these `NULL` entries with a different #' `NA` marker. -#' @return An `epi_df` object given by appending a new column to `x`, named -#' according to the `new_col_name` argument. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, depending on the `col_names` argument, named according to the +#' `new_col_names` argument. #' #' @details To "slide" means to apply a function or formula over a rolling #' window of time steps for each data group, where the window is entered at a @@ -474,7 +475,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) #' @@ -482,33 +483,33 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, +#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6, #' na.rm = TRUE, algo = "exact", hasNA = TRUE) %>% #' dplyr::select(-death_rate_7d_av) #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean("cases", new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) -epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, +epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = FALSE, + new_col_names = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") @@ -568,25 +569,25 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, m <- before + after + 1L if (is.null(names_sep) && !as_list_col) { - if (length(new_col_name) != length(col_name)) { + if (length(new_col_names) != length(col_names)) { cli_abort( - "`new_col_name` must be the same length as `col_name` when `names_sep` is NULL.", - class = "epiprocess__epi_slide_mean__col_name_length_mismatch", - epiprocess__new_col_name = new_col_name, - epiprocess__col_name = col_name + "`new_col_names` must be the same length as `col_names` when `names_sep` is NULL.", + class = "epiprocess__epi_slide_mean__col_names_length_mismatch", + epiprocess__new_col_names = new_col_names, + epiprocess__col_names = col_names ) } - result_col_name <- new_col_name + result_col_names <- new_col_names } else { - if (length(new_col_name) != 1L && length(new_col_name) != length(col_name)) { + if (length(new_col_names) != 1L && length(new_col_names) != length(col_names)) { cli_abort( - "`new_col_name` must be either length 1 or the same length as `col_name`.", - class = "epiprocess__epi_slide_mean__col_name_length_mismatch_and_not_one", - epiprocess__new_col_name = new_col_name, - epiprocess__col_name = col_name + "`new_col_names` must be either length 1 or the same length as `col_names`.", + class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", + epiprocess__new_col_names = new_col_names, + epiprocess__col_names = col_names ) } - result_col_name <- paste(new_col_name, col_name, sep = names_sep) + result_col_names <- paste(new_col_names, col_names, sep = names_sep) } slide_one_grp <- function(.data_group, .group_key, ...) { @@ -621,19 +622,19 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, } roll_output <- data.table::frollmean( - x = .data_group[, col_name], n = m, align = "right", ... + x = .data_group[, col_names], n = m, align = "right", ... ) if (after >= 1) { # Right-aligned `frollmean` results' `ref_time_value`s will be `after` # timesteps ahead of where they should be. Shift results to the left by # `after` timesteps. - .data_group[, result_col_name] <- purrr::map(roll_output, function(.x) { + .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { c(.x[(after + 1L):length(.x)], rep(NA, after)) } ) } else { - .data_group[, result_col_name] <- roll_output + .data_group[, result_col_names] <- roll_output } return(.data_group) @@ -646,13 +647,13 @@ epi_slide_mean = function(x, col_name, ..., before, after, ref_time_values, result$.real <- NULL if (all_rows) { - result[!(result$time_value %in% ref_time_values), result_col_name] <- NA + result[!(result$time_value %in% ref_time_values), result_col_names] <- NA } else if (user_provided_rtvs) { result <- result[result$time_value %in% ref_time_values, ] } if (as_list_col) { - result[, result_col_name] <- purrr::map(result_col_name, + result[, result_col_names] <- purrr::map(result_col_names, function(.x) { tmp <- result[[.x]] tmp[is.na(tmp)] <- list(NULL) From 12c32f0276c842055bd9a0a47b7ab90920b87071 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 5 Mar 2024 20:32:04 -0500 Subject: [PATCH 174/345] rename missing dates to times --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index a23821dc..4a52438a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -591,14 +591,14 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, } slide_one_grp <- function(.data_group, .group_key, ...) { - missing_dates <- all_dates[!(all_dates %in% .data_group$time_value)] + missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] # `frollmean` requires a full window to compute a result. Add NA values # to beginning and end of the group so that we get results for the # first `before` and last `after` elements. .data_group <- bind_rows( .data_group, - tibble(time_value = c(missing_dates, pad_early_dates, pad_late_dates), .real = FALSE) + tibble(time_value = c(missing_times, pad_early_dates, pad_late_dates), .real = FALSE) ) %>% arrange(time_value) From fe81d2f905134f4bd42094c0c20f78c37ee2475f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:26:34 -0500 Subject: [PATCH 175/345] rename col_name -> names in tests --- tests/testthat/test-epi_slide.R | 46 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 35b9ebc1..68b2103a 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -57,11 +57,11 @@ test_that("`before` and `after` are both vectors of length 1", { ) expect_error( - epi_slide_mean(grouped, col_name = "value", before = c(0, 1), after = 0, ref_time_values = d + 3), + epi_slide_mean(grouped, col_names = "value", before = c(0, 1), after = 0, ref_time_values = d + 3), "Assertion on 'before' failed: Must have length 1" ) expect_error( - epi_slide_mean(grouped, col_name = "value", before = 1, after = c(0, 1), ref_time_values = d + 3), + epi_slide_mean(grouped, col_names = "value", before = 1, after = c(0, 1), ref_time_values = d + 3), "Assertion on 'after' failed: Must have length 1" ) }) @@ -81,15 +81,15 @@ test_that("Test errors/warnings for discouraged features", { ) expect_error( - epi_slide_mean(grouped, col_name = "value", ref_time_values = d + 1), + epi_slide_mean(grouped, col_names = "value", ref_time_values = d + 1), "Either or both of `before`, `after` must be provided." ) expect_warning( - epi_slide_mean(grouped, col_name = "value", before = 0L, ref_time_values = d + 1), + epi_slide_mean(grouped, col_names = "value", before = 0L, ref_time_values = d + 1), "`before==0`, `after` missing" ) expect_warning( - epi_slide_mean(grouped, col_name = "value", after = 0L, ref_time_values = d + 1), + epi_slide_mean(grouped, col_names = "value", after = 0L, ref_time_values = d + 1), "`before` missing, `after==0`" ) @@ -98,9 +98,9 @@ test_that("Test errors/warnings for discouraged features", { expect_no_warning(ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2)) expect_no_warning(ref3 <- epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2)) - expect_no_warning(opt1 <- epi_slide_mean(grouped, col_name = "value", before = 1L, ref_time_values = d + 2, na.rm = TRUE)) - expect_no_warning(opt2 <- epi_slide_mean(grouped, col_name = "value", after = 1L, ref_time_values = d + 2, na.rm = TRUE)) - expect_no_warning(opt3 <- epi_slide_mean(grouped, col_name = "value", before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE)) + expect_no_warning(opt1 <- epi_slide_mean(grouped, col_names = "value", before = 1L, ref_time_values = d + 2, na.rm = TRUE)) + expect_no_warning(opt2 <- epi_slide_mean(grouped, col_names = "value", after = 1L, ref_time_values = d + 2, na.rm = TRUE)) + expect_no_warning(opt3 <- epi_slide_mean(grouped, col_names = "value", before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE)) # Results from epi_slide and epi_slide_mean should match expect_identical(select(ref1, -slide_value_count), opt1) @@ -139,37 +139,37 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa ) expect_error( - epi_slide_mean(grouped, col_name = "value", before = -1L, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = "value", before = -1L, ref_time_values = d + 2L), "Assertion on 'before' failed: Element 1 is not >= 0" ) expect_error( - epi_slide_mean(grouped, col_name = "value", before = 2L, after = -1L, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = "value", before = 2L, after = -1L, ref_time_values = d + 2L), "Assertion on 'after' failed: Element 1 is not >= 0" ) - expect_error(epi_slide_mean(grouped, col_name = "value", before = "a", ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = "value", before = "a", ref_time_values = d + 2L), regexp = "before", class = "vctrs_error_incompatible_type" ) - expect_error(epi_slide_mean(grouped, col_name = "value", before = 1L, after = "a", ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = "value", before = 1L, after = "a", ref_time_values = d + 2L), regexp = "after", class = "vctrs_error_incompatible_type" ) - expect_error(epi_slide_mean(grouped, col_name = "value", before = 0.5, ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = "value", before = 0.5, ref_time_values = d + 2L), regexp = "before", class = "vctrs_error_incompatible_type" ) - expect_error(epi_slide_mean(grouped, col_name = "value", before = 1L, after = 0.5, ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = "value", before = 1L, after = 0.5, ref_time_values = d + 2L), regexp = "after", class = "vctrs_error_incompatible_type" ) expect_error( - epi_slide_mean(grouped, col_name = "value", before = NA, after = 1L, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = "value", before = NA, after = 1L, ref_time_values = d + 2L), "Assertion on 'before' failed: May not be NA" ) expect_error( - epi_slide_mean(grouped, col_name = "value", before = 1L, after = NA, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = "value", before = 1L, after = NA, ref_time_values = d + 2L), "Assertion on 'after' failed: May not be NA" ) # Non-integer-class but integer-compatible values are allowed: expect_no_error(ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L)) - expect_no_error(opt <- epi_slide_mean(grouped, col_name = "value", before = 1, after = 1, ref_time_values = d + 2L, na.rm = TRUE)) + expect_no_error(opt <- epi_slide_mean(grouped, col_names = "value", before = 1, after = 1, ref_time_values = d + 2L, na.rm = TRUE)) # Results from epi_slide and epi_slide_mean should match expect_identical(select(ref, -slide_value_count), opt) @@ -186,11 +186,11 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, ) # beyond the last, no data in window expect_error( - epi_slide_mean(grouped, col_name = "value", before = 2L, ref_time_values = d), + epi_slide_mean(grouped, col_names = "value", before = 2L, ref_time_values = d), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( - epi_slide_mean(grouped, col_name = "value", before = 2L, ref_time_values = d + 207L), + epi_slide_mean(grouped, col_names = "value", before = 2L, ref_time_values = d + 207L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window }) @@ -928,7 +928,7 @@ test_that("results for different `before`s and `after`s match between epi_slide ), before = before, after = after, names_sep = NULL, ...) result2 <- epi_slide_mean(epi_data, - col_name = c("a", "b"), na.rm = TRUE, + col_names = c("a", "b"), na.rm = TRUE, before = before, after = after, ...) expect_identical(result1, result2) } @@ -1028,7 +1028,7 @@ test_that("results for different time_types match between epi_slide and epi_slid ), before = before, after = after, names_sep = NULL, ...) result2 <- epi_slide_mean(epi_data, - col_name = c("a", "b"), na.rm = TRUE, + col_names = c("a", "b"), na.rm = TRUE, before = before, after = after, ...) expect_identical(result1, result2) @@ -1051,7 +1051,7 @@ test_that("results for different time_types match between epi_slide and epi_slid epi_data <- generate_special_date_data(weeks) %>% group_by(geo_value) result2 <- epi_slide_mean(epi_data, - col_name = c("a", "b"), na.rm = TRUE, + col_names = c("a", "b"), na.rm = TRUE, before = before, after = after, ...) expect_identical(select(ref_result, -time_value), select(result2, -time_value)) }) @@ -1078,7 +1078,7 @@ test_that("special time_types without time_step fail in epi_slide_mean", { )) expect_error( - epi_slide_mean(epi_data, col_name = "a", + epi_slide_mean(epi_data, col_names = "a", before = before, after = after ), class = "epiprocess__epi_slide_mean__unmappable_time_type" From 862f112244a977ab68585cfb3620fab7ca94d3dd Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:20:28 -0500 Subject: [PATCH 176/345] Revert "support list col output" This reverts commit a51e7ee720a8005d281ce96923b43c6db6bd904d. --- R/slide.R | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/R/slide.R b/R/slide.R index 4a52438a..489ab6b7 100644 --- a/R/slide.R +++ b/R/slide.R @@ -513,6 +513,13 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") + if (as_list_col) { + cli::cli_abort( + "`as_list_col` is not supported for `epi_slide_mean`", + class = "epiproces__epi_slide_mean__list_not_supported" + ) + } + user_provided_rtvs <- !missing(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) @@ -652,16 +659,6 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, result <- result[result$time_value %in% ref_time_values, ] } - if (as_list_col) { - result[, result_col_names] <- purrr::map(result_col_names, - function(.x) { - tmp <- result[[.x]] - tmp[is.na(tmp)] <- list(NULL) - as.list(tmp) - } - ) - } - if (!is_epi_df(result)) { # `all_rows` and `as_list_col` handling strip epi_df format and metadata. # Restore them. From 396df5341ccd730917932c56a8f6950c8f142806 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:28:38 -0500 Subject: [PATCH 177/345] expect error when as_list_col is used in epi_slide_mean --- tests/testthat/test-epi_slide.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 68b2103a..1d15884c 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -303,15 +303,13 @@ test_that("computation output formats x as_list_col", { slide_value_value = slide_value ) %>% select(-slide_value) ) - expect_identical( + expect_error( toy_edf %>% filter( geo_value == "a" ) %>% epi_slide_mean( "value", before = 6L, as_list_col = TRUE, na.rm = TRUE ), - basic_result_from_size1_mean %>% dplyr::mutate( - slide_value_value = as.list(slide_value) - ) %>% select(-slide_value) + class = "epiproces__epi_slide_mean__list_not_supported" ) # `epi_slide_mean` doesn't return dataframe columns }) From 8370a1b5890662cd26d258aada885db9ad76dedf Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:35:58 -0500 Subject: [PATCH 178/345] clean up examples for epi_slide and _mean --- R/slide.R | 54 +++++++++++++++++++++++++---------------- man/epi_slide.Rd | 27 +++++++++++++-------- man/epi_slide_mean.Rd | 56 ++++++++++++++++++++++++------------------- 3 files changed, 83 insertions(+), 54 deletions(-) diff --git a/R/slide.R b/R/slide.R index 489ab6b7..3d114ff4 100644 --- a/R/slide.R +++ b/R/slide.R @@ -130,32 +130,38 @@ #' @seealso [`epi_slide_mean`] #' @examples #' # slide a 7-day trailing average formula on cases +#' # This and other simple sliding means are much faster to do using +#' # the `epi_slide_mean` function instead. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), before = 6) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), after = 6) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' epi_slide(cases_14dav = mean(cases), before = 6, after = 7) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' ungroup() #' #' # nested new columns #' jhu_csse_daily_subset %>% @@ -166,7 +172,8 @@ #' cases_2dma = mad(cases) #' ), #' before = 1, as_list_col = TRUE -#' ) +#' ) %>% +#' ungroup() epi_slide <- function(x, f, ..., before, after, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, @@ -476,37 +483,44 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6, -#' na.rm = TRUE, algo = "exact", hasNA = TRUE) %>% -#' dplyr::select(-death_rate_7d_av) +#' # `frollmean` options +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean("cases", new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% -#' # rmv a nonessential var. to ensure new col is printed -#' dplyr::select(-death_rate_7d_av) +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' ungroup() epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, time_step, new_col_names = "slide_value", as_list_col = FALSE, diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index bdb36506..d09dfdda 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -159,32 +159,38 @@ through the \code{new_col_name} argument. } \examples{ # slide a 7-day trailing average formula on cases +# This and other simple sliding means are much faster to do using +# the `epi_slide_mean` function instead. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), before = 6) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), after = 6) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + epi_slide(cases_14dav = mean(cases), before = 6, after = 7) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% + ungroup() # nested new columns jhu_csse_daily_subset \%>\% @@ -195,7 +201,8 @@ jhu_csse_daily_subset \%>\% cases_2dma = mad(cases) ), before = 1, as_list_col = TRUE - ) + ) \%>\% + ungroup() } \seealso{ \code{\link{epi_slide_mean}} diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 9dfe7767..457d5b69 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -6,13 +6,13 @@ \usage{ epi_slide_mean( x, - col_name, + col_names, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_names = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE @@ -23,7 +23,7 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_name}{A character vector of the names of one or more columns for +\item{col_names}{A character vector of the names of one or more columns for which to calculate the rolling mean.} \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for @@ -62,20 +62,20 @@ return an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_name}{String indicating the name of the new column that will +\item{new_col_names}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} +\code{new_col_names} equal to an existing column name will overwrite this column.} \item{as_list_col}{Should the slide results be held in a list column, or be \link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, in which case a list object returned by \code{f} would be unnested (using \code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} +the names of the resulting columns are given by prepending \code{new_col_names} to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} +from \code{new_col_names} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -89,8 +89,9 @@ operations, you might want to replace these \code{NULL} entries with a different \code{NA} marker.} } \value{ -An \code{epi_df} object given by appending a new column to \code{x}, named -according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to +\code{x}, depending on the \code{col_names} argument, named according to the +\code{new_col_names} argument. } \description{ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for @@ -130,38 +131,45 @@ misspelled.) # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 6, - na.rm = TRUE, algo = "exact", hasNA = TRUE) \%>\% - dplyr::select(-death_rate_7d_av) + epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6, + # `frollmean` options + na.rm = TRUE, algo = "exact", hasNA = TRUE + ) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, after = 6) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, after = 6) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% - # rmv a nonessential var. to ensure new col is printed - dplyr::select(-death_rate_7d_av) + epi_slide_mean("cases", new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% + ungroup() } \seealso{ \code{\link{epi_slide}} From 4c9e63232cb4c7c50bc5ee1c691381df6b7da839 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:44:34 -0500 Subject: [PATCH 179/345] comment use of non-time_step-transformed before/after in full_date_seq --- R/slide.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/slide.R b/R/slide.R index 3d114ff4..6f05a336 100644 --- a/R/slide.R +++ b/R/slide.R @@ -686,6 +686,10 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, #' (x$time_value). Produce lists of dates before min(x$time_value) and after #' max(x$time_value) for padding initial and final windows to size `n`. #' +#' `before` and `after` inputs here should be raw (numeric) values; +#' `time_step` function should NOT have been applied. `full_date_seq` applies +#' `time_step` as needed. +#' #' @importFrom checkmate assert_function #' @noRd full_date_seq <- function(x, before, after, time_step) { From 5f3af61862a9c54fc22f273f71b85c95c28a0ecd Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:12:53 -0500 Subject: [PATCH 180/345] fail if time_values are unevenly spaced --- R/slide.R | 17 ++++++++++++++--- tests/testthat/test-epi_slide.R | 19 ++++++++++++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/R/slide.R b/R/slide.R index 6f05a336..0e45340e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -628,7 +628,7 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, # order. So if the computation is aggregating across multiple obs for the # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` # should be used instead. - if (anyDuplicated(.data_group$time_value) > 0) { + if (anyDuplicated(.data_group$time_value) != 0L) { cli_abort(c( "group contains duplicate time values. Using `epi_slide_mean` on this group will result in incorrect results", @@ -641,6 +641,17 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, epiprocess__group_key = .group_key ) } + if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { + cli_abort(c( + "group contains an unexpected number of rows", + "i" = c("Input data may contain `time_values` closer together than the + expected `time_step` size") + ), + class = "epiprocess__epi_slide_mean__unexpected_row_number", + epiprocess__data_group = .data_group, + epiprocess__group_key = .group_key + ) + } roll_output <- data.table::frollmean( x = .data_group[, col_names], n = m, align = "right", ... @@ -728,8 +739,8 @@ full_date_seq <- function(x, before, after, time_step) { if (is.na(by)) { cli_abort( c( - "`frollmean` requires a full window to compute a result, but - `time_type` associated with the epi_df was not mappable to period + "`frollmean` requires a full window to compute a result, but the + `time_type` associated with the epi_df was not mappable to a period type valid for creating a date sequence.", "i" = c("The input data's `time_type` was probably `custom` or `day-time`. These require also passing a `time_step` function.") diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 1d15884c..38662d5f 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1229,4 +1229,21 @@ test_that("helper `full_date_seq` returns expected date values", { pad_late_dates = as.Date(c("2022-02-19")) ) ) -}) \ No newline at end of file +}) + +test_that("`epi_slide_mean` errors when passed `time_values` with closer than expected spacing", { + time_df <- tibble( + geo_value = 1, + value = c(0:7, 3.5, 10, 20), + # Adding the value 3.5 creates a time that has fractional seconds, which + # doesn't follow the expected 1-second spacing of the `time_values`. + # This leads to `frollmean` using obs spanning less than the expected + # time frame for some computation windows. + time_value = Sys.time() + value + ) %>% + as_epi_df() + expect_error( + epi_slide_mean(time_df, "value", before = 6L, time_step = lubridate::seconds), + class = "epiprocess__epi_slide_mean__unexpected_row_number" + ) +}) From 408148578c3b6136c0cc0cc2ec19948c3aafc626 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:31:36 -0500 Subject: [PATCH 181/345] test cleanup --- tests/testthat/test-epi_slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 38662d5f..1f505173 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1050,7 +1050,7 @@ test_that("results for different time_types match between epi_slide and epi_slid group_by(geo_value) result2 <- epi_slide_mean(epi_data, col_names = c("a", "b"), na.rm = TRUE, - before = before, after = after, ...) + before = 6L, after = 0L) expect_identical(select(ref_result, -time_value), select(result2, -time_value)) }) From 6b325dac6325dd2e90e9aaccab1912bf13628de7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 6 Mar 2024 17:17:01 -0800 Subject: [PATCH 182/345] Update .github/workflows/pkgdown.yaml Co-authored-by: brookslogan --- .github/workflows/pkgdown.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 47dd6ed6..1ee7ef45 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -16,6 +16,7 @@ name: pkgdown jobs: pkgdown: + # only build docs on the main repository and not forks if: github.repository_owner == 'cmu-delphi' runs-on: ubuntu-latest # Only restrict concurrency for non-PR jobs From c16b98bcbc729eb5919afff265fe89f1f3748865 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 6 Mar 2024 17:25:28 -0800 Subject: [PATCH 183/345] Update DEVELOPMENT.md Co-authored-by: brookslogan --- DEVELOPMENT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 369b51c3..5c242ecc 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -50,7 +50,7 @@ Open a release issue and then copy and follow this checklist in the issue (modif - [ ] `git pull` - [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_epiprocess.html) -- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "false"))`. +- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "FALSE"))`. - Aim for 10/10, no notes. - [ ] If check works well enough, merge to main. Otherwise open a PR to fix up. - [ ] [Polish NEWS](https://github.com/cmu-delphi/epiprocess/blob/dev/NEWS.md). From 219e04aef7928a1a66bd04aab89ce1e0dab3278e Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Mar 2024 17:37:44 -0800 Subject: [PATCH 184/345] ci: add document, lint, style like epidatr --- .github/workflows/document.yaml | 52 +++++++++++++++++++++ .github/workflows/lint.yaml | 34 ++++++++++++++ .github/workflows/style.yml | 82 +++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 .github/workflows/document.yaml create mode 100644 .github/workflows/lint.yaml create mode 100644 .github/workflows/style.yml diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml new file mode 100644 index 00000000..17a85104 --- /dev/null +++ b/.github/workflows/document.yaml @@ -0,0 +1,52 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + paths: ["R/**", "README.Rmd"] + workflow_dispatch: + +name: Document + +jobs: + document: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + DELPHI_EPIDATA_KEY: ${{ secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: Install dependencies + uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: | + any::devtools + any::roxygen2 + needs: | + devtools + roxygen2 + + - name: Document + run: roxygen2::roxygenise() + shell: Rscript {0} + + - name: Build README.md from README.Rmd + run: Rscript -e 'devtools::build_readme()' + + - name: Commit and push changes + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add README.md + git add man/\* NAMESPACE DESCRIPTION + git commit -m "docs: document (GHA)" || echo "No changes to commit" + git pull --rebase + git push origin diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 00000000..bc8d3525 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,34 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, dev] + pull_request: + branches: [main, dev] + workflow_dispatch: + +name: Lint + +jobs: + lint: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + DELPHI_EPIDATA_KEY: ${{ secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY }} + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::lintr, local::. + needs: lint + + - name: Lint + run: lintr::lint_package() + shell: Rscript {0} + env: + LINTR_ERROR_ON_LINT: true diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 00000000..acdc0470 --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,82 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + paths: + [ + "**.[rR]", + "**.[qrR]md", + "**.[rR]markdown", + "**.[rR]nw", + "**.[rR]profile", + ] + workflow_dispatch: + +name: Style + +jobs: + style: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + DELPHI_EPIDATA_KEY: ${{ secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: Install dependencies + uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::styler, any::roxygen2 + needs: styler + + - name: Enable styler cache + run: styler::cache_activate() + shell: Rscript {0} + + - name: Determine cache location + id: styler-location + run: | + cat( + "location=", + styler::cache_info(format = "tabular")$location, + "\n", + file = Sys.getenv("GITHUB_OUTPUT"), + append = TRUE, + sep = "" + ) + shell: Rscript {0} + + - name: Cache styler + uses: actions/cache@v3 + with: + path: ${{ steps.styler-location.outputs.location }} + key: ${{ runner.os }}-styler-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-styler- + ${{ runner.os }}- + + - name: Style + run: styler::style_pkg() + shell: Rscript {0} + + - name: Commit and push changes + run: | + if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \ + | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$')) + then + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git commit ${FILES_TO_COMMIT[*]} -m "style: styler (GHA)" + git pull --rebase + git push origin + else + echo "No changes to commit." + fi From b40f90f974abb0d6e9f4863bffa3644a21141fb0 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 5 Mar 2024 18:06:56 -0800 Subject: [PATCH 185/345] ci: add doc string with workflow mods --- .github/workflows/R-CMD-check.yaml | 3 ++- .github/workflows/document.yaml | 6 ++++++ .github/workflows/lint.yaml | 5 +++++ .github/workflows/pkgdown.yaml | 6 ++++-- .github/workflows/style.yml | 5 +++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 4d0a2b03..c863c4f3 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,7 +1,8 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help # -# Created with usethis + edited to run on PRs to dev, use API key. +# Modifications: +# - API key secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY on: push: branches: [main, dev] diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml index 17a85104..8eb92921 100644 --- a/.github/workflows/document.yaml +++ b/.github/workflows/document.yaml @@ -1,5 +1,11 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# Modifications: +# - devtools::build_readme() +# - workflow_dispatch added to allow manual triggering of the workflow +# - trigger branches changed +# - API key secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY on: push: paths: ["R/**", "README.Rmd"] diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index bc8d3525..21d030c9 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -1,5 +1,10 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# Modifications: +# * workflow_dispatch added to allow manual triggering of the workflow +# * trigger branches changed +# * API key secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY on: push: branches: [main, dev] diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 1ee7ef45..6a9d577b 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,8 +1,10 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help # -# Edits from above workflow: also check pkgdown for PRs to `dev` branch, and -# update the documentation web site on pushes to `dev` branch. +# Modifications: +# * workflow_dispatch added to allow manual triggering of the workflow +# * trigger branches changed +# * API key secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY on: push: branches: [main, dev] diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index acdc0470..3206bddc 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -1,5 +1,10 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# Modifications: +# * workflow_dispatch added to allow manual triggering of the workflow +# * trigger branches changed +# * API key secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY on: push: paths: From cbf4e44af6bc40549b0b3c92f31891b0a6c003ad Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 5 Mar 2024 18:07:30 -0800 Subject: [PATCH 186/345] ci: fix style indentation --- .github/workflows/style.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 3206bddc..396200a5 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -15,7 +15,7 @@ on: "**.[rR]nw", "**.[rR]profile", ] - workflow_dispatch: + workflow_dispatch: name: Style From 4a941d29e8e70855e201819cff7c595d396891cb Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 6 Mar 2024 17:38:44 -0800 Subject: [PATCH 187/345] doc: improve release process wording --- DEVELOPMENT.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 5c242ecc..a7f0488c 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -44,13 +44,14 @@ python -m http.server -d docs Please follow the guidelines in the [PR template document](.github/pull_request_template.md). -## Release process +## Planned CRAN release process Open a release issue and then copy and follow this checklist in the issue (modified from the checklist generated by `usethis::use_release_issue(version = "1.0.2")`): -- [ ] `git pull` -- [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_epiprocess.html) -- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "FALSE"))`. +- [ ] `git pull` on `dev` branch. +- [ ] Make sure all changes are committed and pushed. +- [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_epiprocess.html). +- [ ] `devtools::check(".", manual = TRUE, env_vars = c(NOT_CRAN = "false"))`. - Aim for 10/10, no notes. - [ ] If check works well enough, merge to main. Otherwise open a PR to fix up. - [ ] [Polish NEWS](https://github.com/cmu-delphi/epiprocess/blob/dev/NEWS.md). @@ -61,20 +62,20 @@ Open a release issue and then copy and follow this checklist in the issue (modif - This may choke on the MIT license url, and that's ok. - [ ] `devtools::build_readme()` - [ ] `devtools::check_win_devel()` -- [ ] Check email for problems +- [ ] Have maintainer ("cre" in description) check email for problems. - [ ] `revdepcheck::revdep_check(num_workers = 4)`. - This may choke, it is very sensitive to the binary versions of packages on a given system. Either bypass or ask someone else to run it if you're concerned. - [ ] Update `cran-comments.md` -- [ ] PR with any changes +- [ ] PR with any changes (and go through the list again) into dev and run through the list again. Submit to CRAN: -- [ ] `devtools::submit_cran()` -- [ ] Approve email +- [ ] `devtools::submit_cran()`. +- [ ] Maintainer approves email. Wait for CRAN... -- [ ] Accepted :tada: -- [ ] `dev` -- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push). -- [ ] check the release notes and publish the branch on github +- [ ] If accepted :tada:, move on to next steps. +- [ ] Make sure you're on `dev`? +- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push) will create a draft release and tag on the GitHub repo. +- [ ] Go to the repo, verify the release notes, and publish when ready. From aa5236375a73e91d6fd5b5fdbe0f126826c17ab2 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 6 Mar 2024 17:43:52 -0800 Subject: [PATCH 188/345] doc: add link to DEVELOPMENT.md in pull request template --- .github/pull_request_template.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index eefea863..c8841c35 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -14,6 +14,8 @@ Please: (backwards-incompatible changes to the documented interface) are noted. Collect the changes under the next release number (e.g. if you are on 1.7.2, then write your changes under the 1.8 heading). +- See [DEVELOPMENT.md](DEVELOPMENT.md) for more information on the development + process. ### Change explanations for reviewer From f89ecdd059e1ec3e091e387159acb9fe355a2a86 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 6 Mar 2024 19:23:15 -0800 Subject: [PATCH 189/345] ci: simplify pkgdown ref logic --- .github/workflows/pkgdown.yaml | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 6a9d577b..5091b0ed 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -43,18 +43,15 @@ jobs: - name: Build site run: | - override <- if (startsWith("${{ github.event_name }}", "pull_request")) { - if ("${{ github.base_ref }}" == "main") { - list(development = list(mode = "release", version_label = "light")) - } else { - list(development = list(mode = "devel", version_label = "success")) - } + target_ref <- if (startsWith("${{ github.event_name }}", "pull_request")) { + "${{ github.base_ref }}" } else { - if ("${{ github.ref_name }}" == "main") { - list(development = list(mode = "release", version_label = "light")) - } else { - list(development = list(mode = "devel", version_label = "success")) - } + "${{ github.ref }}" + } + override <- if (target_ref == "main") { + list(development = list(mode = "release", version_label = "light")) + } else { + list(development = list(mode = "devel", version_label = "success")) } pkg <- pkgdown::as_pkgdown(".", override = override) cli::cli_rule("Cleaning files from old site...") From a48d82cbd47cef1901fdf7a8f9073e6c23e12c82 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 11:28:09 -0800 Subject: [PATCH 190/345] doc: transfer maintainer role from Ryan to Logan --- DESCRIPTION | 4 ++-- man/autoplot.epi_df.Rd | 9 ++++++--- man/epiprocess.Rd | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ca6a367e..71d95969 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,7 @@ Title: Tools for basic signal processing in epidemiology Version: 0.7.5 Authors@R: c( person("Jacob", "Bien", role = "ctb"), - person("Logan", "Brooks", role = "aut"), + person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), person("Rafael", "Catoia", role = "ctb"), person("Nat", "DeFries", role = "ctb"), person("Daniel", "McDonald", role = "aut"), @@ -14,7 +14,7 @@ Authors@R: c( person("Quang", "Nguyen", role = "ctb"), person("Evan", "Ray", role = "aut"), person("Dmitry", "Shemetov", role = "ctb"), - person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = c("aut", "cre")), + person("Ryan", "Tibshirani", role = "aut"), person("Lionel", "Henry", role = "ctb", comment = "Author of included rlang fragments"), person("Hadley", "Wickham", role = "ctb", comment = "Author of included rlang fragments"), person("Posit", role = "cph", comment = "Copyright holder of included rlang fragments") diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd index a87bc8ca..c97ea02f 100644 --- a/man/autoplot.epi_df.Rd +++ b/man/autoplot.epi_df.Rd @@ -56,10 +56,13 @@ autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", .facet_by = "geo_value" ) -autoplot(jhu_csse_daily_subset, case_rate_7d_av, .color_by = "none", - .base_color = "red", .facet_by = "geo_value") +autoplot(jhu_csse_daily_subset, case_rate_7d_av, + .color_by = "none", + .base_color = "red", .facet_by = "geo_value" +) # .base_color specification won't have any effect due .color_by default autoplot(jhu_csse_daily_subset, case_rate_7d_av, - .base_color = "red", .facet_by = "geo_value") + .base_color = "red", .facet_by = "geo_value" +) } diff --git a/man/epiprocess.Rd b/man/epiprocess.Rd index 7c3ecd8a..f6345cbe 100644 --- a/man/epiprocess.Rd +++ b/man/epiprocess.Rd @@ -18,13 +18,13 @@ Useful links: } \author{ -\strong{Maintainer}: Ryan Tibshirani \email{ryantibs@cmu.edu} +\strong{Maintainer}: Logan Brooks \email{lcbrooks@andrew.cmu.edu} Authors: \itemize{ - \item Logan Brooks \item Daniel McDonald \item Evan Ray + \item Ryan Tibshirani } Other contributors: From 9143422eb7a5284d2126b8639b57748fe6a0bb5e Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 11:53:38 -0800 Subject: [PATCH 191/345] doc: add comments explaining pkgdown action * also simplify the target_ref calculation --- .github/workflows/pkgdown.yaml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 5091b0ed..60edf1ad 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -42,16 +42,21 @@ jobs: needs: website - name: Build site + # - target_ref gets the ref from a different variable, depending on the event + # - override allows us to set the pkgdown mode and version_label + # - mode: release is the standard build mode, devel places the site in /dev + # - version_label: 'light' and 'success' are CSS labels for Bootswatch: Cosmo + # https://bootswatch.com/cosmo/ + # - we use pkgdown:::build_github_pages to build the site because of an issue in pkgdown + # https://github.com/r-lib/pkgdown/issues/2257 run: | - target_ref <- if (startsWith("${{ github.event_name }}", "pull_request")) { - "${{ github.base_ref }}" - } else { - "${{ github.ref }}" - } + target_ref <- ${{ github.event_name == 'pull_request' && github.base_ref || github.ref }} override <- if (target_ref == "main") { list(development = list(mode = "release", version_label = "light")) - } else { + } else if (target_ref == "dev") { list(development = list(mode = "devel", version_label = "success")) + } else { + stop("Unexpected target_ref: ", target_ref) } pkg <- pkgdown::as_pkgdown(".", override = override) cli::cli_rule("Cleaning files from old site...") From 1a0a8856ddf0dc24775d9eeed79844f8765f9d4a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 11:53:51 -0800 Subject: [PATCH 192/345] doc: clarify DEVELOPMENT.md --- DEVELOPMENT.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index a7f0488c..46fa3125 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -66,7 +66,7 @@ Open a release issue and then copy and follow this checklist in the issue (modif - [ ] `revdepcheck::revdep_check(num_workers = 4)`. - This may choke, it is very sensitive to the binary versions of packages on a given system. Either bypass or ask someone else to run it if you're concerned. - [ ] Update `cran-comments.md` -- [ ] PR with any changes (and go through the list again) into dev and run through the list again. +- [ ] PR with any changes (and go through the list again) into `dev` and run through the list again. Submit to CRAN: @@ -75,7 +75,7 @@ Submit to CRAN: Wait for CRAN... -- [ ] If accepted :tada:, move on to next steps. -- [ ] Make sure you're on `dev`? -- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push) will create a draft release and tag on the GitHub repo. +- [ ] If accepted :tada:, move to next steps. If rejected, fix and resubmit. +- [ ] Open and merge a PR containing any updates made to `main` back to `dev`. +- [ ] `usethis::use_github_release(publish = FALSE)` (publish off, otherwise it won't push) will create a draft release based on the commit hash in CRAN-SUBMISSION and push a tag to the GitHub repo. - [ ] Go to the repo, verify the release notes, and publish when ready. From 6fd2e1f00d23489bfc0250bbfdb9f5a86cecd76c Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 13:12:45 -0800 Subject: [PATCH 193/345] ci: fix pkgdown --- .github/workflows/pkgdown.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 60edf1ad..ca4ee94c 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -50,7 +50,7 @@ jobs: # - we use pkgdown:::build_github_pages to build the site because of an issue in pkgdown # https://github.com/r-lib/pkgdown/issues/2257 run: | - target_ref <- ${{ github.event_name == 'pull_request' && github.base_ref || github.ref }} + target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" override <- if (target_ref == "main") { list(development = list(mode = "release", version_label = "light")) } else if (target_ref == "dev") { From b19921abc25b8cc172f389c499d41e586db05aa6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 7 Mar 2024 13:25:57 -0800 Subject: [PATCH 194/345] More color-blind friendly green --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 146f71b0..96d67946 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -16,6 +16,7 @@ template: bslib: font_scale: 1.0 primary: "#C41230" + success: "#B4D43C" link-color: "#C41230" navbar: From 68c027b6ae75bd2d733152fa4a450c92caa24f36 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 13:43:05 -0800 Subject: [PATCH 195/345] repo: add to .Rbuildignore --- .Rbuildignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 8ca62412..19b9f810 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,7 @@ ^pkgdown$ ^doc$ ^Meta$ +^.bumpversion.cfg$ +^.git-blame-ignore-revs$ +^.lintr$ +^DEVELOPMENT.md$ \ No newline at end of file From 0c8144e0e9deb79547eb523298d782926a7337a6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 13:43:30 -0800 Subject: [PATCH 196/345] lint: increase lintr happiness --- R/archive.R | 13 ++- R/autoplot.R | 10 +- R/correlation.R | 2 +- R/data.R | 105 ++++++++++++------ R/epi_df.R | 16 +-- R/grouped_epi_archive.R | 20 +++- R/growth_rate.R | 17 ++- R/methods-epi_archive.R | 72 ++++++------ R/methods-epi_df.R | 9 +- R/outliers.R | 23 ++-- R/slide.R | 24 ++-- R/utils.R | 66 +++++------ man/archive_cases_dv_subset.Rd | 33 ++++-- man/incidence_num_outlier_example.Rd | 23 ++-- man/jhu_csse_county_level_subset.Rd | 19 +++- man/jhu_csse_daily_subset.Rd | 15 ++- tests/testthat/test-archive-version-bounds.R | 15 ++- tests/testthat/test-data.R | 2 +- tests/testthat/test-epi_df.R | 12 +- tests/testthat/test-epi_slide.R | 73 ++++++------ .../testthat/test-epix_fill_through_version.R | 10 +- tests/testthat/test-epix_merge.R | 14 ++- tests/testthat/test-epix_slide.R | 13 ++- tests/testthat/test-grouped_epi_archive.R | 2 + tests/testthat/test-methods-epi_df.R | 4 +- vignettes/advanced.Rmd | 29 ++--- vignettes/archive.Rmd | 20 ++-- vignettes/compactify.Rmd | 4 +- vignettes/epiprocess.Rmd | 40 ++++--- vignettes/outliers.Rmd | 14 +-- vignettes/slide.Rmd | 8 +- 31 files changed, 443 insertions(+), 284 deletions(-) diff --git a/R/archive.R b/R/archive.R index 428cce76..ff3bc20c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -1,10 +1,10 @@ # We use special features of data.table's `[`. The data.table package has a # compatibility feature that disables some/all of these features if it thinks we # might expect `data.frame`-compatible behavior instead. We can signal that we -# want the special behavior via `.datatable.aware = TRUE` or by importing any +# want the special behavior via `.datatable_aware = TRUE` or by importing any # `data.table` package member. Do both to prevent surprises if we decide to use # `data.table::` everywhere and not importing things. -.datatable.aware <- TRUE +.datatable_aware <- TRUE #' Validate a version bound arg #' @@ -79,6 +79,7 @@ max_version_with_row_in <- function(x) { version_bound <- max(version_col) } } + version_bound } #' Get the next possible value greater than `x` of the same type @@ -343,7 +344,7 @@ epi_archive <- # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed key_vars <- c("geo_value", "time_value", other_keys, "version") - DT <- as.data.table(x, key = key_vars) + DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) @@ -381,7 +382,7 @@ epi_archive <- # Runs compactify on data frame if (is.null(compactify) || compactify == TRUE) { elim <- keep_locf(DT) - DT <- rm_locf(DT) + DT <- rm_locf(DT) # nolint: object_name_linter } else { # Create empty data frame for nrow(elim) to be 0 elim <- tibble::tibble() @@ -543,7 +544,7 @@ epi_archive <- validate_version_bound(fill_versions_end, self$DT, na_ok = FALSE) how <- arg_match(how) if (self$versions_end < fill_versions_end) { - new_DT <- switch(how, + new_DT <- switch(how, # nolint: object_name_linter "na" = { # old DT + a version consisting of all NA observations # immediately after the last currently/actually-observed @@ -567,7 +568,7 @@ epi_archive <- if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) } - next_version_DT <- nonversion_key_vals_ever_recorded[ + next_version_DT <- nonversion_key_vals_ever_recorded[ # nolint: object_name_linter , version := next_version_tag ][ # this makes the class of these columns logical (`NA` is a diff --git a/R/autoplot.R b/R/autoplot.R index 8686fb24..e9f5cb83 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -112,14 +112,14 @@ autoplot.epi_df <- function( dplyr::mutate( .colours = switch(.color_by, all_keys = interaction(!!!all_keys, sep = "/"), - geo_value = geo_value, + geo_value = .data$geo_value, other_keys = interaction(!!!other_keys, sep = "/"), all = interaction(!!!all_avail, sep = "/"), NULL ), .facets = switch(.facet_by, all_keys = interaction(!!!all_keys, sep = "/"), - geo_value = as.factor(geo_value), + geo_value = as.factor(.data$geo_value), other_keys = interaction(!!!other_keys, sep = "/"), all = interaction(!!!all_avail, sep = "/"), NULL @@ -130,10 +130,10 @@ autoplot.epi_df <- function( n_facets <- nlevels(object$.facets) if (n_facets > .max_facets) { top_n <- levels(as.factor(object$.facets))[seq_len(.max_facets)] - object <- dplyr::filter(object, .facets %in% top_n) %>% - dplyr::mutate(.facets = droplevels(.facets)) + object <- dplyr::filter(object, .data$.facets %in% top_n) %>% + dplyr::mutate(.facets = droplevels(.data$.facets)) if (".colours" %in% names(object)) { - object <- dplyr::mutate(object, .colours = droplevels(.colours)) + object <- dplyr::mutate(object, .colours = droplevels(.data$.colours)) } } } diff --git a/R/correlation.R b/R/correlation.R index e4272fdd..5e9694c4 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -75,7 +75,7 @@ #' cor_by = geo_value, #' dt1 = -2 #' ) -epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, +epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, # nolint: object_usage_linter cor_by = geo_value, use = "na.or.complete", method = c("pearson", "kendall", "spearman")) { assert_class(x, "epi_df") diff --git a/R/data.R b/R/data.R index 2a5e5738..26b9f39f 100644 --- a/R/data.R +++ b/R/data.R @@ -20,12 +20,15 @@ #' COVID-19 cases, daily} #' } #' @source This object contains a modified part of the -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -#' This data set is licensed under the terms of the -#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -#' by the Johns Hopkins University on behalf of its Center for Systems Science -#' in Engineering. Copyright Johns Hopkins University 2020. +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository +#' by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +#' University} as +#' \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +#' in the COVIDcast Epidata API}. This data set is licensed under the terms of +#' the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons +#' Attribution 4.0 International license} by the Johns Hopkins University on +#' behalf of its Center for Systems Science in Engineering. Copyright Johns +#' Hopkins University 2020. #' #' Modifications: #' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From @@ -54,19 +57,34 @@ #' \item{geo_value}{the geographic value associated with each row of measurements.} #' \item{time_value}{the time value associated with each row of measurements.} #' \item{version}{the time value specifying the version for each row of measurements. } -#' \item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims} -#' \item{case_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily} +#' \item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like +#' illness) computed from medical insurance claims} +#' \item{case_rate_7d_av}{7-day average signal of number of new confirmed +#' deaths due to COVID-19 per 100,000 population, daily} #' } #' @source -#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the -#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering. -#' Copyright Johns Hopkins University 2020. +#' This object contains a modified part of the +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by +#' the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +#' University} as +#' \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +#' in the COVIDcast Epidata API}. This data set is licensed under the terms of +#' the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons +#' Attribution 4.0 International license} by Johns Hopkins University on behalf +#' of its Center for Systems Science in Engineering. Copyright Johns Hopkins +#' University 2020. #' #' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits API}: The signal `percent_cli` is taken directly from the API without changes. -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: `case_rate_7d_av` signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. -#' * Furthermore, the data is a subset of the full dataset, the signal names slightly altered, and formatted into a tibble. +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From +#' the COVIDcast Doctor Visits API}: The signal `percent_cli` is taken +#' directly from the API without changes. +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +#' the COVIDcast Epidata API}: `case_rate_7d_av` signal was computed by Delphi +#' from the original JHU-CSSE data by calculating moving averages of the +#' preceding 7 days, so the signal for June 7 is the average of the underlying +#' data for June 1 through 7, inclusive. +#' * Furthermore, the data is a subset of the full dataset, the signal names +#' slightly altered, and formatted into a tibble. #' #' @export "archive_cases_dv_subset" @@ -128,11 +146,11 @@ some_package_is_being_unregistered <- function(parent_n = 0L) { #' #' @noRd delayed_assign_with_unregister_awareness <- function(x, value, - eval.env = rlang::caller_env(), - assign.env = rlang::caller_env()) { - value_quosure <- rlang::as_quosure(rlang::enexpr(value), eval.env) + eval_env = rlang::caller_env(), + assign_env = rlang::caller_env()) { + value_quosure <- rlang::as_quosure(rlang::enexpr(value), eval_env) this_env <- environment() - delayedAssign(x, eval.env = this_env, assign.env = assign.env, value = { + delayedAssign(x, eval.env = this_env, assign.env = assign_env, value = { if (some_package_is_being_unregistered()) { withCallingHandlers( # `rlang::eval_tidy(value_quosure)` is shorter and would sort of work, @@ -140,7 +158,7 @@ delayed_assign_with_unregister_awareness <- function(x, value, # we'd have with `delayedAssign`; it doesn't seem to actually evaluate # quosure's expr in the quosure's env. Using `rlang::eval_bare` instead # seems to do the trick. (We also could have just used a `value_expr` - # and `eval.env` together rather than introducing `value_quosure` at + # and `eval_env` together rather than introducing `value_quosure` at # all.) rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)), error = function(err) { @@ -193,7 +211,10 @@ delayed_assign_with_unregister_awareness <- function(x, value, # binding may have been created with the same name as the package promise, and # this binding will stick around even when the package is reloaded, and will # need to be `rm`-d to easily access the refreshed package promise. -delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archive(archive_cases_dv_subset_dt, compactify = FALSE)) +delayed_assign_with_unregister_awareness( + "archive_cases_dv_subset", + as_epi_archive(archive_cases_dv_subset_dt, compactify = FALSE) +) #' Subset of JHU daily cases from California and Florida #' @@ -210,15 +231,24 @@ delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archi #' \item{time_value}{the time value associated with each row of measurements.} #' \item{cases}{Number of new confirmed COVID-19 cases, daily} #' } -#' @source This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the -#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -#' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. -#' Copyright Johns Hopkins University 2020. +#' @source This object contains a modified part of the +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by +#' the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +#' University} as +#' \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +#' in the COVIDcast Epidata API}. This data set is licensed under the terms of +#' the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons +#' Attribution 4.0 International license} by the Johns Hopkins University on +#' behalf of its Center for Systems Science in Engineering. Copyright Johns +#' Hopkins University 2020. #' #' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -#' These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. -#' * Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +#' the COVIDcast Epidata API}: These signals are taken directly from the JHU +#' CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub +#' repository} without changes. +#' * Furthermore, the data has been limited to a very small number of rows, the +#' signal names slightly altered, and formatted into a tibble. "incidence_num_outlier_example" #' Subset of JHU daily cases from counties in Massachusetts and Vermont @@ -237,12 +267,25 @@ delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archi #' \item{county_name}{the name of the county} #' \item{state_name}{the full name of the state} #' } -#' @source This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +#' @source This object contains a modified part of the +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by +#' the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +#' University} as +#' \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +#' in the COVIDcast Epidata API}. This data set is licensed under the terms of +#' the #' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} #' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. #' Copyright Johns Hopkins University 2020. #' #' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. -#' * Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +#' the COVIDcast Epidata API}: These signals are taken directly from the JHU +#' CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub +#' repository} without changes. The 7-day average signals are computed by +#' Delphi by calculating moving averages of the preceding 7 days, so the +#' signal for June 7 is the average of the underlying data for June 1 through +#' 7, inclusive. +#' * Furthermore, the data has been limited to a very small number of rows, the +#' signal names slightly altered, and formatted into a tibble. "jhu_csse_county_level_subset" diff --git a/R/epi_df.R b/R/epi_df.R index 0334e1d0..65acfb94 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -134,20 +134,20 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, # If as_of is missing, then try to guess it if (missing(as_of)) { # First check the metadata for an as_of field - if ("metadata" %in% names(attributes(x)) && - "as_of" %in% names(attributes(x)$metadata)) { + if ( + "metadata" %in% names(attributes(x)) && + "as_of" %in% names(attributes(x)$metadata) + ) { as_of <- attributes(x)$metadata$as_of - } - - # Next check for as_of, issue, or version columns - else if ("as_of" %in% names(x)) { + } else if ("as_of" %in% names(x)) { + # Next check for as_of, issue, or version columns as_of <- max(x$as_of) } else if ("issue" %in% names(x)) { as_of <- max(x$issue) } else if ("version" %in% names(x)) { as_of <- max(x$version) - } # If we got here then we failed - else { + } else { + # If we got here then we failed as_of <- Sys.time() } # Use the current day-time } diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 9ddad684..02722c91 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -53,7 +53,10 @@ grouped_epi_archive <- public = list( initialize = function(ungrouped, vars, drop) { if (inherits(ungrouped, "grouped_epi_archive")) { - cli_abort("`ungrouped` must not already be grouped (neither automatic regrouping nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, or `ungroup` first.", + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). + Either use `group_by` with `.add=TRUE`, or `ungroup` first.", class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", epiprocess__ungrouped_class = class(ungrouped), epiprocess__ungrouped_groups = groups(ungrouped) @@ -262,7 +265,12 @@ grouped_epi_archive <- .data_group <- .data_group$DT } - assert(check_atomic(comp_value, any.missing = TRUE), check_data_frame(comp_value), combine = "or", .var.name = vname(comp_value)) + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) # Label every result row with the `ref_time_value` res <- list(time_value = ref_time_value) @@ -297,7 +305,11 @@ grouped_epi_archive <- x <- lapply(ref_time_values, function(ref_time_value) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw <- private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + as_of_raw <- private$ungrouped$as_of( + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) # Set: # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will @@ -371,6 +383,7 @@ grouped_epi_archive <- x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) } + # nolint start: commented_code_linter. # if (is_epi_df(x)) { # # The analogue of `epi_df`'s `as_of` metadata for an archive is # # `$versions_end`, at least in the current absence of @@ -380,6 +393,7 @@ grouped_epi_archive <- # # derived won't always match; override: # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end # } + # nolint end # XXX We need to work out when we want to return an `epi_df` and how # to get appropriate keys (see #290, #223, #163). We'll probably diff --git a/R/growth_rate.R b/R/growth_rate.R index b584f7e3..1d6a0bb1 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -135,7 +135,10 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, if (dup_rm) { o <- !duplicated(x) if (any(!o)) { - cli_warn("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") + cli_warn( + "`x` contains duplicate values. (If being run on a + column in an `epi_df`, did you group by relevant key variables?)" + ) } x <- x[o] y <- y[o] @@ -176,10 +179,8 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, } else { return((b / a - 1) / hh) } - } - - # Linear regression - else { + } else { + # Linear regression xm <- xx - mean(xx) ym <- yy - mean(yy) b <- sum(xm * ym) / sum(xm^2) @@ -216,10 +217,8 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, } else { return(d0 / f0) } - } - - # Trend filtering - else { + } else { + # Trend filtering ord <- params$ord maxsteps <- params$maxsteps cv <- params$cv diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 43b816bc..6c438d38 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -224,22 +224,22 @@ epix_merge <- function(x, y, ), class = "epiprocess__epix_merge_unresolved_sync") } else { new_versions_end <- x$versions_end - x_DT <- x$DT - y_DT <- y$DT + x_dt <- x$DT + y_dt <- y$DT } } else if (sync %in% c("na", "locf")) { new_versions_end <- max(x$versions_end, y$versions_end) - x_DT <- epix_fill_through_version(x, new_versions_end, sync)$DT - y_DT <- epix_fill_through_version(y, new_versions_end, sync)$DT + x_dt <- epix_fill_through_version(x, new_versions_end, sync)$DT + y_dt <- epix_fill_through_version(y, new_versions_end, sync)$DT } else if (sync == "truncate") { new_versions_end <- min(x$versions_end, y$versions_end) - x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] - y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + x_dt <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_dt <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] } else { cli_abort("unimplemented") } - # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same + # key(x_dt) should be the same as key(x$DT) and key(y_dt) should be the same # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to # split the code into separate functions if we wish), but still refer to # {x,y}$DT in the error messages (further relying on this assumption). @@ -248,24 +248,24 @@ epix_merge <- function(x, y, # have a bug in the preprocessing, a weird/invalid archive as input, and/or a # data.table version with different semantics (which may break other parts of # our code). - x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) - y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) - if (!x_DT_key_as_expected || !y_DT_key_as_expected) { + x_dt_key_as_expected <- identical(key(x$DT), key(x_dt)) + y_dt_key_as_expected <- identical(key(y$DT), key(y_dt)) + if (!x_dt_key_as_expected || !y_dt_key_as_expected) { cli_warn(" `epiprocess` internal warning (please report): pre-processing for epix_merge unexpectedly resulted in an intermediate data table (or tables) with a different key than the corresponding input archive. Manually setting intermediate data table keys to the expected values. ", internal = TRUE) - setkeyv(x_DT, key(x$DT)) - setkeyv(y_DT, key(y$DT)) + setkeyv(x_dt, key(x$DT)) + setkeyv(y_dt, key(y$DT)) } # Without some sort of annotations of what various columns represent, we can't # do something that makes sense when merging archives with mismatched keys. # E.g., even if we assume extra keys represent demographic breakdowns, a # sensible default treatment of count-type and rate-type value columns would # differ. - if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { + if (!identical(sort(key(x_dt)), sort(key(y_dt)))) { cli_abort(" The archives must have the same set of key column names; if the key columns represent the same things, just with different @@ -281,8 +281,8 @@ epix_merge <- function(x, y, # # non-`by` cols = "value"-ish cols, and are looked up with last # version carried forward via rolling joins - by <- key(x_DT) # = some perm of key(y_DT) - if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { + by <- key(x_dt) # = some perm of key(y_dt) + if (!all(c("geo_value", "time_value", "version") %in% key(x_dt))) { cli_abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to contain "geo_value", "time_value", and "version".', @@ -296,8 +296,8 @@ epix_merge <- function(x, y, class = "epiprocess__epi_archive_must_have_version_at_end_of_key" ) } - x_nonby_colnames <- setdiff(names(x_DT), by) - y_nonby_colnames <- setdiff(names(y_DT), by) + x_nonby_colnames <- setdiff(names(x_dt), by) + y_nonby_colnames <- setdiff(names(y_dt), by) if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { cli_abort(" `x` and `y` DTs have overlapping non-by column names; @@ -306,7 +306,7 @@ epix_merge <- function(x, y, incorporated into the key, and other columns should be renamed. ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") } - x_by_vals <- x_DT[, by, with = FALSE] + x_by_vals <- x_dt[, by, with = FALSE] if (anyDuplicated(x_by_vals) != 0L) { cli_abort(" The `by` columns must uniquely determine rows of `x$DT`; @@ -315,7 +315,7 @@ epix_merge <- function(x, y, to `x`'s key (to get a unique key). ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - y_by_vals <- y_DT[, by, with = FALSE] + y_by_vals <- y_dt[, by, with = FALSE] if (anyDuplicated(y_by_vals) != 0L) { cli_abort(" The `by` columns must uniquely determine rows of `y$DT`; @@ -324,7 +324,7 @@ epix_merge <- function(x, y, to `y`'s key (to get a unique key). ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - result_DT <- merge(x_by_vals, y_by_vals, + result_dt <- merge(x_by_vals, y_by_vals, by = by, # We must have `all=TRUE` or we may skip updates # from x and/or y and corrupt the history @@ -337,8 +337,8 @@ epix_merge <- function(x, y, allow.cartesian = TRUE ) set( - result_DT, , x_nonby_colnames, - x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + result_dt, , x_nonby_colnames, + x_dt[result_dt[, by, with = FALSE], x_nonby_colnames, with = FALSE, # It's good practice to specify `on`, and we must # explicitly specify `on` if there's a potential key vs. @@ -356,8 +356,8 @@ epix_merge <- function(x, y, ] ) set( - result_DT, , y_nonby_colnames, - y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + result_dt, , y_nonby_colnames, + y_dt[result_dt[, by, with = FALSE], y_nonby_colnames, with = FALSE, on = by, roll = TRUE, @@ -367,13 +367,13 @@ epix_merge <- function(x, y, ) # The key could be unset in case of a key vs. by order mismatch as # noted above. Ensure that we keep it: - setkeyv(result_DT, by) + setkeyv(result_dt, by) return(as_epi_archive( - result_DT[], # clear data.table internal invisibility flag if set + result_dt[], # clear data.table internal invisibility flag if set geo_type = x$geo_type, time_type = x$time_type, - other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), + other_keys = setdiff(key(result_dt), c("geo_value", "time_value", "version")), additional_metadata = result_additional_metadata, # It'd probably be better to pre-compactify before the merge, and might be # guaranteed not to be necessary to compactify the merge result if the @@ -419,7 +419,7 @@ destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { list( unchanged_parent_df = col_modify_recorder_df %>% `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% - `class<-`(setdiff(class(.), "col_modify_recorder_df")), + `class<-`(setdiff(class(.data), "col_modify_recorder_df")), cols = attr(col_modify_recorder_df, "epiprocess::col_modify_recorder_df::cols", exact = TRUE @@ -510,11 +510,11 @@ epix_detailed_restricted_mutate <- function(.data, ...) { # sorting (including potential extra copies) or sortedness checking, then # `setDT` (rather than `as.data.table`, in order to prevent column copying # to establish ownership according to `data.table`'s memory model). - out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + out_dt <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% data.table::setDT(key = key(.data$DT)) out_archive <- .data$clone() - out_archive$DT <- out_DT + out_archive$DT <- out_dt request_names <- names(col_modify_cols) return(list( archive = out_archive, @@ -668,11 +668,19 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. if (!any(grouping_col_is_factor)) { - cli_warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" ) } else if (any(diff(grouping_col_is_factor) == -1L)) { - cli_warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these columns; + see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert + all grouping columns to factors beforehand, specify the non-factor grouping columns + first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" ) } diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 3636d966..22ea2928 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -80,9 +80,12 @@ summary.epi_df <- function(object, ...) { cat(sprintf("* %-27s = %s\n", "max time value", max(object$time_value))) cat(sprintf( "* %-27s = %i\n", "average rows per time value", - as.integer(object %>% dplyr::group_by(.data$time_value) %>% - dplyr::summarize(num = dplyr::n()) %>% - dplyr::summarize(mean(.data$num))) + as.integer( + object %>% + dplyr::group_by(.data$time_value) %>% + dplyr::summarize(num = dplyr::n()) %>% + dplyr::summarize(mean(.data$num)) + ) )) } diff --git a/R/outliers.R b/R/outliers.R index a8051dbd..68a656a7 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -97,7 +97,10 @@ detect_outlr <- function(x = seq_along(y), y, # Validate that x contains all distinct values if (any(duplicated(x))) { - cli_abort("`x` cannot contain duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") + cli_abort( + "`x` cannot contain duplicate values. (If being run on a + column in an `epi_df`, did you group by relevant key variables?)" + ) } # Run all outlier detection methods @@ -124,7 +127,9 @@ detect_outlr <- function(x = seq_along(y), y, if (combiner != "none") { if (combiner == "mean") { combine_fun <- mean - } else if (combiner == "median") combine_fun <- median + } else if (combiner == "median") { + combine_fun <- median + } for (target in c("lower", "upper", "replacement")) { results[[paste0("combined_", target)]] <- apply( @@ -312,21 +317,21 @@ detect_outlr_stl <- function(x = seq_along(y), y, fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>% generics::components() %>% tibble::as_tibble() %>% - dplyr::select(trend:remainder) %>% + dplyr::select(.data$trend:.data$remainder) %>% # dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>% - dplyr::rename(resid = remainder) + dplyr::rename(resid = .data$remainder) # Allocate the seasonal term from STL to either fitted or resid if (!is.null(seasonal_period)) { stl_components <- stl_components %>% dplyr::mutate( - fitted = trend + seasonal + fitted = .data$trend + .data$seasonal ) } else { stl_components <- stl_components %>% dplyr::mutate( - fitted = trend, - resid = seasonal + resid + fitted = .data$trend, + resid = .data$seasonal + resid ) } @@ -368,7 +373,7 @@ detect_outlr_stl <- function(x = seq_along(y), y, roll_iqr <- function(z, n, detection_multiplier, min_radius, replacement_multiplier, min_lower) { if (typeof(z$y) == "integer") { - as_type <- as.integer + as_type <- as.integer # nolint: object_usage_linter } else { as_type <- as.numeric } @@ -386,6 +391,6 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius, TRUE ~ y ) ) %>% - dplyr::select(lower, upper, replacement) %>% + dplyr::select(.data$lower, .data$upper, .data$replacement) %>% tibble::as_tibble() } diff --git a/R/slide.R b/R/slide.R index 9adabf9e..253a6457 100644 --- a/R/slide.R +++ b/R/slide.R @@ -221,7 +221,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, } # Arrange by increasing time_value - x <- arrange(x, time_value) + x <- arrange(x, .data$time_value) # Now set up starts and stops for sliding/hopping starts <- ref_time_values - before @@ -271,9 +271,14 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, dplyr::count(.data$time_value) %>% `[[`("n") - if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && - !all(purrr::map_lgl(slide_values_list, is.data.frame))) { - cli_abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") + if ( + !all(purrr::map_lgl(slide_values_list, is.atomic)) && + !all(purrr::map_lgl(slide_values_list, is.data.frame)) + ) { + cli_abort( + "The slide computations must return always atomic vectors + or data frames (and not a mix of these two structures)." + ) } # Unlist if appropriate: @@ -284,8 +289,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, vctrs::list_unchop(slide_values_list) } - if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && - length(slide_values_list) != 0L) { + if ( + all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && + length(slide_values_list) != 0L + ) { # Recycle to make size stable (one slide value per ref time value). # (Length-0 case also could be handled here, but causes difficulties; # leave it to the next branch, where it also belongs.) @@ -299,7 +306,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, )) } if (vctrs::vec_size(slide_values) != num_ref_rows) { - cli_abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.") + cli_abort( + "The slide computations must either (a) output a single element/row each, or + (b) one element/row per appearance of the reference time value in the local window." + ) } } diff --git a/R/utils.R b/R/utils.R index 2ab52328..098b9966 100644 --- a/R/utils.R +++ b/R/utils.R @@ -109,19 +109,19 @@ assert_sufficient_f_args <- function(f, ...) { args_names <- names(args) # Remove named arguments forwarded from `epi[x]_slide`'s `...`: forwarded_dots_names <- names(rlang::call_match(dots_expand = FALSE)[["..."]]) - args_matched_in_dots <- - # positional calling args will skip over args matched by named calling args - args_names %in% forwarded_dots_names & - # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` - args_names != "..." + # positional calling args will skip over args matched by named calling args + # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` + args_matched_in_dots <- args_names %in% forwarded_dots_names & args_names != "..." + remaining_args <- args[!args_matched_in_dots] remaining_args_names <- names(remaining_args) # note that this doesn't include unnamed args forwarded through `...`. dots_i <- which(remaining_args_names == "...") # integer(0) if no match n_f_args_before_dots <- dots_i - 1L - if (length(dots_i) != 0L) { # `f` has a dots "arg" + if (length(dots_i) != 0L) { + # `f` has a dots "arg" # Keep all arg names before `...` - mandatory_args_mapped_names <- remaining_args_names[seq_len(n_f_args_before_dots)] + mandatory_args_mapped_names <- remaining_args_names[seq_len(n_f_args_before_dots)] # nolint: object_usage_linter if (n_f_args_before_dots < n_mandatory_f_args) { mandatory_f_args_in_f_dots <- @@ -170,10 +170,8 @@ assert_sufficient_f_args <- function(f, ...) { default_check_mandatory_args_labels <- mandatory_f_args_labels[seq_len(n_remaining_args_for_default_check)] # ^ excludes any mandatory args absorbed by f's `...`'s: - mandatory_args_replacing_defaults <- - default_check_mandatory_args_labels[has_default_replaced_by_mandatory] - args_with_default_replaced_by_mandatory <- - rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) + mandatory_args_replacing_defaults <- default_check_mandatory_args_labels[has_default_replaced_by_mandatory] # nolint: object_usage_linter + args_with_default_replaced_by_mandatory <- rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) # nolint: object_usage_linter cli::cli_abort( "`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which @@ -390,13 +388,11 @@ guess_geo_type <- function(geo_value) { ) if (all(geo_value %in% state_values)) { return("state") - } # Else if all geo values are 2 letters, then use "nation" - else if (all(grepl("[a-z]{2}", geo_value)) & - !any(grepl("[a-z]{3}", geo_value))) { + } else if (all(grepl("[a-z]{2}", geo_value)) && !any(grepl("[a-z]{3}", geo_value))) { + # Else if all geo values are 2 letters, then use "nation" return("nation") - } # Else if all geo values are 5 numbers, then use "county" - else if (all(grepl("[0-9]{5}", geo_value)) & - !any(grepl("[0-9]{6}", geo_value))) { + } else if (all(grepl("[0-9]{5}", geo_value)) && !any(grepl("[0-9]{6}", geo_value))) { + # Else if all geo values are 5 numbers, then use "county" return("county") } } else if (is.numeric(geo_value)) { @@ -442,8 +438,8 @@ guess_time_type <- function(time_value) { # Now, if a POSIXct class, then use "day-time" if (inherits(time_value, "POSIXct")) { return("day-time") - } # Else, if a Date class, then use "week" or "day" depending on gaps - else if (inherits(time_value, "Date")) { + } else if (inherits(time_value, "Date")) { + # Else, if a Date class, then use "week" or "day" depending on gaps # Convert to numeric so we can use the modulo operator. unique_time_gaps <- as.numeric(diff(sort(unique(time_value)))) # We need to check the modulus of `unique_time_gaps` in case there are @@ -451,10 +447,8 @@ guess_time_type <- function(time_value) { # be larger than 7 days. If we just check if `diffs == 7`, it will fail # unless the weekly date sequence is already complete. return(ifelse(all(unique_time_gaps %% 7 == 0), "week", "day")) - } - - # Else, check whether it's one of the tsibble classes - else if (inherits(time_value, "yearweek")) { + } else if (inherits(time_value, "yearweek")) { + # Else, check whether it's one of the tsibble classes return("yearweek") } else if (inherits(time_value, "yearmonth")) { return("yearmonth") @@ -463,9 +457,11 @@ guess_time_type <- function(time_value) { } # Else, if it's an integer that's at least 1582, then use "year" - if (is.numeric(time_value) && - all(time_value == as.integer(time_value)) && - all(time_value >= 1582)) { + if ( + is.numeric(time_value) && + all(time_value == as.integer(time_value)) && + all(time_value >= 1582) + ) { return("year") } @@ -561,8 +557,7 @@ deprecated_quo_is_present <- function(quo) { FALSE } else { quo_expr <- rlang::get_expr(quo) - if (identical(quo_expr, rlang::expr(deprecated())) || - identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { + if (identical(quo_expr, rlang::expr(deprecated())) || identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { # nolint: object_usage_linter FALSE } else { TRUE @@ -617,7 +612,10 @@ gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { assert_numeric(pqlim, len = 1L, lower = 0) assert_numeric(irtol, len = 1L, lower = 0) if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a / b) >= pqlim || abs(b / a) >= pqlim) { - cli_abort("`a` and/or `b` is either `NA` or exactly zero, or one is so much smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting.") + cli_abort( + "`a` and/or `b` is either `NA` or exactly zero, or one is so much + smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting." + ) } iatol <- irtol * max(a, b) a_curr <- a @@ -625,7 +623,10 @@ gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { while (TRUE) { # `b_curr` is the candidate GCD / iterand; check first if it seems too small: if (abs(b_curr) <= iatol) { - cli_abort("No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.") + cli_abort( + "No GCD found; remaining potential Gads are all too small relative + to one/both of the original inputs; see `irtol` setting." + ) } remainder <- a_curr - round(a_curr / b_curr) * b_curr if (abs(remainder / b_curr) <= rrtol) { @@ -653,7 +654,10 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { cli_abort("`dividends` must satisfy `is.numeric`, and have `length` > 0") } if (rlang::dots_n(...) != 0L) { - cli_abort("`...` should be empty; all dividends should go in a single `dividends` vector, and all tolerance&limit settings should be passed by name.") + cli_abort( + "`...` should be empty; all dividends should go in a single `dividends` + vector, and all tolerance&limit settings should be passed by name." + ) } # We expect a bunch of duplicate `dividends` for some applications. # De-duplicate to reduce work. Sort by absolute value to attempt to reduce diff --git a/man/archive_cases_dv_subset.Rd b/man/archive_cases_dv_subset.Rd index 4b19e58c..bd6bc876 100644 --- a/man/archive_cases_dv_subset.Rd +++ b/man/archive_cases_dv_subset.Rd @@ -10,21 +10,36 @@ An \code{epi_archive} data format. The data table DT has 129,638 rows and 5 colu \item{geo_value}{the geographic value associated with each row of measurements.} \item{time_value}{the time value associated with each row of measurements.} \item{version}{the time value specifying the version for each row of measurements. } -\item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims} -\item{case_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily} +\item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like +illness) computed from medical insurance claims} +\item{case_rate_7d_av}{7-day average signal of number of new confirmed +deaths due to COVID-19 per 100,000 population, daily} } } \source{ -This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by Johns Hopkins University on behalf of its Center for Systems Science in Engineering. -Copyright Johns Hopkins University 2020. +This object contains a modified part of the +\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by +the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +University} as +\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +in the COVIDcast Epidata API}. This data set is licensed under the terms of +the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons +Attribution 4.0 International license} by Johns Hopkins University on behalf +of its Center for Systems Science in Engineering. Copyright Johns Hopkins +University 2020. Modifications: \itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits API}: The signal \code{percent_cli} is taken directly from the API without changes. -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: \code{case_rate_7d_av} signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. -\item Furthermore, the data is a subset of the full dataset, the signal names slightly altered, and formatted into a tibble. +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From +the COVIDcast Doctor Visits API}: The signal \code{percent_cli} is taken +directly from the API without changes. +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +the COVIDcast Epidata API}: \code{case_rate_7d_av} signal was computed by Delphi +from the original JHU-CSSE data by calculating moving averages of the +preceding 7 days, so the signal for June 7 is the average of the underlying +data for June 1 through 7, inclusive. +\item Furthermore, the data is a subset of the full dataset, the signal names +slightly altered, and formatted into a tibble. } } \usage{ diff --git a/man/incidence_num_outlier_example.Rd b/man/incidence_num_outlier_example.Rd index 90275099..a56c5d0c 100644 --- a/man/incidence_num_outlier_example.Rd +++ b/man/incidence_num_outlier_example.Rd @@ -13,16 +13,25 @@ A tibble with 730 rows and 3 variables: } } \source{ -This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. -Copyright Johns Hopkins University 2020. +This object contains a modified part of the +\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by +the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +University} as +\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +in the COVIDcast Epidata API}. This data set is licensed under the terms of +the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons +Attribution 4.0 International license} by the Johns Hopkins University on +behalf of its Center for Systems Science in Engineering. Copyright Johns +Hopkins University 2020. Modifications: \itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. -\item Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +the COVIDcast Epidata API}: These signals are taken directly from the JHU +CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub +repository} without changes. +\item Furthermore, the data has been limited to a very small number of rows, the +signal names slightly altered, and formatted into a tibble. } } \usage{ diff --git a/man/jhu_csse_county_level_subset.Rd b/man/jhu_csse_county_level_subset.Rd index dfe8ef8a..a8b20fd1 100644 --- a/man/jhu_csse_county_level_subset.Rd +++ b/man/jhu_csse_county_level_subset.Rd @@ -15,15 +15,28 @@ A tibble with 16,212 rows and 5 variables: } } \source{ -This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +This object contains a modified part of the +\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by +the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +University} as +\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +in the COVIDcast Epidata API}. This data set is licensed under the terms of +the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. Modifications: \itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. -\item Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From +the COVIDcast Epidata API}: These signals are taken directly from the JHU +CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub +repository} without changes. The 7-day average signals are computed by +Delphi by calculating moving averages of the preceding 7 days, so the +signal for June 7 is the average of the underlying data for June 1 through +7, inclusive. +\item Furthermore, the data has been limited to a very small number of rows, the +signal names slightly altered, and formatted into a tibble. } } \usage{ diff --git a/man/jhu_csse_daily_subset.Rd b/man/jhu_csse_daily_subset.Rd index 6d4913f0..ed61ceb6 100644 --- a/man/jhu_csse_daily_subset.Rd +++ b/man/jhu_csse_daily_subset.Rd @@ -21,12 +21,15 @@ COVID-19 cases, daily} } \source{ This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. +\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository +by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins +University} as +\href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished +in the COVIDcast Epidata API}. This data set is licensed under the terms of +the \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons +Attribution 4.0 International license} by the Johns Hopkins University on +behalf of its Center for Systems Science in Engineering. Copyright Johns +Hopkins University 2020. Modifications: \itemize{ diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 720b33de..47506152 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -41,14 +41,23 @@ test_that("`validate_version_bound` validate and class checks together allow and x_datetime <- tibble::tibble(version = my_datetime) # Custom classes matter (test vectors and non-vctrs-specialized lists separately): my_version_bound1 <- `class<-`(24, "c1") - expect_error(validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), regexp = "must have the same classes as") + expect_error( + validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), + regexp = "must have the same classes as" + ) my_version_bound2 <- `class<-`(list(12), c("c2a", "c2b", "c2c")) expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), regexp = "must have the same classes") # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: validate_version_bound(my_date, x_date, version_bound_arg = "vb") validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") - expect_error(validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), regexp = "must have the same classes") - expect_error(validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), regexp = "must have the same classes") + expect_error( + validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), + regexp = "must have the same classes" + ) + expect_error( + validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), + regexp = "must have the same classes" + ) # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same classes") expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same classes") diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index bd9002a3..885f0013 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -59,7 +59,7 @@ test_that("`delayed_assign_with_unregister_awareness` injection support works", my_exprs <- rlang::exprs(a = b + c, d = e) delayed_assign_with_unregister_awareness( "good2", list(!!!my_exprs), - eval.env = rlang::new_environment(list(b = 2L, c = 3L, e = 4L), rlang::base_env()) + eval_env = rlang::new_environment(list(b = 2L, c = 3L, e = 4L), rlang::base_env()) ) force(good2) expect_identical(good2, list(a = 5L, d = 4L)) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 38257282..8cfb4408 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -50,9 +50,7 @@ test_that("as_epi_df errors when additional_metadata is not a list", { tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), + time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) epi_tib <- epiprocess::new_epi_df(tib) @@ -78,13 +76,9 @@ test_that("grouped epi_df drops type when dropping keys", { test_that("grouped epi_df handles extra keys correctly", { tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), + time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5), - extra_key = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2) + extra_key = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2) ) epi_tib <- epiprocess::new_epi_df(tib, additional_metadata = list(other_keys = "extra_key") diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 588ad933..163bf010 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -103,48 +103,52 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, ) # beyond the last, no data in window }) -test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { - expect_error( - epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, but we'd expect there to be data in the window - expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, but still with data in window -}) +test_that( + "`ref_time_values` + `before` + `after` that have some slide data, but + generate the error due to ref. time being out of time range (would also + happen if they were in between `time_value`s)", + { + expect_error( + epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # beyond the last, but still with data in window + } +) ## --- These cases generate warnings (or not): --- test_that("Warn user against having a blank `before`", { - expect_warning(epi_slide(grouped, f, - after = 1L, - ref_time_values = d + 1L - ), NA) - expect_warning(epi_slide(grouped, f, - before = 0L, after = 1L, - ref_time_values = d + 1L - ), NA) + expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d + 1L), NA) + expect_warning(epi_slide(grouped, f, before = 0L, after = 1L, ref_time_values = d + 1L), NA) }) ## --- These cases doesn't generate the error: --- -test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { - expect_identical( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199) - ) # out of range for one group - expect_identical( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) - ) # not out of range for either group -}) +test_that( + "these doesn't produce an error; the error appears only if the ref + time values are out of the range for every group", + { + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group + } +) test_that("computation output formats x as_list_col", { # See `toy_edf` definition at top of file. # We'll try 7d sum with a few formats. + # nolint start: line_length_linter. basic_result_from_size1 <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), @@ -153,6 +157,7 @@ test_that("computation output formats x as_list_col", { tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) + # nolint end expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), basic_result_from_size1 @@ -186,6 +191,7 @@ test_that("computation output formats x as_list_col", { basic_result_from_size1 %>% rename(value_sum = slide_value) ) # trying with non-size-1 computation outputs: + # nolint start: line_length_linter. basic_result_from_size2 <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), @@ -194,6 +200,7 @@ test_that("computation output formats x as_list_col", { tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) + # nolint end expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), basic_result_from_size2 @@ -228,6 +235,7 @@ test_that("epi_slide alerts if the provided f doesn't take enough args", { test_that("`ref_time_values` + `all_rows = TRUE` works", { # See `toy_edf` definition at top of file. We'll do variants of a slide # returning the following: + # nolint start: line_length_linter. basic_full_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), @@ -236,6 +244,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) + # nolint end # slide computations returning atomic vecs: expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 6b113545..9ba847fa 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -74,19 +74,19 @@ test_that("epix_fill_through_version does not mutate x", { # sort of work, but we might want something stricter. `as.list` + # `identical` plus a check of the DT seems to do the trick. ea_orig_before_as_list <- as.list(ea_orig) - ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + ea_orig_dt_before_copy <- data.table::copy(ea_orig$DT) some_unobserved_version <- 8L # ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") ea_orig_after_as_list <- as.list(ea_orig) # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) - expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + expect_identical(ea_orig_dt_before_copy, ea_orig$DT) # ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") ea_orig_after_as_list <- as.list(ea_orig) expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) - expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + expect_identical(ea_orig_dt_before_copy, ea_orig$DT) } }) @@ -115,8 +115,8 @@ test_that("{epix_,$}fill_through_version return with expected visibility", { test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - old_DT <- ea$DT - old_DT_copy <- data.table::copy(old_DT) + old_dt <- ea$DT + old_dt_copy <- data.table::copy(old_dt) old_key <- data.table::key(ea$DT) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 0ae428e4..181aee28 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -216,8 +216,18 @@ local({ test_that('epix_merge sync="na" balks if do not know next_after', { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), + as_epi_archive(tibble::tibble( + geo_value = 1L, + time_value = 1L, + version = as.POSIXct(as.Date("2020-01-01")), + x_value = 10L + )), + as_epi_archive(tibble::tibble( + geo_value = 1L, + time_value = 1L, + version = as.POSIXct(as.Date("2020-01-02")), + y_value = 20L + )), sync = "na" ), regexp = "no applicable method.*next_after" diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 4af84254..07f0e5bf 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -89,8 +89,7 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { 2^6 + 2^3, 2^10 + 2^9, 2^15 + 2^14 - ) %>% - purrr::map(~ data.frame(bin_sum = .x)) + ) %>% purrr::map(~ data.frame(bin_sum = .x)) ) %>% group_by(geo_value) @@ -125,8 +124,7 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { c(2^6, 2^3), c(2^10, 2^9), c(2^15, 2^14) - ) %>% - purrr::map(~ data.frame(bin = rev(.x))) + ) %>% purrr::map(~ data.frame(bin = rev(.x))) ) %>% group_by(geo_value) @@ -172,8 +170,7 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { c(2^6, 2^3), c(2^10, 2^9), c(2^15, 2^14) - ) %>% - purrr::map(rev) + ) %>% purrr::map(rev) ) %>% group_by(geo_value) @@ -564,6 +561,7 @@ test_that("epix_slide with all_versions option works as intended", { expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical }) +# nolint start: commented_code_linter. # XXX currently, we're using a stopgap measure of having `epix_slide` always # output a (grouped/ungrouped) tibble while we think about the class, columns, # and attributes of `epix_slide` output more carefully. We might bring this test @@ -583,6 +581,7 @@ test_that("epix_slide with all_versions option works as intended", { # 10 # ) # }) +# nolint end test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty <- function(ea, ...) { @@ -631,6 +630,7 @@ test_that("epix_slide works with 0-row computation outputs", { ) }) +# nolint start: commented_code_linter. # test_that("epix_slide grouped by geo can produce `epi_df` output", { # # This is a characterization test. Not sure we actually want this behavior; # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 @@ -648,6 +648,7 @@ test_that("epix_slide works with 0-row computation outputs", { # new_epi_df(as_of = ea$versions_end) # ) # }) +# nolint end test_that("epix_slide alerts if the provided f doesn't take enough args", { f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 9fd15e10..45251a89 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -66,6 +66,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value) ) %>% + # nolint start: commented_code_linter. # # See # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 # # and @@ -78,6 +79,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # additional_metadata = list(other_keys = "age_group")) %>% # # put back in expected order; see issue #166: # select(age_group, geo_value, time_value, s) %>% + # nolint end group_by(age_group, geo_value, .drop = FALSE) ) expect_identical( diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index c2a6d956..cff88dac 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -85,11 +85,11 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { test_that("When duplicate cols in subset should abort", { expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], "Column name(s) time_value, y must not be duplicated.", - fixed = T + fixed = TRUE ) expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], "Column name(s) geo_value must not be duplicated.", - fixed = T + fixed = TRUE ) }) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 567975a5..eff00765 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -87,10 +87,8 @@ library(dplyr) edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(geo_value)), - x = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)), + time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), + x = seq_along(geo_value) + 0.01 * rnorm(length(geo_value)), ) %>% as_epi_df() @@ -395,9 +393,9 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Build features and response for the AR model, and then fit it dat <- - tibble(i = 1:ncol(x), lag = args$lags) %>% + tibble(i = seq_len(ncol(x)), lag = args$lags) %>% unnest(lag) %>% - mutate(name = paste0("x", 1:nrow(.))) %>% + mutate(name = paste0("x", seq_len(nrow(.)))) %>% # One list element for each lagged feature pmap(function(i, lag, name) { tibble( @@ -427,9 +425,12 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { # Make predictions test_time_value <- max(time_value) - point <- predict(obj, newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value)) + point <- predict( + obj, + newdata = dat %>% + dplyr::group_by(geo_value) %>% + dplyr::filter(time_value == test_time_value) + ) # Compute bands r <- residuals(obj) @@ -467,24 +468,24 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% epix_slide( - fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), before = 119, ref_time_values = fc_time_values ) %>% mutate( - target_date = time_value + ahead, as_of = TRUE, - geo_value = fc_geo_value + target_date = .data$time_value + ahead, as_of = TRUE, + geo_value = .data$fc_geo_value ) } else { x_latest %>% epi_slide( - fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), before = 119, ref_time_values = fc_time_values ) %>% - mutate(target_date = time_value + ahead, as_of = FALSE) + mutate(target_date = .data$time_value + ahead, as_of = FALSE) } } diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index b351d684..fdb0e3c6 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -190,7 +190,9 @@ versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { epix_as_of(x, max_version = v) %>% mutate(version = v) }) %>% - bind_rows(x_latest %>% mutate(version = self_max)) %>% + bind_rows( + x_latest %>% mutate(version = self_max) + ) %>% mutate(latest = version == self_max) ggplot( @@ -316,13 +318,13 @@ prob_arx <- function(x, y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20, dat <- do.call( data.frame, unlist( # Below we loop through and build the lagged features - purrr::map(1:ncol(x), function(i) { + purrr::map(seq_len(ncol(x)), function(i) { purrr::map(lags[[i]], function(j) lag(x[, i], n = j)) }), recursive = FALSE ) ) - names(dat) <- paste0("x", 1:ncol(dat)) + names(dat) <- paste0("x", seq_len(ncol(dat))) if (intercept) dat$x0 <- rep(1, nrow(dat)) dat$y <- lead(y, n = ahead) obj <- lm(y ~ . + 0, data = dat) @@ -393,21 +395,21 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - group_by(geo_value) %>% + group_by(.data$geo_value) %>% epix_slide( - fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% - mutate(target_date = time_value + ahead, as_of = TRUE) %>% + mutate(target_date = .data$time_value + ahead, as_of = TRUE) %>% ungroup() } else { x_latest %>% - group_by(geo_value) %>% + group_by(.data$geo_value) %>% epi_slide( - fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% - mutate(target_date = time_value + ahead, as_of = FALSE) %>% + mutate(target_date = .data$time_value + ahead, as_of = FALSE) %>% ungroup() } } diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 3e97b6b9..cad065e7 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -40,7 +40,7 @@ locf_included <- as_epi_archive(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) ``` - + LOCF-redundant values can mar the performance of dataset operations. As the column `case_rate_7d_av` has many more LOCF-redundant values than `percent_cli`, we will omit the `percent_cli` column for comparing performance. @@ -101,7 +101,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of, "as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(case_rate_7d_av), before = 7) + my_ea$slide(median = median(.data$case_rate_7d_av), before = 7) } speeds <- rbind(speeds, speed_test(slide_median, "slide_median")) diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index a1b52daa..85b1e1f4 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -143,10 +143,8 @@ ex1 <- tibble( "12111", "12113", "12117", "42101", "42103", "42105" ), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(geo_value)), - value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) + time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), + value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) ) %>% as_tsibble(index = time_value, key = c(geo_value, county_code)) @@ -164,24 +162,26 @@ attr(ex1, "metadata") `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. ```{r, error = TRUE} data.frame( - state = rep(c("ca", "fl", "pa"), each = 3), # misnamed - pol = rep(c("blue", "swing", "swing"), each = 3), # extra key - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(geo_value)), # misnamed - value = 1:length(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) + # misnamed + state = rep(c("ca", "fl", "pa"), each = 3), + # extra key + pol = rep(c("blue", "swing", "swing"), each = 3), + # misnamed + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), + value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) ) %>% as_epi_df() ``` The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. ```{r} ex2 <- tibble( - state = rep(c("ca", "fl", "pa"), each = 3), # misnamed - pol = rep(c("blue", "swing", "swing"), each = 3), # extra key - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(state)), # misnamed - value = 1:length(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state))) + # misnamed + state = rep(c("ca", "fl", "pa"), each = 3), + # extra key + pol = rep(c("blue", "swing", "swing"), each = 3), + # misnamed + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(state)), + value = seq_along(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state))) ) %>% data.frame() head(ex2) @@ -265,8 +265,12 @@ x <- x %>% pivot_longer(starts_with("cases"), names_to = "type") %>% mutate(type = substring(type, 7)) -yrange <- range(x %>% group_by(time_value) %>% - summarize(value = sum(value)) %>% pull(value)) +yrange <- range( + x %>% + group_by(time_value) %>% + summarize(value = sum(value)) %>% + pull(value) +) ggplot(x, aes(x = time_value, y = value)) + geom_col(aes(fill = type)) + diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 416a135f..4d9d4da8 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -141,7 +141,7 @@ To visualize the results, we first define a convenience function for plotting. ```{r} # Plot outlier detection bands and/or points identified as outliers plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, - facet_vars = vars(geo_value), nrow = NULL, ncol = NULL, + facet_vars = vars(.data$geo_value), nrow = NULL, ncol = NULL, scales = "fixed") { # Convert outlier detection results to long format signal <- rlang::enquo(signal) @@ -154,27 +154,27 @@ plot_outlr <- function(x, signal, method_abbr, bands = TRUE, points = TRUE, # Start of plot with observed data p <- ggplot() + - geom_line(data = x, mapping = aes(x = time_value, y = !!signal)) + geom_line(data = x, mapping = aes(x = .data$time_value, y = !!signal)) # If requested, add bands if (bands) { p <- p + geom_ribbon( data = x_long, aes( - x = time_value, ymin = lower, ymax = upper, - color = method + x = .data$time_value, ymin = .data$lower, ymax = .data$upper, + color = .data$method ), fill = NA ) } # If requested, add points if (points) { - x_detected <- x_long %>% filter((!!signal < lower) | (!!signal > upper)) + x_detected <- x_long %>% filter((!!signal < .data$lower) | (!!signal > .data$upper)) p <- p + geom_point( data = x_detected, aes( - x = time_value, y = !!signal, color = method, - shape = method + x = .data$time_value, y = !!signal, color = .data$method, + shape = .data$method ) ) } diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 34d5bd59..3238f08b 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -195,7 +195,7 @@ prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, data.frame, purrr::map(lags, function(j) lag(y, n = j)) ) - names(dat) <- paste0("x", 1:ncol(dat)) + names(dat) <- paste0("x", seq_len(ncol(dat))) if (intercept) dat$x0 <- rep(1, nrow(dat)) dat$y <- lead(y, n = ahead) @@ -258,13 +258,13 @@ so that we can call it a few times. # Note the use of all_rows = TRUE (keeps all original rows in the output) k_week_ahead <- function(x, ahead = 7) { x %>% - group_by(geo_value) %>% + group_by(.data$geo_value) %>% epi_slide( - fc = prob_ar(cases_7dav, ahead = ahead), before = 119, + fc = prob_ar(.data$cases_7dav, ahead = ahead), before = 119, ref_time_values = fc_time_values, all_rows = TRUE ) %>% ungroup() %>% - mutate(target_date = time_value + ahead) + mutate(target_date = .data$time_value + ahead) } # First generate the forecasts, and bind them together From 9331e6ed37417fd56f94d399037cad6aacc1d418 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 13:50:59 -0800 Subject: [PATCH 197/345] ci: build README.rmd only if it exists --- .github/workflows/document.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml index 8eb92921..38ce641f 100644 --- a/.github/workflows/document.yaml +++ b/.github/workflows/document.yaml @@ -45,7 +45,8 @@ jobs: shell: Rscript {0} - name: Build README.md from README.Rmd - run: Rscript -e 'devtools::build_readme()' + # Run only if README.Rmd is found + run: Rscript -e 'if (file.exists("README.Rmd")) devtools::build_readme()' - name: Commit and push changes run: | From eb55b8bae7de3bc2552c860fd0f45c17588ad0ed Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 13:51:29 -0800 Subject: [PATCH 198/345] ci: build README.rmd only if it exists --- .github/workflows/document.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml index 38ce641f..bfc3c43d 100644 --- a/.github/workflows/document.yaml +++ b/.github/workflows/document.yaml @@ -45,7 +45,6 @@ jobs: shell: Rscript {0} - name: Build README.md from README.Rmd - # Run only if README.Rmd is found run: Rscript -e 'if (file.exists("README.Rmd")) devtools::build_readme()' - name: Commit and push changes From 69e022852bbcadc916a36893cc8ec678c28659e9 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 14:17:50 -0800 Subject: [PATCH 199/345] repo: ignore linter commit in blame --- .git-blame-ignore-revs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index a3d36061..4d4804de 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1 +1,4 @@ -c65876078a6f9525952b305eaea2fca003adf907 \ No newline at end of file +# Move to styler +c65876078a6f9525952b305eaea2fca003adf907 +# Move to lintr +0c8144e0e9deb79547eb523298d782926a7337a6 \ No newline at end of file From 9256dd221e2638fe5e0c45d0bfe28f4a5400cb5e Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 7 Mar 2024 15:08:41 -0800 Subject: [PATCH 200/345] refactor+lint: remove unused functions, snake_case --- R/growth_rate.R | 2 +- R/methods-epi_archive.R | 2 +- R/utils.R | 18 ++---------------- tests/testthat/test-utils.R | 15 ++------------- 4 files changed, 6 insertions(+), 31 deletions(-) diff --git a/R/growth_rate.R b/R/growth_rate.R index 1d6a0bb1..a60db452 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -249,7 +249,7 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, # Estimate growth rate and return f <- genlasso::coef.genlasso(obj, df = df)$beta - d <- ExtendR(diff(f) / diff(x)) + d <- extend_r(diff(f) / diff(x)) if (log_scale) { return(d[i0]) } else { diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 6c438d38..4bcead66 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -207,7 +207,7 @@ epix_merge <- function(x, y, if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { NA # (any type of NA is fine here) } else { - Min(c(x$clobberable_versions_start, y$clobberable_versions_start)) + min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) } # The actual merge below may not succeed 100% of the time, so do this diff --git a/R/utils.R b/R/utils.R index 098b9966..57a7f53a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -359,22 +359,8 @@ as_slide_computation <- function(f, ...) { ) } -########## - -Min <- function(x) min(x, na.rm = TRUE) -Max <- function(x) max(x, na.rm = TRUE) -Sum <- function(x) sum(x, na.rm = TRUE) -Mean <- function(x) mean(x, na.rm = TRUE) -Median <- function(x) median(x, na.rm = TRUE) - -########## - -Start <- function(x) x[1] -End <- function(x) x[length(x)] -MiddleL <- function(x) x[floor((length(x) + 1) / 2)] -MiddleR <- function(x) x[ceiling((length(x) + 1) / 2)] -ExtendL <- function(x) c(Start(x), x) -ExtendR <- function(x) c(x, End(x)) +min_na_rm <- function(x) min(x, na.rm = TRUE) +extend_r <- function(x) c(x, x[length(x)]) guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 11d6e864..83cc07f6 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,22 +1,11 @@ test_that("new summarizing functions work", { x <- c(3, 4, 5, 9, NA) - expect_equal(Min(x), 3) - expect_equal(Max(x), 9) - expect_equal(Sum(x), 21) - expect_equal(Mean(x), 5.25) - expect_equal(Median(x), 4.5) + expect_equal(min_na_rm(x), 3) }) test_that("Other capital letter functions work", { x <- c(1, 2, 3, 4, 5) - expect_equal(Start(x), 1) - expect_equal(End(x), 5) - expect_equal(MiddleL(x), 3) - expect_equal(MiddleR(x), 3) - expect_equal(MiddleL(x[-5]), 2) - expect_equal(MiddleR(x[-5]), 3) - expect_equal(ExtendL(x), c(1, 1, 2, 3, 4, 5)) - expect_equal(ExtendR(x), c(1, 2, 3, 4, 5, 5)) + expect_equal(extend_r(x), c(1, 2, 3, 4, 5, 5)) }) test_that("guess_geo_type tests for different types of geo_value's", { From 0f478080b53555a6fbcfa68cc3fb37c04164a197 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 7 Mar 2024 15:57:18 -0800 Subject: [PATCH 201/345] lint: objects to dplyr perhaps too much --- R/autoplot.R | 2 +- R/slide.R | 2 +- vignettes/advanced.Rmd | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index 09881209..7443628b 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -102,7 +102,7 @@ autoplot.epi_df <- function( names_to = ".response_name" ) } else { - object <- dplyr::rename(object[pos], .response := !!names(vars)) + object <- dplyr::rename(object[pos], .response := !!names(vars)) # nolint: object_usage_linter } all_keys <- rlang::syms(as.list(geo_and_other_keys)) other_keys <- rlang::syms(as.list(setdiff(geo_and_other_keys, "geo_value"))) diff --git a/R/slide.R b/R/slide.R index 253a6457..bbcaeb02 100644 --- a/R/slide.R +++ b/R/slide.R @@ -338,7 +338,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, f <- quos[[1]] new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # magic value that passes zero args as dots in calls below + ... <- missing_arg() # magic value that passes zero args as dots in calls below # nolint: object_usage_linter } f <- as_slide_computation(f, ...) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index eff00765..21b95c60 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -395,7 +395,7 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { dat <- tibble(i = seq_len(ncol(x)), lag = args$lags) %>% unnest(lag) %>% - mutate(name = paste0("x", seq_len(nrow(.)))) %>% + mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter # One list element for each lagged feature pmap(function(i, lag, name) { tibble( From ae1f3ad7332277dcbc83affa20f56228452722ee Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 7 Mar 2024 23:59:45 +0000 Subject: [PATCH 202/345] style: styler (GHA) --- vignettes/advanced.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 21b95c60..d4fad3e7 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -395,7 +395,7 @@ prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { dat <- tibble(i = seq_len(ncol(x)), lag = args$lags) %>% unnest(lag) %>% - mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter + mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter # One list element for each lagged feature pmap(function(i, lag, name) { tibble( From 216f0e9743445a9c8e3167bcff905033ea1c412f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 8 Mar 2024 09:07:23 -0800 Subject: [PATCH 203/345] light searchbar --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 96d67946..8ac42458 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -21,7 +21,7 @@ template: navbar: bg: primary - type: dark + type: light url: https://cmu-delphi.github.io/epiprocess/ From c4a5df0711af92fde3419bf6b8b929c88c791a93 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 8 Mar 2024 17:43:32 -0800 Subject: [PATCH 204/345] ci: fix pkgdown refs --- .github/workflows/pkgdown.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index ca4ee94c..886055ae 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -51,9 +51,9 @@ jobs: # https://github.com/r-lib/pkgdown/issues/2257 run: | target_ref <- "${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}" - override <- if (target_ref == "main") { + override <- if (target_ref == "main" || target_ref == "refs/heads/main") { list(development = list(mode = "release", version_label = "light")) - } else if (target_ref == "dev") { + } else if (target_ref == "dev" || target_ref == "refs/heads/dev") { list(development = list(mode = "devel", version_label = "success")) } else { stop("Unexpected target_ref: ", target_ref) From ed231a9135a5538a5e8c5c361e14e623924df216 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 13 Mar 2024 18:02:12 -0400 Subject: [PATCH 205/345] add epi_slide_mean example to intro slide vignette --- vignettes/slide.Rmd | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 34d5bd59..dbd1930a 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -60,12 +60,34 @@ x <- jhu_csse_daily_subset %>% as_epi_df() ``` +## Optimized rolling mean + +We first demonstrate how to apply a 7-day trailing average to the daily cases +in order to smooth the signal, by passing in the name of the column(s) we +want to average for the first argument of `epi_slide_mean()`. `epi_slide_mean +()` can only be used for averaging. To do this computation per state, we +first call `group_by()`. + +```{r} +x %>% + group_by(geo_value) %>% + epi_slide_mean("cases", before = 6) %>% + ungroup() %>% + head(10) +``` + +The calculation is done using `data.table::frollmean`, whose behavior can be +adjusted by passing relevant arguments via `...`. ## Slide with a formula -We first demonstrate how to apply a 7-day trailing average to the daily cases in -order to smooth the signal, by passing in a formula for the first argument of -`epi_slide()`. To do this computation per state, we first call `group_by()`. +The previous computation can also be performed using `epi_slide()`, which is +more flexible but quite a bit slower than `epi_slide_mean()`. It is +recommended to use `epi_slide_mean()` when possible. + +The same 7-day trailing average of daily cases can be computed by passing in a +formula for the first argument of `epi_slide()`. To do this per state, we +first call `group_by()`. ```{r} x %>% From fe35331b98b57b4eb4bdda7e446667c6a6921c3a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 13 Mar 2024 18:44:37 -0400 Subject: [PATCH 206/345] deprecate more as_list_col behavior, clarify naming error --- R/slide.R | 25 ++++++++++--------------- man/epi_slide_mean.Rd | 14 +++----------- man/epiprocess.Rd | 2 +- 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/R/slide.R b/R/slide.R index 0e45340e..3fd921db 100644 --- a/R/slide.R +++ b/R/slide.R @@ -419,12 +419,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @param new_col_names String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_names` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_names` -#' to the names of the list elements. +#' @param as_list_col Not supported. Included to match `epi_slide` interface. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_names` entirely. @@ -434,10 +429,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' outside `ref_time_values`; otherwise, there will be one row for each row in #' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The #' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. If using `as_list_col = TRUE`, note that -#' the missing marker is a `NULL` entry in the list column; for certain -#' operations, you might want to replace these `NULL` entries with a different -#' `NA` marker. +#' of the slide computation output. #' @return An `epi_df` object given by appending one or more new columns to #' `x`, depending on the `col_names` argument, named according to the #' `new_col_names` argument. @@ -523,11 +515,11 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' ungroup() epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = FALSE, + new_col_names = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") - if (as_list_col) { + if (!is.null(as_list_col)) { cli::cli_abort( "`as_list_col` is not supported for `epi_slide_mean`", class = "epiproces__epi_slide_mean__list_not_supported" @@ -589,10 +581,13 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, # `before` and `after` params. m <- before + after + 1L - if (is.null(names_sep) && !as_list_col) { + if (is.null(names_sep)) { if (length(new_col_names) != length(col_names)) { cli_abort( - "`new_col_names` must be the same length as `col_names` when `names_sep` is NULL.", + c( + "`new_col_names` must be the same length as `col_names` when + `names_sep` is NULL to avoid duplicate output column names." + ), class = "epiprocess__epi_slide_mean__col_names_length_mismatch", epiprocess__new_col_names = new_col_names, epiprocess__col_names = col_names @@ -685,7 +680,7 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, } if (!is_epi_df(result)) { - # `all_rows` and `as_list_col` handling strip epi_df format and metadata. + # `all_rows`handling strip epi_df format and metadata. # Restore them. result <- reclass(result, attributes(x)$metadata) } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 457d5b69..8fb0b33c 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -13,7 +13,7 @@ epi_slide_mean( ref_time_values, time_step, new_col_names = "slide_value", - as_list_col = FALSE, + as_list_col = NULL, names_sep = "_", all_rows = FALSE ) @@ -66,12 +66,7 @@ return an object of class \code{lubridate::period}. For example, we can use contain the derivative values. Default is "slide_value"; note that setting \code{new_col_names} equal to an existing column name will overwrite this column.} -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_names} -to the names of the list elements.} +\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix @@ -83,10 +78,7 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} +of the slide computation output.} } \value{ An \code{epi_df} object given by appending one or more new columns to diff --git a/man/epiprocess.Rd b/man/epiprocess.Rd index bc7ef263..7c3ecd8a 100644 --- a/man/epiprocess.Rd +++ b/man/epiprocess.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/epiprocess.R \docType{package} \name{epiprocess} -\alias{epiprocess} \alias{epiprocess-package} +\alias{epiprocess} \title{epiprocess: Tools for basic signal processing in epidemiology} \description{ This package introduces a common data structure for epidemiological data sets From bf98cfd053735167f5d79158674e341319ec364f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 14 Mar 2024 11:37:52 -0400 Subject: [PATCH 207/345] replace deprecated Start and End --- R/slide.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/slide.R b/R/slide.R index a0e836ac..7736eb16 100644 --- a/R/slide.R +++ b/R/slide.R @@ -723,10 +723,10 @@ full_date_seq <- function(x, before, after, time_step) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { - pad_early_dates <- Start(all_dates) - before:1 + pad_early_dates <- all_dates[1L] - before:1 } if (after != 0) { - pad_late_dates <- End(all_dates) + 1:after + pad_late_dates <- all_dates[length(all_dates)] + 1:after } } else if (missing(time_step)) { # Guess what `by` should be based on the epi_df's `time_type`. @@ -768,10 +768,10 @@ full_date_seq <- function(x, before, after, time_step) { # element. # # Adding "-1" to the `by` arg makes `seq.Date` go backwards in time. - pad_early_dates <- sort(seq(Start(all_dates), by = paste("-1", by), length.out = before + 1)[-1]) + pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) } if (after != 0) { - pad_late_dates <- seq(End(all_dates), by = by, length.out = after + 1)[-1] + pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] } } else { # A custom time step is specified. @@ -787,10 +787,10 @@ full_date_seq <- function(x, before, after, time_step) { all_dates <- min(x$time_value) + time_step(0:n_steps) if (before != 0) { - pad_early_dates <- Start(all_dates) - time_step(before:1) + pad_early_dates <- all_dates[1L] - time_step(before:1) } if (after != 0) { - pad_late_dates <- End(all_dates) + time_step(1:after) + pad_late_dates <- all_dates[length(all_dates)] + time_step(1:after) } } From 298be501480308f34e821db7a2ab7119897ed3fb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 13 Mar 2024 19:36:10 -0400 Subject: [PATCH 208/345] linting style: styler (GHA) more linting style: styler (GHA) --- R/slide.R | 32 +-- tests/testthat/test-epi_slide.R | 388 +++++++++++++++++++------------- 2 files changed, 254 insertions(+), 166 deletions(-) diff --git a/R/slide.R b/R/slide.R index 7736eb16..9d5a4e2a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -493,7 +493,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' epi_slide_mean("cases", +#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -523,10 +524,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() -epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, - time_step, - new_col_names = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { +epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, + time_step, + new_col_names = "slide_value", as_list_col = NULL, + names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") if (!is.null(as_list_col)) { @@ -593,7 +594,7 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, if (is.null(names_sep)) { if (length(new_col_names) != length(col_names)) { - cli_abort( + cli_abort( c( "`new_col_names` must be the same length as `col_names` when `names_sep` is NULL to avoid duplicate output column names." @@ -606,7 +607,7 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, result_col_names <- new_col_names } else { if (length(new_col_names) != 1L && length(new_col_names) != length(col_names)) { - cli_abort( + cli_abort( "`new_col_names` must be either length 1 or the same length as `col_names`.", class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", epiprocess__new_col_names = new_col_names, @@ -626,7 +627,7 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, .data_group, tibble(time_value = c(missing_times, pad_early_dates, pad_late_dates), .real = FALSE) ) %>% - arrange(time_value) + arrange(.data$time_value) # If a group contains duplicate time values, `frollmean` will still only # use the last `k` obs. It isn't looking at dates, it just goes in row @@ -634,7 +635,8 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` # should be used instead. if (anyDuplicated(.data_group$time_value) != 0L) { - cli_abort(c( + cli_abort( + c( "group contains duplicate time values. Using `epi_slide_mean` on this group will result in incorrect results", "i" = "Please change the grouping structure of the input data so that @@ -647,7 +649,8 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, ) } if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { - cli_abort(c( + cli_abort( + c( "group contains an unexpected number of rows", "i" = c("Input data may contain `time_values` closer together than the expected `time_step` size") @@ -667,9 +670,8 @@ epi_slide_mean = function(x, col_names, ..., before, after, ref_time_values, # timesteps ahead of where they should be. Shift results to the left by # `after` timesteps. .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { - c(.x[(after + 1L):length(.x)], rep(NA, after)) - } - ) + c(.x[(after + 1L):length(.x)], rep(NA, after)) + }) } else { .data_group[, result_col_names] <- roll_output } @@ -742,7 +744,7 @@ full_date_seq <- function(x, before, after, time_step) { ) if (is.na(by)) { - cli_abort( + cli_abort( c( "`frollmean` requires a full window to compute a result, but the `time_type` associated with the epi_df was not mappable to a period @@ -799,4 +801,4 @@ full_date_seq <- function(x, before, after, time_step) { pad_early_dates = pad_early_dates, pad_late_dates = pad_late_dates )) -} \ No newline at end of file +} diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 223b5fc6..8b509452 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -28,6 +28,7 @@ toy_edf <- tibble::tribble( tidyr::unchop(c(time_value, value)) %>% as_epi_df(as_of = 100) +# nolint start: line_length_linter. basic_result_from_size1_sum <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), @@ -44,6 +45,7 @@ basic_result_from_size1_mean <- tibble::tribble( tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) +# nolint end: line_length_linter. ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { @@ -94,13 +96,36 @@ test_that("Test errors/warnings for discouraged features", { ) # Below cases should raise no errors/warnings: - expect_no_warning(ref1 <- epi_slide(grouped, f, before = 1L, ref_time_values = d + 2)) - expect_no_warning(ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2)) - expect_no_warning(ref3 <- epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d + 2)) + expect_no_warning( + ref1 <- epi_slide(grouped, f, before = 1L, ref_time_values = d + 2) + ) + expect_no_warning( + ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2) + ) + expect_no_warning( + ref3 <- epi_slide(grouped, f, + before = 0L, after = 0L, ref_time_values = d + 2 + ) + ) - expect_no_warning(opt1 <- epi_slide_mean(grouped, col_names = "value", before = 1L, ref_time_values = d + 2, na.rm = TRUE)) - expect_no_warning(opt2 <- epi_slide_mean(grouped, col_names = "value", after = 1L, ref_time_values = d + 2, na.rm = TRUE)) - expect_no_warning(opt3 <- epi_slide_mean(grouped, col_names = "value", before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE)) + expect_no_warning( + opt1 <- epi_slide_mean(grouped, + col_names = "value", + before = 1L, ref_time_values = d + 2, na.rm = TRUE + ) + ) + expect_no_warning( + opt2 <- epi_slide_mean(grouped, + col_names = "value", + after = 1L, ref_time_values = d + 2, na.rm = TRUE + ) + ) + expect_no_warning( + opt3 <- epi_slide_mean(grouped, + col_names = "value", + before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE + ) + ) # Results from epi_slide and epi_slide_mean should match expect_identical(select(ref1, -slide_value_count), opt1) @@ -168,8 +193,14 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa ) # Non-integer-class but integer-compatible values are allowed: - expect_no_error(ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L)) - expect_no_error(opt <- epi_slide_mean(grouped, col_names = "value", before = 1, after = 1, ref_time_values = d + 2L, na.rm = TRUE)) + expect_no_error( + ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L) + ) + expect_no_error(opt <- epi_slide_mean( + grouped, + col_names = "value", before = 1, after = 1, + ref_time_values = d + 2L, na.rm = TRUE + )) # Results from epi_slide and epi_slide_mean should match expect_identical(select(ref, -slide_value_count), opt) @@ -195,42 +226,50 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, ) # beyond the last, no data in window }) -test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { - expect_error( - epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, but we'd expect there to be data in the window - expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, but still with data in window +test_that( + c( + "`ref_time_values` + `before` + `after` that have some slide data, but + generate the error due to ref. time being out of time range (would + also happen if they were in between `time_value`s)" + ), + { + expect_error( + epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # beyond the last, but still with data in window - expect_error( - epi_slide_mean(grouped, "value", before = 0L, after = 2L, ref_time_values = d), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # before the first, but we'd expect there to be data in the window - expect_error( - epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 201L), - "`ref_time_values` must be a unique subset of the time values in `x`." - ) # beyond the last, but still with data in window -}) + expect_error( + epi_slide_mean(grouped, "value", before = 0L, after = 2L, ref_time_values = d), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # before the first, but we'd expect there to be data in the window + expect_error( + epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 201L), + "`ref_time_values` must be a unique subset of the time values in `x`." + ) # beyond the last, but still with data in window + } +) ## --- These cases generate warnings (or not): --- test_that("Warn user against having a blank `before`", { - expect_no_warning(ref1 <- epi_slide(grouped, f, - after = 1L, - ref_time_values = d + 1L + expect_no_warning(ref1 <- epi_slide( + grouped, f, + after = 1L, ref_time_values = d + 1L )) - expect_no_warning(ref2 <- epi_slide(grouped, f, - before = 0L, after = 1L, - ref_time_values = d + 1L + expect_no_warning(ref2 <- epi_slide( + grouped, f, + before = 0L, after = 1L, ref_time_values = d + 1L )) - expect_no_warning(opt1 <- epi_slide_mean(grouped, "value", - after = 1L, - ref_time_values = d + 1L, na.rm = TRUE + expect_no_warning(opt1 <- epi_slide_mean( + grouped, "value", + after = 1L, ref_time_values = d + 1L, na.rm = TRUE )) - expect_no_warning(opt2 <- epi_slide_mean(grouped, "value", + expect_no_warning(opt2 <- epi_slide_mean( + grouped, "value", before = 0L, after = 1L, ref_time_values = d + 1L, na.rm = TRUE )) @@ -241,33 +280,39 @@ test_that("Warn user against having a blank `before`", { }) ## --- These cases doesn't generate the error: --- -test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { - expect_identical( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199) - ) # out of range for one group - expect_identical( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) - ) # not out of range for either group - - expect_identical( - epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = "ak", slide_value_value = 199) - ) # out of range for one group - expect_identical( - epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% - ungroup() %>% - dplyr::select("geo_value", "slide_value_value"), - dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) - ) # not out of range for either group -}) +test_that( + c( + "these doesn't produce an error; the error appears only if the ref time + values are out of the range for every group" + ), + { + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group + + expect_identical( + epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) + ) # out of range for one group + expect_identical( + epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% + ungroup() %>% + dplyr::select("geo_value", "slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) + ) # not out of range for either group + } +) test_that("computation output formats x as_list_col", { # See `toy_edf` and `basic_result_from_size1_sum` definitions at top of file. @@ -294,21 +339,28 @@ test_that("computation output formats x as_list_col", { # We'll try 7d avg with a few formats. # Warning: not exactly the same naming behavior as `epi_slide`. expect_identical( - toy_edf %>% filter( - geo_value == "a" - ) %>% epi_slide_mean( - "value", before = 6L, na.rm = TRUE - ), + toy_edf %>% + filter( + geo_value == "a" + ) %>% + epi_slide_mean( + "value", + before = 6L, na.rm = TRUE + ), basic_result_from_size1_mean %>% dplyr::mutate( slide_value_value = slide_value - ) %>% select(-slide_value) + ) %>% + select(-slide_value) ) expect_error( - toy_edf %>% filter( - geo_value == "a" - ) %>% epi_slide_mean( - "value", before = 6L, as_list_col = TRUE, na.rm = TRUE - ), + toy_edf %>% + filter( + geo_value == "a" + ) %>% + epi_slide_mean( + "value", + before = 6L, as_list_col = TRUE, na.rm = TRUE + ), class = "epiproces__epi_slide_mean__list_not_supported" ) # `epi_slide_mean` doesn't return dataframe columns @@ -316,25 +368,29 @@ test_that("computation output formats x as_list_col", { test_that("nested dataframe output names are controllable", { expect_identical( - toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - new_col_name = "result" - ), + toy_edf %>% + epi_slide( + before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result" + ), basic_result_from_size1_sum %>% rename(result_value = slide_value) ) expect_identical( - toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value_sum = sum(.x$value)), - names_sep = NULL - ), + toy_edf %>% + epi_slide( + before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL + ), basic_result_from_size1_sum %>% rename(value_sum = slide_value) ) expect_identical( toy_edf %>% filter( geo_value == "a" - ) %>% epi_slide_mean( - "value", before = 6L, names_sep = NULL, na.rm = TRUE - ), + ) %>% + epi_slide_mean( + "value", + before = 6L, names_sep = NULL, na.rm = TRUE + ), basic_result_from_size1_mean ) }) @@ -421,27 +477,33 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { expect_identical( toy_edf %>% filter( geo_value == "a" - ) %>% epi_slide_mean( - "value", before = 6L, names_sep = NULL, na.rm = TRUE - ), + ) %>% + epi_slide_mean( + "value", + before = 6L, names_sep = NULL, na.rm = TRUE + ), basic_result_from_size1_mean ) expect_identical( toy_edf %>% filter( geo_value == "a" - ) %>% epi_slide_mean( - "value", before = 6L, ref_time_values = c(2L, 8L), - names_sep = NULL, na.rm = TRUE - ), + ) %>% + epi_slide_mean( + "value", + before = 6L, ref_time_values = c(2L, 8L), + names_sep = NULL, na.rm = TRUE + ), filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) ) expect_identical( toy_edf %>% filter( geo_value == "a" - ) %>% epi_slide_mean( - "value", before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, - names_sep = NULL, na.rm = TRUE - ), + ) %>% + epi_slide_mean( + "value", + before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, + names_sep = NULL, na.rm = TRUE + ), basic_result_from_size1_mean %>% dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), slide_value, NA_integer_ @@ -899,7 +961,7 @@ test_that("results for different `before`s and `after`s match between epi_slide n <- 15 # Max date index m <- 3 # Number of missing dates n_obs <- n + 1 - m # Number of obs created - k <- c(0:(n-(m + 1)), n) # Date indices + k <- c(0:(n - (m + 1)), n) # Date indices # Basic time type days <- as.Date("2022-01-01") + k @@ -907,7 +969,7 @@ test_that("results for different `before`s and `after`s match between epi_slide set.seed(0) rand_vals <- rnorm(n_obs) - test_time_type_mean <- function (dates, vals, before = 6L, after = 0L, ...) { + test_time_type_mean <- function(dates, vals, before = 6L, after = 0L, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. date 10 is missing epi_data <- epiprocess::as_epi_df(rbind(tibble( @@ -925,13 +987,15 @@ test_that("results for different `before`s and `after`s match between epi_slide # Use the `epi_slide` result as a reference. result1 <- epi_slide(epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = before, after = after, names_sep = NULL, ...) + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL, ... + ) result2 <- epi_slide_mean(epi_data, col_names = c("a", "b"), na.rm = TRUE, - before = before, after = after, ...) + before = before, after = after, ... + ) expect_identical(result1, result2) } @@ -946,7 +1010,7 @@ test_that("results for different `before`s and `after`s match between epi_slide n <- 15 # Max date index m <- 0 # Number of missing dates n_obs <- n + 1 - m # Number of obs created - k <- c(0:(n-(m + 1)), n) # Date indices + k <- c(0:(n - (m + 1)), n) # Date indices # Basic time type days <- as.Date("2022-01-01") + k @@ -964,7 +1028,7 @@ test_that("results for different time_types match between epi_slide and epi_slid n <- 6L # Max date index m <- 1L # Number of missing dates n_obs <- n + 1L - m # Number of obs created - k <- c(0L:(n-(m + 1L)), n) # Date indices + k <- c(0L:(n - (m + 1L)), n) # Date indices set.seed(0) rand_vals <- rnorm(n_obs) @@ -973,17 +1037,17 @@ test_that("results for different time_types match between epi_slide and epi_slid epiprocess::as_epi_df(rbind(tibble( geo_value = "al", time_value = date_seq, - a = 1:length(date_seq), + a = seq_along(date_seq), b = rand_vals ), tibble( geo_value = "ca", time_value = date_seq, - a = length(date_seq):1, + a = rev(seq_along(date_seq)), b = rand_vals + 10 ), tibble( geo_value = "fl", time_value = date_seq, - a = length(date_seq):1, + a = rev(seq_along(date_seq)), b = rand_vals * 2 )), ...) } @@ -1013,25 +1077,28 @@ test_that("results for different time_types match between epi_slide and epi_slid group_by(geo_value) ref_result <- epi_slide(ref_epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = 6L, after = 0L, names_sep = NULL) + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = 6L, after = 0L, names_sep = NULL + ) - test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { + test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. date 10 is missing epi_data <- generate_special_date_data(dates) %>% group_by(geo_value) result1 <- epi_slide(epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = before, after = after, names_sep = NULL, ...) + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + before = before, after = after, names_sep = NULL, ... + ) result2 <- epi_slide_mean(epi_data, col_names = c("a", "b"), na.rm = TRUE, - before = before, after = after, ...) + before = before, after = after, ... + ) expect_identical(result1, result2) # All fields except dates @@ -1054,7 +1121,8 @@ test_that("results for different time_types match between epi_slide and epi_slid group_by(geo_value) result2 <- epi_slide_mean(epi_data, col_names = c("a", "b"), na.rm = TRUE, - before = 6L, after = 0L) + before = 6L, after = 0L + ) expect_identical(select(ref_result, -time_value), select(result2, -time_value)) }) @@ -1072,7 +1140,7 @@ test_that("special time_types without time_step fail in epi_slide_mean", { ) not_dates <- c("a", "b", "c", "d", "e", "f") - test_time_type_mean <- function (dates, before = 6L, after = 0L, ...) { + test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { epi_data <- epiprocess::as_epi_df(tibble( geo_value = "al", time_value = dates, @@ -1080,12 +1148,12 @@ test_that("special time_types without time_step fail in epi_slide_mean", { )) expect_error( - epi_slide_mean(epi_data, col_names = "a", + epi_slide_mean(epi_data, + col_names = "a", before = before, after = after ), class = "epiprocess__epi_slide_mean__unmappable_time_type" ) - } test_time_type_mean(custom_dates) @@ -1098,7 +1166,7 @@ test_that("helper `full_date_seq` returns expected date values", { n <- 6L # Max date index m <- 1L # Number of missing dates n_obs <- n + 1L - m # Number of obs created - k <- c(0L:(n-(m + 1L)), n) # Date indices + k <- c(0L:(n - (m + 1L)), n) # Date indices set.seed(0) rand_vals <- rnorm(n_obs) @@ -1107,17 +1175,17 @@ test_that("helper `full_date_seq` returns expected date values", { epiprocess::as_epi_df(rbind(tibble( geo_value = "al", time_value = date_seq, - a = 1:length(date_seq), + a = seq_along(date_seq), b = rand_vals ), tibble( geo_value = "ca", time_value = date_seq, - a = length(date_seq):1, + a = rev(seq_along(date_seq)), b = rand_vals + 10 ), tibble( geo_value = "fl", time_value = date_seq, - a = length(date_seq):1, + a = rev(seq_along(date_seq)), b = rand_vals * 2 )), ...) } @@ -1141,17 +1209,22 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( - generate_special_date_data(days), before = before, after = after + generate_special_date_data(days), + before = before, after = after ), list( - all_dates = as.Date(c("2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", "2022-01-05", "2022-01-06", "2022-01-07")), + all_dates = as.Date(c( + "2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", + "2022-01-05", "2022-01-06", "2022-01-07" + )), pad_early_dates = as.Date(c("2021-12-30", "2021-12-31")), pad_late_dates = as.Date(c("2022-01-08")) ) ) expect_identical( full_date_seq( - generate_special_date_data(yearweeks), before = before, after = after + generate_special_date_data(yearweeks), + before = before, after = after ), list( all_dates = tsibble::yearweek(10:16), @@ -1161,7 +1234,8 @@ test_that("helper `full_date_seq` returns expected date values", { ) expect_identical( full_date_seq( - generate_special_date_data(yearmonths), before = before, after = after + generate_special_date_data(yearmonths), + before = before, after = after ), list( all_dates = tsibble::yearmonth(10:16), @@ -1171,7 +1245,8 @@ test_that("helper `full_date_seq` returns expected date values", { ) expect_identical( full_date_seq( - generate_special_date_data(yearquarters), before = before, after = after + generate_special_date_data(yearquarters), + before = before, after = after ), list( all_dates = tsibble::yearquarter(10:16), @@ -1181,7 +1256,8 @@ test_that("helper `full_date_seq` returns expected date values", { ) expect_identical( full_date_seq( - generate_special_date_data(years), before = before, after = after + generate_special_date_data(years), + before = before, after = after ), list( all_dates = 2000L:2006L, @@ -1191,7 +1267,8 @@ test_that("helper `full_date_seq` returns expected date values", { ) expect_identical( full_date_seq( - generate_special_date_data(day_times_minute), before = before, after = after, + generate_special_date_data(day_times_minute), + before = before, after = after, time_step = lubridate::minutes ), list( @@ -1202,7 +1279,8 @@ test_that("helper `full_date_seq` returns expected date values", { ) expect_identical( full_date_seq( - generate_special_date_data(day_times_hour), before = before, after = after, + generate_special_date_data(day_times_hour), + before = before, after = after, time_step = lubridate::hours ), list( @@ -1213,11 +1291,15 @@ test_that("helper `full_date_seq` returns expected date values", { ) expect_identical( full_date_seq( - generate_special_date_data(weeks), before = before, after = after, + generate_special_date_data(weeks), + before = before, after = after, time_step = lubridate::weeks ), list( - all_dates = as.Date(c("2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", "2022-01-29", "2022-02-05", "2022-02-12")), + all_dates = as.Date(c( + "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", + "2022-01-29", "2022-02-05", "2022-02-12" + )), pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), pad_late_dates = as.Date(c("2022-02-19")) ) @@ -1225,10 +1307,14 @@ test_that("helper `full_date_seq` returns expected date values", { # Check the middle branch (`if (missing(time_step))`) of `full_date_seq`. expect_identical( full_date_seq( - generate_special_date_data(weeks, time_type = "week"), before = before, after = after + generate_special_date_data(weeks, time_type = "week"), + before = before, after = after ), list( - all_dates = as.Date(c("2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", "2022-01-29", "2022-02-05", "2022-02-12")), + all_dates = as.Date(c( + "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", + "2022-01-29", "2022-02-05", "2022-02-12" + )), pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), pad_late_dates = as.Date(c("2022-02-19")) ) @@ -1237,17 +1323,17 @@ test_that("helper `full_date_seq` returns expected date values", { test_that("`epi_slide_mean` errors when passed `time_values` with closer than expected spacing", { time_df <- tibble( - geo_value = 1, - value = c(0:7, 3.5, 10, 20), - # Adding the value 3.5 creates a time that has fractional seconds, which - # doesn't follow the expected 1-second spacing of the `time_values`. - # This leads to `frollmean` using obs spanning less than the expected - # time frame for some computation windows. - time_value = Sys.time() + value - ) %>% - as_epi_df() - expect_error( - epi_slide_mean(time_df, "value", before = 6L, time_step = lubridate::seconds), - class = "epiprocess__epi_slide_mean__unexpected_row_number" - ) + geo_value = 1, + value = c(0:7, 3.5, 10, 20), + # Adding the value 3.5 creates a time that has fractional seconds, which + # doesn't follow the expected 1-second spacing of the `time_values`. + # This leads to `frollmean` using obs spanning less than the expected + # time frame for some computation windows. + time_value = Sys.time() + value + ) %>% + as_epi_df() + expect_error( + epi_slide_mean(time_df, "value", before = 6L, time_step = lubridate::seconds), + class = "epiprocess__epi_slide_mean__unexpected_row_number" + ) }) From b44748424174f3f6c9f24c7ab976260f4c1c8228 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 20 Mar 2024 18:48:04 -0400 Subject: [PATCH 209/345] rearrange test vars for clarity --- tests/testthat/test-epi_slide.R | 58 ++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 8b509452..ae94402b 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -957,31 +957,19 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin }) test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { - # 3 missing dates - n <- 15 # Max date index - m <- 3 # Number of missing dates - n_obs <- n + 1 - m # Number of obs created - k <- c(0:(n - (m + 1)), n) # Date indices - - # Basic time type - days <- as.Date("2022-01-01") + k - - set.seed(0) - rand_vals <- rnorm(n_obs) - - test_time_type_mean <- function(dates, vals, before = 6L, after = 0L, ...) { + test_time_type_mean <- function(dates, vals, before = 6L, after = 0L, n, m, n_obs, k, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other - # b is just random. date 10 is missing + # b is just random. last (m-1):(n-1) dates are missing epi_data <- epiprocess::as_epi_df(rbind(tibble( geo_value = "al", time_value = dates, a = 1:n_obs, - b = rand_vals + b = vals ), tibble( geo_value = "ca", time_value = dates, a = n_obs:1, - b = rand_vals + 10 + b = vals + 10 ))) %>% group_by(geo_value) @@ -999,12 +987,24 @@ test_that("results for different `before`s and `after`s match between epi_slide expect_identical(result1, result2) } - test_time_type_mean(days, rand_vals, before = 6, after = 0) - test_time_type_mean(days, rand_vals, before = 6, after = 1) - test_time_type_mean(days, rand_vals, before = 6, after = 6) - test_time_type_mean(days, rand_vals, before = 1, after = 6) - test_time_type_mean(days, rand_vals, before = 0, after = 6) - test_time_type_mean(days, rand_vals, before = 0, after = 1) + set.seed(0) + + # 3 missing dates + n <- 15 # Max date index + m <- 3 # Number of missing dates + n_obs <- n + 1 - m # Number of obs created + k <- c(0:(n - (m + 1)), n) # Date indices + + rand_vals <- rnorm(n_obs) + # Basic time type + days <- as.Date("2022-01-01") + k + + test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) # Without any missing dates n <- 15 # Max date index @@ -1012,16 +1012,16 @@ test_that("results for different `before`s and `after`s match between epi_slide n_obs <- n + 1 - m # Number of obs created k <- c(0:(n - (m + 1)), n) # Date indices + rand_vals <- rnorm(n_obs) # Basic time type days <- as.Date("2022-01-01") + k - rand_vals <- rnorm(n_obs) - test_time_type_mean(days, rand_vals, before = 6, after = 0) - test_time_type_mean(days, rand_vals, before = 6, after = 1) - test_time_type_mean(days, rand_vals, before = 6, after = 6) - test_time_type_mean(days, rand_vals, before = 1, after = 6) - test_time_type_mean(days, rand_vals, before = 0, after = 6) - test_time_type_mean(days, rand_vals, before = 0, after = 1) + test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) }) test_that("results for different time_types match between epi_slide and epi_slide_mean", { From f5d7a8f5ee8c3d8660ca8a00802cf27ef6aaf189 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 20 Mar 2024 18:49:39 -0400 Subject: [PATCH 210/345] add actionable example to duplicate time value error --- R/slide.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 9d5a4e2a..ac03fe64 100644 --- a/R/slide.R +++ b/R/slide.R @@ -640,7 +640,8 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, "group contains duplicate time values. Using `epi_slide_mean` on this group will result in incorrect results", "i" = "Please change the grouping structure of the input data so that - each group has non-duplicate time values", + each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) + %>% epi_slide_mean`)", "i" = "Use `epi_slide` to aggregate across groups" ), class = "epiprocess__epi_slide_mean__duplicate_time_values", From c04b2adb2f86cd99e61ca971156776e98fad9f7e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 20 Mar 2024 19:01:42 -0400 Subject: [PATCH 211/345] check other before/after values for full_date_seq helper --- R/slide.R | 2 +- tests/testthat/test-epi_slide.R | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index ac03fe64..5ae6377e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -722,7 +722,7 @@ full_date_seq <- function(x, before, after, time_step) { # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a unit step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { + is.numeric(x$time_value)) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index ae94402b..74b8d8bc 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1319,6 +1319,48 @@ test_that("helper `full_date_seq` returns expected date values", { pad_late_dates = as.Date(c("2022-02-19")) ) ) + + # Other before/after values + before <- 5L + after <- 0L + + expect_identical( + full_date_seq( + generate_special_date_data(days), + before = before, after = after + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", + "2022-01-05", "2022-01-06", "2022-01-07" + )), + pad_early_dates = as.Date(c( + "2021-12-27", "2021-12-28", "2021-12-29", "2021-12-30", + "2021-12-31" + )), + pad_late_dates = NULL + ) + ) + + before <- 0L + after <- 3L + + expect_identical( + full_date_seq( + generate_special_date_data(days), + before = before, after = after + ), + list( + all_dates = as.Date(c( + "2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", + "2022-01-05", "2022-01-06", "2022-01-07" + )), + pad_early_dates = NULL, + pad_late_dates = as.Date(c( + "2022-01-08", "2022-01-09", "2022-01-10" + )) + ) + ) }) test_that("`epi_slide_mean` errors when passed `time_values` with closer than expected spacing", { From c6ee7f91013dd1438158a8c2cd64567968146f69 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:28:43 -0400 Subject: [PATCH 212/345] check 0-row input data in epi_slide_mean --- R/slide.R | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index 5ae6377e..584d3664 100644 --- a/R/slide.R +++ b/R/slide.R @@ -530,8 +530,20 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") + if (nrow(x) == 0L) { + cli_abort( + c( + "input data `x` unexpectedly has 0 rows", + "i" = "If this computation is occuring within an `epix_slide` call, + check that `epix_slide` `ref_time_values` argument was set appropriately" + ), + class = "epiprocess__epi_slide_mean__0_row_input", + epiprocess__x = x + ) + } + if (!is.null(as_list_col)) { - cli::cli_abort( + cli_abort( "`as_list_col` is not supported for `epi_slide_mean`", class = "epiproces__epi_slide_mean__list_not_supported" ) @@ -720,7 +732,7 @@ full_date_seq <- function(x, before, after, time_step) { # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. # # `tsibble` classes apparently can't be added to in different units, so even - # if `time_step` is provided by the user, use a unit step. + # if `time_step` is provided by the user, use a value-1 unitless step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || is.numeric(x$time_value)) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) From 56bed8c3ef2b780eb05cd3cd8dc34514eb6374c3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Sat, 23 Mar 2024 10:27:03 -0400 Subject: [PATCH 213/345] support col_names as tidyselect --- NAMESPACE | 2 + R/slide.R | 46 +++++++++++------ man/epi_slide_mean.Rd | 15 +++--- tests/testthat/test-epi_slide.R | 87 ++++++++++++++++++--------------- 4 files changed, 89 insertions(+), 61 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 8de30ed3..cf59f29d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -132,6 +132,7 @@ importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) +importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,enquo) @@ -146,6 +147,7 @@ importFrom(rlang,is_missing) importFrom(rlang,is_quosure) importFrom(rlang,missing_arg) importFrom(rlang,new_function) +importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) diff --git a/R/slide.R b/R/slide.R index 584d3664..bfd74905 100644 --- a/R/slide.R +++ b/R/slide.R @@ -393,8 +393,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] #' or ungrouped. If ungrouped, all data in `x` will be treated as part of a #' single data group. -#' @param col_names A character vector of the names of one or more columns for -#' which to calculate the rolling mean. +#' @param col_names A single tidyselection or a tidyselection vector of the +#' names of one or more columns for which to calculate the rolling mean. #' @param ... Additional arguments to pass to `data.table::frollmean`, for #' example, `na.rm` and `algo`. `data.table::frollmean` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment @@ -473,7 +473,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' leading window was intended, but the `after` argument was forgotten or #' misspelled.) #' -#' @importFrom dplyr bind_rows mutate %>% arrange tibble +#' @importFrom dplyr bind_rows mutate %>% arrange tibble select +#' @importFrom rlang enquo quo_get_expr as_label #' @importFrom purrr map #' @importFrom data.table frollmean #' @importFrom lubridate as.period @@ -484,7 +485,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -493,7 +494,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", +#' epi_slide_mean(cases, #' new_col_names = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE @@ -504,7 +505,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -512,7 +513,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -520,7 +521,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean("cases", new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() @@ -604,8 +605,25 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, # `before` and `after` params. m <- before + after + 1L + col_names_quo <- enquo(col_names) + col_names_chr <- as.character(rlang::quo_get_expr(col_names_quo)) + if (startsWith(rlang::as_label(col_names_quo), "c(")) { + # List or vector of col names. We need to drop the first element since it + # will be either "c" (if built as a vector) or "list" (if built as a + # list). + col_names_chr <- col_names_chr[-1] + } else if (startsWith(rlang::as_label(col_names_quo), "list(")) { + cli_abort( + "`col_names` must be a single tidy column name or a vector + (`c()`) of tidy column names", + class = "epiprocess__epi_slide_mean__col_names_in_list", + epiprocess__col_names = col_names_chr + ) + } + # If single column name, do nothing. + if (is.null(names_sep)) { - if (length(new_col_names) != length(col_names)) { + if (length(new_col_names) != length(col_names_chr)) { cli_abort( c( "`new_col_names` must be the same length as `col_names` when @@ -613,20 +631,20 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, ), class = "epiprocess__epi_slide_mean__col_names_length_mismatch", epiprocess__new_col_names = new_col_names, - epiprocess__col_names = col_names + epiprocess__col_names = col_names_chr ) } result_col_names <- new_col_names } else { - if (length(new_col_names) != 1L && length(new_col_names) != length(col_names)) { + if (length(new_col_names) != 1L && length(new_col_names) != length(col_names_chr)) { cli_abort( "`new_col_names` must be either length 1 or the same length as `col_names`.", class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", epiprocess__new_col_names = new_col_names, - epiprocess__col_names = col_names + epiprocess__col_names = col_names_chr ) } - result_col_names <- paste(new_col_names, col_names, sep = names_sep) + result_col_names <- paste(new_col_names, col_names_chr, sep = names_sep) } slide_one_grp <- function(.data_group, .group_key, ...) { @@ -675,7 +693,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, } roll_output <- data.table::frollmean( - x = .data_group[, col_names], n = m, align = "right", ... + x = select(.data_group, {{ col_names }}), n = m, align = "right", ... ) if (after >= 1) { diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 8fb0b33c..fd4b84ab 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -23,8 +23,8 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A character vector of the names of one or more columns for -which to calculate the rolling mean.} +\item{col_names}{A single tidyselection or a tidyselection vector of the +names of one or more columns for which to calculate the rolling mean.} \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically @@ -123,7 +123,7 @@ misspelled.) # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6) \%>\% + epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -132,7 +132,8 @@ jhu_csse_daily_subset \%>\% # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 6, + epi_slide_mean(cases, + new_col_names = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -142,7 +143,7 @@ jhu_csse_daily_subset \%>\% # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, after = 6) \%>\% + epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -150,7 +151,7 @@ jhu_csse_daily_subset \%>\% # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -158,7 +159,7 @@ jhu_csse_daily_subset \%>\% # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean("cases", new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% ungroup() diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 74b8d8bc..f727337c 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -59,11 +59,11 @@ test_that("`before` and `after` are both vectors of length 1", { ) expect_error( - epi_slide_mean(grouped, col_names = "value", before = c(0, 1), after = 0, ref_time_values = d + 3), + epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = d + 3), "Assertion on 'before' failed: Must have length 1" ) expect_error( - epi_slide_mean(grouped, col_names = "value", before = 1, after = c(0, 1), ref_time_values = d + 3), + epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = d + 3), "Assertion on 'after' failed: Must have length 1" ) }) @@ -83,15 +83,15 @@ test_that("Test errors/warnings for discouraged features", { ) expect_error( - epi_slide_mean(grouped, col_names = "value", ref_time_values = d + 1), + epi_slide_mean(grouped, col_names = value, ref_time_values = d + 1), "Either or both of `before`, `after` must be provided." ) expect_warning( - epi_slide_mean(grouped, col_names = "value", before = 0L, ref_time_values = d + 1), + epi_slide_mean(grouped, col_names = value, before = 0L, ref_time_values = d + 1), "`before==0`, `after` missing" ) expect_warning( - epi_slide_mean(grouped, col_names = "value", after = 0L, ref_time_values = d + 1), + epi_slide_mean(grouped, col_names = value, after = 0L, ref_time_values = d + 1), "`before` missing, `after==0`" ) @@ -110,19 +110,19 @@ test_that("Test errors/warnings for discouraged features", { expect_no_warning( opt1 <- epi_slide_mean(grouped, - col_names = "value", + col_names = value, before = 1L, ref_time_values = d + 2, na.rm = TRUE ) ) expect_no_warning( opt2 <- epi_slide_mean(grouped, - col_names = "value", + col_names = value, after = 1L, ref_time_values = d + 2, na.rm = TRUE ) ) expect_no_warning( opt3 <- epi_slide_mean(grouped, - col_names = "value", + col_names = value, before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE ) ) @@ -164,31 +164,31 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa ) expect_error( - epi_slide_mean(grouped, col_names = "value", before = -1L, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = d + 2L), "Assertion on 'before' failed: Element 1 is not >= 0" ) expect_error( - epi_slide_mean(grouped, col_names = "value", before = 2L, after = -1L, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = value, before = 2L, after = -1L, ref_time_values = d + 2L), "Assertion on 'after' failed: Element 1 is not >= 0" ) - expect_error(epi_slide_mean(grouped, col_names = "value", before = "a", ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = d + 2L), regexp = "before", class = "vctrs_error_incompatible_type" ) - expect_error(epi_slide_mean(grouped, col_names = "value", before = 1L, after = "a", ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = "a", ref_time_values = d + 2L), regexp = "after", class = "vctrs_error_incompatible_type" ) - expect_error(epi_slide_mean(grouped, col_names = "value", before = 0.5, ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = d + 2L), regexp = "before", class = "vctrs_error_incompatible_type" ) - expect_error(epi_slide_mean(grouped, col_names = "value", before = 1L, after = 0.5, ref_time_values = d + 2L), + expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = 0.5, ref_time_values = d + 2L), regexp = "after", class = "vctrs_error_incompatible_type" ) expect_error( - epi_slide_mean(grouped, col_names = "value", before = NA, after = 1L, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = value, before = NA, after = 1L, ref_time_values = d + 2L), "Assertion on 'before' failed: May not be NA" ) expect_error( - epi_slide_mean(grouped, col_names = "value", before = 1L, after = NA, ref_time_values = d + 2L), + epi_slide_mean(grouped, col_names = value, before = 1L, after = NA, ref_time_values = d + 2L), "Assertion on 'after' failed: May not be NA" ) @@ -198,7 +198,7 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa ) expect_no_error(opt <- epi_slide_mean( grouped, - col_names = "value", before = 1, after = 1, + col_names = value, before = 1, after = 1, ref_time_values = d + 2L, na.rm = TRUE )) @@ -217,11 +217,11 @@ test_that("`ref_time_values` + `before` + `after` that result in no slide data, ) # beyond the last, no data in window expect_error( - epi_slide_mean(grouped, col_names = "value", before = 2L, ref_time_values = d), + epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( - epi_slide_mean(grouped, col_names = "value", before = 2L, ref_time_values = d + 207L), + epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d + 207L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window }) @@ -243,11 +243,11 @@ test_that( ) # beyond the last, but still with data in window expect_error( - epi_slide_mean(grouped, "value", before = 0L, after = 2L, ref_time_values = d), + epi_slide_mean(grouped, value, before = 0L, after = 2L, ref_time_values = d), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( - epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 201L), + epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 201L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window } @@ -265,11 +265,11 @@ test_that("Warn user against having a blank `before`", { )) expect_no_warning(opt1 <- epi_slide_mean( - grouped, "value", + grouped, value, after = 1L, ref_time_values = d + 1L, na.rm = TRUE )) expect_no_warning(opt2 <- epi_slide_mean( - grouped, "value", + grouped, value, before = 0L, after = 1L, ref_time_values = d + 1L, na.rm = TRUE )) @@ -300,13 +300,13 @@ test_that( ) # not out of range for either group expect_identical( - epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% + epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group expect_identical( - epi_slide_mean(grouped, "value", before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% + epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) @@ -344,7 +344,7 @@ test_that("computation output formats x as_list_col", { geo_value == "a" ) %>% epi_slide_mean( - "value", + value, before = 6L, na.rm = TRUE ), basic_result_from_size1_mean %>% dplyr::mutate( @@ -358,7 +358,7 @@ test_that("computation output formats x as_list_col", { geo_value == "a" ) %>% epi_slide_mean( - "value", + value, before = 6L, as_list_col = TRUE, na.rm = TRUE ), class = "epiproces__epi_slide_mean__list_not_supported" @@ -388,7 +388,7 @@ test_that("nested dataframe output names are controllable", { geo_value == "a" ) %>% epi_slide_mean( - "value", + value, before = 6L, names_sep = NULL, na.rm = TRUE ), basic_result_from_size1_mean @@ -479,7 +479,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { geo_value == "a" ) %>% epi_slide_mean( - "value", + value, before = 6L, names_sep = NULL, na.rm = TRUE ), basic_result_from_size1_mean @@ -489,7 +489,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { geo_value == "a" ) %>% epi_slide_mean( - "value", + value, before = 6L, ref_time_values = c(2L, 8L), names_sep = NULL, na.rm = TRUE ), @@ -500,7 +500,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { geo_value == "a" ) %>% epi_slide_mean( - "value", + value, before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, names_sep = NULL, na.rm = TRUE ), @@ -661,7 +661,7 @@ test_that("basic grouped epi_slide_mean computation produces expected output", { group_by(geo_value) %>% as_epi_df(as_of = d + 6) - result1 <- epi_slide_mean(small_x, "value", before = 50, names_sep = NULL, na.rm = TRUE) + result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) expect_identical(result1, expected_output) }) @@ -721,14 +721,14 @@ test_that("basic ungrouped epi_slide_mean computation produces expected output", result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% - epi_slide_mean("value", before = 50, names_sep = NULL, na.rm = TRUE) + epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) expect_identical(result1, expected_output) # Ungrouped with multiple geos # epi_slide_mean fails when input data groups contain duplicate time_values, # e.g. aggregating across geos expect_error( - small_x %>% ungroup() %>% epi_slide_mean("value", before = 6L), + small_x %>% ungroup() %>% epi_slide_mean(value, before = 6L), class = "epiprocess__epi_slide_mean__duplicate_time_values" ) }) @@ -927,7 +927,7 @@ test_that("basic slide behavior is correct when groups have non-overlapping date result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50) expect_identical(result1, expected_output) - result2 <- epi_slide_mean(small_x_misaligned_dates, "value", before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) expect_identical(result2, expected_output) }) @@ -981,7 +981,7 @@ test_that("results for different `before`s and `after`s match between epi_slide before = before, after = after, names_sep = NULL, ... ) result2 <- epi_slide_mean(epi_data, - col_names = c("a", "b"), na.rm = TRUE, + col_names = c(a, b), na.rm = TRUE, before = before, after = after, ... ) expect_identical(result1, result2) @@ -1096,7 +1096,7 @@ test_that("results for different time_types match between epi_slide and epi_slid before = before, after = after, names_sep = NULL, ... ) result2 <- epi_slide_mean(epi_data, - col_names = c("a", "b"), na.rm = TRUE, + col_names = c(a, b), na.rm = TRUE, before = before, after = after, ... ) expect_identical(result1, result2) @@ -1120,7 +1120,7 @@ test_that("results for different time_types match between epi_slide and epi_slid epi_data <- generate_special_date_data(weeks) %>% group_by(geo_value) result2 <- epi_slide_mean(epi_data, - col_names = c("a", "b"), na.rm = TRUE, + col_names = c(a, b), na.rm = TRUE, before = 6L, after = 0L ) expect_identical(select(ref_result, -time_value), select(result2, -time_value)) @@ -1149,7 +1149,7 @@ test_that("special time_types without time_step fail in epi_slide_mean", { expect_error( epi_slide_mean(epi_data, - col_names = "a", + col_names = a, before = before, after = after ), class = "epiprocess__epi_slide_mean__unmappable_time_type" @@ -1375,7 +1375,14 @@ test_that("`epi_slide_mean` errors when passed `time_values` with closer than ex ) %>% as_epi_df() expect_error( - epi_slide_mean(time_df, "value", before = 6L, time_step = lubridate::seconds), + epi_slide_mean(time_df, value, before = 6L, time_step = lubridate::seconds), class = "epiprocess__epi_slide_mean__unexpected_row_number" ) }) + +test_that("`epi_slide_mean` errors when passed `col_names` as list", { + expect_error( + epi_slide_mean(grouped, col_names = list(value), before = 1L, after = 0L, ref_time_values = d + 1), + class = "epiprocess__epi_slide_mean__col_names_in_list" + ) +}) From 0125bee67ad3c8d68e7157bc1bf14ba5f557566e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:02:30 -0400 Subject: [PATCH 214/345] more descriptive frollmean window size name --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index bfd74905..52b7bc95 100644 --- a/R/slide.R +++ b/R/slide.R @@ -603,7 +603,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, # `frollmean` is 1-indexed, so create a new window width based on our # `before` and `after` params. - m <- before + after + 1L + window_size <- before + after + 1L col_names_quo <- enquo(col_names) col_names_chr <- as.character(rlang::quo_get_expr(col_names_quo)) @@ -693,7 +693,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, } roll_output <- data.table::frollmean( - x = select(.data_group, {{ col_names }}), n = m, align = "right", ... + x = select(.data_group, {{ col_names }}), n = window_size, align = "right", ... ) if (after >= 1) { From 8ae9fbabee0d0c9873807a10bd5c1396304c0580 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Tue, 26 Mar 2024 17:28:46 +0000 Subject: [PATCH 215/345] style: styler (GHA) --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 52b7bc95..b0903fab 100644 --- a/R/slide.R +++ b/R/slide.R @@ -752,7 +752,7 @@ full_date_seq <- function(x, before, after, time_step) { # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a value-1 unitless step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { + is.numeric(x$time_value)) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { From f1fff07fbc121ff3e66e04d4c6dac450e00b8ded Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 25 Mar 2024 14:31:14 -0400 Subject: [PATCH 216/345] check user function is allowed; add `f` slide function arg --- NAMESPACE | 9 +++++ R/slide.R | 79 +++++++++++++++++++++++++++++++++++-------- man/epi_slide_mean.Rd | 11 ++++++ 3 files changed, 85 insertions(+), 14 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index cf59f29d..cb2d9f68 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -95,7 +95,9 @@ importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) +importFrom(data.table,frollapply) importFrom(data.table,frollmean) +importFrom(data.table,frollsum) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) @@ -151,6 +153,13 @@ importFrom(rlang,quo_get_expr) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) +importFrom(slider,slide_all) +importFrom(slider,slide_any) +importFrom(slider,slide_max) +importFrom(slider,slide_mean) +importFrom(slider,slide_min) +importFrom(slider,slide_prod) +importFrom(slider,slide_sum) importFrom(stats,cor) importFrom(stats,median) importFrom(tibble,as_tibble) diff --git a/R/slide.R b/R/slide.R index b0903fab..d600e752 100644 --- a/R/slide.R +++ b/R/slide.R @@ -395,6 +395,15 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' single data group. #' @param col_names A single tidyselection or a tidyselection vector of the #' names of one or more columns for which to calculate the rolling mean. +#' @param f Function; together with `...` specifies the computation to slide. +#' `f` must be one of `data.table`'s rolling functions +#' (`frollmean`, `frollsum`, `frollapply`. See `?data.table::roll`) or one +#' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, +#' etc. See `?slider::\`summary-slide\``). To "slide" means to apply a +#' computation within a sliding (a.k.a. "rolling") time window for each data +#' group. The window is determined by the `before` and `after` parameters +#' described below. One time step is typically one day or one week; see +#' details for more explanation. #' @param ... Additional arguments to pass to `data.table::frollmean`, for #' example, `na.rm` and `algo`. `data.table::frollmean` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment @@ -476,9 +485,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @importFrom dplyr bind_rows mutate %>% arrange tibble select #' @importFrom rlang enquo quo_get_expr as_label #' @importFrom purrr map -#' @importFrom data.table frollmean +#' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period #' @importFrom checkmate assert_function +#' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any #' @export #' @seealso [`epi_slide`] #' @examples @@ -525,7 +535,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() -epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, +epi_slide_mean <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, new_col_names = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { @@ -550,6 +560,39 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, ) } + # Check that slide function `f` is one of those short-listed from + # `data.table` and `slider` (or a function that has the exact same + # definition, e.g. if the function has been reexported or defined + # locally). + if (any(lapply( + c(frollmean, frollsum, frollapply), + function(roll_fn) { + isTRUE(all.equal(f, roll_fn)) + } + ))) { + f_from_package <- "data.table" + } else if (any(lapply( + c(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), + function(roll_fn) { + isTRUE(all.equal(f, roll_fn)) + } + ))) { + f_from_package <- "slider" + } else { + # `f` is from somewhere else and not supported + cli_abort( + c( + "slide function `f` is not supported", + "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, + `frollsum`, `frollapply`. See `?data.table::roll`) or one of + `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, + etc. See `?slider::\`summary-slide\`` for more options)." + ), + class = "epiprocess__epi_slide_opt__unsupported_slide_function", + epiprocess__f = f + ) + } + user_provided_rtvs <- !missing(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) @@ -692,19 +735,27 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, ) } - roll_output <- data.table::frollmean( - x = select(.data_group, {{ col_names }}), n = window_size, align = "right", ... - ) + if (f_from_package == "data.table") { + roll_output <- f( + x = select(.data_group, {{ col_names }}), n = window_size, align = "right", ... + ) - if (after >= 1) { - # Right-aligned `frollmean` results' `ref_time_value`s will be `after` - # timesteps ahead of where they should be. Shift results to the left by - # `after` timesteps. - .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { - c(.x[(after + 1L):length(.x)], rep(NA, after)) - }) - } else { - .data_group[, result_col_names] <- roll_output + if (after >= 1) { + # Right-aligned `frollmean` results' `ref_time_value`s will be `after` + # timesteps ahead of where they should be. Shift results to the left by + # `after` timesteps. + .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { + c(.x[(after + 1L):length(.x)], rep(NA, after)) + }) + } else { + .data_group[, result_col_names] <- roll_output + } + } else if (f_from_package == "slider") { + for (i in seq_along(col_names_chr)) { + .data_group[, result_col_names[i]] <- f( + x = .data_group[[col_names_chr[i]]], before = before, after = after, ... + ) + } } return(.data_group) diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index fd4b84ab..23c129d0 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -7,6 +7,7 @@ epi_slide_mean( x, col_names, + f, ..., before, after, @@ -26,6 +27,16 @@ single data group.} \item{col_names}{A single tidyselection or a tidyselection vector of the names of one or more columns for which to calculate the rolling mean.} +\item{f}{Function; together with \code{...} specifies the computation to slide. +\code{f} must be one of \code{data.table}'s rolling functions +(\code{frollmean}, \code{frollsum}, \code{frollapply}. See \code{?data.table::roll}) or one +of \code{slider}'s specialized sliding functions (\code{slide_mean}, \code{slide_sum}, +etc. See \verb{?slider::\\}summary-slide\``). To "slide" means to apply a +computation within a sliding (a.k.a. "rolling") time window for each data +group. The window is determined by the \code{before} and `after` parameters +described below. One time step is typically one day or one week; see +details for more explanation.} + \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically passed the data \code{x} to operate on, the window size \code{n}, and the alignment From 24d91e8fe0ea826e4989f73da158bb614ae7f8ac Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 25 Mar 2024 17:33:36 -0400 Subject: [PATCH 217/345] add wrappers for rolling sum and mean; update examples --- NAMESPACE | 2 + R/slide.R | 327 ++++++++++++++++++++++++++++++++++++++---- man/epi_slide.Rd | 2 +- man/epi_slide_mean.Rd | 15 +- man/epi_slide_opt.Rd | 181 +++++++++++++++++++++++ man/epi_slide_sum.Rd | 135 +++++++++++++++++ 6 files changed, 625 insertions(+), 37 deletions(-) create mode 100644 man/epi_slide_opt.Rd create mode 100644 man/epi_slide_sum.Rd diff --git a/NAMESPACE b/NAMESPACE index cb2d9f68..46207199 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,8 @@ export(epi_archive) export(epi_cor) export(epi_slide) export(epi_slide_mean) +export(epi_slide_opt) +export(epi_slide_sum) export(epix_as_of) export(epix_merge) export(epix_slide) diff --git a/R/slide.R b/R/slide.R index d600e752..811a12a1 100644 --- a/R/slide.R +++ b/R/slide.R @@ -127,7 +127,7 @@ #' @importFrom dplyr bind_rows group_vars filter select #' @importFrom rlang .data .env !! enquo enquos sym env missing_arg #' @export -#' @seealso [`epi_slide_mean`] +#' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`] #' @examples #' # slide a 7-day trailing average formula on cases #' # This and other simple sliding means are much faster to do using @@ -384,7 +384,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, return(x) } -#' Optimized slide function for performing rolling averages on an `epi_df` object +#' Optimized slide function for performing common rolling computations on an `epi_df` object #' #' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for @@ -404,10 +404,13 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group. The window is determined by the `before` and `after` parameters #' described below. One time step is typically one day or one week; see #' details for more explanation. -#' @param ... Additional arguments to pass to `data.table::frollmean`, for -#' example, `na.rm` and `algo`. `data.table::frollmean` is automatically -#' passed the data `x` to operate on, the window size `n`, and the alignment -#' `align`. Providing these args via `...` will cause an error. +#' @param ... Additional arguments to pass to the slide computation `f`, for +#' example, `na.rm` and `algo` if `f` is a `data.table` function. If `f` is +#' a `data.table` function, it is automatically passed the data `x` to +#' operate on, the window size `n`, and the alignment `align`. Providing +#' these args via `...` will cause an error. If `f` is a `slider` function, +#' it is automatically passed the data `x` to operate on, and number of +#' points `before` and `after` to use in the computation. #' @param before,after How far `before` and `after` each `ref_time_value` should #' the sliding window extend? At least one of these two arguments must be #' provided; the other's default will be 0. Any value provided for either @@ -490,12 +493,14 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @importFrom checkmate assert_function #' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any #' @export -#' @seealso [`epi_slide`] +#' @seealso [`epi_slide`] [`epi_slide_mean`] [`epi_slide_sum`] #' @examples -#' # slide a 7-day trailing average formula on cases +#' # slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_opt( +#' cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 +#' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -504,7 +509,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, +#' epi_slide_opt(cases, #' new_col_names = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE @@ -515,27 +520,23 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_opt( +#' cases, f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +#' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' -#' # slide a 7-day centre-aligned average +#' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_opt( +#' cases, f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 +#' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() -#' -#' # slide a 14-day centre-aligned average -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% -#' ungroup() -epi_slide_mean <- function(x, col_names, f, ..., before, after, ref_time_values, +epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, new_col_names = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { @@ -564,14 +565,14 @@ epi_slide_mean <- function(x, col_names, f, ..., before, after, ref_time_values, # `data.table` and `slider` (or a function that has the exact same # definition, e.g. if the function has been reexported or defined # locally). - if (any(lapply( + if (any(sapply( c(frollmean, frollsum, frollapply), function(roll_fn) { isTRUE(all.equal(f, roll_fn)) } ))) { f_from_package <- "data.table" - } else if (any(lapply( + } else if (any(sapply( c(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), function(roll_fn) { isTRUE(all.equal(f, roll_fn)) @@ -782,6 +783,284 @@ epi_slide_mean <- function(x, col_names, f, ..., before, after, ref_time_values, return(result) } +#' Optimized slide function for performing rolling averages on an `epi_df` object +#' +#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for +#' examples. +#' +#' Wrapper around `epi_slide_opt` with `f = datatable::frollmean`. +#' +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param col_names A single tidyselection or a tidyselection vector of the +#' names of one or more columns for which to calculate the rolling mean. +#' @param ... Additional arguments to pass to `data.table::frollmean`, for +#' example, `na.rm` and `algo`. `data.table::frollmean` is automatically +#' passed the data `x` to operate on, the window size `n`, and the alignment +#' `align`. Providing these args via `...` will cause an error. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. +#' @param ref_time_values Time values for sliding computations, meaning, each +#' element of this vector serves as the reference time point for one sliding +#' window. If missing, then this will be set to all unique time values in the +#' underlying data table, by default. +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class `lubridate::period`. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_names String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_names` equal to an existing column name will overwrite this column. +#' @param as_list_col Not supported. Included to match `epi_slide` interface. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_names` entirely. +#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, depending on the `col_names` argument, named according to the +#' `new_col_names` argument. +#' +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is entered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. The unit (the meaning of one time step) is implicitly +#' defined by the way the `time_value` column treats addition and subtraction; +#' for example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals +#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly +#' using the `time_step` argument (which if specified would override the +#' default choice based on `time_value` column). If there are not enough time +#' steps available to complete the window at any given reference time, then +#' `epi_slide()` still attempts to perform the computation anyway (it does not +#' require a complete window). The issue of what to do with partial +#' computations (those run on incomplete windows) is therefore left up to the +#' user, either through the specified function or formula `f`, or through +#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and +#' `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) +#' +#' @export +#' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_sum`] +#' @examples +#' # slide a 7-day trailing average formula on cases +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed +#' # and accuracy, and to allow partially-missing windows. +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, +#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' # `frollmean` options +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 7-day leading average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 7-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() +#' +#' # slide a 14-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' ungroup() +epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, + time_step, + new_col_names = "slide_value", as_list_col = NULL, + names_sep = "_", all_rows = FALSE) { + epi_slide_opt( + x = x, + col_names = {{ col_names }}, + f = data.table::frollmean, + ..., + before = before, + after = after, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_names = new_col_names, + as_list_col = as_list_col, + names_sep = names_sep, + all_rows = all_rows + ) +} + +#' Optimized slide function for performing rolling sums on an `epi_df` object +#' +#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for +#' examples. +#' +#' Wrapper around `epi_slide_opt` with `f = datatable::frollsum`. +#' +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param col_names A single tidyselection or a tidyselection vector of the +#' names of one or more columns for which to calculate the rolling mean. +#' @param ... Additional arguments to pass to `data.table::frollsum`, for +#' example, `na.rm` and `algo`. `data.table::frollsum` is automatically +#' passed the data `x` to operate on, the window size `n`, and the alignment +#' `align`. Providing these args via `...` will cause an error. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. +#' @param ref_time_values Time values for sliding computations, meaning, each +#' element of this vector serves as the reference time point for one sliding +#' window. If missing, then this will be set to all unique time values in the +#' underlying data table, by default. +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class `lubridate::period`. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_names String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_names` equal to an existing column name will overwrite this column. +#' @param as_list_col Not supported. Included to match `epi_slide` interface. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_names` entirely. +#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, depending on the `col_names` argument, named according to the +#' `new_col_names` argument. +#' +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is entered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. The unit (the meaning of one time step) is implicitly +#' defined by the way the `time_value` column treats addition and subtraction; +#' for example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals +#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly +#' using the `time_step` argument (which if specified would override the +#' default choice based on `time_value` column). If there are not enough time +#' steps available to complete the window at any given reference time, then +#' `epi_slide()` still attempts to perform the computation anyway (it does not +#' require a complete window). The issue of what to do with partial +#' computations (those run on incomplete windows) is therefore left up to the +#' user, either through the specified function or formula `f`, or through +#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and +#' `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) +#' +#' @export +#' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_mean`] +#' @examples +#' # slide a 7-day trailing sum formula on cases +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, new_col_names = "cases_7dsum", names_sep = NULL, before = 6) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>% +#' ungroup() +epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, + time_step, + new_col_names = "slide_value", as_list_col = NULL, + names_sep = "_", all_rows = FALSE) { + epi_slide_opt( + x = x, + col_names = {{ col_names }}, + f = data.table::frollsum, + ..., + before = before, + after = after, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_names = new_col_names, + as_list_col = as_list_col, + names_sep = names_sep, + all_rows = all_rows + ) +} + #' Make a complete date sequence between min(x$time_value) and max #' (x$time_value). Produce lists of dates before min(x$time_value) and after #' max(x$time_value) for padding initial and final windows to size `n`. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index d09dfdda..2fe1dce6 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -205,5 +205,5 @@ jhu_csse_daily_subset \%>\% ungroup() } \seealso{ -\code{\link{epi_slide_mean}} +\code{\link{epi_slide_opt}} \code{\link{epi_slide_mean}} \code{\link{epi_slide_sum}} } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 23c129d0..9979c1e3 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -7,7 +7,6 @@ epi_slide_mean( x, col_names, - f, ..., before, after, @@ -27,16 +26,6 @@ single data group.} \item{col_names}{A single tidyselection or a tidyselection vector of the names of one or more columns for which to calculate the rolling mean.} -\item{f}{Function; together with \code{...} specifies the computation to slide. -\code{f} must be one of \code{data.table}'s rolling functions -(\code{frollmean}, \code{frollsum}, \code{frollapply}. See \code{?data.table::roll}) or one -of \code{slider}'s specialized sliding functions (\code{slide_mean}, \code{slide_sum}, -etc. See \verb{?slider::\\}summary-slide\``). To "slide" means to apply a -computation within a sliding (a.k.a. "rolling") time window for each data -group. The window is determined by the \code{before} and `after` parameters -described below. One time step is typically one day or one week; see -details for more explanation.} - \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically passed the data \code{x} to operate on, the window size \code{n}, and the alignment @@ -101,6 +90,8 @@ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \hr examples. } \details{ +Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. + To "slide" means to apply a function or formula over a rolling window of time steps for each data group, where the window is entered at a reference time and left and right endpoints are given by the \code{before} and @@ -176,5 +167,5 @@ jhu_csse_daily_subset \%>\% ungroup() } \seealso{ -\code{\link{epi_slide}} +\code{\link{epi_slide}} \code{\link{epi_slide_opt}} \code{\link{epi_slide_sum}} } diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd new file mode 100644 index 00000000..202216ce --- /dev/null +++ b/man/epi_slide_opt.Rd @@ -0,0 +1,181 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{epi_slide_opt} +\alias{epi_slide_opt} +\title{Optimized slide function for performing common rolling computations on an \code{epi_df} object} +\usage{ +epi_slide_opt( + x, + col_names, + f, + ..., + before, + after, + ref_time_values, + time_step, + new_col_names = "slide_value", + as_list_col = NULL, + names_sep = "_", + all_rows = FALSE +) +} +\arguments{ +\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +single data group.} + +\item{col_names}{A single tidyselection or a tidyselection vector of the +names of one or more columns for which to calculate the rolling mean.} + +\item{f}{Function; together with \code{...} specifies the computation to slide. +\code{f} must be one of \code{data.table}'s rolling functions +(\code{frollmean}, \code{frollsum}, \code{frollapply}. See \code{?data.table::roll}) or one +of \code{slider}'s specialized sliding functions (\code{slide_mean}, \code{slide_sum}, +etc. See \verb{?slider::\\}summary-slide\``). To "slide" means to apply a +computation within a sliding (a.k.a. "rolling") time window for each data +group. The window is determined by the \code{before} and `after` parameters +described below. One time step is typically one day or one week; see +details for more explanation.} + +\item{...}{Additional arguments to pass to the slide computation \code{f}, for +example, \code{na.rm} and \code{algo} if \code{f} is a \code{data.table} function. If \code{f} is +a \code{data.table} function, it is automatically passed the data \code{x} to +operate on, the window size \code{n}, and the alignment \code{align}. Providing +these args via \code{...} will cause an error. If \code{f} is a \code{slider} function, +it is automatically passed the data \code{x} to operate on, and number of +points \code{before} and \code{after} to use in the computation.} + +\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should +the sliding window extend? At least one of these two arguments must be +provided; the other's default will be 0. Any value provided for either +argument must be a single, non-\code{NA}, non-negative, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of +the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass +\verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item For leading/left-aligned windows from \code{ref_time_value} to +\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +or pass \verb{before=0, after=k}. +See "Details:" about the definition of a time step,(non)treatment of +missing rows within the window, and avoiding warnings about +\code{before}&\code{after} settings for a certain uncommon use case. +}} + +\item{ref_time_values}{Time values for sliding computations, meaning, each +element of this vector serves as the reference time point for one sliding +window. If missing, then this will be set to all unique time values in the +underlying data table, by default.} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a non-negative integer and +return an object of class \code{lubridate::period}. For example, we can use +\code{time_step = lubridate::hours} in order to set the time step to be one hour +(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_names}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_names} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_names} entirely.} + +\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output.} +} +\value{ +An \code{epi_df} object given by appending one or more new columns to +\code{x}, depending on the \code{col_names} argument, named according to the +\code{new_col_names} argument. +} +\description{ +Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +examples. +} +\details{ +To "slide" means to apply a function or formula over a rolling +window of time steps for each data group, where the window is entered at a +reference time and left and right endpoints are given by the \code{before} and +\code{after} arguments. The unit (the meaning of one time step) is implicitly +defined by the way the \code{time_value} column treats addition and subtraction; +for example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals +\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly +using the \code{time_step} argument (which if specified would override the +default choice based on \code{time_value} column). If there are not enough time +steps available to complete the window at any given reference time, then +\code{epi_slide()} still attempts to perform the computation anyway (it does not +require a complete window). The issue of what to do with partial +computations (those run on incomplete windows) is therefore left up to the +user, either through the specified function or formula \code{f}, or through +post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and +\code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) +} +\examples{ +# slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_opt( + cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 + ) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed +# and accuracy, and to allow partially-missing windows. +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_opt(cases, + new_col_names = "cases_7dav", names_sep = NULL, before = 6, + # `frollmean` options + na.rm = TRUE, algo = "exact", hasNA = TRUE + ) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 7-day leading average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_opt( + cases, f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + ) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() + +# slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_opt( + cases, f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 + ) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() +} +\seealso{ +\code{\link{epi_slide}} \code{\link{epi_slide_mean}} \code{\link{epi_slide_sum}} +} diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd new file mode 100644 index 00000000..d87e5f29 --- /dev/null +++ b/man/epi_slide_sum.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{epi_slide_sum} +\alias{epi_slide_sum} +\title{Optimized slide function for performing rolling sums on an \code{epi_df} object} +\usage{ +epi_slide_sum( + x, + col_names, + ..., + before, + after, + ref_time_values, + time_step, + new_col_names = "slide_value", + as_list_col = NULL, + names_sep = "_", + all_rows = FALSE +) +} +\arguments{ +\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +single data group.} + +\item{col_names}{A single tidyselection or a tidyselection vector of the +names of one or more columns for which to calculate the rolling mean.} + +\item{...}{Additional arguments to pass to \code{data.table::frollsum}, for +example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically +passed the data \code{x} to operate on, the window size \code{n}, and the alignment +\code{align}. Providing these args via \code{...} will cause an error.} + +\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should +the sliding window extend? At least one of these two arguments must be +provided; the other's default will be 0. Any value provided for either +argument must be a single, non-\code{NA}, non-negative, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of +the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass +\verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item For leading/left-aligned windows from \code{ref_time_value} to +\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +or pass \verb{before=0, after=k}. +See "Details:" about the definition of a time step,(non)treatment of +missing rows within the window, and avoiding warnings about +\code{before}&\code{after} settings for a certain uncommon use case. +}} + +\item{ref_time_values}{Time values for sliding computations, meaning, each +element of this vector serves as the reference time point for one sliding +window. If missing, then this will be set to all unique time values in the +underlying data table, by default.} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a non-negative integer and +return an object of class \code{lubridate::period}. For example, we can use +\code{time_step = lubridate::hours} in order to set the time step to be one hour +(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_names}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_names} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_names} entirely.} + +\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output.} +} +\value{ +An \code{epi_df} object given by appending one or more new columns to +\code{x}, depending on the \code{col_names} argument, named according to the +\code{new_col_names} argument. +} +\description{ +Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +examples. +} +\details{ +Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. + +To "slide" means to apply a function or formula over a rolling +window of time steps for each data group, where the window is entered at a +reference time and left and right endpoints are given by the \code{before} and +\code{after} arguments. The unit (the meaning of one time step) is implicitly +defined by the way the \code{time_value} column treats addition and subtraction; +for example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals +\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly +using the \code{time_step} argument (which if specified would override the +default choice based on \code{time_value} column). If there are not enough time +steps available to complete the window at any given reference time, then +\code{epi_slide()} still attempts to perform the computation anyway (it does not +require a complete window). The issue of what to do with partial +computations (those run on incomplete windows) is therefore left up to the +user, either through the specified function or formula \code{f}, or through +post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and +\code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) +} +\examples{ +# slide a 7-day trailing sum formula on cases +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_sum(cases, new_col_names = "cases_7dsum", names_sep = NULL, before = 6) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dsum) \%>\% + ungroup() +} +\seealso{ +\code{\link{epi_slide}} \code{\link{epi_slide_opt}} \code{\link{epi_slide_mean}} +} From 4a1c5663f43dbd2fb17b642ce0cfffdab063fb33 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 25 Mar 2024 18:18:23 -0400 Subject: [PATCH 218/345] use base select instead of tidyselect for speed --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 811a12a1..aa6a2e47 100644 --- a/R/slide.R +++ b/R/slide.R @@ -738,7 +738,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (f_from_package == "data.table") { roll_output <- f( - x = select(.data_group, {{ col_names }}), n = window_size, align = "right", ... + x = .data_group[, col_names_chr], n = window_size, align = "right", ... ) if (after >= 1) { From fc4cae4c6c20bc3e9914fa2edca2ac6b30bc44ac Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:54:48 -0400 Subject: [PATCH 219/345] reuse epi_slide_opt documentation --- R/slide.R | 168 +++--------------------------------------- man/epi_slide_mean.Rd | 31 +------- man/epi_slide_opt.Rd | 17 +++-- man/epi_slide_sum.Rd | 31 +------- 4 files changed, 21 insertions(+), 226 deletions(-) diff --git a/R/slide.R b/R/slide.R index aa6a2e47..4944580b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -397,9 +397,9 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' names of one or more columns for which to calculate the rolling mean. #' @param f Function; together with `...` specifies the computation to slide. #' `f` must be one of `data.table`'s rolling functions -#' (`frollmean`, `frollsum`, `frollapply`. See `?data.table::roll`) or one +#' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one #' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, -#' etc. See `?slider::\`summary-slide\``). To "slide" means to apply a +#' etc. See [slider::summary-slide]). To "slide" means to apply a #' computation within a sliding (a.k.a. "rolling") time window for each data #' group. The window is determined by the `before` and `after` parameters #' described below. One time step is typically one day or one week; see @@ -435,7 +435,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @param time_step Optional function used to define the meaning of one time #' step, which if specified, overrides the default choice based on the #' `time_value` column. This function must take a non-negative integer and -#' return an object of class `lubridate::period`. For example, we can use +#' return an object of class [lubridate::period]. For example, we can use #' `time_step = lubridate::hours` in order to set the time step to be one hour #' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_names String indicating the name of the new column that will @@ -791,88 +791,13 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' #' Wrapper around `epi_slide_opt` with `f = datatable::frollmean`. #' -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a -#' single data group. -#' @param col_names A single tidyselection or a tidyselection vector of the -#' names of one or more columns for which to calculate the rolling mean. #' @param ... Additional arguments to pass to `data.table::frollmean`, for #' example, `na.rm` and `algo`. `data.table::frollmean` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment #' `align`. Providing these args via `...` will cause an error. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class `lubridate::period`. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_names String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_names` equal to an existing column name will overwrite this column. -#' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_names` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. -#' @return An `epi_df` object given by appending one or more new columns to -#' `x`, depending on the `col_names` argument, named according to the -#' `new_col_names` argument. -#' -#' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is entered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. The unit (the meaning of one time step) is implicitly -#' defined by the way the `time_value` column treats addition and subtraction; -#' for example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals -#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly -#' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If there are not enough time -#' steps available to complete the window at any given reference time, then -#' `epi_slide()` still attempts to perform the computation anyway (it does not -#' require a complete window). The issue of what to do with partial -#' computations (those run on incomplete windows) is therefore left up to the -#' user, either through the specified function or formula `f`, or through -#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and -#' `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) +#' @inheritParams epi_slide_opt +#' @inherit epi_slide_opt return +#' @inherit epi_slide_opt details #' #' @export #' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_sum`] @@ -948,88 +873,13 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' #' Wrapper around `epi_slide_opt` with `f = datatable::frollsum`. #' -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a -#' single data group. -#' @param col_names A single tidyselection or a tidyselection vector of the -#' names of one or more columns for which to calculate the rolling mean. #' @param ... Additional arguments to pass to `data.table::frollsum`, for #' example, `na.rm` and `algo`. `data.table::frollsum` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment #' `align`. Providing these args via `...` will cause an error. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class `lubridate::period`. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_names String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_names` equal to an existing column name will overwrite this column. -#' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_names` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. -#' @return An `epi_df` object given by appending one or more new columns to -#' `x`, depending on the `col_names` argument, named according to the -#' `new_col_names` argument. -#' -#' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is entered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. The unit (the meaning of one time step) is implicitly -#' defined by the way the `time_value` column treats addition and subtraction; -#' for example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals -#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly -#' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If there are not enough time -#' steps available to complete the window at any given reference time, then -#' `epi_slide()` still attempts to perform the computation anyway (it does not -#' require a complete window). The issue of what to do with partial -#' computations (those run on incomplete windows) is therefore left up to the -#' user, either through the specified function or formula `f`, or through -#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and -#' `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) +#' @inheritParams epi_slide_opt +#' @inherit epi_slide_opt return +#' @inherit epi_slide_opt details #' #' @export #' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_mean`] diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 9979c1e3..19b6fcec 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -58,7 +58,7 @@ underlying data table, by default.} \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the \code{time_value} column. This function must take a non-negative integer and -return an object of class \code{lubridate::period}. For example, we can use +return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} @@ -91,35 +91,6 @@ examples. } \details{ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. - -To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is entered at a -reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. The unit (the meaning of one time step) is implicitly -defined by the way the \code{time_value} column treats addition and subtraction; -for example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals -\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly -using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If there are not enough time -steps available to complete the window at any given reference time, then -\code{epi_slide()} still attempts to perform the computation anyway (it does not -require a complete window). The issue of what to do with partial -computations (those run on incomplete windows) is therefore left up to the -user, either through the specified function or formula \code{f}, or through -post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and -\code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) } \examples{ # slide a 7-day trailing average formula on cases diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 202216ce..bc45dbea 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -29,11 +29,11 @@ names of one or more columns for which to calculate the rolling mean.} \item{f}{Function; together with \code{...} specifies the computation to slide. \code{f} must be one of \code{data.table}'s rolling functions -(\code{frollmean}, \code{frollsum}, \code{frollapply}. See \code{?data.table::roll}) or one +(\code{frollmean}, \code{frollsum}, \code{frollapply}. See \link[data.table:froll]{data.table::roll}) or one of \code{slider}'s specialized sliding functions (\code{slide_mean}, \code{slide_sum}, -etc. See \verb{?slider::\\}summary-slide\``). To "slide" means to apply a +etc. See \link[slider:summary-slide]{slider::summary-slide}). To "slide" means to apply a computation within a sliding (a.k.a. "rolling") time window for each data -group. The window is determined by the \code{before} and `after` parameters +group. The window is determined by the \code{before} and \code{after} parameters described below. One time step is typically one day or one week; see details for more explanation.} @@ -72,7 +72,7 @@ underlying data table, by default.} \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the \code{time_value} column. This function must take a non-negative integer and -return an object of class \code{lubridate::period}. For example, we can use +return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} @@ -138,7 +138,8 @@ misspelled.) jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 + cases, + f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -160,7 +161,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + cases, + f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -170,7 +172,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 + cases, + f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index d87e5f29..001bafdf 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -58,7 +58,7 @@ underlying data table, by default.} \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the \code{time_value} column. This function must take a non-negative integer and -return an object of class \code{lubridate::period}. For example, we can use +return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} @@ -91,35 +91,6 @@ examples. } \details{ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. - -To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is entered at a -reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. The unit (the meaning of one time step) is implicitly -defined by the way the \code{time_value} column treats addition and subtraction; -for example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals -\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly -using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If there are not enough time -steps available to complete the window at any given reference time, then -\code{epi_slide()} still attempts to perform the computation anyway (it does not -require a complete window). The issue of what to do with partial -computations (those run on incomplete windows) is therefore left up to the -user, either through the specified function or formula \code{f}, or through -post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and -\code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) } \examples{ # slide a 7-day trailing sum formula on cases From 1385d5976052bb0854cb52f4388c64d5c4e6d8a1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:02:15 -0400 Subject: [PATCH 220/345] tests --- tests/testthat/test-epi_slide.R | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f727337c..baf3bfc7 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1386,3 +1386,29 @@ test_that("`epi_slide_mean` errors when passed `col_names` as list", { class = "epiprocess__epi_slide_mean__col_names_in_list" ) }) + +test_that("epi_slide_mean produces same output as epi_slide_opt", { + result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_opt(small_x, value, f = data.table::frollmean, + before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result1, result2) + + # # Theoretically should work, but getting "Error: C stack usage 7973092 is + # too close to the limit" (recursion is too deep) + # result3 <- epi_slide_opt(small_x, value, f = slider::slide_mean, + # before = 50, names_sep = NULL, na_rm = TRUE) + # expect_equal(result1, result3) +}) + +test_that("epi_slide_sum produces same output as epi_slide_opt", { + result1 <- epi_slide_sum(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_opt(small_x, value, f = data.table::frollsum, + before = 50, names_sep = NULL, na.rm = TRUE) + expect_identical(result1, result2) + + # # Theoretically should work, but getting "Error: C stack usage 7973092 is + # too close to the limit" (recursion is too deep) + # result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, + # before = 50, names_sep = NULL, na_rm = TRUE) + # expect_equal(result1, result3) +}) From cd311ded5b07afd9049fac510548e15fa39cf049 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Tue, 26 Mar 2024 22:16:05 +0000 Subject: [PATCH 221/345] style: styler (GHA) --- R/slide.R | 21 ++++++++++++--------- tests/testthat/test-epi_slide.R | 12 ++++++++---- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/R/slide.R b/R/slide.R index 4944580b..8bf6e1aa 100644 --- a/R/slide.R +++ b/R/slide.R @@ -499,7 +499,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 +#' cases, +#' f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% @@ -521,7 +522,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +#' cases, +#' f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% @@ -531,15 +533,16 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 +#' cases, +#' f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, - time_step, - new_col_names = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + time_step, + new_col_names = "slide_value", as_list_col = NULL, + names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") if (nrow(x) == 0L) { @@ -892,9 +895,9 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>% #' ungroup() epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, - time_step, - new_col_names = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + time_step, + new_col_names = "slide_value", as_list_col = NULL, + names_sep = "_", all_rows = FALSE) { epi_slide_opt( x = x, col_names = {{ col_names }}, diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index baf3bfc7..53dcd426 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1389,8 +1389,10 @@ test_that("`epi_slide_mean` errors when passed `col_names` as list", { test_that("epi_slide_mean produces same output as epi_slide_opt", { result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - result2 <- epi_slide_opt(small_x, value, f = data.table::frollmean, - before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_opt(small_x, value, + f = data.table::frollmean, + before = 50, names_sep = NULL, na.rm = TRUE + ) expect_identical(result1, result2) # # Theoretically should work, but getting "Error: C stack usage 7973092 is @@ -1402,8 +1404,10 @@ test_that("epi_slide_mean produces same output as epi_slide_opt", { test_that("epi_slide_sum produces same output as epi_slide_opt", { result1 <- epi_slide_sum(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - result2 <- epi_slide_opt(small_x, value, f = data.table::frollsum, - before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_opt(small_x, value, + f = data.table::frollsum, + before = 50, names_sep = NULL, na.rm = TRUE + ) expect_identical(result1, result2) # # Theoretically should work, but getting "Error: C stack usage 7973092 is From 15949f3018437017468b5d7f64641ca4895f1025 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Tue, 26 Mar 2024 22:16:43 +0000 Subject: [PATCH 222/345] docs: document (GHA) --- man/epi_slide_opt.Rd | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index bc45dbea..6e945f0d 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -138,8 +138,7 @@ misspelled.) jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, - f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 + cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -161,8 +160,7 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, - f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + cases, f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -172,8 +170,7 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, - f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 + cases, f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% From c551da172a33976f214da60c442ea4128ebc9998 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:20:22 -0400 Subject: [PATCH 223/345] version, news, pkgdown --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/slide.R | 2 +- _pkgdown.yml | 2 ++ 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c737e0c6..3c6bbb16 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.6 +Version: 0.7.7 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 908302f2..b9b2f60b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat calculated (#397). - Add new `epi_slide_mean` function to allow much (~30x) faster rolling average computations in some cases (#400). +- Add new `epi_slide_sum` function to allow much faster rolling sum + computations in some cases (#433). +- Add new `epi_slide_opt` function to allow much faster rolling computations + in some cases, using `data.table` and `slider` optimized rolling functions + (#433). - regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API - changed approach to versioning, see DEVELOPMENT.md for details diff --git a/R/slide.R b/R/slide.R index 8bf6e1aa..ad8d2cb9 100644 --- a/R/slide.R +++ b/R/slide.R @@ -935,7 +935,7 @@ full_date_seq <- function(x, before, after, time_step) { # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a value-1 unitless step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { + is.numeric(x$time_value)) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { diff --git a/_pkgdown.yml b/_pkgdown.yml index bc7dd779..1daef5a0 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -68,6 +68,8 @@ reference: - contents: - epi_slide - epi_slide_mean + - epi_slide_sum + - epi_slide_opt - epi_cor - title: Vector functions desc: Functions that act directly on signal variables. From c684ee54bf2fc2b52f3a02418a1b360bf5b8bb63 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Tue, 26 Mar 2024 22:22:41 +0000 Subject: [PATCH 224/345] style: styler (GHA) --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index ad8d2cb9..8bf6e1aa 100644 --- a/R/slide.R +++ b/R/slide.R @@ -935,7 +935,7 @@ full_date_seq <- function(x, before, after, time_step) { # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a value-1 unitless step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { + is.numeric(x$time_value)) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { From 5a240e3b362ed9bdad2e87887f0921cce60bab28 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Tue, 26 Mar 2024 22:22:54 +0000 Subject: [PATCH 225/345] docs: document (GHA) --- man/epi_slide_opt.Rd | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 6e945f0d..bc45dbea 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -138,7 +138,8 @@ misspelled.) jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 + cases, + f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -160,7 +161,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + cases, + f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -170,7 +172,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 + cases, + f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% From 240b517967ecf5ae004c49a3435eb9adb456ffad Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:31:34 -0400 Subject: [PATCH 226/345] opt examples --- R/slide.R | 23 ++++++++++++----------- man/epi_slide_opt.Rd | 23 ++++++++++++----------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/R/slide.R b/R/slide.R index 8bf6e1aa..575f3459 100644 --- a/R/slide.R +++ b/R/slide.R @@ -510,7 +510,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, +#' epi_slide_opt(cases, f = data.table::frollmean, #' new_col_names = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE @@ -518,16 +518,17 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' -#' # slide a 7-day leading average -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_opt( -#' cases, -#' f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 -#' ) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% -#' ungroup() +#' # # slide a 7-day leading average. Theoretically works, but `slider::slide_mean` is +#' # # having a recursion issue. +#' # jhu_csse_daily_subset %>% +#' # group_by(geo_value) %>% +#' # epi_slide_opt( +#' # cases, +#' # f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +#' # ) %>% +#' # # Remove a nonessential var. to ensure new col is printed +#' # dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' # ungroup() #' #' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` #' jhu_csse_daily_subset %>% diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index bc45dbea..e4df0741 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -149,7 +149,7 @@ jhu_csse_daily_subset \%>\% # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt(cases, + epi_slide_opt(cases, f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE @@ -157,16 +157,17 @@ jhu_csse_daily_subset \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() -# slide a 7-day leading average -jhu_csse_daily_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_opt( - cases, - f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 - ) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% - ungroup() +# # slide a 7-day leading average. Theoretically works, but `slider::slide_mean` is +# # having a recursion issue. +# jhu_csse_daily_subset \%>\% +# group_by(geo_value) \%>\% +# epi_slide_opt( +# cases, +# f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +# ) \%>\% +# # Remove a nonessential var. to ensure new col is printed +# dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% +# ungroup() # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` jhu_csse_daily_subset \%>\% From fd2ed62b3eeda4bc63ae1539fa5dc521a43e8738 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Tue, 26 Mar 2024 22:35:27 +0000 Subject: [PATCH 227/345] style: styler (GHA) --- R/slide.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 575f3459..f9c61059 100644 --- a/R/slide.R +++ b/R/slide.R @@ -510,7 +510,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, f = data.table::frollmean, +#' epi_slide_opt(cases, +#' f = data.table::frollmean, #' new_col_names = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE From fc75a04ba95428797e7abbffdbc3aae7aa4dd0f2 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 27 Mar 2024 12:27:23 -0400 Subject: [PATCH 228/345] use identical to check computation function f validity Co-authored-by: David Weber --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index f9c61059..08b755e5 100644 --- a/R/slide.R +++ b/R/slide.R @@ -573,14 +573,14 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (any(sapply( c(frollmean, frollsum, frollapply), function(roll_fn) { - isTRUE(all.equal(f, roll_fn)) + isTRUE(identical(f, roll_fn)) } ))) { f_from_package <- "data.table" } else if (any(sapply( c(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), function(roll_fn) { - isTRUE(all.equal(f, roll_fn)) + isTRUE(identical(f, roll_fn)) } ))) { f_from_package <- "slider" From 96886f05db4a982be5e1d92ab7ee261ff1cf7d80 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Wed, 27 Mar 2024 16:29:03 +0000 Subject: [PATCH 229/345] docs: document (GHA) --- man/epi_slide_opt.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index e4df0741..a2a71482 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -149,7 +149,8 @@ jhu_csse_daily_subset \%>\% # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt(cases, f = data.table::frollmean, + epi_slide_opt(cases, + f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE From e673cd932f69a7f44d131021bb85fe2f85efedfe Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 27 Mar 2024 12:33:04 -0400 Subject: [PATCH 230/345] allow slider tests to run now that recursion error is fixed --- R/slide.R | 21 ++++++++++----------- man/epi_slide_opt.Rd | 21 ++++++++++----------- tests/testthat/test-epi_slide.R | 16 ++++++---------- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/R/slide.R b/R/slide.R index 08b755e5..33abf242 100644 --- a/R/slide.R +++ b/R/slide.R @@ -519,17 +519,16 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' -#' # # slide a 7-day leading average. Theoretically works, but `slider::slide_mean` is -#' # # having a recursion issue. -#' # jhu_csse_daily_subset %>% -#' # group_by(geo_value) %>% -#' # epi_slide_opt( -#' # cases, -#' # f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 -#' # ) %>% -#' # # Remove a nonessential var. to ensure new col is printed -#' # dplyr::select(geo_value, time_value, cases, cases_7dav) %>% -#' # ungroup() +#' # slide a 7-day leading average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide_opt( +#' cases, +#' f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +#' ) %>% +#' # Remove a nonessential var. to ensure new col is printed +#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' ungroup() #' #' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` #' jhu_csse_daily_subset %>% diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index a2a71482..1c2e690d 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -158,17 +158,16 @@ jhu_csse_daily_subset \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() -# # slide a 7-day leading average. Theoretically works, but `slider::slide_mean` is -# # having a recursion issue. -# jhu_csse_daily_subset \%>\% -# group_by(geo_value) \%>\% -# epi_slide_opt( -# cases, -# f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 -# ) \%>\% -# # Remove a nonessential var. to ensure new col is printed -# dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% -# ungroup() +# slide a 7-day leading average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_opt( + cases, + f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + ) \%>\% + # Remove a nonessential var. to ensure new col is printed + dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + ungroup() # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` jhu_csse_daily_subset \%>\% diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 53dcd426..4d221b9b 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1395,11 +1395,9 @@ test_that("epi_slide_mean produces same output as epi_slide_opt", { ) expect_identical(result1, result2) - # # Theoretically should work, but getting "Error: C stack usage 7973092 is - # too close to the limit" (recursion is too deep) - # result3 <- epi_slide_opt(small_x, value, f = slider::slide_mean, - # before = 50, names_sep = NULL, na_rm = TRUE) - # expect_equal(result1, result3) + result3 <- epi_slide_opt(small_x, value, f = slider::slide_mean, + before = 50, names_sep = NULL, na_rm = TRUE) + expect_equal(result1, result3) }) test_that("epi_slide_sum produces same output as epi_slide_opt", { @@ -1410,9 +1408,7 @@ test_that("epi_slide_sum produces same output as epi_slide_opt", { ) expect_identical(result1, result2) - # # Theoretically should work, but getting "Error: C stack usage 7973092 is - # too close to the limit" (recursion is too deep) - # result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, - # before = 50, names_sep = NULL, na_rm = TRUE) - # expect_equal(result1, result3) + result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, + before = 50, names_sep = NULL, na_rm = TRUE) + expect_equal(result1, result3) }) From 06cbe7abf55ae8729db0cabe47053247eeeffc31 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:28:22 -0400 Subject: [PATCH 231/345] document why epi_slide_opt and not epi_slide(frollmean) --- R/slide.R | 7 +++++++ man/epi_slide_opt.Rd | 9 ++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 33abf242..2e07d502 100644 --- a/R/slide.R +++ b/R/slide.R @@ -404,6 +404,13 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group. The window is determined by the `before` and `after` parameters #' described below. One time step is typically one day or one week; see #' details for more explanation. +#' +#' The optimized `data.table` and `slider` functions can't be directly passed +#' as the computation function in `epi_slide` without careful handling to +#' make sure each computation group is made up of the `n` dates rather than +#' `n` points. `epi_slide_opt` (and wrapper functions `epi_slide_mean` and +#' `epi_slide_sum`) take care of window completion automatically to prevent +#' associated errors. #' @param ... Additional arguments to pass to the slide computation `f`, for #' example, `na.rm` and `algo` if `f` is a `data.table` function. If `f` is #' a `data.table` function, it is automatically passed the data `x` to diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 1c2e690d..9c9cf6c4 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -35,7 +35,14 @@ etc. See \link[slider:summary-slide]{slider::summary-slide}). To "slide" means t computation within a sliding (a.k.a. "rolling") time window for each data group. The window is determined by the \code{before} and \code{after} parameters described below. One time step is typically one day or one week; see -details for more explanation.} +details for more explanation. + +The optimized \code{data.table} and \code{slider} functions can't be directly passed +as the computation function in \code{epi_slide} without careful handling to +make sure each computation group is made up of the \code{n} dates rather than +\code{n} points. \code{epi_slide_opt} (and wrapper functions \code{epi_slide_mean} and +\code{epi_slide_sum}) take care of window completion automatically to prevent +associated errors.} \item{...}{Additional arguments to pass to the slide computation \code{f}, for example, \code{na.rm} and \code{algo} if \code{f} is a \code{data.table} function. If \code{f} is From 0683b7cf78a6814bebc798cd106c7fd299224903 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:36:52 -0400 Subject: [PATCH 232/345] test passing invalid computation functions --- tests/testthat/test-epi_slide.R | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 4d221b9b..3cfebd91 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1412,3 +1412,34 @@ test_that("epi_slide_sum produces same output as epi_slide_opt", { before = 50, names_sep = NULL, na_rm = TRUE) expect_equal(result1, result3) }) + +test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` functions", { + expect_no_error( + epi_slide_opt( + grouped, col_names = value, f = data.table::frollmean, + before = 1L, after = 0L, ref_time_values = d + 1 + ) + ) + expect_no_error( + epi_slide_opt( + grouped, col_names = value, f = slider::slide_min, + before = 1L, after = 0L, ref_time_values = d + 1 + ) + ) + + reexport_frollmean <- data.table::frollmean + expect_no_error( + epi_slide_opt( + grouped, col_names = value, f = reexport_frollmean, + before = 1L, after = 0L, ref_time_values = d + 1 + ) + ) + + expect_error( + epi_slide_opt( + grouped, col_names = value, f = mean, + before = 1L, after = 0L, ref_time_values = d + 1 + ), + class = "epiprocess__epi_slide_opt__unsupported_slide_function" + ) +}) From fd5f3ca2b37a39a93e8335e562b2135395b34f75 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Wed, 27 Mar 2024 17:39:21 +0000 Subject: [PATCH 233/345] style: styler (GHA) --- tests/testthat/test-epi_slide.R | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 3cfebd91..6d66e0c4 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1395,8 +1395,10 @@ test_that("epi_slide_mean produces same output as epi_slide_opt", { ) expect_identical(result1, result2) - result3 <- epi_slide_opt(small_x, value, f = slider::slide_mean, - before = 50, names_sep = NULL, na_rm = TRUE) + result3 <- epi_slide_opt(small_x, value, + f = slider::slide_mean, + before = 50, names_sep = NULL, na_rm = TRUE + ) expect_equal(result1, result3) }) @@ -1408,21 +1410,25 @@ test_that("epi_slide_sum produces same output as epi_slide_opt", { ) expect_identical(result1, result2) - result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, - before = 50, names_sep = NULL, na_rm = TRUE) + result3 <- epi_slide_opt(small_x, value, + f = slider::slide_sum, + before = 50, names_sep = NULL, na_rm = TRUE + ) expect_equal(result1, result3) }) test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` functions", { expect_no_error( epi_slide_opt( - grouped, col_names = value, f = data.table::frollmean, + grouped, + col_names = value, f = data.table::frollmean, before = 1L, after = 0L, ref_time_values = d + 1 ) ) expect_no_error( epi_slide_opt( - grouped, col_names = value, f = slider::slide_min, + grouped, + col_names = value, f = slider::slide_min, before = 1L, after = 0L, ref_time_values = d + 1 ) ) @@ -1430,14 +1436,16 @@ test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` fun reexport_frollmean <- data.table::frollmean expect_no_error( epi_slide_opt( - grouped, col_names = value, f = reexport_frollmean, + grouped, + col_names = value, f = reexport_frollmean, before = 1L, after = 0L, ref_time_values = d + 1 ) ) expect_error( epi_slide_opt( - grouped, col_names = value, f = mean, + grouped, + col_names = value, f = mean, before = 1L, after = 0L, ref_time_values = d + 1 ), class = "epiprocess__epi_slide_opt__unsupported_slide_function" From 06deb57e760397ccdf1630fe712509fd0324a83b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 14:00:49 -0400 Subject: [PATCH 234/345] share bulk of params between slide fns --- R/slide.R | 64 +++++++++---------------------------------- man/epi_slide.Rd | 6 ++-- man/epi_slide_mean.Rd | 14 ++++------ man/epi_slide_opt.Rd | 24 ++++++++++------ man/epi_slide_sum.Rd | 14 ++++------ 5 files changed, 43 insertions(+), 79 deletions(-) diff --git a/R/slide.R b/R/slide.R index 2e07d502..40d32099 100644 --- a/R/slide.R +++ b/R/slide.R @@ -51,7 +51,7 @@ #' @param time_step Optional function used to define the meaning of one time #' step, which if specified, overrides the default choice based on the #' `time_value` column. This function must take a non-negative integer and -#' return an object of class `lubridate::period`. For example, we can use +#' return an object of class [lubridate::period]. For example, we can use #' `time_step = lubridate::hours` in order to set the time step to be one hour #' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_name String indicating the name of the new column that will @@ -76,8 +76,8 @@ #' the missing marker is a `NULL` entry in the list column; for certain #' operations, you might want to replace these `NULL` entries with a different #' `NA` marker. -#' @return An `epi_df` object given by appending a new column to `x`, named -#' according to the `new_col_name` argument. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, named according to the `new_col_name` argument. #' #' @details To "slide" means to apply a function or formula over a rolling #' window of time steps for each data group, where the window is entered at a @@ -386,13 +386,11 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' Optimized slide function for performing common rolling computations on an `epi_df` object #' -#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for +#' Slides an n-timestep [data.table::froll] or [slider::summary-slide] function +#' over variables in an `epi_df` object. See the [slide vignette] +#' (https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a -#' single data group. #' @param col_names A single tidyselection or a tidyselection vector of the #' names of one or more columns for which to calculate the rolling mean. #' @param f Function; together with `...` specifies the computation to slide. @@ -418,50 +416,14 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' these args via `...` will cause an error. If `f` is a `slider` function, #' it is automatically passed the data `x` to operate on, and number of #' points `before` and `after` to use in the computation. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class [lubridate::period]. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_names String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_names` equal to an existing column name will overwrite this column. #' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_names` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. -#' @return An `epi_df` object given by appending one or more new columns to -#' `x`, depending on the `col_names` argument, named according to the -#' `new_col_names` argument. +#' @param new_col_name Character vector indicating the name(s) of the new +#' column(s) that will contain the derivative values. Default +#' is "slide_value"; note that setting `new_col_name` equal to any existing +#' column names will overwrite those columns. If `names_sep` is `NULL`, +#' `new_col_name` must be the same length as `col_names`. +#' @inheritParams epi_slide +#' @inherit epi_slide return #' #' @details To "slide" means to apply a function or formula over a rolling #' window of time steps for each data group, where the window is entered at a diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 2fe1dce6..0745a194 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -72,7 +72,7 @@ underlying data table, by default.} \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the \code{time_value} column. This function must take a non-negative integer and -return an object of class \code{lubridate::period}. For example, we can use +return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} @@ -103,8 +103,8 @@ operations, you might want to replace these \code{NULL} entries with a different \code{NA} marker.} } \value{ -An \code{epi_df} object given by appending a new column to \code{x}, named -according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to +\code{x}, named according to the \code{new_col_name} argument. } \description{ Slides a given function over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 19b6fcec..92975f09 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -62,15 +62,11 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_names}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_names} equal to an existing column name will overwrite this column.} - \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_names} entirely.} +from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -78,12 +74,14 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending one or more new columns to -\code{x}, depending on the \code{col_names} argument, named according to the -\code{new_col_names} argument. +\code{x}, named according to the \code{new_col_name} argument. } \description{ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 9c9cf6c4..ed40ecad 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -83,15 +83,11 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_names}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_names} equal to an existing column name will overwrite this column.} - \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_names} entirely.} +from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -99,15 +95,25 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} + +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} } \value{ An \code{epi_df} object given by appending one or more new columns to -\code{x}, depending on the \code{col_names} argument, named according to the -\code{new_col_names} argument. +\code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +Slides an n-timestep \link[data.table:froll]{data.table::froll} or \link[slider:summary-slide]{slider::summary-slide} function +over variables in an \code{epi_df} object. See the \link{slide vignette} +(https://cmu-delphi.github.io/epiprocess/articles/slide.html) for examples. } \details{ diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 001bafdf..920023fe 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -62,15 +62,11 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_names}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_names} equal to an existing column name will overwrite this column.} - \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_names} entirely.} +from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -78,12 +74,14 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending one or more new columns to -\code{x}, depending on the \code{col_names} argument, named according to the -\code{new_col_names} argument. +\code{x}, named according to the \code{new_col_name} argument. } \description{ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for From 8e1dfb6f45f4919efb3c1ec6ef3f62d77294e689 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 14:04:07 -0400 Subject: [PATCH 235/345] rename new_col_names to new_col_name to match epi_slide --- R/slide.R | 46 +++++++++++++++++++++---------------------- man/epi_slide_mean.Rd | 18 +++++++++++------ man/epi_slide_opt.Rd | 22 ++++++++++----------- man/epi_slide_sum.Rd | 10 ++++++++-- 4 files changed, 54 insertions(+), 42 deletions(-) diff --git a/R/slide.R b/R/slide.R index 40d32099..bb99d785 100644 --- a/R/slide.R +++ b/R/slide.R @@ -469,7 +469,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 +#' f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% @@ -481,7 +481,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt(cases, #' f = data.table::frollmean, -#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -493,7 +493,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +#' f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% @@ -504,14 +504,14 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 +#' f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = NULL, + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") @@ -640,28 +640,28 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # If single column name, do nothing. if (is.null(names_sep)) { - if (length(new_col_names) != length(col_names_chr)) { + if (length(new_col_name) != length(col_names_chr)) { cli_abort( c( - "`new_col_names` must be the same length as `col_names` when + "`new_col_name` must be the same length as `col_names` when `names_sep` is NULL to avoid duplicate output column names." ), class = "epiprocess__epi_slide_mean__col_names_length_mismatch", - epiprocess__new_col_names = new_col_names, + epiprocess__new_col_name = new_col_name, epiprocess__col_names = col_names_chr ) } - result_col_names <- new_col_names + result_col_names <- new_col_name } else { - if (length(new_col_names) != 1L && length(new_col_names) != length(col_names_chr)) { + if (length(new_col_name) != 1L && length(new_col_name) != length(col_names_chr)) { cli_abort( - "`new_col_names` must be either length 1 or the same length as `col_names`.", + "`new_col_name` must be either length 1 or the same length as `col_names`.", class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", - epiprocess__new_col_names = new_col_names, + epiprocess__new_col_name = new_col_name, epiprocess__col_names = col_names_chr ) } - result_col_names <- paste(new_col_names, col_names_chr, sep = names_sep) + result_col_names <- paste(new_col_name, col_names_chr, sep = names_sep) } slide_one_grp <- function(.data_group, .group_key, ...) { @@ -778,7 +778,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -788,7 +788,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean(cases, -#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -798,7 +798,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -806,7 +806,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -814,13 +814,13 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = NULL, + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { epi_slide_opt( x = x, @@ -831,7 +831,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, after = after, ref_time_values = ref_time_values, time_step = time_step, - new_col_names = new_col_names, + new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, all_rows = all_rows @@ -860,13 +860,13 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' # slide a 7-day trailing sum formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, new_col_names = "cases_7dsum", names_sep = NULL, before = 6) %>% +#' epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>% #' ungroup() epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = NULL, + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { epi_slide_opt( x = x, @@ -877,7 +877,7 @@ epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, after = after, ref_time_values = ref_time_values, time_step = time_step, - new_col_names = new_col_names, + new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, all_rows = all_rows diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 92975f09..5ce5783a 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -12,7 +12,7 @@ epi_slide_mean( after, ref_time_values, time_step, - new_col_names = "slide_value", + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE @@ -62,6 +62,12 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} + \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} @@ -94,7 +100,7 @@ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) \%>\% + epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -104,7 +110,7 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_mean(cases, - new_col_names = "cases_7dav", names_sep = NULL, before = 6, + new_col_name = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -114,7 +120,7 @@ jhu_csse_daily_subset \%>\% # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) \%>\% + epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -122,7 +128,7 @@ jhu_csse_daily_subset \%>\% # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -130,7 +136,7 @@ jhu_csse_daily_subset \%>\% # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% ungroup() diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index ed40ecad..75ea8f7a 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,7 +13,7 @@ epi_slide_opt( after, ref_time_values, time_step, - new_col_names = "slide_value", + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE @@ -83,6 +83,12 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} + \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} @@ -99,12 +105,6 @@ of the slide computation output. If using \code{as_list_col = TRUE}, note that the missing marker is a \code{NULL} entry in the list column; for certain operations, you might want to replace these \code{NULL} entries with a different \code{NA} marker.} - -\item{new_col_name}{Character vector indicating the name(s) of the new -column(s) that will contain the derivative values. Default -is "slide_value"; note that setting \code{new_col_name} equal to any existing -column names will overwrite those columns. If \code{names_sep} is \code{NULL}, -\code{new_col_name} must be the same length as \code{col_names}.} } \value{ An \code{epi_df} object given by appending one or more new columns to @@ -152,7 +152,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 + f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -164,7 +164,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt(cases, f = data.table::frollmean, - new_col_names = "cases_7dav", names_sep = NULL, before = 6, + new_col_name = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -176,7 +176,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -187,7 +187,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 + f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 920023fe..b0f76960 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -12,7 +12,7 @@ epi_slide_sum( after, ref_time_values, time_step, - new_col_names = "slide_value", + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE @@ -62,6 +62,12 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} + \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} @@ -94,7 +100,7 @@ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. # slide a 7-day trailing sum formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, new_col_names = "cases_7dsum", names_sep = NULL, before = 6) \%>\% + epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dsum) \%>\% ungroup() From d20e351a7d0628e8a9c829432b15fd1ed622ea1f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:56:16 -0400 Subject: [PATCH 236/345] use templates for epi_slide_opt derivatives reorder template and remaining param calls for clarity --- R/slide.R | 103 ++++--------------------------- man-roxygen/basic-slide-params.R | 45 ++++++++++++++ man-roxygen/opt-slide-details.R | 25 ++++++++ man-roxygen/opt-slide-params.R | 8 +++ man/epi_slide.Rd | 2 +- man/epi_slide_mean.Rd | 25 ++++++++ man/epi_slide_opt.Rd | 32 +++++----- man/epi_slide_sum.Rd | 25 ++++++++ 8 files changed, 156 insertions(+), 109 deletions(-) create mode 100644 man-roxygen/basic-slide-params.R create mode 100644 man-roxygen/opt-slide-details.R create mode 100644 man-roxygen/opt-slide-params.R diff --git a/R/slide.R b/R/slide.R index bb99d785..783a6097 100644 --- a/R/slide.R +++ b/R/slide.R @@ -4,9 +4,7 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a -#' single data group. +#' @template basic-slide-params #' @param f Function, formula, or missing; together with `...` specifies the #' computation to slide. To "slide" means to apply a computation within a #' sliding (a.k.a. "rolling") time window for each data group. The window is @@ -27,33 +25,6 @@ #' directly by name, the expression has access to `.data` and `.env` pronouns #' as in `dplyr` verbs, and can also refer to `.x`, `.group_key`, and #' `.ref_time_value`. See details. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class [lubridate::period]. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. @@ -63,24 +34,9 @@ #' [`tidyr::unnest()`]), and, if the slide computations output data frames, #' the names of the resulting columns are given by prepending `new_col_name` #' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. If using `as_list_col = TRUE`, note that -#' the missing marker is a `NULL` entry in the list column; for certain -#' operations, you might want to replace these `NULL` entries with a different -#' `NA` marker. -#' @return An `epi_df` object given by appending one or more new columns to -#' `x`, named according to the `new_col_name` argument. #' #' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is entered at a +#' window of time steps for each data group, where the window is centered at a #' reference time and left and right endpoints are given by the `before` and #' `after` arguments. The unit (the meaning of one time step) is implicitly #' defined by the way the `time_value` column treats addition and subtraction; @@ -391,8 +347,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' (https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' @param col_names A single tidyselection or a tidyselection vector of the -#' names of one or more columns for which to calculate the rolling mean. +#' @template basic-slide-params +#' @template opt-slide-params #' @param f Function; together with `...` specifies the computation to slide. #' `f` must be one of `data.table`'s rolling functions #' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one @@ -416,43 +372,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' these args via `...` will cause an error. If `f` is a `slider` function, #' it is automatically passed the data `x` to operate on, and number of #' points `before` and `after` to use in the computation. -#' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param new_col_name Character vector indicating the name(s) of the new -#' column(s) that will contain the derivative values. Default -#' is "slide_value"; note that setting `new_col_name` equal to any existing -#' column names will overwrite those columns. If `names_sep` is `NULL`, -#' `new_col_name` must be the same length as `col_names`. -#' @inheritParams epi_slide -#' @inherit epi_slide return #' -#' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is entered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. The unit (the meaning of one time step) is implicitly -#' defined by the way the `time_value` column treats addition and subtraction; -#' for example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals -#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly -#' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If there are not enough time -#' steps available to complete the window at any given reference time, then -#' `epi_slide()` still attempts to perform the computation anyway (it does not -#' require a complete window). The issue of what to do with partial -#' computations (those run on incomplete windows) is therefore left up to the -#' user, either through the specified function or formula `f`, or through -#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and -#' `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) +#' @template opt-slide-details #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select #' @importFrom rlang enquo quo_get_expr as_label @@ -764,13 +685,14 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' #' Wrapper around `epi_slide_opt` with `f = datatable::frollmean`. #' +#' @template basic-slide-params +#' @template opt-slide-params #' @param ... Additional arguments to pass to `data.table::frollmean`, for #' example, `na.rm` and `algo`. `data.table::frollmean` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment #' `align`. Providing these args via `...` will cause an error. -#' @inheritParams epi_slide_opt -#' @inherit epi_slide_opt return -#' @inherit epi_slide_opt details +#' +#' @template opt-slide-details #' #' @export #' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_sum`] @@ -846,13 +768,14 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' #' Wrapper around `epi_slide_opt` with `f = datatable::frollsum`. #' +#' @template basic-slide-params +#' @template opt-slide-params #' @param ... Additional arguments to pass to `data.table::frollsum`, for #' example, `na.rm` and `algo`. `data.table::frollsum` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment #' `align`. Providing these args via `...` will cause an error. -#' @inheritParams epi_slide_opt -#' @inherit epi_slide_opt return -#' @inherit epi_slide_opt details +#' +#' @template opt-slide-details #' #' @export #' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_mean`] diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R new file mode 100644 index 00000000..383c102d --- /dev/null +++ b/man-roxygen/basic-slide-params.R @@ -0,0 +1,45 @@ +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. +#' @param ref_time_values Time values for sliding computations, meaning, each +#' element of this vector serves as the reference time point for one sliding +#' window. If missing, then this will be set to all unique time values in the +#' underlying data table, by default. +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class [lubridate::period]. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. If using `as_list_col = TRUE`, note that +#' the missing marker is a `NULL` entry in the list column; for certain +#' operations, you might want to replace these `NULL` entries with a different +#' `NA` marker. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, named according to the `new_col_name` argument. diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R new file mode 100644 index 00000000..33fb437c --- /dev/null +++ b/man-roxygen/opt-slide-details.R @@ -0,0 +1,25 @@ +#' @details To "slide" means to apply a function over a rolling window of time +#' steps for each data group, where the window is centered at a reference +#' time and left and right endpoints are given by the `before` and `after` +#' arguments. The unit (the meaning of one time step) is implicitly defined +#' by the way the `time_value` column treats addition and subtraction; for +#' example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals `as.Date +#' ("2022-01-02")`. Alternatively, the time step can be set explicitly using +#' the `time_step` argument (which if specified would override the default +#' choice based on `time_value` column). If there are not enough time steps +#' available to complete the window at any given reference time, then +#' `epi_slide_*()` will fail; it requires a complete window to perform the +#' computation. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` +#' and `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R new file mode 100644 index 00000000..a7d5b04a --- /dev/null +++ b/man-roxygen/opt-slide-params.R @@ -0,0 +1,8 @@ +#' @param col_names A single tidyselection or a tidyselection vector of the +#' names of one or more columns for which to calculate the rolling mean. +#' @param as_list_col Not supported. Included to match `epi_slide` interface. +#' @param new_col_name Character vector indicating the name(s) of the new +#' column(s) that will contain the derivative values. Default +#' is "slide_value"; note that setting `new_col_name` equal to any existing +#' column names will overwrite those columns. If `names_sep` is `NULL`, +#' `new_col_name` must be the same length as `col_names`. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 0745a194..dede6d05 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -112,7 +112,7 @@ examples. } \details{ To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is entered at a +window of time steps for each data group, where the window is centered at a reference time and left and right endpoints are given by the \code{before} and \code{after} arguments. The unit (the meaning of one time step) is implicitly defined by the way the \code{time_value} column treats addition and subtraction; diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 5ce5783a..ee3e7838 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -95,6 +95,31 @@ examples. } \details{ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. + +To "slide" means to apply a function over a rolling window of time +steps for each data group, where the window is centered at a reference +time and left and right endpoints are given by the \code{before} and \code{after} +arguments. The unit (the meaning of one time step) is implicitly defined +by the way the \code{time_value} column treats addition and subtraction; for +example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using +the \code{time_step} argument (which if specified would override the default +choice based on \code{time_value} column). If there are not enough time steps +available to complete the window at any given reference time, then +\verb{epi_slide_*()} will fail; it requires a complete window to perform the +computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} +and \code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) } \examples{ # slide a 7-day trailing average formula on cases diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 75ea8f7a..207e4c19 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -117,25 +117,21 @@ over variables in an \code{epi_df} object. See the \link{slide vignette} examples. } \details{ -To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is entered at a -reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. The unit (the meaning of one time step) is implicitly -defined by the way the \code{time_value} column treats addition and subtraction; -for example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals -\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly -using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If there are not enough time -steps available to complete the window at any given reference time, then -\code{epi_slide()} still attempts to perform the computation anyway (it does not -require a complete window). The issue of what to do with partial -computations (those run on incomplete windows) is therefore left up to the -user, either through the specified function or formula \code{f}, or through -post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +To "slide" means to apply a function over a rolling window of time +steps for each data group, where the window is centered at a reference +time and left and right endpoints are given by the \code{before} and \code{after} +arguments. The unit (the meaning of one time step) is implicitly defined +by the way the \code{time_value} column treats addition and subtraction; for +example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using +the \code{time_step} argument (which if specified would override the default +choice based on \code{time_value} column). If there are not enough time steps +available to complete the window at any given reference time, then +\verb{epi_slide_*()} will fail; it requires a complete window to perform the +computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and -\code{after = n/2} when \code{n} is even. +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} +and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index b0f76960..c61d0007 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -95,6 +95,31 @@ examples. } \details{ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. + +To "slide" means to apply a function over a rolling window of time +steps for each data group, where the window is centered at a reference +time and left and right endpoints are given by the \code{before} and \code{after} +arguments. The unit (the meaning of one time step) is implicitly defined +by the way the \code{time_value} column treats addition and subtraction; for +example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using +the \code{time_step} argument (which if specified would override the default +choice based on \code{time_value} column). If there are not enough time steps +available to complete the window at any given reference time, then +\verb{epi_slide_*()} will fail; it requires a complete window to perform the +computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} +and \code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) } \examples{ # slide a 7-day trailing sum formula on cases From d0bbf4aaecf8c20a0510e4b9ac35467294f3722e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:58:44 -0400 Subject: [PATCH 237/345] sum says it's a sum --- R/slide.R | 2 +- man/epi_slide_sum.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index 783a6097..728b51b6 100644 --- a/R/slide.R +++ b/R/slide.R @@ -762,7 +762,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' Optimized slide function for performing rolling sums on an `epi_df` object #' -#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide +#' Slides an n-timestep sum over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index c61d0007..d5961f27 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -90,7 +90,7 @@ An \code{epi_df} object given by appending one or more new columns to \code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +Slides an n-timestep sum over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ From 6d1c7c63decbbe36a62763bc85f80f90919faf2f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 17:47:52 -0400 Subject: [PATCH 238/345] epidf methods templates --- R/methods-epi_df.R | 16 ++++++++-------- man-roxygen/x.R | 1 + man/as_tibble.epi_df.Rd | 2 +- man/as_tsibble.epi_df.Rd | 2 +- man/print.epi_df.Rd | 8 +++----- 5 files changed, 14 insertions(+), 15 deletions(-) create mode 100644 man-roxygen/x.R diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 22ea2928..02eaf798 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -3,8 +3,8 @@ #' Converts an `epi_df` object into a tibble, dropping metadata and any #' grouping. #' -#' @param x an `epi_df` -#' @param ... arguments to forward to `NextMethod()` +#' @template x +#' @param ... additional arguments to forward to `NextMethod()` #' #' @importFrom tibble as_tibble #' @export @@ -22,7 +22,7 @@ as_tibble.epi_df <- function(x, ...) { #' others in the `other_keys` field of the metadata, or else explicitly set. #' #' @method as_tsibble epi_df -#' @param x The `epi_df` object. +#' @template x #' @param key Optional. Any additional keys (other than `geo_value`) to add to #' the `tsibble`. #' @param ... additional arguments passed on to `tsibble::as_tsibble()` @@ -39,8 +39,8 @@ as_tsibble.epi_df <- function(x, key, ...) { #' #' Print and summary functions for an `epi_df` object. #' -#' @param x The `epi_df` object. -#' @param ... Additional arguments passed to methods. +#' @template x +#' @param ... additional arguments to forward to `NextMethod()` #' #' @method print epi_df #' @export @@ -61,7 +61,7 @@ print.epi_df <- function(x, ...) { #' Prints a variety of summary statistics about the `epi_df` object, such as #' the time range included and geographic coverage. #' -#' @param object The `epi_df` object. +#' @param object an `epi_df` #' @param ... Additional arguments, for compatibility with `summary()`. #' Currently unused. #' @@ -223,7 +223,7 @@ ungroup.epi_df <- function(x, ...) { #' @method group_modify epi_df #' @rdname print.epi_df -#' @param .data The `epi_df` object. +#' @param .data an `epi_df` #' @param .f function or formula; see [`dplyr::group_modify`] #' @param .keep Boolean; see [`dplyr::group_modify`] #' @export @@ -233,7 +233,7 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { #' @method unnest epi_df #' @rdname print.epi_df -#' @param data The `epi_df` object. +#' @param .data an `epi_df` #' @export unnest.epi_df <- function(data, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) diff --git a/man-roxygen/x.R b/man-roxygen/x.R new file mode 100644 index 00000000..a26f9f25 --- /dev/null +++ b/man-roxygen/x.R @@ -0,0 +1 @@ +#' @param x an `epi_df` diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd index c314f47e..5913a5e7 100644 --- a/man/as_tibble.epi_df.Rd +++ b/man/as_tibble.epi_df.Rd @@ -9,7 +9,7 @@ \arguments{ \item{x}{an \code{epi_df}} -\item{...}{arguments to forward to \code{NextMethod()}} +\item{...}{additional arguments to forward to \code{NextMethod()}} } \description{ Converts an \code{epi_df} object into a tibble, dropping metadata and any diff --git a/man/as_tsibble.epi_df.Rd b/man/as_tsibble.epi_df.Rd index 98ca7f74..73200c3b 100644 --- a/man/as_tsibble.epi_df.Rd +++ b/man/as_tsibble.epi_df.Rd @@ -7,7 +7,7 @@ \method{as_tsibble}{epi_df}(x, key, ...) } \arguments{ -\item{x}{The \code{epi_df} object.} +\item{x}{an \code{epi_df}} \item{key}{Optional. Any additional keys (other than \code{geo_value}) to add to the \code{tsibble}.} diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index f5749d82..894036c4 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -22,20 +22,18 @@ \method{unnest}{epi_df}(data, ...) } \arguments{ -\item{x}{The \code{epi_df} object.} +\item{x}{an \code{epi_df}} \item{...}{Additional arguments, for compatibility with \code{summary()}. Currently unused.} -\item{object}{The \code{epi_df} object.} +\item{object}{an \code{epi_df}} -\item{.data}{The \code{epi_df} object.} +\item{.data}{an \code{epi_df}} \item{.f}{function or formula; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} \item{.keep}{Boolean; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} - -\item{data}{The \code{epi_df} object.} } \description{ Print and summary functions for an \code{epi_df} object. From 7ec2d875f8caa5db662d535bca36466c8d3bac12 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 17:51:32 -0400 Subject: [PATCH 239/345] outliers templates --- R/outliers.R | 57 ++++--------------------- man-roxygen/detect-outlr-return.R | 3 ++ man-roxygen/outlier-detection-options.R | 15 +++++++ man-roxygen/x-y.R | 4 ++ man/detect_outlr.Rd | 6 +-- man/detect_outlr_rm.Rd | 5 ++- man/detect_outlr_stl.Rd | 5 ++- 7 files changed, 39 insertions(+), 56 deletions(-) create mode 100644 man-roxygen/detect-outlr-return.R create mode 100644 man-roxygen/outlier-detection-options.R create mode 100644 man-roxygen/x-y.R diff --git a/R/outliers.R b/R/outliers.R index 68a656a7..ab4f0e8e 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -6,10 +6,7 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/outliers.html) for #' examples. #' -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. +#' @template x-y #' @param methods A tibble specifying the method(s) to use for outlier #' detection, with one row per method, and the following columns: #' * `method`: Either "rm" or "stl", or a custom function for outlier @@ -25,9 +22,7 @@ #' summarized results are calculated. Note that if the number of `methods` #' (number of rows) is odd, then "median" is equivalent to a majority vote for #' purposes of determining whether a given observation is an outlier. -#' @return An tibble with number of rows equal to `length(y)` and columns giving -#' the outlier detection thresholds and replacement values from each detection -#' method. +#' @template detect-outlr-return #' #' @details Each outlier detection method, one per row of the passed `methods` #' tibble, is a function that must take as its first two arguments `x` and @@ -147,32 +142,14 @@ detect_outlr <- function(x = seq_along(y), y, #' Detects outliers based on a distance from the rolling median specified in #' terms of multiples of the rolling interquartile range (IQR). #' -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. +#' @template x-y #' @param n Number of time steps to use in the rolling window. Default is 21. #' This value is centrally aligned. When `n` is an odd number, the rolling #' window extends from `(n-1)/2` time steps before each design point to `(n-1)/2` #' time steps after. When `n` is even, then the rolling range extends from #' `n/2-1` time steps before to `n/2` time steps after. -#' @param log_transform Should a log transform be applied before running outlier -#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the -#' log transform will be padded by 1. -#' @param detect_negatives Should negative values automatically count as -#' outliers? Default is `FALSE`. -#' @param detection_multiplier Value determining how far the outlier detection -#' thresholds are from the rolling median, which are calculated as (rolling -#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. -#' @param min_radius Minimum distance between rolling median and threshold, on -#' transformed scale. Default is 0. -#' @param replacement_multiplier Value determining how far the replacement -#' values are from the rolling median. The replacement is the original value -#' if it is within the detection thresholds, or otherwise it is rounded to the -#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). -#' Default is 0. -#' @return A tibble with number of rows equal to `length(y)`, and columns -#' `lower`, `upper`, and `replacement`. +#' @template outlier-detection-options +#' @template detect-outlr-return #' #' @export #' @examples @@ -235,10 +212,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' #' Detects outliers based on a seasonal-trend decomposition using LOESS (STL). #' -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. +#' @template x-y #' @param n_trend Number of time steps to use in the rolling window for trend. #' Default is 21. #' @param n_seasonal Number of time steps to use in the rolling window for @@ -248,23 +222,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' @param seasonal_period Integer specifying period of seasonality. For example, #' for daily data, a period 7 means weekly seasonality. The default is `NULL`, #' meaning that no seasonal term will be included in the STL decomposition. -#' @param log_transform Should a log transform be applied before running outlier -#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the -#' log transform will be padded by 1. -#' @param detect_negatives Should negative values automatically count as -#' outliers? Default is `FALSE`. -#' @param detection_multiplier Value determining how far the outlier detection -#' thresholds are from the rolling median, which are calculated as (rolling -#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. -#' @param min_radius Minimum distance between rolling median and threshold, on -#' transformed scale. Default is 0. -#' @param replacement_multiplier Value determining how far the replacement -#' values are from the rolling median. The replacement is the original value -#' if it is within the detection thresholds, or otherwise it is rounded to the -#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). -#' Default is 0. -#' @return A tibble with number of rows equal to `length(y)`, and columns -#' `lower`, `upper`, and `replacement`. +#' @template outlier-detection-options +#' @template detect-outlr-return #' #' @details The STL decomposition is computed using the `feasts` package. Once #' computed, the outlier detection method is analogous to the rolling median diff --git a/man-roxygen/detect-outlr-return.R b/man-roxygen/detect-outlr-return.R new file mode 100644 index 00000000..50222e0e --- /dev/null +++ b/man-roxygen/detect-outlr-return.R @@ -0,0 +1,3 @@ +#' @return An tibble with number of rows equal to `length(y)` and columns +#' giving the outlier detection thresholds (`lower` and `upper`) and +#' replacement values from each detection method (`replacement`). diff --git a/man-roxygen/outlier-detection-options.R b/man-roxygen/outlier-detection-options.R new file mode 100644 index 00000000..4b4260e5 --- /dev/null +++ b/man-roxygen/outlier-detection-options.R @@ -0,0 +1,15 @@ +#' @param log_transform Should a log transform be applied before running outlier +#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the +#' log transform will be padded by 1. +#' @param detect_negatives Should negative values automatically count as +#' outliers? Default is `FALSE`. +#' @param detection_multiplier Value determining how far the outlier detection +#' thresholds are from the rolling median, which are calculated as (rolling +#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. +#' @param min_radius Minimum distance between rolling median and threshold, on +#' transformed scale. Default is 0. +#' @param replacement_multiplier Value determining how far the replacement +#' values are from the rolling median. The replacement is the original value +#' if it is within the detection thresholds, or otherwise it is rounded to the +#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). +#' Default is 0. diff --git a/man-roxygen/x-y.R b/man-roxygen/x-y.R new file mode 100644 index 00000000..a4f9d1d7 --- /dev/null +++ b/man-roxygen/x-y.R @@ -0,0 +1,4 @@ +#' @param x Design points corresponding to the signal values `y`. Default is +#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of +#' `y`). +#' @param y Signal values. diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 3a793ebf..4263a64b 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -38,9 +38,9 @@ summarized results are calculated. Note that if the number of \code{methods} purposes of determining whether a given observation is an outlier.} } \value{ -An tibble with number of rows equal to \code{length(y)} and columns giving -the outlier detection thresholds and replacement values from each detection -method. +An tibble with number of rows equal to \code{length(y)} and columns +giving the outlier detection thresholds (\code{lower} and \code{upper}) and +replacement values from each detection method (\code{replacement}). } \description{ Applies one or more outlier detection methods to a given signal variable, and diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 0d011619..333c4a7b 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -49,8 +49,9 @@ nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). Default is 0.} } \value{ -A tibble with number of rows equal to \code{length(y)}, and columns -\code{lower}, \code{upper}, and \code{replacement}. +An tibble with number of rows equal to \code{length(y)} and columns +giving the outlier detection thresholds (\code{lower} and \code{upper}) and +replacement values from each detection method (\code{replacement}). } \description{ Detects outliers based on a distance from the rolling median specified in diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 34a550d5..2b518451 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -59,8 +59,9 @@ nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). Default is 0.} } \value{ -A tibble with number of rows equal to \code{length(y)}, and columns -\code{lower}, \code{upper}, and \code{replacement}. +An tibble with number of rows equal to \code{length(y)} and columns +giving the outlier detection thresholds (\code{lower} and \code{upper}) and +replacement values from each detection method (\code{replacement}). } \description{ Detects outliers based on a seasonal-trend decomposition using LOESS (STL). From 29ccf38858d11e1132d4b0770874962e923986aa Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 4 Apr 2024 17:52:27 -0400 Subject: [PATCH 240/345] epi_df templates --- R/epi_df.R | 40 ++----------------------------------- man-roxygen/epi_df-params.R | 19 ++++++++++++++++++ 2 files changed, 21 insertions(+), 38 deletions(-) create mode 100644 man-roxygen/epi_df-params.R diff --git a/R/epi_df.R b/R/epi_df.R index 65acfb94..9ed677cf 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -91,25 +91,7 @@ NULL #' correct metadata for an `epi_df` object (ie. `geo_type`, `time_type`, and `as_of`). #' Refer to the below info. about the arguments for more details. #' -#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param as_of Time value representing the time at which the given data were -#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` -#' object that is created would represent the most up-to-date version of the -#' data available as of January 31, 2022. If the `as_of` argument is missing, -#' then the current day-time will be used. -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and -#' `as_of` fields; named entries from the passed list will be included as -#' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @template epi_df-params #' #' @export new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, @@ -182,25 +164,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for #' examples. #' -#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param as_of Time value representing the time at which the given data were -#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` -#' object that is created would represent the most up-to-date version of the -#' data available as of January 31, 2022. If the `as_of` argument is missing, -#' then the current day-time will be used. -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and -#' `as_of` fields; named entries from the passed list will be included as -#' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @template epi_df-params #' #' @export #' @examples diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R new file mode 100644 index 00000000..54d8c2d2 --- /dev/null +++ b/man-roxygen/epi_df-params.R @@ -0,0 +1,19 @@ +#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param as_of Time value representing the time at which the given data were +#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` +#' object that is created would represent the most up-to-date version of the +#' data available as of January 31, 2022. If the `as_of` argument is missing, +#' then the current day-time will be used. +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and +#' `as_of` fields; named entries from the passed list will be included as +#' well. If your tibble has additional keys, be sure to specify them as a +#' character vector in the `other_keys` component of `additional_metadata`. +#' @param ... Additional arguments passed to methods. +#' @return An `epi_df` object. From 296d657df6a49c822e5d1410d533f03639b5048b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:32:46 -0500 Subject: [PATCH 241/345] identical doesn't need isTRUE wrapper --- R/slide.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index 2e07d502..53e63235 100644 --- a/R/slide.R +++ b/R/slide.R @@ -579,14 +579,14 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (any(sapply( c(frollmean, frollsum, frollapply), function(roll_fn) { - isTRUE(identical(f, roll_fn)) + identical(f, roll_fn) } ))) { f_from_package <- "data.table" } else if (any(sapply( c(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), function(roll_fn) { - isTRUE(identical(f, roll_fn)) + identical(f, roll_fn) } ))) { f_from_package <- "slider" From 9e336def0ddc0d3377fe8b8b886944a8cea67fe7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:41:16 -0500 Subject: [PATCH 242/345] make unsupported-f error message report more detail than just arg name --- NAMESPACE | 2 ++ R/slide.R | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 46207199..a84da14e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,9 +139,11 @@ importFrom(rlang,arg_match) importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) +importFrom(rlang,enexpr) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) +importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) importFrom(rlang,is_environment) diff --git a/R/slide.R b/R/slide.R index 53e63235..3b36caef 100644 --- a/R/slide.R +++ b/R/slide.R @@ -493,7 +493,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' misspelled.) #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select -#' @importFrom rlang enquo quo_get_expr as_label +#' @importFrom rlang enquo quo_get_expr as_label expr_label enexpr #' @importFrom purrr map #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period @@ -594,7 +594,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `f` is from somewhere else and not supported cli_abort( c( - "slide function `f` is not supported", + "problem with {rlang::expr_label(rlang::enexpr(f))}", "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, `frollsum`, `frollapply`. See `?data.table::roll`) or one of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, From a7aa15bc078ded2ba4e12760be704ccace66d66e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 9 Apr 2024 15:57:24 -0500 Subject: [PATCH 243/345] use map_lgl to check if user opt-f is valid --- R/slide.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/slide.R b/R/slide.R index 3b36caef..27cedbc6 100644 --- a/R/slide.R +++ b/R/slide.R @@ -494,7 +494,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select #' @importFrom rlang enquo quo_get_expr as_label expr_label enexpr -#' @importFrom purrr map +#' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period #' @importFrom checkmate assert_function @@ -576,15 +576,15 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `data.table` and `slider` (or a function that has the exact same # definition, e.g. if the function has been reexported or defined # locally). - if (any(sapply( - c(frollmean, frollsum, frollapply), + if (any(map_lgl( + list(frollmean, frollsum, frollapply), function(roll_fn) { identical(f, roll_fn) } ))) { f_from_package <- "data.table" - } else if (any(sapply( - c(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), + } else if (any(map_lgl( + list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), function(roll_fn) { identical(f, roll_fn) } From aa9b6f484d5b5477d936f6137de332be671fd9fc Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:10:03 -0500 Subject: [PATCH 244/345] fix slide vignette links --- R/slide.R | 12 ++++++------ man/epi_slide.Rd | 5 +++-- man/epi_slide_opt.Rd | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/R/slide.R b/R/slide.R index 728b51b6..68843eed 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,8 +1,8 @@ #' Slide a function over variables in an `epi_df` object #' -#' Slides a given function over variables in an `epi_df` object. See the [slide -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. +#' Slides a given function over variables in an `epi_df` object. See the +#' [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) +#' for examples. #' #' @template basic-slide-params #' @param f Function, formula, or missing; together with `...` specifies the @@ -343,9 +343,9 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' Optimized slide function for performing common rolling computations on an `epi_df` object #' #' Slides an n-timestep [data.table::froll] or [slider::summary-slide] function -#' over variables in an `epi_df` object. See the [slide vignette] -#' (https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. +#' over variables in an `epi_df` object. See the +#' [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) +#' for examples. #' #' @template basic-slide-params #' @template opt-slide-params diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index dede6d05..0d0dfb55 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -107,8 +107,9 @@ An \code{epi_df} object given by appending one or more new columns to \code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides a given function over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for -examples. +Slides a given function over variables in an \code{epi_df} object. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} +for examples. } \details{ To "slide" means to apply a function or formula over a rolling diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 207e4c19..0772b431 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -112,9 +112,9 @@ An \code{epi_df} object given by appending one or more new columns to } \description{ Slides an n-timestep \link[data.table:froll]{data.table::froll} or \link[slider:summary-slide]{slider::summary-slide} function -over variables in an \code{epi_df} object. See the \link{slide vignette} -(https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -examples. +over variables in an \code{epi_df} object. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} +for examples. } \details{ To "slide" means to apply a function over a rolling window of time From e499737deff521a207d2c0b43ab6a65cf367bd59 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:16:31 -0500 Subject: [PATCH 245/345] suppress indendation linting on a controversial line --- R/slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/slide.R b/R/slide.R index 68843eed..7cdc8f38 100644 --- a/R/slide.R +++ b/R/slide.R @@ -828,7 +828,7 @@ full_date_seq <- function(x, before, after, time_step) { # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a value-1 unitless step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { + is.numeric(x$time_value)) { # nolint: indentation_linter all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { From d54dd8754f839c565c65372c3acf1cf03fe690d3 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:16:54 -0500 Subject: [PATCH 246/345] document remaining arg --- R/methods-epi_df.R | 3 ++- man/print.epi_df.Rd | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 02eaf798..632dc3a3 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -204,6 +204,7 @@ dplyr_row_slice.epi_df <- function(data, i, ...) { } #' @method group_by epi_df +#' @param .data an `epi_df` #' @rdname print.epi_df #' @export group_by.epi_df <- function(.data, ...) { @@ -233,7 +234,7 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { #' @method unnest epi_df #' @rdname print.epi_df -#' @param .data an `epi_df` +#' @param data an `epi_df` #' @export unnest.epi_df <- function(data, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 894036c4..5a232de0 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -34,6 +34,8 @@ Currently unused.} \item{.f}{function or formula; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} \item{.keep}{Boolean; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + +\item{data}{an \code{epi_df}} } \description{ Print and summary functions for an \code{epi_df} object. From fc2590d7401d98598ec58aba3609f64d0ac88f00 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:07:34 -0400 Subject: [PATCH 247/345] imporove slide function f error context --- NAMESPACE | 1 - R/slide.R | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a84da14e..18bc6fc6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,7 +139,6 @@ importFrom(rlang,arg_match) importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) -importFrom(rlang,enexpr) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) diff --git a/R/slide.R b/R/slide.R index 4c5947c2..416127ef 100644 --- a/R/slide.R +++ b/R/slide.R @@ -376,7 +376,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' @template opt-slide-details #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select -#' @importFrom rlang enquo quo_get_expr as_label expr_label enexpr +#' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg #' @importFrom purrr map map_lgl #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period @@ -477,7 +477,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `f` is from somewhere else and not supported cli_abort( c( - "problem with {rlang::expr_label(rlang::enexpr(f))}", + "problem with {rlang::expr_label(rlang::caller_arg(f))}", "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, `frollsum`, `frollapply`. See `?data.table::roll`) or one of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, From 248a0e8cb50bd3252aafdd4077159a984738cb0e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 10:27:52 -0700 Subject: [PATCH 248/345] Guess YYYYww numeric as "custom", fix incidental typo - Previously we guessed YYYYww numeric as "year", almost surely wrong. But we can't easily/reliably tell what type of yearweek it is, so just use "custom" to avoid various/bad guesses in time_type-reliant code. - Don't use <= ; seemed nonimpactful in this case, at least in tested locale. --- DESCRIPTION | 2 +- NEWS.md | 3 ++- R/utils.R | 12 +++--------- tests/testthat/test-utils.R | 10 +++++++++- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3c6bbb16..578243a4 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.7 +Version: 0.7.8 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index b9b2f60b..f6e1f1de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -27,7 +27,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat the `epi_df` (#382). - Refactored internals to use `cli` for warnings/errors and `checkmate` for argument checking (#413). -- Fix logic to auto-assign `ep_df` `time_type` to `week` (#416). +- Fix logic to auto-assign `epi_df` `time_type` to `week` (#416) and `year` + (#441). ## Breaking changes diff --git a/R/utils.R b/R/utils.R index 57a7f53a..cf363434 100644 --- a/R/utils.R +++ b/R/utils.R @@ -403,7 +403,7 @@ guess_geo_type <- function(geo_value) { guess_time_type <- function(time_value) { # Convert character time values to Date or POSIXct if (is.character(time_value)) { - if (nchar(time_value[1]) <= "10") { + if (nchar(time_value[1]) <= 10L) { new_time_value <- tryCatch( { as.Date(time_value) @@ -440,14 +440,8 @@ guess_time_type <- function(time_value) { return("yearmonth") } else if (inherits(time_value, "yearquarter")) { return("yearquarter") - } - - # Else, if it's an integer that's at least 1582, then use "year" - if ( - is.numeric(time_value) && - all(time_value == as.integer(time_value)) && - all(time_value >= 1582) - ) { + } else if (rlang::is_integerish(time_value) && + all(nchar(as.character(time_value)) == 4L)) { # nolint: indentation_linter return("year") } diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 83cc07f6..fbb209ac 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -56,6 +56,10 @@ test_that("guess_time_type works for different types", { yearquarters <- tsibble::yearquarter(10) years <- c(1999, 2000) + ambiguous_yearweeks <- c(199901, 199902) # -> "custom" + + daytimes <- as.POSIXct(c("2022-01-01 05:00:00", "2022-01-01 15:0:00"), tz = "UTC") + daytimes_chr <- as.character(daytimes) # YYYY-MM-DD is the accepted format not_ymd1 <- "January 1, 2022" @@ -72,13 +76,17 @@ test_that("guess_time_type works for different types", { expect_equal(guess_time_type(yearquarters), "yearquarter") expect_equal(guess_time_type(years), "year") + expect_equal(guess_time_type(ambiguous_yearweeks), "custom") + + expect_equal(guess_time_type(daytimes), "day-time") + expect_equal(guess_time_type(daytimes_chr), "day-time") expect_equal(guess_time_type(not_ymd1), "custom") expect_equal(guess_time_type(not_ymd2), "custom") expect_equal(guess_time_type(not_ymd3), "custom") expect_equal(guess_time_type(not_a_date), "custom") }) -3 + test_that("guess_time_type works with gaps", { days_gaps <- as.Date("2022-01-01") + c(0, 1, 3, 4, 8, 8 + 7) weeks_gaps <- as.Date("2022-01-01") + 7 * c(0, 1, 3, 4, 8, 8 + 7) From 01f22621f16ed593faf733175878feea73f9d5f3 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 21 Mar 2024 15:45:02 -0700 Subject: [PATCH 249/345] wip+doc: add S3 implementation of epi_archive * remove comment #417 * bump version to 0.7.6 and add NEWS line --- DESCRIPTION | 3 + NAMESPACE | 27 + NEWS.md | 1 + R/archive.R | 3 - R/archive_new.R | 1115 +++++++++++++++++ R/data.R | 8 + R/grouped_archive_new.R | 456 +++++++ R/methods-epi_archive.R | 3 +- R/methods-epi_archive_new.R | 826 ++++++++++++ man/as_epi_archive2.Rd | 142 +++ man/as_of.epi_archive2.Rd | 33 + man/epi_archive.Rd | 106 +- man/epix_as_of2.Rd | 95 ++ man/epix_fill_through_version2.Rd | 48 + man/epix_merge2.Rd | 71 ++ man/epix_slide2.Rd | 283 +++++ man/epix_truncate_versions_after.Rd | 10 +- ...ate_versions_after.grouped_epi_archive2.Rd | 11 + man/fill_through_version.epi_archive2.Rd | 21 + man/group_by.epi_archive.Rd | 23 +- man/group_by.epi_archive2.Rd | 147 +++ man/is_epi_archive2.Rd | 35 + man/max_version_with_row_in.Rd | 9 +- man/merge_epi_archive2.Rd | 30 + man/new_epi_archive2.Rd | 69 + man/next_after.Rd | 8 +- man/print.epi_archive2.Rd | 17 + man/slide.epi_archive2.Rd | 101 ++ man/slide.grouped_epi_archive2.Rd | 24 + man/truncate_versions_after.epi_archive2.Rd | 19 + ...ate_versions_after.grouped_epi_archive2.Rd | 18 + tests/testthat/test-archive_new.R | 173 +++ tests/testthat/test-compactify.R | 2 +- tests/testthat/test-compactify_new.R | 110 ++ .../test-epix_fill_through_version_new.R | 109 ++ tests/testthat/test-epix_merge_new.R | 226 ++++ tests/testthat/test-epix_slide_new.R | 810 ++++++++++++ tests/testthat/test-grouped_epi_archive_new.R | 104 ++ tests/testthat/test-methods-epi_archive_new.R | 136 ++ vignettes/advanced.Rmd | 27 +- vignettes/archive.Rmd | 24 +- vignettes/compactify.Rmd | 10 +- 42 files changed, 5452 insertions(+), 41 deletions(-) create mode 100644 R/archive_new.R create mode 100644 R/grouped_archive_new.R create mode 100644 R/methods-epi_archive_new.R create mode 100644 man/as_epi_archive2.Rd create mode 100644 man/as_of.epi_archive2.Rd create mode 100644 man/epix_as_of2.Rd create mode 100644 man/epix_fill_through_version2.Rd create mode 100644 man/epix_merge2.Rd create mode 100644 man/epix_slide2.Rd create mode 100644 man/epix_truncate_versions_after.grouped_epi_archive2.Rd create mode 100644 man/fill_through_version.epi_archive2.Rd create mode 100644 man/group_by.epi_archive2.Rd create mode 100644 man/is_epi_archive2.Rd create mode 100644 man/merge_epi_archive2.Rd create mode 100644 man/new_epi_archive2.Rd create mode 100644 man/print.epi_archive2.Rd create mode 100644 man/slide.epi_archive2.Rd create mode 100644 man/slide.grouped_epi_archive2.Rd create mode 100644 man/truncate_versions_after.epi_archive2.Rd create mode 100644 man/truncate_versions_after.grouped_epi_archive2.Rd create mode 100644 tests/testthat/test-archive_new.R create mode 100644 tests/testthat/test-compactify_new.R create mode 100644 tests/testthat/test-epix_fill_through_version_new.R create mode 100644 tests/testthat/test-epix_merge_new.R create mode 100644 tests/testthat/test-epix_slide_new.R create mode 100644 tests/testthat/test-grouped_epi_archive_new.R create mode 100644 tests/testthat/test-methods-epi_archive_new.R diff --git a/DESCRIPTION b/DESCRIPTION index 3c6bbb16..cfdd9f49 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -73,6 +73,7 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' + 'archive_new.R' 'autoplot.R' 'correlation.R' 'data.R' @@ -80,9 +81,11 @@ Collate: 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' + 'grouped_archive_new.R' 'grouped_epi_archive.R' 'growth_rate.R' 'key_colnames.R' + 'methods-epi_archive_new.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index 18bc6fc6..d5d1cd7b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,40 +6,58 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_of,epi_archive2) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) +S3method(clone,epi_archive2) +S3method(clone,grouped_epi_archive2) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) S3method(epix_truncate_versions_after,epi_archive) +S3method(epix_truncate_versions_after,epi_archive2) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(epix_truncate_versions_after,grouped_epi_archive2) S3method(group_by,epi_archive) +S3method(group_by,epi_archive2) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) +S3method(group_by,grouped_epi_archive2) S3method(group_by_drop_default,grouped_epi_archive) +S3method(group_by_drop_default,grouped_epi_archive2) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) +S3method(groups,grouped_epi_archive2) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) +S3method(print,epi_archive2) S3method(print,epi_df) +S3method(print,grouped_epi_archive2) S3method(select,epi_df) +S3method(slide,grouped_epi_archive2) S3method(summary,epi_df) +S3method(truncate_versions_after,grouped_epi_archive2) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) +S3method(ungroup,grouped_epi_archive2) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) +export(archive_cases_dv_subset_2) export(arrange) export(as_epi_archive) +export(as_epi_archive2) export(as_epi_df) +export(as_of) export(as_tsibble) export(autoplot) +export(clone) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -50,24 +68,33 @@ export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) +export(epix_as_of2) export(epix_merge) +export(epix_merge2) export(epix_slide) +export(epix_slide2) export(epix_truncate_versions_after) +export(fill_through_version) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) +export(is_epi_archive2) export(is_epi_df) export(is_grouped_epi_archive) +export(is_grouped_epi_archive2) export(key_colnames) export(max_version_with_row_in) export(mutate) +export(new_epi_archive2) export(new_epi_df) export(next_after) export(relocate) export(rename) export(slice) +export(slide) +export(truncate_versions_after) export(ungroup) export(unnest) importFrom(R6,R6Class) diff --git a/NEWS.md b/NEWS.md index b9b2f60b..e2c5b8e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 +- Refactor `epi_archive` to use S3 instead of R6 for its object model. The calls to some functions will change, but the functionality will remain the same. It will also help us maintain the package better in the future. (#340) # epiprocess 0.7.0 diff --git a/R/archive.R b/R/archive.R index ff3bc20c..a530cc05 100644 --- a/R/archive.R +++ b/R/archive.R @@ -514,9 +514,6 @@ epi_archive <- fromLast = TRUE ) %>% tibble::as_tibble() %>% - # (`as_tibble` should de-alias the DT and its columns in any edge - # cases where they are aliased. We don't say we guarantee this - # though.) dplyr::select(-"version") %>% as_epi_df( geo_type = self$geo_type, diff --git a/R/archive_new.R b/R/archive_new.R new file mode 100644 index 00000000..0b4f3695 --- /dev/null +++ b/R/archive_new.R @@ -0,0 +1,1115 @@ +# We use special features of data.table's `[`. The data.table package has a +# compatibility feature that disables some/all of these features if it thinks we +# might expect `data.frame`-compatible behavior instead. We can signal that we +# want the special behavior via `.datatable.aware = TRUE` or by importing any +# `data.table` package member. Do both to prevent surprises if we decide to use +# `data.table::` everywhere and not importing things. +.datatable.aware <- TRUE + +#' Validate a version bound arg +#' +#' Expected to be used on `clobberable_versions_start`, `versions_end`, +#' and similar arguments. Some additional context-specific checks may be needed. +#' +#' @param version_bound the version bound to validate +#' @param x a data frame containing a version column with which to check +#' compatibility +#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will +#' have a special context-dependent meaning.) +#' @param version_bound_arg optional string; what to call the version bound in +#' error messages +#' +#' @section Side effects: raises an error if version bound appears invalid +#' +#' @noRd +validate_version_bound <- function(version_bound, x, na_ok = FALSE, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { + if (is.null(version_bound)) { + cli_abort( + "{version_bound_arg} cannot be NULL" + ) + } + if (na_ok && is.na(version_bound)) { + return(invisible(NULL)) + } + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + ) + } + if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same types as x$version, + which is {typeof(x$version)}", + ) + } + + return(invisible(NULL)) +} + +#' `max(x$version)`, with error if `x` has 0 rows +#' +#' Exported to make defaults more easily copyable. +#' +#' @param x `x` argument of [`as_epi_archive`] +#' +#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or +#' an `NA` version value +#' +#' @export +max_version_with_row_in <- function(x) { + if (nrow(x) == 0L) { + cli_abort( + "`nrow(x)==0L`, representing a data set history with no row up through the + latest observed version, but we don't have a sensible guess at what version + that is, or whether any of the empty versions might be clobbered in the + future; if we use `x` to form an `epi_archive`, then + `clobberable_versions_start` and `versions_end` must be manually specified.", + class = "epiprocess__max_version_cannot_be_used" + ) + } else { + version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist + if (anyNA(version_col)) { + cli_abort("version values cannot be NA", + class = "epiprocess__version_values_must_not_be_na" + ) + } else { + version_bound <- max(version_col) + } + } +} + +#' Get the next possible value greater than `x` of the same type +#' +#' @param x the starting "value"(s) +#' @return same class, typeof, and length as `x` +#' +#' @export +next_after <- function(x) UseMethod("next_after") + +#' @export +next_after.integer <- function(x) x + 1L + +#' @export +next_after.Date <- function(x) x + 1L + + + +#' epi archive +#' @title `epi_archive` object +#' +#' @description An `epi_archive` is an R6 class which contains a data table +#' along with several relevant pieces of metadata. The data table can be seen +#' as the full archive (version history) for some signal variables of +#' interest. +#' +#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of +#' class `data.table` from the `data.table` package, with (at least) the +#' following columns: +#' +#' * `geo_value`: the geographic value associated with each row of measurements. +#' * `time_value`: the time value associated with each row of measurements. +#' * `version`: the time value specifying the version for each row of +#' measurements. For example, if in a given row the `version` is January 15, +#' 2022 and `time_value` is January 14, 2022, then this row contains the +#' measurements of the data for January 14, 2022 that were available one day +#' later. +#' +#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, +#' as well as any others (these can be specified when instantiating the +#' `epi_archive` object via the `other_keys` argument, and/or set by operating +#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for +#' information and examples of relevant parameter names for an `epi_archive` object. +#' Note that there can only be a single row per unique combination of +#' key variables, and thus the key variables are critical for figuring out how +#' to generate a snapshot of data from the archive, as of a given version. +#' +#' In general, the last version of each observation is carried forward (LOCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `versions_end`. One consequence is that the `DT` +#' doesn't have to contain a full snapshot of every version (although this +#' generally works), but can instead contain only the rows that are new or +#' changed from the previous version (see `compactify`, which does this +#' automatically). Currently, deletions must be represented as revising a row +#' to a special state (e.g., making the entries `NA` or including a special +#' column that flags the data as removed and performing some kind of +#' post-processing), and the archive is unaware of what this state is. Note +#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, +#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' **A word of caution:** R6 objects, unlike most other objects in R, have +#' reference semantics. A primary consequence of this is that objects are not +#' copied when modified. You can read more about this in Hadley Wickham's +#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order +#' to construct a modified archive while keeping the original intact, first +#' make a clone using the `$clone` method, then overwrite the clone's `DT` +#' field with `data.table::copy(clone$DT)`, and finally perform the +#' modifications on the clone. +#' +#' @section Metadata: +#' The following pieces of metadata are included as fields in an `epi_archive` +#' object: +#' +#' * `geo_type`: the type for the geo values. +#' * `time_type`: the type for the time values. +#' * `additional_metadata`: list of additional metadata for the data archive. +#' +#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be +#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, +#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the +#' metadata of an `epi_archive` object are not currently used by any +#' downstream functions in the `epiprocess` package, and serve only as useful +#' bits of information to convey about the data set at hand. +#' +#' @section Generating Snapshots: +#' An `epi_archive` object can be used to generate a snapshot of the data in +#' `epi_df` format, which represents the most up-to-date values of the signal +#' variables, as of the specified version. This is accomplished by calling the +#' `as_of()` method for an `epi_archive` object `x`. More details on this +#' method are documented in the wrapper function [`epix_as_of()`]. +#' +#' @section Sliding Computations: +#' We can run a sliding computation over an `epi_archive` object, much like +#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling +#' the `slide()` method for an `epi_archive` object, which works similarly to +#' the way `epi_slide()` works for an `epi_df` object, but with one key +#' difference: it is version-aware. That is, for an `epi_archive` object, the +#' sliding computation at any given reference time point t is performed on +#' **data that would have been available as of t**. More details on `slide()` +#' are documented in the wrapper function [`epix_slide()`]. +#' +#' @export +#' @examples +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% new_epi_archive2( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' @name epi_archive +# TODO: Figure out where to actually put this documentation +NULL + +#' New epi archive +#' @description Creates a new `epi_archive` object. +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space while maintaining the same behavior (with the help of the +#' `clobberable_versions_start` and `versions_end` fields in some edge cases). +#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will +#' remove these rows and issue a warning. Generally, this can be set to +#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` +#' such as its `DT`, or rely on redundant updates to achieve a certain +#' behavior of the `ref_time_values` default in `epix_slide`, you will have to +#' determine whether `compactify=TRUE` will produce the desired results. If +#' compactification here is removing a large proportion of the rows, this may +#' indicate a potential for space, time, or bandwidth savings upstream the +#' data pipeline, e.g., by avoiding fetching, storing, or processing these +#' rows of `x`. +#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] +#' @param versions_end Optional; as in [`as_epi_archive`] +#' @return An `epi_archive` object. +#' @importFrom data.table as.data.table key setkeyv +#' +#' @details +#' Refer to the documentation for [as_epi_archive()] for more information +#' and examples of parameter names. +#' @export +new_epi_archive2 <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + # If geo type is missing, then try to guess it + if (missing(geo_type) || is.null(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } + + # If time type is missing, then try to guess it + if (missing(time_type) || is.null(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } + assert_logical(compactify, len = 1, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end) || is.null(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + sprintf( + "`versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } + + # --- End of validation and replacing missing args with defaults --- + + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + structure( + list( + DT = DT, + geo_type = geo_type, + time_type = time_type, + additional_metadata = additional_metadata, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end, + private = list() # TODO: to be encapsulated with guard-rails later + ), + class = "epi_archive2" + ) +} + +#' Print information about an `epi_archive` object +#' @param class Boolean; whether to print the class label header +#' @param methods Boolean; whether to print all available methods of +#' the archive +#' @importFrom cli cli_inform +#' @export +print.epi_archive2 <- function(epi_archive, class = TRUE, methods = TRUE) { + cli_inform( + c( + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(epi_archive$DT), c("geo_value", "time_value", "version"))) > 0) { + "Non-standard DT keys: {setdiff(key(epi_archive$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(epi_archive$DT$time_value)} / {max(epi_archive$DT$time_value)}", + "i" = "First/last version with update: {min(epi_archive$DT$version)} / {max(epi_archive$DT$version)}", + "i" = if (!is.na(epi_archive$clobberable_versions_start)) { + "Clobberable versions start: {epi_archive$clobberable_versions_start}" + }, + "i" = "Versions end: {epi_archive$versions_end}", + "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", + "i" = "A preview of the table ({nrow(epi_archive$DT)} rows x {ncol(epi_archive$DT)} columns):" + ) + ) + + return(invisible(epi_archive$DT %>% print())) +} + + +#' @export +as_of <- function(x, ...) { + UseMethod("as_of") +} + + +#' As of epi_archive +#' @description Generates a snapshot in `epi_df` format as of a given version. +#' See the documentation for the wrapper function [`epix_as_of()`] for +#' details. The parameter descriptions below are copied from there +#' @param epi_archive An `epi_archive` object +#' @param max_version Version specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose `time_value`s are at least +#' `min_time_value`). +#' @param min_time_value Time value specifying the min `time_value` to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @importFrom data.table between key +#' @export +as_of.epi_archive2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + other_keys <- setdiff( + key(epi_archive$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `epi_archive$DT$version`." + ) + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same types as `epi_archive$DT$version`." + ) + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + assert_logical(all_versions, len = 1) + if (!is.na(epi_archive$clobberable_versions_start) && max_version >= epi_archive$clobberable_versions_start) { + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + # epi_archive is copied into result, so we can modify result directly + result <- epix_truncate_versions_after(epi_archive, max_version) + result$DT <- result$DT[time_value >= min_time_value, ] + return(result) + } + + # Make sure to use data.table ways of filtering and selecting + as_of_epi_df <- epi_archive$DT[time_value >= min_time_value & version <= max_version, ] %>% + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE + ) %>% + tibble::as_tibble() %>% + dplyr::select(-"version") %>% + as_epi_df( + geo_type = epi_archive$geo_type, + time_type = epi_archive$time_type, + as_of = max_version, + additional_metadata = c(epi_archive$additional_metadata, + other_keys = other_keys + ) + ) + + return(as_of_epi_df) +} + + +#' @export +fill_through_version <- function(x, ...) { + UseMethod("fill_through_version") +} + + +#' Fill through version +#' @description Fill in unobserved history using requested scheme by mutating +#' the given object and potentially reseating its fields. See +#' [`epix_fill_through_version`], which doesn't mutate the input archive but +#' might alias its fields. +#' +#' @param epi_archive an `epi_archive` object +#' @param fill_versions_end as in [`epix_fill_through_version`] +#' @param how as in [`epix_fill_through_version`] +#' +#' @importFrom data.table key setkeyv := address copy +#' @importFrom rlang arg_match +fill_through_version.epi_archive2 <- function( + epi_archive, + fill_versions_end, + how = c("na", "locf")) { + validate_version_bound(fill_versions_end, epi_archive$DT, na_ok = FALSE) + how <- arg_match(how) + if (epi_archive$versions_end < fill_versions_end) { + new_DT <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `epi_archive` is outdated. + nonversion_key_cols <- setdiff(key(epi_archive$DT), "version") + nonkey_cols <- setdiff(names(epi_archive$DT), key(epi_archive$DT)) + next_version_tag <- next_after(epi_archive$versions_end) + if (next_version_tag > fill_versions_end) { + cli_abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), epi_archive$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(epi_archive$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(epi_archive$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(epi_archive$DT, next_version_DT), key(epi_archive$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + epi_archive$DT + } + ) + new_versions_end <- fill_versions_end + # Update `epi_archive` all at once with simple, error-free operations + + # return below: + epi_archive$DT <- new_DT + epi_archive$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(invisible(epi_archive)) +} + + +#' @export +truncate_versions_after <- function(x, ...) { + UseMethod("truncate_versions_after") +} + + +#' Truncate versions after +#' @description Filter to keep only older versions, mutating the archive by +#' potentially reseating but not mutating some fields. `DT` is likely, but not +#' guaranteed, to be copied. Returns the mutated archive +#' [invisibly][base::invisible]. +#' @param epi_archive as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +truncate_versions_after.epi_archive2 <- function( + epi_archive, + max_version) { + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + epi_archive$DT <- epi_archive$DT[epi_archive$DT$version <= max_version, colnames(epi_archive$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(epi_archive$clobberable_versions_start) && epi_archive$clobberable_versions_start > max_version) { + epi_archive$clobberable_versions_start <- NA + } + epi_archive$versions_end <- max_version + return(invisible(epi_archive)) +} + + +#' Merge epi archive +#' @description Merges another `epi_archive` with the current one, mutating the +#' current one by reseating its `DT` and several other fields, but avoiding +#' mutation of the old `DT`; returns the current archive +#' [invisibly][base::invisible]. See [`epix_merge`] for a full description +#' of the non-R6-method version, which does not mutate either archive, and +#' does not alias either archive's `DT`.a +#' @param x as in [`epix_merge`] +#' @param y as in [`epix_merge`] +#' @param sync as in [`epix_merge`] +#' @param compactify as in [`epix_merge`] +merge_epi_archive2 <- function( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + result <- epix_merge(x, y, + sync = sync, + compactify = compactify + ) + + # TODO: Use encapsulating methods instead. + if (length(x$private_fields) != 0L) { + cli_abort("expected no private fields in x", + internal = TRUE + ) + } + + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(x$public_fields)) { + x[[field_name]] <- result[[field_name]] + } + + return(invisible(x)) +} + + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, +#' introducing column-level aliasing between its input and its result. This +#' doesn't follow the general model for most `data.table` operations, which +#' seems to be that, given an nonaliased (i.e., unique) pointer to a +#' `data.table` object, its pointers to its columns should also be nonaliased. +#' If you mutate any of the columns of either the input or result, first ensure +#' that it is fine if columns of the other are also mutated, but do not rely on +#' such behavior to occur. Additionally, never perform mutation on the key +#' columns at all (except for strictly increasing transformations), as this will +#' invalidate sortedness assumptions about the rows. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive <- +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) +#' ) %>% +#' as_epi_archive2(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% +#' epix_slide2(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive2 <- function(epi_archive, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(epi_archive)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate <- epix_detailed_restricted_mutate2(epi_archive, ...) + assert_logical(.drop) + if (!.drop) { + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + } else if (any(diff(grouping_col_is_factor) == -1L)) { + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these + columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert all + grouping columns to factors beforehand, specify the non-factor grouping columns first, + or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + } + } + new_grouped_epi_archive(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop + ) +} + + +#' @export +slide <- function(.data, ...) { + UseMethod("slide") +} + + +#' Slide over epi archive +#' @description Slides a given function over variables in an `epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. The parameter descriptions below are copied from there +#' @importFrom data.table key +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +slide.epi_archive2 <- function(epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + slide( + group_by(epi_archive), + f, + ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() +} + + +#' Convert to `epi_archive` format +#' +#' Converts a data frame, data table, or tibble into an `epi_archive` +#' object. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. The parameter descriptions below are copied from there +#' +#' @param x A data frame, data table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or +#' `NULL` will remove these rows and issue a warning. Generally, this can be +#' set to `TRUE`, but if you directly inspect or edit the fields of the +#' `epi_archive` such as its `DT`, you will have to determine whether +#' `compactify=TRUE` will produce the desired results. If compactification +#' here is removing a large proportion of the rows, this may indicate a +#' potential for space, time, or bandwidth savings upstream the data pipeline, +#' e.g., when fetching, storing, or preparing the input data `x` +#' @param clobberable_versions_start Optional; `length`-1; either a value of the +#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and +#' `typeof`: specifically, either (a) the earliest version that could be +#' subject to "clobbering" (being overwritten with different update data, but +#' using the *same* version tag as the old update data), or (b) `NA`, to +#' indicate that no versions are clobberable. There are a variety of reasons +#' why versions could be clobberable under routine circumstances, such as (a) +#' today's version of one/all of the columns being published after initially +#' being filled with `NA` or LOCF, (b) a buggy version of today's data being +#' published but then fixed and republished later in the day, or (c) data +#' pipeline delays (e.g., publisher uploading, periodic scraping, database +#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected +#' later in the day (or even on a different day) than expected; potential +#' causes vary between different data pipelines. The default value is `NA`, +#' which doesn't consider any versions to be clobberable. Another setting that +#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` and `typeof` as +#' `x$version`: what is the last version we have observed? The default is +#' `max_version_with_row_in(x)`, but values greater than this could also be +#' valid, and would indicate that we observed additional versions of the data +#' beyond `max(x$version)`, but they all contained empty updates. (The default +#' value of `clobberable_versions_start` does not fully trust these empty +#' updates, and assumes that any version `>= max(x$version)` could be +#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' @return An `epi_archive` object. +#' +#' @details This simply a wrapper around the `new()` method of the `epi_archive` +#' class, so for example: +#' ``` +#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") +#' ``` +#' would be equivalent to: +#' ``` +#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") +#' ``` +#' +#' @export +#' @examples +#' # Simple ex. with necessary keys +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% as_epi_archive2( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' +#' # Ex. with an additional key for county +#' df <- data.frame( +#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c( +#' "2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02" +#' ), +#' version = c( +#' "2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03" +#' ), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05) +#' ) +#' +#' x <- df %>% as_epi_archive2( +#' geo_type = "state", +#' time_type = "day", +#' other_keys = "county" +#' ) +as_epi_archive2 <- function(x, geo_type, time_type, other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x)) { + new_epi_archive2( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) +} + +#' Test for `epi_archive` format +#' +#' @param x An object. +#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also +#' count? Default is `FALSE`. +#' @return `TRUE` if the object inherits from `epi_archive`. +#' +#' @export +#' @examples +#' is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +#' is_epi_archive2(archive_cases_dv_subset_2) # TRUE +#' +#' # By default, grouped_epi_archives don't count as epi_archives, as they may +#' # support a different set of operations from regular `epi_archives`. This +#' # behavior can be controlled by `grouped_okay`. +#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) +#' is_epi_archive2(grouped_archive) # FALSE +#' is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE +#' +#' @seealso [`is_grouped_epi_archive`] +is_epi_archive2 <- function(x, grouped_okay = FALSE) { + inherits(x, "epi_archive2") || grouped_okay && inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone <- function(x, ...) { + UseMethod("clone") +} + + +#' @export +clone.epi_archive2 <- function(epi_archive, deep = FALSE) { + # TODO: Finish. + if (deep) { + epi_archive$DT <- copy(epi_archive$DT) + } else { + epi_archive$DT <- copy(epi_archive$DT) + } + return(epi_archive) +} diff --git a/R/data.R b/R/data.R index 26b9f39f..37ccc522 100644 --- a/R/data.R +++ b/R/data.R @@ -289,3 +289,11 @@ delayed_assign_with_unregister_awareness( #' * Furthermore, the data has been limited to a very small number of rows, the #' signal names slightly altered, and formatted into a tibble. "jhu_csse_county_level_subset" + +#' @export +"archive_cases_dv_subset_2" + +delayed_assign_with_unregister_awareness( + "archive_cases_dv_subset_2", + as_epi_archive2(archive_cases_dv_subset_dt, compactify = FALSE) +) diff --git a/R/grouped_archive_new.R b/R/grouped_archive_new.R new file mode 100644 index 00000000..c0e6c35e --- /dev/null +++ b/R/grouped_archive_new.R @@ -0,0 +1,456 @@ +#' +#' Convenience function for performing a `tidy_select` on dots according to its +#' docs, and taking the names (rather than the integer indices). +#' +#' @param ... tidyselect-syntax selection description +#' @param .data named vector / data frame; context for the description / the +#' object to which the selections apply +#' @return character vector containing names of entries/columns of +#' `names(.data)` denoting the selection +#' +#' @noRd +eval_pure_select_names_from_dots <- function(..., .data) { + # `?tidyselect::eval_select` tells us to use this form when we take in dots. + # It seems a bit peculiar, since the expr doesn't pack with it a way to get at + # the environment for the dots, but it looks like `eval_select` will assume + # the caller env (our `environment()`) when given an expr, and thus have + # access to the dots. + # + # If we were allowing renaming, we'd need to be careful about which names (new + # vs. old vs. both) to return here. + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) +} + +#' Get names of dots without forcing the dots +#' +#' For use in functions that use nonstandard evaluation (NSE) on the dots; we +#' can't use the pattern `names(list(...))` in this case because it will attempt +#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the +#' dots if we're using NSE. +#' +#' @noRd +nse_dots_names <- function(...) { + names(rlang::call_match()) +} +nse_dots_names2 <- function(...) { + rlang::names2(rlang::call_match()) +} + +#' @importFrom dplyr group_by_drop_default +#' @noRd +new_grouped_epi_archive <- function(ungrouped, vars, drop) { + if (inherits(ungrouped, "grouped_epi_archive")) { + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, + or `ungroup` first.", + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped) + ) + } + assert_class(ungrouped, "epi_archive2") + assert_character(vars) + if (!test_subset(vars, names(ungrouped$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", + ) + } + if ("version" %in% vars) { + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + assert_logical(drop, len = 1) + + # ----- + private <- list() + private$ungrouped <- ungrouped + private$vars <- vars + private$drop <- drop + + return(structure( + list( + private = private + ), + class = c("grouped_epi_archive2", "epi_archive2") + )) +} + +#' @export +print.grouped_epi_archive2 <- function(grouped_epi_archive, class = TRUE) { + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(grouped_epi_archive$private$vars, initial = "* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(grouped_epi_archive$private$ungrouped$DT, is.factor)[grouped_epi_archive$private$vars])) { + cat(strwrap(init = "* ", prefix = " ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (grouped_epi_archive$private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + print(grouped_epi_archive$private$ungrouped, class = FALSE) + # Return self invisibly for convenience in `$`-"pipe": + invisible(grouped_epi_archive) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_by +#' @export +group_by.grouped_epi_archive2 <- function( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive)) { + assert_logical(.add, len = 1) + if (!.add) { + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate <- epix_detailed_restricted_mutate2(grouped_epi_archive$private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(grouped_epi_archive$private$vars, vars_from_dots) + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, vars, .drop) + } +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +group_by_drop_default.grouped_epi_archive2 <- function(grouped_epi_archive) { + grouped_epi_archive$private$drop +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr groups +#' @export +groups.grouped_epi_archive2 <- function(grouped_epi_archive) { + rlang::syms(grouped_epi_archive$private$vars) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr ungroup +#' @export +ungroup.grouped_epi_archive2 <- function(grouped_epi_archive, ...) { + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + grouped_epi_archive$private$ungrouped + } else { + exclude_vars <- eval_pure_select_names_from_dots(..., .data = grouped_epi_archive$private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars <- grouped_epi_archive$private$vars[!grouped_epi_archive$private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, result_vars, grouped_epi_archive$private$drop) + } +} + +#' Truncate versions after a given version, grouped +#' @description Filter to keep only older versions by mutating the underlying +#' `epi_archive` using `$truncate_versions_after`. Returns the mutated +#' `grouped_epi_archive` [invisibly][base::invisible]. +#' @param x as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +#' @export +truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + truncate_versions_after(grouped_epi_archive$private$ungrouped, max_version) + return(invisible(grouped_epi_archive)) +} + +#' Truncate versions after a given version, grouped +#' @export +epix_truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + cloned_group_epi_archive <- clone(grouped_epi_archive, deep = TRUE) + return((truncate_versions_after(cloned_group_epi_archive, max_version))) + # ^ second set of parens drops invisibility +} + + +#' Slide over grouped epi archive +#' @description Slides a given function over variables in a `grouped_epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. +#' @importFrom data.table key address rbindlist setDF +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' env missing_arg +#' @export +slide.grouped_epi_archive2 <- function(grouped_epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + cli_abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + cli_abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(grouped_epi_archive$private$ungrouped) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > grouped_epi_archive$private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + cli_abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col <- sym(new_col_name) + + # Validate rest of parameters: + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) + + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group <- .data_group$DT + } + + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } + + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below + } + + f <- as_slide_computation(f, ...) + x <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- as_of(grouped_epi_archive$private$ungrouped, + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(grouped_epi_archive$private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). + as_of_archive$DT <- copy(as_of_archive$DT) + } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- clone(as_of_archive) + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop) + + # Unchop/unnest if we need to + if (!as_list_col) { + x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) +} + + +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining +# `Collate:`) and ordering of functions within each file in order to get the +# desired ordering. + + + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +is_grouped_epi_archive2 <- function(x) { + inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone.grouped_epi_archive2 <- function(x, deep = FALSE) { + # TODO: Finish. + if (deep) { + ungrouped <- clone(x$private$ungrouped, deep = TRUE) + } else { + ungrouped <- x$private$ungrouped + } + new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) +} diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 4bcead66..213cf1b1 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -470,9 +470,8 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { #' @noRd epix_detailed_restricted_mutate <- function(.data, ...) { # We don't want to directly use `dplyr::mutate` on the `$DT`, as: - # - this likely copies the entire table # - `mutate` behavior, including the output class, changes depending on - # whether `dtplyr` is loaded and would require post-processing + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing # - behavior with `dtplyr` isn't fully compatible # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not # appropriately handle the `= NULL` and `= ` tidyeval cases diff --git a/R/methods-epi_archive_new.R b/R/methods-epi_archive_new.R new file mode 100644 index 00000000..3ce39afc --- /dev/null +++ b/R/methods-epi_archive_new.R @@ -0,0 +1,826 @@ +#' Generate a snapshot from an `epi_archive` object +#' +#' Generates a snapshot in `epi_df` format from an `epi_archive` object, as of a +#' given version. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose time values are at least +#' `min_time_value`.) +#' @param min_time_value Time value specifying the min time value to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @return An `epi_df` object. +#' +#' @details This is simply a wrapper around the `as_of()` method of the +#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' ``` +#' epix_as_of(x, max_version = v) +#' ``` +#' is equivalent to: +#' ``` +#' x$as_of(max_version = v) +#' ``` +#' +#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' +#' @examples +#' # warning message of data latency shown +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = max(archive_cases_dv_subset_2$DT$version) +#' ) +#' +#' range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 +#' +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = as.Date("2020-06-12") +#' ) +#' +#' # When fetching a snapshot as of the latest version with update data in the +#' # archive, a warning is issued by default, as this update data might not yet +#' # be finalized (for example, if data versions are labeled with dates, these +#' # versions might be overwritten throughout the corresponding days with +#' # additional data or "hotfixes" of erroroneous data; when we build an archive +#' # based on database queries, the latest available update might still be +#' # subject to change, but previous versions should be finalized). We can +#' # muffle such warnings with the following pattern: +#' withCallingHandlers( +#' { +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = max(archive_cases_dv_subset_2$DT$version) +#' ) +#' }, +#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' ) +#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used +#' # to globally toggle these warnings. +#' +#' @export +epix_as_of2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + assert_class(epi_archive, "epi_archive2") + return(as_of(epi_archive, max_version, min_time_value, all_versions = all_versions)) +} + +#' `epi_archive` with unobserved history filled in (won't mutate, might alias) +#' +#' Sometimes, due to upstream data pipeline issues, we have to work with a +#' version history that isn't completely up to date, but with functions that +#' expect archives that are completely up to date, or equally as up-to-date as +#' another archive. This function provides one way to approach such mismatches: +#' pretend that we've "observed" additional versions, filling in these versions +#' with NAs or extrapolated values. +#' +#' '`epix_fill_through_version` will not mutate its `x` argument, but its result +#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate +#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to +#' give the result, but might reseat its fields (e.g., references to the old +#' `x$DT` might not be updated by this function or subsequent operations on +#' `x`), and returns the updated `x` [invisibly][base::invisible]. +#' +#' @param x An `epi_archive` +#' @param fill_versions_end Length-1, same class&type as `x$version`: the +#' version through which to fill in missing version history; this will be the +#' result's `$versions_end` unless it already had a later +#' `$versions_end`. +#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing +#' required version history with `NA`s, by inserting (if necessary) an update +#' immediately after the current `$versions_end` that revises all +#' existing measurements to be `NA` (this is only supported for `version` +#' classes with a `next_after` implementation); `"locf"` will fill in missing +#' version history with the last version of each observation carried forward +#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are +#' based on LOCF). Default is `"na"`. +#' @return An `epi_archive` +epix_fill_through_version2 <- function(epi_archive, fill_versions_end, + how = c("na", "locf")) { + assert_class(epi_archive, "epi_archive2") + cloned_epi_archive <- clone(epi_archive) + # Enclosing parentheses drop the invisibility flag. See description above of + # potential mutation and aliasing behavior. + (fill_through_version(cloned_epi_archive, fill_versions_end, how = how)) +} + +#' Merge two `epi_archive` objects +#' +#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and +#' set of key columns. When they also share a common `versions_end`, +#' using `$as_of` on the result should be the same as using `$as_of` on `x` and +#' `y` individually, then performing a full join of the `DT`s on the non-version +#' key columns (potentially consolidating multiple warnings about clobberable +#' versions). If the `versions_end` values differ, the +#' `sync` parameter controls what is done. +#' +#' This function, [`epix_merge`], does not mutate its inputs and will not alias +#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite +#' `x` with the result of the merge, reseating its `DT` and several other fields +#' (making them point to different objects), but avoiding mutation of the +#' contents of the old `DT` (only relevant if you have another reference to the +#' old `DT` in another object). +#' +#' @param x,y Two `epi_archive` objects to join together. +#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the +#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: +#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` +#' as the result's `versions_end`, but ensure that, if we request a snapshot +#' as of a version after `min(x$versions_end, y$versions_end)`, the +#' observation columns from the less up-to-date archive will be all NAs (i.e., +#' imagine there was an update immediately after its `versions_end` which +#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, +#' y$versions_end)` as the result's `versions_end`, allowing the last version +#' of each observation to be carried forward to extrapolate unavailable +#' versions for the less up-to-date input archive (i.e., imagining that in the +#' less up-to-date archive's data set remained unchanged between its actual +#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: +#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, +#' and discard any rows containing update rows for later versions. +#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be +#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' Default here is `TRUE`. +#' @return the resulting `epi_archive` +#' +#' @details In all cases, `additional_metadata` will be an empty list, and +#' `clobberable_versions_start` will be set to the earliest version that could +#' be clobbered in either input archive. +#' +#' @examples +#' # create two example epi_archive datasets +#' x <- archive_cases_dv_subset_2$DT %>% +#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% +#' as_epi_archive2(compactify = TRUE) +#' y <- archive_cases_dv_subset_2$DT %>% +#' dplyr::select(geo_value, time_value, version, percent_cli) %>% +#' as_epi_archive2(compactify = TRUE) +#' # merge results stored in a third object: +#' xy <- epix_merge2(x, y) +#' +#' @importFrom data.table key set setkeyv +#' @export +epix_merge2 <- function(x, y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + assert_class(x, "epi_archive2") + assert_class(y, "epi_archive2") + sync <- rlang::arg_match(sync) + + if (!identical(x$geo_type, y$geo_type)) { + cli_abort("`x` and `y` must have the same `$geo_type`") + } + + if (!identical(x$time_type, y$time_type)) { + cli_abort("`x` and `y` must have the same `$time_type`") + } + + if (length(x$additional_metadata) != 0L) { + cli_warn("x$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + if (length(y$additional_metadata) != 0L) { + cli_warn("y$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + result_additional_metadata <- list() + + result_clobberable_versions_start <- + if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { + NA # (any type of NA is fine here) + } else { + min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) + } + + # The actual merge below may not succeed 100% of the time, so do this + # preprocessing using non-mutating (but potentially aliasing) functions. This + # approach potentially uses more memory, but won't leave behind a + # partially-mutated `x` on failure. + if (sync == "forbid") { + if (!identical(x$versions_end, y$versions_end)) { + cli_abort(paste( + "`x` and `y` were not equally up to date version-wise:", + "`x$versions_end` was not identical to `y$versions_end`;", + "either ensure that `x` and `y` are equally up to date before merging,", + "or specify how to deal with this using `sync`" + ), class = "epiprocess__epix_merge_unresolved_sync") + } else { + new_versions_end <- x$versions_end + x_DT <- x$DT + y_DT <- y$DT + } + } else if (sync %in% c("na", "locf")) { + new_versions_end <- max(x$versions_end, y$versions_end) + x_DT <- epix_fill_through_version2(x, new_versions_end, sync)$DT + y_DT <- epix_fill_through_version2(y, new_versions_end, sync)$DT + } else if (sync == "truncate") { + new_versions_end <- min(x$versions_end, y$versions_end) + x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + } else { + cli_abort("unimplemented") + } + + # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same + # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to + # split the code into separate functions if we wish), but still refer to + # {x,y}$DT in the error messages (further relying on this assumption). + # + # Check&ensure that the above assumption; if it didn't already hold, we likely + # have a bug in the preprocessing, a weird/invalid archive as input, and/or a + # data.table version with different semantics (which may break other parts of + # our code). + x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) + y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) + if (!x_DT_key_as_expected || !y_DT_key_as_expected) { + cli_warn(" + `epiprocess` internal warning (please report): pre-processing for + epix_merge unexpectedly resulted in an intermediate data table (or + tables) with a different key than the corresponding input archive. + Manually setting intermediate data table keys to the expected values. + ", internal = TRUE) + setkeyv(x_DT, key(x$DT)) + setkeyv(y_DT, key(y$DT)) + } + # Without some sort of annotations of what various columns represent, we can't + # do something that makes sense when merging archives with mismatched keys. + # E.g., even if we assume extra keys represent demographic breakdowns, a + # sensible default treatment of count-type and rate-type value columns would + # differ. + if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { + cli_abort(" + The archives must have the same set of key column names; if the + key columns represent the same things, just with different + names, please retry after manually renaming to match; if they + represent different things (e.g., x has an age breakdown + but y does not), please retry after processing them to share + the same key (e.g., by summarizing x to remove the age breakdown, + or by applying a static age breakdown to y). + ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") + } + # `by` cols = result (and each input's) `key` cols, and determine + # the row set, determined using a full join via `merge` + # + # non-`by` cols = "value"-ish cols, and are looked up with last + # version carried forward via rolling joins + by <- key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to contain + "geo_value", "time_value", and "version".', + class = "epiprocess__epi_archive_must_have_required_key_cols" + ) + } + if (length(by) < 1L || utils::tail(by, 1L) != "version") { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to have a "version" as + the last key col.', + class = "epiprocess__epi_archive_must_have_version_at_end_of_key" + ) + } + x_nonby_colnames <- setdiff(names(x_DT), by) + y_nonby_colnames <- setdiff(names(y_DT), by) + if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { + cli_abort(" + `x` and `y` DTs have overlapping non-by column names; + this is currently not supported; please manually fix up first: + any overlapping columns that can are key-like should be + incorporated into the key, and other columns should be renamed. + ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + } + x_by_vals <- x_DT[, by, with = FALSE] + if (anyDuplicated(x_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `x$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `x`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + y_by_vals <- y_DT[, by, with = FALSE] + if (anyDuplicated(y_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `y$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `y`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + result_DT <- merge(x_by_vals, y_by_vals, + by = by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all = TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian = TRUE + ) + set( + result_DT, , x_nonby_colnames, + x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + with = FALSE, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll = TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch = NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian = TRUE + ] + ) + set( + result_DT, , y_nonby_colnames, + y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + with = FALSE, + on = by, + roll = TRUE, + nomatch = NA, + allow.cartesian = TRUE + ] + ) + # The key could be unset in case of a key vs. by order mismatch as + # noted above. Ensure that we keep it: + setkeyv(result_DT, by) + + return(as_epi_archive2( + result_DT[], # clear data.table internal invisibility flag if set + geo_type = x$geo_type, + time_type = x$time_type, + other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), + additional_metadata = result_additional_metadata, + # It'd probably be better to pre-compactify before the merge, and might be + # guaranteed not to be necessary to compactify the merge result if the + # inputs are already compactified, but at time of writing we don't have + # compactify in its own method or field, and it seems like it should be + # pretty fast anyway. + compactify = compactify, + clobberable_versions_start = result_clobberable_versions_start, + versions_end = new_versions_end + )) +} + +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df <- function(parent_df) { + assert_class(parent_df, "data.frame") + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { + assert_class(col_modify_recorder_df, "col_modify_recorder_df") + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) + ) +} + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { + cli_abort("`col_modify_recorder_df` can only record `cols` once", + internal = TRUE + ) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data +} + +#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` +#' +#' More detailed: provides the names of the "requested" columns in addition to +#' the output expected from a regular `mutate` method. +#' +#' Restricted: doesn't allow replacing or removing key cols, where a sort is +#' potentially required at best and what the output key should be is unclear at +#' worst. (The originally expected restriction was that the `mutate` parameters +#' not present in `group_by` would not be recognized, but the current +#' implementation just lets `mutate` handle these even anyway, even if they're +#' not part of the regular `group_by` parameters; these arguments would have to +#' be passed by names with dot prefixes, so just hope that the user means to use +#' them here if provided.) +#' +#' This can introduce column-level aliasing in `data.table`s, which isn't really +#' intended in the `data.table` user model but we can make it part of our user +#' model (see +#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table +#' and links). +#' +#' Don't export this without cleaning up language of "mutate" as in side effects +#' vs. "mutate" as in `dplyr::mutate`. +#' @noRd +epix_detailed_restricted_mutate2 <- function(.data, ...) { + # We don't want to directly use `dplyr::mutate` on the `$DT`, as: + # - `mutate` behavior, including the output class, changes depending on + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing + # - behavior with `dtplyr` isn't fully compatible + # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not + # appropriately handle the `= NULL` and `= ` tidyeval cases + # Instead: + # - Use `as.list` to get a shallow copy (undocumented, but apparently + # intended, behavior), then `as_tibble` (also shallow, given a list) to get + # back to something that will use `dplyr`'s included `mutate` method(s), + # then convert this using shallow operations into a `data.table`. + # - Use `col_modify_recorder_df` to get the desired details. + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + col_modify_cols <- + destructure_col_modify_recorder_df( + mutate(new_col_modify_recorder_df(in_tbl), ...) + )[["cols"]] + invalidated_key_col_is <- + which(purrr::map_lgl(key(.data$DT), function(key_colname) { + key_colname %in% names(col_modify_cols) && + !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) + })) + if (length(invalidated_key_col_is) != 0L) { + rlang::abort(paste_lines(c( + "Key columns must not be replaced or removed.", + wrap_varnames(key(.data$DT)[invalidated_key_col_is], + initial = "Flagged key cols: " + ) + ))) + } else { + # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing + # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key = key(.data$DT)) + out_archive <- clone(.data) + out_archive$DT <- out_DT + request_names <- names(col_modify_cols) + return(list( + archive = out_archive, + request_names = request_names + )) + # (We might also consider special-casing when `mutate` hands back something + # equivalent (in some sense) to the input (probably only encountered when + # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, + # in the result, primarily in order to hedge against `as.list` or `setDT` + # changing their behavior and generating deep copies somehow. This could + # also prevent storage, and perhaps also generation, of shallow copies, but + # this seems unlikely to be a major gain unless it helps enable some + # in-place modifications of refcount-1 columns (although detecting this case + # seems to be common across `group_by` implementations; maybe there is + # something there).) + } +} + + +#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` +#' +#' Slides a given function over variables in an `epi_archive` object. This +#' behaves similarly to `epi_slide()`, with the key exception that it is +#' version-aware: the sliding computation at any given reference time t is +#' performed on **data that would have been available as of t**. See the +#' [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, +#' all data in `x` will be treated as part of a single data group. +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +#' @return A tibble whose columns are: the grouping variables, `time_value`, +#' containing the reference time values for the slide computation, and a +#' column named according to the `new_col_name` argument, containing the slide +#' values. +#' +#' @details A few key distinctions between the current function and `epi_slide()`: +#' 1. In `f` functions for `epix_slide`, one should not assume that the input +#' data to contain any rows with `time_value` matching the computation's +#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for +#' typical epidemiological surveillance data, observations pertaining to a +#' particular time period (`time_value`) are first reported `as_of` some +#' instant after that time period has ended. +#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend +#' from `before` time steps before a given `ref_time_value` through the last +#' `time_value` available as of version `ref_time_value` (typically, this +#' won't include `ref_time_value` itself, as observations about a particular +#' time interval (e.g., day) are only published after that time interval +#' ends); `epi_slide` windows extend from `before` time steps before a +#' `ref_time_value` through `after` time steps after `ref_time_value`. +#' 3. The input class and columns are similar but different: `epix_slide` +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) +#' 4. The output class and columns are similar but different: `epix_slide()` +#' returns a tibble containing only the grouping variables, `time_value`, and +#' the new column(s) from the slide computations, whereas `epi_slide()` +#' returns an `epi_df` with all original variables plus the new columns from +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. +#' 7. The `ref_time_values` default for `epix_slide` is based on making an +#' evenly-spaced sequence out of the `version`s in the `DT` plus the +#' `versions_end`, rather than the `time_value`s. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. +#' +#' Furthermore, the current function can be considerably slower than +#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch +#' properly-versioned snapshots from the data archive (via its `as_of()` +#' method), and (2) it performs a "manual" sliding of sorts, and does not +#' benefit from the highly efficient `slider` package. For this reason, it +#' should never be used in place of `epi_slide()`, and only used when +#' version-aware sliding is necessary (as it its purpose). +#' +#' Finally, this is simply a wrapper around the `slide()` method of the +#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an +#' object of either of these classes, then: +#' ``` +#' epix_slide(x, new_var = comp(old_var), before = 119) +#' ``` +#' is equivalent to: +#' ``` +#' x$slide(new_var = comp(old_var), before = 119) +#' ``` +#' +#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place +#' mutation of the input archives on their own. In some edge cases the inputs it +#' feeds to the slide computations may alias parts of the input archive, so copy +#' the slide computation inputs if needed before using mutating operations like +#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of +#' the slide operation may alias parts of the input archive, so similarly, make +#' sure to clone and/or copy appropriately before using in-place mutation. +#' +#' @examples +#' library(dplyr) +#' +#' # Reference time points for which we want to compute slide values: +#' ref_time_values <- seq(as.Date("2020-06-01"), +#' as.Date("2020-06-15"), +#' by = "1 day" +#' ) +#' +#' # A simple (but not very useful) example (see the archive vignette for a more +#' # realistic one): +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = "case_rate_7d_av_recent_av" +#' ) %>% +#' ungroup() +#' # We requested time windows that started 2 days before the corresponding time +#' # values. The actual number of `time_value`s in each computation depends on +#' # the reporting latency of the signal and `time_value` range covered by the +#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +#' # discarded +#' # * 1 `time_value`, for ref time 2020-06-02 +#' # * 2 `time_value`s, for the rest of the results +#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because +#' # of data latency, we'll never have an observation +#' # `time_value == ref_time_value` as of `ref_time_value`. +#' # The example below shows this type of behavior in more detail. +#' +#' # Examining characteristics of the data passed to each computation with +#' # `all_versions=FALSE`. +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if (nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) +#' +#' # --- Advanced: --- +#' +#' # `epix_slide` with `all_versions=FALSE` (the default) applies a +#' # version-unaware computation to several versions of the data. We can also +#' # use `all_versions=TRUE` to apply a version-*aware* computation to several +#' # versions of the data, again looking at characteristics of the data passed +#' # to each computation. In this case, each computation should expect an +#' # `epi_archive` containing the relevant version data: +#' +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' function(x, gk, rtv) { +#' tibble( +#' versions_start = if (nrow(x$DT) == 0L) { +#' "NA (0 rows)" +#' } else { +#' toString(min(x$DT$version)) +#' }, +#' versions_end = x$versions_end, +#' time_range = if (nrow(x$DT) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) +#' }, +#' n = nrow(x$DT), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = TRUE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' # Focus on one geo_value so we can better see the columns above: +#' filter(geo_value == "ca") %>% +#' select(-geo_value) +#' +#' @importFrom rlang enquo !!! +#' @export +epix_slide2 <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive2(x, grouped_okay = TRUE)) { + cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") + } + return(slide(x, f, ..., + before = before, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_name = new_col_name, + as_list_col = as_list_col, + names_sep = names_sep, + all_versions = all_versions + )) +} + + +#' Filter an `epi_archive` object to keep only older versions +#' +#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping +#' only rows with `version` falling on or before a specified date. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' filtered archive. That is, the output archive will comprise rows of the +#' current archive data having `version` less than or equal to the +#' specified `max_version` +#' @return An `epi_archive` object +#' +#' @export +epix_truncate_versions_after <- function(x, max_version) { + UseMethod("epix_truncate_versions_after") +} + +#' @export +epix_truncate_versions_after.epi_archive2 <- function(x, max_version) { + cloned_epi_archive <- clone(x) + return((truncate_versions_after(x, max_version))) + # ^ second set of parens drops invisibility +} diff --git a/man/as_epi_archive2.Rd b/man/as_epi_archive2.Rd new file mode 100644 index 00000000..bc3f5185 --- /dev/null +++ b/man/as_epi_archive2.Rd @@ -0,0 +1,142 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_epi_archive2} +\alias{as_epi_archive2} +\title{Convert to \code{epi_archive} format} +\usage{ +as_epi_archive2( + x, + geo_type, + time_type, + other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x) +) +} +\arguments{ +\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or +\code{NULL} will remove these rows and issue a warning. Generally, this can be +set to \code{TRUE}, but if you directly inspect or edit the fields of the +\code{epi_archive} such as its \code{DT}, you will have to determine whether +\code{compactify=TRUE} will produce the desired results. If compactification +here is removing a large proportion of the rows, this may indicate a +potential for space, time, or bandwidth savings upstream the data pipeline, +e.g., when fetching, storing, or preparing the input data \code{x}} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} + +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Converts a data frame, data table, or tibble into an \code{epi_archive} +object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. The parameter descriptions below are copied from there +} +\details{ +This simply a wrapper around the \code{new()} method of the \code{epi_archive} +class, so for example: + +\if{html}{\out{

}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} +} +\examples{ +# Simple ex. with necessary keys +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% as_epi_archive2( + geo_type = "state", + time_type = "day" +) +toy_epi_archive + +# Ex. with an additional key for county +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) +) + +x <- df \%>\% as_epi_archive2( + geo_type = "state", + time_type = "day", + other_keys = "county" +) +} diff --git a/man/as_of.epi_archive2.Rd b/man/as_of.epi_archive2.Rd new file mode 100644 index 00000000..21a4cfc1 --- /dev/null +++ b/man/as_of.epi_archive2.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_of.epi_archive2} +\alias{as_of.epi_archive2} +\title{As of epi_archive} +\usage{ +\method{as_of}{epi_archive2}(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) +} +\arguments{ +\item{epi_archive}{An \code{epi_archive} object} + +\item{max_version}{Version specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose \code{time_value}s are at least +\code{min_time_value}).} + +\item{min_time_value}{Time value specifying the min \code{time_value} to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{Boolean; If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} +} +\description{ +Generates a snapshot in \code{epi_df} format as of a given version. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 6a25b2af..86e21b89 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,9 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{epi_archive} \alias{epi_archive} \title{\code{epi_archive} object} \description{ +An \code{epi_archive} is an R6 class which contains a data table +along with several relevant pieces of metadata. The data table can be seen +as the full archive (version history) for some signal variables of +interest. + An \code{epi_archive} is an R6 class which contains a data table along with several relevant pieces of metadata. The data table can be seen as the full archive (version history) for some signal variables of @@ -49,6 +54,56 @@ represent potential update data that we do not yet have access to; or in version in which it was first released, or if no version of that observation appears in the archive data at all. +\strong{A word of caution:} R6 objects, unlike most other objects in R, have +reference semantics. A primary consequence of this is that objects are not +copied when modified. You can read more about this in Hadley Wickham's +\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order +to construct a modified archive while keeping the original intact, first +make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} +field with \code{data.table::copy(clone$DT)}, and finally perform the +modifications on the clone. + +epi archive + +An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of +class \code{data.table} from the \code{data.table} package, with (at least) the +following columns: +\itemize{ +\item \code{geo_value}: the geographic value associated with each row of measurements. +\item \code{time_value}: the time value associated with each row of measurements. +\item \code{version}: the time value specifying the version for each row of +measurements. For example, if in a given row the \code{version} is January 15, +2022 and \code{time_value} is January 14, 2022, then this row contains the +measurements of the data for January 14, 2022 that were available one day +later. +} + +The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, +as well as any others (these can be specified when instantiating the +\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating +on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for +information and examples of relevant parameter names for an \code{epi_archive} object. +Note that there can only be a single row per unique combination of +key variables, and thus the key variables are critical for figuring out how +to generate a snapshot of data from the archive, as of a given version. + +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{versions_end}. One consequence is that the \code{DT} +doesn't have to contain a full snapshot of every version (although this +generally works), but can instead contain only the rows that are new or +changed from the previous version (see \code{compactify}, which does this +automatically). Currently, deletions must be represented as revising a row +to a special state (e.g., making the entries \code{NA} or including a special +column that flags the data as removed and performing some kind of +post-processing), and the archive is unaware of what this state is. Note +that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, +e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. + \strong{A word of caution:} R6 objects, unlike most other objects in R, have reference semantics. A primary consequence of this is that objects are not copied when modified. You can read more about this in Hadley Wickham's @@ -60,6 +115,22 @@ modifications on the clone. } \section{Metadata}{ +The following pieces of metadata are included as fields in an \code{epi_archive} +object: +\itemize{ +\item \code{geo_type}: the type for the geo values. +\item \code{time_type}: the type for the time values. +\item \code{additional_metadata}: list of additional metadata for the data archive. +} + +Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be +accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, +etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the +metadata of an \code{epi_archive} object are not currently used by any +downstream functions in the \code{epiprocess} package, and serve only as useful +bits of information to convey about the data set at hand. + + The following pieces of metadata are included as fields in an \code{epi_archive} object: \itemize{ @@ -78,6 +149,13 @@ bits of information to convey about the data set at hand. \section{Generating Snapshots}{ +An \code{epi_archive} object can be used to generate a snapshot of the data in +\code{epi_df} format, which represents the most up-to-date values of the signal +variables, as of the specified version. This is accomplished by calling the +\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this +method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. + + An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal variables, as of the specified version. This is accomplished by calling the @@ -87,6 +165,16 @@ method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_o \section{Sliding Computations}{ +We can run a sliding computation over an \code{epi_archive} object, much like +\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling +the \code{slide()} method for an \code{epi_archive} object, which works similarly to +the way \code{epi_slide()} works for an \code{epi_df} object, but with one key +difference: it is version-aware. That is, for an \code{epi_archive} object, the +sliding computation at any given reference time point t is performed on +\strong{data that would have been available as of t}. More details on \code{slide()} +are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. + + We can run a sliding computation over an \code{epi_archive} object, much like \code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling the \code{slide()} method for an \code{epi_archive} object, which works similarly to @@ -114,6 +202,22 @@ toy_epi_archive <- tib \%>\% epi_archive$new( time_type = "day" ) toy_epi_archive +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% new_epi_archive2( + geo_type = "state", + time_type = "day" +) +toy_epi_archive } \section{Public fields}{ \if{html}{\out{
}} diff --git a/man/epix_as_of2.Rd b/man/epix_as_of2.Rd new file mode 100644 index 00000000..ac69e9a9 --- /dev/null +++ b/man/epix_as_of2.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_as_of2} +\alias{epix_as_of2} +\title{Generate a snapshot from an \code{epi_archive} object} +\usage{ +epix_as_of2( + epi_archive, + max_version, + min_time_value = -Inf, + all_versions = FALSE +) +} +\arguments{ +\item{max_version}{Time value specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose time values are at least +\code{min_time_value}.)} + +\item{min_time_value}{Time value specifying the min time value to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{x}{An \code{epi_archive} object} +} +\value{ +An \code{epi_df} object. +} +\description{ +Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, as of a +given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +This is simply a wrapper around the \code{as_of()} method of the +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: + +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input +archives, but may in some edge cases alias parts of the inputs, so copy the +outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} +operator. Currently, the only situation where there is potentially aliasing +is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change +in the future. +} +\examples{ +# warning message of data latency shown +epix_as_of2( + archive_cases_dv_subset_2, + max_version = max(archive_cases_dv_subset_2$DT$version) +) + +range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 + +epix_as_of2( + archive_cases_dv_subset_2, + max_version = as.Date("2020-06-12") +) + +# When fetching a snapshot as of the latest version with update data in the +# archive, a warning is issued by default, as this update data might not yet +# be finalized (for example, if data versions are labeled with dates, these +# versions might be overwritten throughout the corresponding days with +# additional data or "hotfixes" of erroroneous data; when we build an archive +# based on database queries, the latest available update might still be +# subject to change, but previous versions should be finalized). We can +# muffle such warnings with the following pattern: +withCallingHandlers( + { + epix_as_of2( + archive_cases_dv_subset_2, + max_version = max(archive_cases_dv_subset_2$DT$version) + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +) +# Since R 4.0, there is a `globalCallingHandlers` function that can be used +# to globally toggle these warnings. + +} diff --git a/man/epix_fill_through_version2.Rd b/man/epix_fill_through_version2.Rd new file mode 100644 index 00000000..7389388a --- /dev/null +++ b/man/epix_fill_through_version2.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_fill_through_version2} +\alias{epix_fill_through_version2} +\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\usage{ +epix_fill_through_version2( + epi_archive, + fill_versions_end, + how = c("na", "locf") +) +} +\arguments{ +\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the +version through which to fill in missing version history; this will be the +result's \verb{$versions_end} unless it already had a later +\verb{$versions_end}.} + +\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing +required version history with \code{NA}s, by inserting (if necessary) an update +immediately after the current \verb{$versions_end} that revises all +existing measurements to be \code{NA} (this is only supported for \code{version} +classes with a \code{next_after} implementation); \code{"locf"} will fill in missing +version history with the last version of each observation carried forward +(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are +based on LOCF). Default is \code{"na"}.} + +\item{x}{An \code{epi_archive}} +} +\value{ +An \code{epi_archive} +} +\description{ +Sometimes, due to upstream data pipeline issues, we have to work with a +version history that isn't completely up to date, but with functions that +expect archives that are completely up to date, or equally as up-to-date as +another archive. This function provides one way to approach such mismatches: +pretend that we've "observed" additional versions, filling in these versions +with NAs or extrapolated values. +} +\details{ +'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result +might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate +\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to +give the result, but might reseat its fields (e.g., references to the old +\code{x$DT} might not be updated by this function or subsequent operations on +\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. +} diff --git a/man/epix_merge2.Rd b/man/epix_merge2.Rd new file mode 100644 index 00000000..11d0aff5 --- /dev/null +++ b/man/epix_merge2.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_merge2} +\alias{epix_merge2} +\title{Merge two \code{epi_archive} objects} +\usage{ +epix_merge2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x, y}{Two \code{epi_archive} objects to join together.} + +\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the +case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: +\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} +as the result's \code{versions_end}, but ensure that, if we request a snapshot +as of a version after \code{min(x$versions_end, y$versions_end)}, the +observation columns from the less up-to-date archive will be all NAs (i.e., +imagine there was an update immediately after its \code{versions_end} which +revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version +of each observation to be carried forward to extrapolate unavailable +versions for the less up-to-date input archive (i.e., imagining that in the +less up-to-date archive's data set remained unchanged between its actual +\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: +use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, +and discard any rows containing update rows for later versions.} + +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be +compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +Default here is \code{TRUE}.} +} +\value{ +the resulting \code{epi_archive} +} +\description{ +Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and +set of key columns. When they also share a common \code{versions_end}, +using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and +\code{y} individually, then performing a full join of the \code{DT}s on the non-version +key columns (potentially consolidating multiple warnings about clobberable +versions). If the \code{versions_end} values differ, the +\code{sync} parameter controls what is done. +} +\details{ +This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias +either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite +\code{x} with the result of the merge, reseating its \code{DT} and several other fields +(making them point to different objects), but avoiding mutation of the +contents of the old \code{DT} (only relevant if you have another reference to the +old \code{DT} in another object). + +In all cases, \code{additional_metadata} will be an empty list, and +\code{clobberable_versions_start} will be set to the earliest version that could +be clobbered in either input archive. +} +\examples{ +# create two example epi_archive datasets +x <- archive_cases_dv_subset_2$DT \%>\% + dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% + as_epi_archive2(compactify = TRUE) +y <- archive_cases_dv_subset_2$DT \%>\% + dplyr::select(geo_value, time_value, version, percent_cli) \%>\% + as_epi_archive2(compactify = TRUE) +# merge results stored in a third object: +xy <- epix_merge2(x, y) + +} diff --git a/man/epix_slide2.Rd b/man/epix_slide2.Rd new file mode 100644 index 00000000..8d822bc0 --- /dev/null +++ b/man/epix_slide2.Rd @@ -0,0 +1,283 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_slide2} +\alias{epix_slide2} +\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} +\usage{ +epix_slide2( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, +all data in \code{x} will be treated as part of a single data group.} + +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\value{ +A tibble whose columns are: the grouping variables, \code{time_value}, +containing the reference time values for the slide computation, and a +column named according to the \code{new_col_name} argument, containing the slide +values. +} +\description{ +Slides a given function over variables in an \code{epi_archive} object. This +behaves similarly to \code{epi_slide()}, with the key exception that it is +version-aware: the sliding computation at any given reference time t is +performed on \strong{data that would have been available as of t}. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +A few key distinctions between the current function and \code{epi_slide()}: +\enumerate{ +\item In \code{f} functions for \code{epix_slide}, one should not assume that the input +data to contain any rows with \code{time_value} matching the computation's +\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for +typical epidemiological surveillance data, observations pertaining to a +particular time period (\code{time_value}) are first reported \code{as_of} some +instant after that time period has ended. +\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend +from \code{before} time steps before a given \code{ref_time_value} through the last +\code{time_value} available as of version \code{ref_time_value} (typically, this +won't include \code{ref_time_value} itself, as observations about a particular +time interval (e.g., day) are only published after that time interval +ends); \code{epi_slide} windows extend from \code{before} time steps before a +\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. +\item The input class and columns are similar but different: \code{epix_slide} +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) +\item The output class and columns are similar but different: \code{epix_slide()} +returns a tibble containing only the grouping variables, \code{time_value}, and +the new column(s) from the slide computations, whereas \code{epi_slide()} +returns an \code{epi_df} with all original variables plus the new columns from +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. +\item The \code{ref_time_values} default for \code{epix_slide} is based on making an +evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the +\code{versions_end}, rather than the \code{time_value}s. +} + +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + +Furthermore, the current function can be considerably slower than +\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch +properly-versioned snapshots from the data archive (via its \code{as_of()} +method), and (2) it performs a "manual" sliding of sorts, and does not +benefit from the highly efficient \code{slider} package. For this reason, it +should never be used in place of \code{epi_slide()}, and only used when +version-aware sliding is necessary (as it its purpose). + +Finally, this is simply a wrapper around the \code{slide()} method of the +\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an +object of either of these classes, then: + +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place +mutation of the input archives on their own. In some edge cases the inputs it +feeds to the slide computations may alias parts of the input archive, so copy +the slide computation inputs if needed before using mutating operations like +\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of +the slide operation may alias parts of the input archive, so similarly, make +sure to clone and/or copy appropriately before using in-place mutation. +} +\examples{ +library(dplyr) + +# Reference time points for which we want to compute slide values: +ref_time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-15"), + by = "1 day" +) + +# A simple (but not very useful) example (see the archive vignette for a more +# realistic one): +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = "case_rate_7d_av_recent_av" + ) \%>\% + ungroup() +# We requested time windows that started 2 days before the corresponding time +# values. The actual number of `time_value`s in each computation depends on +# the reporting latency of the signal and `time_value` range covered by the +# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +# discarded +# * 1 `time_value`, for ref time 2020-06-02 +# * 2 `time_value`s, for the rest of the results +# * never the 3 `time_value`s we would get from `epi_slide`, since, because +# of data latency, we'll never have an observation +# `time_value == ref_time_value` as of `ref_time_value`. +# The example below shows this type of behavior in more detail. + +# Examining characteristics of the data passed to each computation with +# `all_versions=FALSE`. +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + function(x, gk, rtv) { + tibble( + time_range = if (nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) + +# --- Advanced: --- + +# `epix_slide` with `all_versions=FALSE` (the default) applies a +# version-unaware computation to several versions of the data. We can also +# use `all_versions=TRUE` to apply a version-*aware* computation to several +# versions of the data, again looking at characteristics of the data passed +# to each computation. In this case, each computation should expect an +# `epi_archive` containing the relevant version data: + +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + function(x, gk, rtv) { + tibble( + versions_start = if (nrow(x$DT) == 0L) { + "NA (0 rows)" + } else { + toString(min(x$DT$version)) + }, + versions_end = x$versions_end, + time_range = if (nrow(x$DT) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) + }, + n = nrow(x$DT), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = TRUE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + # Focus on one geo_value so we can better see the columns above: + filter(geo_value == "ca") \%>\% + select(-geo_value) + +} diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd index 8f741418..f30be07f 100644 --- a/man/epix_truncate_versions_after.Rd +++ b/man/epix_truncate_versions_after.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, +% R/methods-epi_archive_new.R \name{epix_truncate_versions_after} \alias{epix_truncate_versions_after} \title{Filter an \code{epi_archive} object to keep only older versions} \usage{ +epix_truncate_versions_after(x, max_version) + epix_truncate_versions_after(x, max_version) } \arguments{ @@ -15,9 +18,14 @@ current archive data having \code{version} less than or equal to the specified \code{max_version}} } \value{ +An \code{epi_archive} object + An \code{epi_archive} object } \description{ +Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping +only rows with \code{version} falling on or before a specified date. + Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping only rows with \code{version} falling on or before a specified date. } diff --git a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..5fba48fb --- /dev/null +++ b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{epix_truncate_versions_after.grouped_epi_archive2} +\alias{epix_truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{epix_truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\description{ +Truncate versions after a given version, grouped +} diff --git a/man/fill_through_version.epi_archive2.Rd b/man/fill_through_version.epi_archive2.Rd new file mode 100644 index 00000000..48afb864 --- /dev/null +++ b/man/fill_through_version.epi_archive2.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{fill_through_version.epi_archive2} +\alias{fill_through_version.epi_archive2} +\title{Fill through version} +\usage{ +\method{fill_through_version}{epi_archive2}(epi_archive, fill_versions_end, how = c("na", "locf")) +} +\arguments{ +\item{epi_archive}{an \code{epi_archive} object} + +\item{fill_versions_end}{as in \code{\link{epix_fill_through_version}}} + +\item{how}{as in \code{\link{epix_fill_through_version}}} +} +\description{ +Fill in unobserved history using requested scheme by mutating +the given object and potentially reseating its fields. See +\code{\link{epix_fill_through_version}}, which doesn't mutate the input archive but +might alias its fields. +} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index 5e867bf3..f157e834 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -1,8 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_archive_new.R, +% R/grouped_epi_archive.R \name{group_by.epi_archive} \alias{group_by.epi_archive} \alias{grouped_epi_archive} +\alias{group_by.grouped_epi_archive2} +\alias{group_by_drop_default.grouped_epi_archive2} +\alias{groups.grouped_epi_archive2} +\alias{ungroup.grouped_epi_archive2} +\alias{is_grouped_epi_archive2} \alias{group_by.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} @@ -12,6 +18,21 @@ \usage{ \method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by}{grouped_epi_archive2}( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive) +) + +\method{group_by_drop_default}{grouped_epi_archive2}(grouped_epi_archive) + +\method{groups}{grouped_epi_archive2}(grouped_epi_archive) + +\method{ungroup}{grouped_epi_archive2}(grouped_epi_archive, ...) + +is_grouped_epi_archive2(x) + \method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) \method{groups}{grouped_epi_archive}(x) diff --git a/man/group_by.epi_archive2.Rd b/man/group_by.epi_archive2.Rd new file mode 100644 index 00000000..fa9040c3 --- /dev/null +++ b/man/group_by.epi_archive2.Rd @@ -0,0 +1,147 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{group_by.epi_archive2} +\alias{group_by.epi_archive2} +\alias{grouped_epi_archive} +\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} +\usage{ +\method{group_by}{epi_archive2}( + epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(epi_archive) +) +} +\arguments{ +\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); +\itemize{ +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +perform grouping, but note that, if you are regrouping an already-grouped +\code{.data} object, the calculations will be carried out ignoring such grouping +(same as \link[dplyr:group_by]{in dplyr}). +\item For \code{ungroup}: either +\itemize{ +\item empty, in order to remove the grouping and output an \code{epi_archive}; or +\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} +expression(s), in order to remove the matching variables from the list of +grouping variables, and output another \code{grouped_epi_archive}. +} +}} + +\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by +the variable selection from \code{...} only; if \code{TRUE}, the output will be +grouped by the current grouping variables plus the variable selection from +\code{...}.} + +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} + +\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} + +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} + +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} +} +\description{ +\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} +} +\details{ +To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as +"tidy evaluation") expressions \code{...}, not just column names, in a way similar +to \code{mutate}. Note that replacing or removing key columns with these +expressions is disabled. + +\code{archive \%>\% group_by()} and other expressions that group or regroup by zero +columns (indicating that all rows should be treated as part of one large +group) will output a \code{grouped_epi_archive}, in order to enable the use of +\code{grouped_epi_archive} methods on the result. This is in slight contrast to +the same operations on tibbles and grouped tibbles, which will \emph{not} output a +\code{grouped_df} in these circumstances. + +Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is +disabled; instead, \code{ungroup} first then \code{group_by}. + +Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, +introducing column-level aliasing between its input and its result. This +doesn't follow the general model for most \code{data.table} operations, which +seems to be that, given an nonaliased (i.e., unique) pointer to a +\code{data.table} object, its pointers to its columns should also be nonaliased. +If you mutate any of the columns of either the input or result, first ensure +that it is fine if columns of the other are also mutated, but do not rely on +such behavior to occur. Additionally, never perform mutation on the key +columns at all (except for strictly increasing transformations), as this will +invalidate sortedness assumptions about the rows. + +\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch +to \code{group_by_drop_default.default} (but there is a dedicated method for +\code{grouped_epi_archive}s). +} +\examples{ + +grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) + +# `print` for metadata and method listing: +grouped_archive \%>\% print() + +# The primary use for grouping is to perform a grouped `epix_slide`: + +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = "case_rate_3d_av" + ) \%>\% + ungroup() + +# ----------------------------------------------------------------- + +# Advanced: some other features of dplyr grouping are implemented: + +library(dplyr) +toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) \%>\% + as_epi_archive2(other_keys = "age_group") + +# The following are equivalent: +toy_archive \%>\% group_by(geo_value, age_group) +toy_archive \%>\% + group_by(geo_value) \%>\% + group_by(age_group, .add = TRUE) +grouping_cols <- c("geo_value", "age_group") +toy_archive \%>\% group_by(across(all_of(grouping_cols))) + +# And these are equivalent: +toy_archive \%>\% group_by(geo_value) +toy_archive \%>\% + group_by(geo_value, age_group) \%>\% + ungroup(age_group) + +# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +toy_archive \%>\% + group_by(geo_value) \%>\% + groups() + +toy_archive \%>\% + group_by(geo_value, age_group, .drop = FALSE) \%>\% + epix_slide2(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() + +} diff --git a/man/is_epi_archive2.Rd b/man/is_epi_archive2.Rd new file mode 100644 index 00000000..df258d3e --- /dev/null +++ b/man/is_epi_archive2.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{is_epi_archive2} +\alias{is_epi_archive2} +\title{Test for \code{epi_archive} format} +\usage{ +is_epi_archive2(x, grouped_okay = FALSE) +} +\arguments{ +\item{x}{An object.} + +\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also +count? Default is \code{FALSE}.} +} +\value{ +\code{TRUE} if the object inherits from \code{epi_archive}. +} +\description{ +Test for \code{epi_archive} format +} +\examples{ +is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +is_epi_archive2(archive_cases_dv_subset_2) # TRUE + +# By default, grouped_epi_archives don't count as epi_archives, as they may +# support a different set of operations from regular `epi_archives`. This +# behavior can be controlled by `grouped_okay`. +grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) +is_epi_archive2(grouped_archive) # FALSE +is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE + +} +\seealso{ +\code{\link{is_grouped_epi_archive}} +} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index cca554fa..6f0d35b3 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -1,18 +1,25 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{max_version_with_row_in} \alias{max_version_with_row_in} \title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ +max_version_with_row_in(x) + max_version_with_row_in(x) } \arguments{ \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ +\code{max(x$version)} if it has any rows; raises error if it has 0 rows or +an \code{NA} version value + \code{max(x$version)} if it has any rows; raises error if it has 0 rows or an \code{NA} version value } \description{ +Exported to make defaults more easily copyable. + Exported to make defaults more easily copyable. } diff --git a/man/merge_epi_archive2.Rd b/man/merge_epi_archive2.Rd new file mode 100644 index 00000000..dd1e671e --- /dev/null +++ b/man/merge_epi_archive2.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{merge_epi_archive2} +\alias{merge_epi_archive2} +\title{Merge epi archive} +\usage{ +merge_epi_archive2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x}{as in \code{\link{epix_merge}}} + +\item{y}{as in \code{\link{epix_merge}}} + +\item{sync}{as in \code{\link{epix_merge}}} + +\item{compactify}{as in \code{\link{epix_merge}}} +} +\description{ +Merges another \code{epi_archive} with the current one, mutating the +current one by reseating its \code{DT} and several other fields, but avoiding +mutation of the old \code{DT}; returns the current archive +\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description +of the non-R6-method version, which does not mutate either archive, and +does not alias either archive's \code{DT}.a +} diff --git a/man/new_epi_archive2.Rd b/man/new_epi_archive2.Rd new file mode 100644 index 00000000..52141190 --- /dev/null +++ b/man/new_epi_archive2.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{new_epi_archive2} +\alias{new_epi_archive2} +\title{New epi archive} +\usage{ +new_epi_archive2( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL +) +} +\arguments{ +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space while maintaining the same behavior (with the help of the +\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). +\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will +remove these rows and issue a warning. Generally, this can be set to +\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} +such as its \code{DT}, or rely on redundant updates to achieve a certain +behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to +determine whether \code{compactify=TRUE} will produce the desired results. If +compactification here is removing a large proportion of the rows, this may +indicate a potential for space, time, or bandwidth savings upstream the +data pipeline, e.g., by avoiding fetching, storing, or processing these +rows of \code{x}.} + +\item{clobberable_versions_start}{Optional; as in \code{\link{as_epi_archive}}} + +\item{versions_end}{Optional; as in \code{\link{as_epi_archive}}} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Creates a new \code{epi_archive} object. +} +\details{ +Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information +and examples of parameter names. +} diff --git a/man/next_after.Rd b/man/next_after.Rd index 5170e8d9..82fd3ebb 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -1,17 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{next_after} \alias{next_after} \title{Get the next possible value greater than \code{x} of the same type} \usage{ +next_after(x) + next_after(x) } \arguments{ \item{x}{the starting "value"(s)} } \value{ +same class, typeof, and length as \code{x} + same class, typeof, and length as \code{x} } \description{ +Get the next possible value greater than \code{x} of the same type + Get the next possible value greater than \code{x} of the same type } diff --git a/man/print.epi_archive2.Rd b/man/print.epi_archive2.Rd new file mode 100644 index 00000000..0105c47e --- /dev/null +++ b/man/print.epi_archive2.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{print.epi_archive2} +\alias{print.epi_archive2} +\title{Print information about an \code{epi_archive} object} +\usage{ +\method{print}{epi_archive2}(epi_archive, class = TRUE, methods = TRUE) +} +\arguments{ +\item{class}{Boolean; whether to print the class label header} + +\item{methods}{Boolean; whether to print all available methods of +the archive} +} +\description{ +Print information about an \code{epi_archive} object +} diff --git a/man/slide.epi_archive2.Rd b/man/slide.epi_archive2.Rd new file mode 100644 index 00000000..54db5636 --- /dev/null +++ b/man/slide.epi_archive2.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{slide.epi_archive2} +\alias{slide.epi_archive2} +\title{Slide over epi archive} +\usage{ +\method{slide}{epi_archive2}( + epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\description{ +Slides a given function over variables in an \code{epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/slide.grouped_epi_archive2.Rd b/man/slide.grouped_epi_archive2.Rd new file mode 100644 index 00000000..b5aac24c --- /dev/null +++ b/man/slide.grouped_epi_archive2.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{slide.grouped_epi_archive2} +\alias{slide.grouped_epi_archive2} +\title{Slide over grouped epi archive} +\usage{ +\method{slide}{grouped_epi_archive2}( + grouped_epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\description{ +Slides a given function over variables in a \code{grouped_epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. +} diff --git a/man/truncate_versions_after.epi_archive2.Rd b/man/truncate_versions_after.epi_archive2.Rd new file mode 100644 index 00000000..08ae40d4 --- /dev/null +++ b/man/truncate_versions_after.epi_archive2.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{truncate_versions_after.epi_archive2} +\alias{truncate_versions_after.epi_archive2} +\title{Truncate versions after} +\usage{ +\method{truncate_versions_after}{epi_archive2}(epi_archive, max_version) +} +\arguments{ +\item{epi_archive}{as in \code{\link{epix_truncate_versions_after}}} + +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions, mutating the archive by +potentially reseating but not mutating some fields. \code{DT} is likely, but not +guaranteed, to be copied. Returns the mutated archive +\link[base:invisible]{invisibly}. +} diff --git a/man/truncate_versions_after.grouped_epi_archive2.Rd b/man/truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..7c25950f --- /dev/null +++ b/man/truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{truncate_versions_after.grouped_epi_archive2} +\alias{truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\arguments{ +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} + +\item{x}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions by mutating the underlying +\code{epi_archive} using \verb{$truncate_versions_after}. Returns the mutated +\code{grouped_epi_archive} \link[base:invisible]{invisibly}. +} diff --git a/tests/testthat/test-archive_new.R b/tests/testthat/test-archive_new.R new file mode 100644 index 00000000..98f708d7 --- /dev/null +++ b/tests/testthat/test-archive_new.R @@ -0,0 +1,173 @@ +library(dplyr) + +test_that("first input must be a data.frame", { + expect_error(as_epi_archive2(c(1, 2, 3), compactify = FALSE), + regexp = "Must be of type 'data.frame'." + ) +}) + +dt <- archive_cases_dv_subset_2$DT + +test_that("data.frame must contain geo_value, time_value and version columns", { + expect_error(as_epi_archive2(select(dt, -geo_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -time_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -version), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) +}) + +test_that("other_keys can only contain names of the data.frame columns", { + expect_error(as_epi_archive2(dt, other_keys = "xyz", compactify = FALSE), + regexp = "`other_keys` must be contained in the column names of `x`." + ) + expect_error(as_epi_archive2(dt, other_keys = "percent_cli", compactify = FALSE), NA) +}) + +test_that("other_keys cannot contain names geo_value, time_value or version", { + expect_error(as_epi_archive2(dt, other_keys = "geo_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "time_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "version", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) +}) + +test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { + expect_warning(as_epi_archive2(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) + expect_warning(as_epi_archive2(dt, additional_metadata = list(time_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) +}) + +test_that("epi_archives are correctly instantiated with a variety of data types", { + # Data frame + df <- data.frame( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20 + ) + + ea1 <- as_epi_archive2(df, compactify = FALSE) + expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) + expect_equal(ea1$additional_metadata, list()) + + ea2 <- as_epi_archive2(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea2$additional_metadata, list(value = df$value)) + + # Tibble + tib <- tibble::tibble(df, code = "x") + + ea3 <- as_epi_archive2(tib, compactify = FALSE) + expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) + expect_equal(ea3$additional_metadata, list()) + + ea4 <- as_epi_archive2(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea4$additional_metadata, list(value = df$value)) + + # Keyed data.table + kdt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA", + key = "code" + ) + + ea5 <- as_epi_archive2(kdt, compactify = FALSE) + # Key from data.table isn't absorbed when as_epi_archive2 is used + expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) + expect_equal(ea5$additional_metadata, list()) + + ea6 <- as_epi_archive2(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + # Mismatched keys, but the one from as_epi_archive2 overrides + expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea6$additional_metadata, list(value = df$value)) + + # Unkeyed data.table + udt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA" + ) + + ea7 <- as_epi_archive2(udt, compactify = FALSE) + expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) + expect_equal(ea7$additional_metadata, list()) + + ea8 <- as_epi_archive2(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea8$additional_metadata, list(value = df$value)) + + # epi_df + edf1 <- jhu_csse_daily_subset %>% + select(geo_value, time_value, cases) %>% + mutate(version = max(time_value), code = "USA") + + ea9 <- as_epi_archive2(edf1, compactify = FALSE) + expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) + expect_equal(ea9$additional_metadata, list()) + + ea10 <- as_epi_archive2(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea10$additional_metadata, list(value = df$value)) + + # Keyed epi_df + edf2 <- data.frame( + geo_value = "al", + time_value = rep(as.Date("2020-01-01") + 0:9, 2), + version = c( + rep(as.Date("2020-01-25"), 10), + rep(as.Date("2020-01-26"), 10) + ), + cases = 1:20, + misc = "USA" + ) %>% + as_epi_df(additional_metadata = list(other_keys = "misc")) + + ea11 <- as_epi_archive2(edf2, compactify = FALSE) + expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) + expect_equal(ea11$additional_metadata, list()) + + ea12 <- as_epi_archive2(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(ea12$additional_metadata, list(value = df$misc)) +}) + +test_that("`epi_archive` rejects nonunique keys", { + toy_update_tbl <- + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) + expect_error( + as_epi_archive2(toy_update_tbl), + class = "epiprocess__epi_archive_requires_unique_key" + ) + expect_error( + regexp = NA, + as_epi_archive2(toy_update_tbl, other_keys = "age_group"), + ) +}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 4400c94a..58e97884 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,7 +2,7 @@ library(epiprocess) library(data.table) library(dplyr) -dt <- archive_cases_dv_subset$DT +dt <- archive_cases_dv_subset_2$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) diff --git a/tests/testthat/test-compactify_new.R b/tests/testthat/test-compactify_new.R new file mode 100644 index 00000000..cd53913d --- /dev/null +++ b/tests/testthat/test-compactify_new.R @@ -0,0 +1,110 @@ +library(epiprocess) +library(data.table) +library(dplyr) + +dt <- archive_cases_dv_subset_2$DT +dt <- filter(dt, geo_value == "ca") %>% + filter(version <= "2020-06-15") %>% + select(-case_rate_7d_av) + +test_that("Input for compactify must be NULL or a boolean", { + expect_error(as_epi_archive2(dt, compactify = "no")) +}) + +dt$percent_cli <- c(1:80) +dt$case_rate <- c(1:80) + +row_replace <- function(dt, row, x, y) { + # (This way of "replacing" elements appears to use copy-on-write even though + # we are working with a data.table.) + dt[row, 4] <- x + dt[row, 5] <- y + dt +} + +# Note that compactify is working on version-wise LOCF (last version of each +# observation carried forward) + +# Rows 1 should not be eliminated even if NA +dt <- row_replace(dt, 1, NA, NA) # Not LOCF + +# NOTE! We are assuming that there are no NA's in geo_value, time_value, +# and version. Even though compactify may erroneously remove the first row +# if it has all NA's, we are not testing this behaviour for now as this dataset +# has problems beyond the scope of this test + +# Rows 11 and 12 correspond to different time_values +dt <- row_replace(dt, 12, 11, 11) # Not LOCF + +# Rows 20 and 21 only differ in version +dt <- row_replace(dt, 21, 20, 20) # LOCF + +# Rows 21 and 22 only differ in version +dt <- row_replace(dt, 22, 20, 20) # LOCF + +# Row 39 comprises the first NA's +dt <- row_replace(dt, 39, NA, NA) # Not LOCF + +# Row 40 has two NA's, just like its lag, row 39 +dt <- row_replace(dt, 40, NA, NA) # LOCF + +# Row 62's values already exist in row 15, but row 15 is not a preceding row +dt <- row_replace(dt, 62, 15, 15) # Not LOCF + +# Row 73 only has one value carried over +dt <- row_replace(dt, 74, 73, 74) # Not LOCF + +dt_true <- as_tibble(as_epi_archive2(dt, compactify = TRUE)$DT) +dt_false <- as_tibble(as_epi_archive2(dt, compactify = FALSE)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive2(dt, compactify = NULL)$DT)) + +test_that("Warning for LOCF with compactify as NULL", { + expect_warning(as_epi_archive2(dt, compactify = NULL)) +}) + +test_that("No warning when there is no LOCF", { + expect_warning(as_epi_archive2(dt[1:5], compactify = NULL), NA) +}) + +test_that("LOCF values are ignored with compactify=FALSE", { + expect_identical(nrow(dt), nrow(dt_false)) +}) + +test_that("LOCF values are taken out with compactify=TRUE", { + dt_test <- as_tibble(as_epi_archive2(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) + + expect_identical(dt_true, dt_null) + expect_identical(dt_null, dt_test) +}) + +test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { + ea_true <- as_epi_archive2(dt, compactify = TRUE) + ea_false <- as_epi_archive2(dt, compactify = FALSE) + + # Row 22, an LOCF row corresponding to the latest version, is omitted in + # ea_true + latest_version <- max(ea_false$DT$version) + as_of_true <- as_of(ea_true, latest_version) + as_of_false <- as_of(ea_false, latest_version) + + expect_identical(as_of_true, as_of_false) +}) + +test_that("compactify does not alter the default clobberable and observed version bounds", { + x <- tibble::tibble( + geo_value = "geo1", + time_value = as.Date("2000-01-01"), + version = as.Date("2000-01-01") + 1:5, + value = 42L + ) + ea_true <- as_epi_archive2(x, compactify = TRUE) + ea_false <- as_epi_archive2(x, compactify = FALSE) + # We say that we base the bounds on the user's `x` arg. We might mess up or + # change our minds and base things on the `DT` field (or a temporary `DT` + # variable, post-compactify) instead. Check that this test would trigger + # in that case: + expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) + # The actual test: + expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) + expect_identical(ea_true$versions_end, ea_false$versions_end) +}) diff --git a/tests/testthat/test-epix_fill_through_version_new.R b/tests/testthat/test-epix_fill_through_version_new.R new file mode 100644 index 00000000..2b76a851 --- /dev/null +++ b/tests/testthat/test-epix_fill_through_version_new.R @@ -0,0 +1,109 @@ +test_that("epix_fill_through_version2 mirrors input when it is sufficiently up to date", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + some_earlier_observed_version <- 2L + ea_trivial_fill_na1 <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 <- epix_fill_through_version2(ea_orig, ea_orig$versions_end, "na") + ea_trivial_fill_locf <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "locf") + # Below, we want R6 objects to be compared based on contents rather than + # addresses. We appear to get this with `expect_identical` in `testthat` + # edition 3, which is based on `waldo::compare` rather than `base::identical`; + # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 + # objects by contents rather than address (in a way that is tested but maybe + # not guaranteed via user docs). Use `testthat::local_edition` to ensure we + # use testthat edition 3 here (use `testthat::` to prevent ambiguity with + # `readr`). + testthat::local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) + expect_identical(ea_orig, ea_trivial_fill_na2) + expect_identical(ea_orig, ea_trivial_fill_locf) +}) + +test_that("epix_fill_through_version2 can extend observed versions, gives expected `as_of`s", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", + time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), + version = c(1:5, 2L), + value = 1:6 + )) + first_unobserved_version <- 6L + later_unobserved_version <- 10L + ea_fill_na <- epix_fill_through_version2(ea_orig, later_unobserved_version, "na") + ea_fill_locf <- epix_fill_through_version2(ea_orig, later_unobserved_version, "locf") + + # We use testthat edition 3 features here, passing `ignore_attr` to + # `waldo::compare`. Ensure we are using edition 3: + testthat::local_edition(3) + withCallingHandlers( + { + expect_identical(ea_fill_na$versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(as_of(ea_fill_na, first_unobserved_version)), + tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + ignore_attr = TRUE + ) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) + expect_identical( + as_of(ea_fill_locf, first_unobserved_version), + as_of(ea_fill_locf, ea_orig$versions_end) %>% + { + attr(., "metadata")$as_of <- first_unobserved_version + . + } + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") + ) +}) + +test_that("epix_fill_through_version2 does not mutate x", { + for (ea_orig in list( + # vanilla case + as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )), + # data.table unique yielding original DT by reference special case (maybe + # having only 1 row is the trigger? having no revisions of initial values + # doesn't seem sufficient to trigger) + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + )) { + # We want to perform a strict comparison of the contents of `ea_orig` before + # and `ea_orig` after. `clone` + `expect_identical` based on waldo would + # sort of work, but we might want something stricter. `as.list` + + # `identical` plus a check of the DT seems to do the trick. + ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + some_unobserved_version <- 8L + # + ea_fill_na <- epix_fill_through_version2(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list <- as.list(ea_orig) + # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + # + ea_fill_locf <- epix_fill_through_version2(ea_orig, some_unobserved_version, "locf") + ea_orig_after_as_list <- as.list(ea_orig) + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + } +}) + +test_that("epix_fill_through_version return with expected visibility", { + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) +}) + +test_that("epix_fill_through_version2 returns same key & doesn't mutate old DT or its key", { + ea <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + old_DT <- ea$DT + old_DT_copy <- data.table::copy(old_DT) + old_key <- data.table::key(ea$DT) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "locf")$DT), old_key) + expect_identical(data.table::key(ea$DT), old_key) +}) diff --git a/tests/testthat/test-epix_merge_new.R b/tests/testthat/test-epix_merge_new.R new file mode 100644 index 00000000..10041dbb --- /dev/null +++ b/tests/testthat/test-epix_merge_new.R @@ -0,0 +1,226 @@ +test_that("epix_merge requires forbids on invalid `y`", { + ea <- archive_cases_dv_subset_2 %>% + clone() + expect_error(epix_merge2(ea, data.frame(x = 1))) +}) + +test_that("epix_merge merges and carries forward updates properly", { + x <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) + "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + ) %>% + tidyr::unchop(c(version, x_value)) %>% + dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + y <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, y_value)) %>% + dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + xy <- epix_merge2(x, y) + xy_expected <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, x_value, y_value)) %>% + dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + # We rely on testthat edition 3 expect_identical using waldo, not identical. See + # test-epix_fill_through_version.R comments for details. + testthat::local_edition(3) + expect_identical(xy, xy_expected) +}) + +test_that("epix_merge forbids and warns on metadata and naming issues", { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*geo_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*time_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) + ), + regexp = "overlapping.*names" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) + ), + regexp = "x\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ) + ), + regexp = "y\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) +}) + +# use `local` to prevent accidentally using the x, y, xy bindings here +# elsewhere, while allowing reuse across a couple tests +local({ + x <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + clobberable_versions_start = 1L, versions_end = 10L + ) + y <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + clobberable_versions_start = 3L, versions_end = 10L + ) + xy <- epix_merge2(x, y) + test_that("epix_merge considers partially-clobberable row to be clobberable", { + expect_identical(xy$clobberable_versions_start, 1L) + }) + test_that("epix_merge result uses versions_end metadata not max version val", { + expect_identical(xy$versions_end, 10L) + }) +}) + +local({ + x <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), + clobberable_versions_start = 1L, + versions_end = 3L + ) + y <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), + clobberable_versions_start = 1L + ) + test_that('epix_merge forbids on sync default or "forbid"', { + expect_error(epix_merge2(x, y), + class = "epiprocess__epix_merge_unresolved_sync" + ) + expect_error(epix_merge2(x, y, sync = "forbid"), + class = "epiprocess__epix_merge_unresolved_sync" + ) + }) + test_that('epix_merge sync="na" works', { + expect_equal( + epix_merge2(x, y, sync = "na"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + # (we should not have a y vals -> NA update here; version 5 should be + # the `versions_end` of the result) + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="locf" works', { + expect_equal( + epix_merge2(x, y, sync = "locf"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="truncate" works', { + expect_equal( + epix_merge2(x, y, sync = "truncate"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + # y's update beyond x's last update has been truncated + ), clobberable_versions_start = 1L, versions_end = 3L) + ) + }) + x_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) + y_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + xy_no_conflict_expected <- as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) + test_that('epix_merge sync="forbid" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "forbid"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="na" on no-conflict works', { + # This test is the main reason for these no-conflict tests. We want to make + # sure that we don't add an unnecessary NA-ing-out version beyond a common + # versions_end. + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "na"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="locf" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "locf"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="truncate" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "truncate"), + xy_no_conflict_expected + ) + }) +}) + + +test_that('epix_merge sync="na" balks if do not know next_after', { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), + sync = "na" + ), + regexp = "no applicable method.*next_after" + ) +}) diff --git a/tests/testthat/test-epix_slide_new.R b/tests/testthat/test-epix_slide_new.R new file mode 100644 index 00000000..49ef5e41 --- /dev/null +++ b/tests/testthat/test-epix_slide_new.R @@ -0,0 +1,810 @@ +library(dplyr) + +test_that("epix_slide2 only works on an epi_archive", { + expect_error(epix_slide2(data.frame(x = 1))) +}) + +x <- tibble::tribble( + ~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2, 4), 2^(4:6), + 6, c(1:2, 4:5), 2^(7:10), + 7, 2:6, 2^(11:15) +) %>% + tidyr::unnest(c(time_value, binary)) + +xx <- bind_cols(geo_value = rep("x", 15), x) %>% + as_epi_archive2() + +test_that("epix_slide2 works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by( + dplyr::across(dplyr::all_of("geo_value")) + ) %>% + slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2(f = function(x, gk, rtv) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1, xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + sum_binary = sum(binary), + before = 2 + ) + + expect_identical(xx1, xx5) +}) + +test_that("epix_slide2 works as intended with `as_list_col=TRUE`", { + xx_dfrow1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + xx_dfrow2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx_dfrow1, xx_dfrow2) # * + + xx_dfrow3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_df2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1, xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_scalar2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1, xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ .x$binary, + before = 2, + as_list_col = TRUE + ) + + xx_vec2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1, xx_vec2) +}) + +test_that("epix_slide2 `before` validation works", { + expect_error( + slide(xx, f = ~ sum(.x$binary)), + "`before` is required" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = NA), + "Assertion on 'before' failed: May not be NA" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = -1), + "Assertion on 'before' failed: Element 1 is not >= 0" + ) + expect_error(slide(xx, f = ~ sum(.x$binary), before = 1.5), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # We might want to allow this at some point (issue #219): + expect_error(slide(xx, f = ~ sum(.x$binary), before = Inf), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # (wrapper shouldn't introduce a value:) + expect_error(epix_slide2(xx, f = ~ sum(.x$binary)), "`before` is required") + # These `before` values should be accepted: + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 0), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 2L), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 365000), + NA + ) +}) + +test_that("quosure passing issue in epix_slide2 is resolved + other potential issues", { + # (First part adapted from @examples) + time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-02"), + by = "1 day" + ) + # We only have one non-version, non-time key in the example archive. Add + # another so that we don't accidentally pass tests due to accidentally + # matching the default grouping. + ea <- as_epi_archive2( + archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE + ) + reference_by_modulus <- ea %>% + group_by(modulus) %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + reference_by_neither <- ea %>% + group_by() %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + # test the passing-something-that-must-be-enquosed behavior: + # + # (S3 group_by behavior for this case is the `reference_by_modulus`) + expect_identical( + ea %>% group_by(modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the .data pronoun behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(.data$modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing across-all-of-string-literal behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(across(all_of("modulus"))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing-across-all-of-string-var behavior: + my_group_by <- "modulus" + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the default behavior (default in this case should just be grouping by neither): + expect_identical( + epix_slide2( + x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) + expect_identical( + ea %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) +}) + +ea <- tibble::tribble( + ~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1) +) %>% + tidyr::unnest(c(time_value, binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive2() + +test_that("epix_slide2 with all_versions option has access to all older versions", { + library(data.table) + # Make sure we're using testthat edition 3, where `expect_identical` doesn't + # actually mean `base::identical` but something more content-based using + # `waldo` package: + testthat::local_edition(3) + + slide_fn <- function(x, gk, rtv) { + return(tibble( + n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)) + )) + } + + ea_orig_mirror <- ea %>% clone(deep = TRUE) + ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + + result1 <- ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_true(inherits(result1, "tbl_df")) + + result2 <- tibble::tribble( + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) + + expect_identical(result1, result2) # * + + result3 <- ea %>% + group_by() %>% + slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result3) # This and * Imply result2 and result3 are identical + + # formula interface + result4 <- ea %>% + group_by() %>% + epix_slide2( + f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide2( + data = slide_fn( + .x, + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result5) # This and * Imply result2 and result5 are identical + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea +}) + +test_that("as_of and epix_slide2 with long enough window are compatible", { + library(data.table) + testthat::local_edition(3) + + # For all_versions = FALSE: + + f1 <- function(x, gk, rtv) { + tibble( + diff_mean = mean(diff(x$binary)) + ) + } + ref_time_value1 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ) + + # For all_versions = TRUE: + + f2 <- function(x, gk, rtv) { + x %>% + # extract time&version-lag-1 data: + epix_slide2( + function(subx, subgk, rtv) { + tibble(data = list( + subx %>% + filter(time_value == attr(subx, "metadata")$as_of - 1) %>% + rename(real_time_value = time_value, lag1 = binary) + )) + }, + before = 1, names_sep = NULL + ) %>% + # assess as nowcast: + unnest(data) %>% + inner_join(x %>% as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + summarize(mean_abs_delta = mean(abs(binary - lag1))) + } + ref_time_value2 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), + ea %>% slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ) + + # Test the same sort of thing when grouping by geo in an archive with multiple geos. + ea_multigeo <- ea %>% clone() + ea_multigeo$DT <- rbind( + ea_multigeo$DT, + copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + ) + setkeyv(ea_multigeo$DT, key(ea$DT)) + + expect_identical( + ea_multigeo %>% + group_by(geo_value) %>% + epix_slide2(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% + filter(geo_value == "x"), + ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` + epix_as_of2(ref_time_value2, all_versions = TRUE) %>% + f2() %>% + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + group_by(geo_value) + ) +}) + +test_that("epix_slide2 `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { + slide_fn <- function(x, gk, rtv) { + expect_true(is_epi_archive2(x)) + return(NA) + } + + ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE + ) +}) + +test_that("epix_slide2 with all_versions option works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9 + 2^6, + 2^15 + 2^14 + 2^10 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical +}) + +# XXX currently, we're using a stopgap measure of having `epix_slide2` always +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide2` output more carefully. We might bring this test +# back depending on the decisions there: +# +# test_that("`epix_slide2` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide2(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) + +test_that("epix_slide2 works with 0-row computation outputs", { + epix_slide_empty <- function(ea, ...) { + ea %>% + epix_slide2(before = 5L, ..., function(x, gk, rtv) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) %>% + group_by(geo_value) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + group_by(geo_value) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) + +test_that("epix_slide alerts if the provided f doesn't take enough args", { + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + expect_warning(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + expect_warning(epix_slide2(xx, f_x_dots, before = 2L), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) +}) + +test_that("epix_slide2 computation via formula can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.ref_time_value, + before = 2 + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.z, + before = 2 + ) + + expect_identical(xx2, xx_ref) + + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~..3, + before = 2 + ) + + expect_identical(xx3, xx_ref) +}) + +test_that("epix_slide2 computation via function can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = function(x, g, t) t, + before = 2 + ) + + expect_identical(xx1, xx_ref) +}) + +test_that("epix_slide2 computation via dots can use ref_time_value and group", { + # ref_time_value + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .ref_time_value + ) + + expect_identical(xx1, xx_ref) + + # group_key + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = "x" + ) %>% + group_by(geo_value) + + # Use group_key column + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .group_key$geo_value + ) + + expect_identical(xx3, xx_ref) + + # Use entire group_key object + expect_error( + xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = nrow(.group_key) + ), + NA + ) +}) + +test_that("epix_slide2 computation via dots outputs the same result using col names and the data var", { + xx_ref <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(time_value) + ) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.x$time_value) + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.data$time_value) + ) + + expect_identical(xx2, xx_ref) +}) + +test_that("`epix_slide2` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive2() %>% + epix_slide2(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) + +test_that("`epix_slide2` can access objects inside of helper functions", { + helper <- function(archive_haystack, time_value_needle) { + archive_haystack %>% epix_slide2(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(archive_cases_dv_subset_2, as.Date("2021-01-01")), + NA + ) + expect_error( + helper(xx, 3L), + NA + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive_new.R b/tests/testthat/test-grouped_epi_archive_new.R new file mode 100644 index 00000000..8f0133b9 --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive_new.R @@ -0,0 +1,104 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) %>% + as_epi_archive2(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys <- toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) + ) + grouping_cols <- c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "Must be of type 'logical', not 'character'" + ) + expect_warning(toy_archive %>% group_by(.drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning( + grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + expect_identical( + grouped_factor_then_nonfactor %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop = FALSE) + ) + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop = FALSE) %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop = FALSE) + ) +}) diff --git a/tests/testthat/test-methods-epi_archive_new.R b/tests/testthat/test-methods-epi_archive_new.R new file mode 100644 index 00000000..eb2c14be --- /dev/null +++ b/tests/testthat/test-methods-epi_archive_new.R @@ -0,0 +1,136 @@ +library(dplyr) + +ea <- archive_cases_dv_subset_2 %>% + clone() + +ea2_data <- tibble::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, +) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + +# epix_as_of tests +test_that("epix_as_of behaves identically to as_of method", { + expect_identical( + epix_as_of2(ea, max_version = min(ea$DT$version)), + ea %>% as_of(max_version = min(ea$DT$version)) + ) +}) + +test_that("Errors are thrown due to bad as_of inputs", { + # max_version cannot be of string class rather than date class + expect_error(ea %>% as_of("2020-01-01")) + # max_version cannot be later than latest version + expect_error(ea %>% as_of(as.Date("2025-01-01"))) + # max_version cannot be a vector + expect_error(ea %>% as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) +}) + +test_that("Warning against max_version being clobberable", { + # none by default + expect_warning(regexp = NA, ea %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% as_of(max_version = min(ea$DT$version))) + # but with `clobberable_versions_start` non-`NA`, yes + ea_with_clobberable <- ea %>% clone() + ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) + expect_warning(ea_with_clobberable %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% as_of(max_version = min(ea$DT$version))) +}) + +test_that("as_of properly grabs the data and doesn't mutate key", { + d <- as.Date("2020-06-01") + + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + edf_as_of <- ea2 %>% + epix_as_of2(max_version = as.Date("2020-06-03")) + + edf_expected <- as_epi_df(tibble( + geo_value = "ca", + time_value = d + 0:2, + cases = c(2, 1, 1) + ), as_of = as.Date("2020-06-03")) + + expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { + # x must be an archive + expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) + # max_version cannot be of string class rather than date class + expect_error(epix_truncate_versions_after(ea, "2020-01-01")) + # max_version cannot be a vector + expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + # max_version cannot be missing + expect_error(epix_truncate_versions_after(ea, as.Date(NA))) + # max_version cannot be after latest version in archive + expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) +}) + +test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-02")) + + ea_expected <- ea2_data[1:3, ] %>% + as_epi_archive2() + + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) +}) + +test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_epi_archive2(ea_as_of, grouped_okay = FALSE)) + + ea2_grouped <- ea2 %>% group_by(geo_value) + + ea_as_of <- ea2_grouped %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_grouped_epi_archive2(ea_as_of)) +}) + + +test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + ea2 <- ea2 %>% group_by(geo_value) + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index d4fad3e7..c010c1f3 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -111,17 +111,17 @@ edf %>% edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide2(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide2(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() ``` @@ -219,9 +219,9 @@ edf %>% edf %>% mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide( + epix_slide2( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL @@ -317,16 +317,17 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive(compactify = FALSE) + as_epi_archive2(compactify = FALSE) # mutating merge operation: -x$merge( +x <- epix_merge2( + x, y2 %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value ) %>% - as_epi_archive(compactify = FALSE), + as_epi_archive2(compactify = FALSE), sync = "locf", compactify = FALSE ) @@ -337,9 +338,9 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -x <- archive_cases_dv_subset$DT %>% +x <- archive_cases_dv_subset_2$DT %>% filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) + as_epi_archive2(compactify = FALSE) ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -457,7 +458,7 @@ data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month" @@ -467,7 +468,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide( + epix_slide2( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index fdb0e3c6..0b57d639 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -76,16 +76,16 @@ the compactify vignette. ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) class(x) print(x) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset$DT %>% +x <- archive_cases_dv_subset_2$DT %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) class(x) print(x) @@ -154,7 +154,7 @@ function `epix_as_of()` since this is likely a more familiar interface for users not familiar with R6 (or object-oriented programming). ```{r} -x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of2(x, max_version = as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -174,7 +174,7 @@ this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) ``` Below, we pull several snapshots from the archive, spaced one month apart. We @@ -188,7 +188,7 @@ theme_set(theme_bw()) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of(x, max_version = v) %>% mutate(version = v) + epix_as_of2(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) @@ -258,15 +258,15 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) -x$merge(y, sync = "locf", compactify = FALSE) +x <- epix_merge2(x, y, sync = "locf", compactify = TRUE) print(x) head(x$DT) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset +x <- archive_cases_dv_subset_2 print(x) head(x$DT) ``` @@ -362,7 +362,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), z <- x %>% group_by(geo_value) %>% - epix_slide( + epix_slide2( fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, ref_time_values = fc_time_values ) %>% @@ -389,14 +389,14 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(.data$geo_value) %>% - epix_slide( + epix_slide2( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index cad065e7..0b68c73b 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -32,10 +32,10 @@ from the second from the third value included. library(epiprocess) library(dplyr) -dt <- archive_cases_dv_subset$DT +dt <- archive_cases_dv_subset_2$DT -locf_omitted <- as_epi_archive(dt) -locf_included <- as_epi_archive(dt, compactify = FALSE) +locf_omitted <- as_epi_archive2(dt) +locf_included <- as_epi_archive2(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -48,8 +48,8 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu ```{r} dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) -locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) +locf_included_2 <- as_epi_archive2(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive2(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were From 74aa8311731c2283c587cbd0934c26bbb26db445 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 19 Mar 2024 18:09:32 -0700 Subject: [PATCH 250/345] feat: replace epi_archive with S3 implementation --- DESCRIPTION | 6 +- NAMESPACE | 40 +- NEWS.md | 6 +- R/archive.R | 1083 +++++++--------- R/archive_new.R | 1115 ----------------- R/data.R | 12 +- R/group_by_epi_df_methods.R | 4 - R/grouped_archive_new.R | 456 ------- R/grouped_epi_archive.R | 756 ++++++----- R/growth_rate.R | 4 +- R/methods-epi_archive.R | 584 ++++----- R/methods-epi_archive_new.R | 826 ------------ R/utils.R | 2 - _pkgdown.yml | 3 + man/as_epi_archive.Rd | 142 --- man/as_epi_archive2.Rd | 142 --- man/as_of.epi_archive2.Rd | 33 - man/clone.Rd | 17 + man/compactify.Rd | 28 + man/epi_archive.Rd | 645 ++-------- man/epix_as_of.Rd | 26 +- man/epix_as_of2.Rd | 95 -- man/epix_fill_through_version.Rd | 10 +- man/epix_fill_through_version2.Rd | 48 - man/epix_merge.Rd | 23 +- man/epix_merge2.Rd | 71 -- man/epix_slide.Rd | 64 +- man/epix_slide2.Rd | 283 ----- man/epix_truncate_versions_after.Rd | 21 +- ...ate_versions_after.grouped_epi_archive2.Rd | 11 - man/fill_through_version.epi_archive2.Rd | 21 - man/group_by.epi_archive.Rd | 46 +- man/group_by.epi_archive2.Rd | 147 --- man/is_epi_archive.Rd | 2 +- man/is_epi_archive2.Rd | 35 - man/max_version_with_row_in.Rd | 9 +- man/merge_epi_archive2.Rd | 30 - man/new_epi_archive2.Rd | 69 - man/next_after.Rd | 8 +- ...t.epi_archive2.Rd => print.epi_archive.Rd} | 12 +- man/slide.epi_archive2.Rd | 101 -- man/slide.grouped_epi_archive2.Rd | 24 - man/truncate_versions_after.epi_archive2.Rd | 19 - ...ate_versions_after.grouped_epi_archive2.Rd | 18 - tests/testthat/test-archive-version-bounds.R | 10 +- tests/testthat/test-archive_new.R | 173 --- tests/testthat/test-compactify.R | 6 +- tests/testthat/test-compactify_new.R | 110 -- tests/testthat/test-deprecations.R | 20 +- .../testthat/test-epix_fill_through_version.R | 56 +- .../test-epix_fill_through_version_new.R | 109 -- tests/testthat/test-epix_merge.R | 7 +- tests/testthat/test-epix_merge_new.R | 226 ---- tests/testthat/test-epix_slide.R | 150 ++- tests/testthat/test-epix_slide_new.R | 810 ------------ tests/testthat/test-grouped_epi_archive.R | 4 - tests/testthat/test-grouped_epi_archive_new.R | 104 -- tests/testthat/test-methods-epi_archive.R | 41 +- tests/testthat/test-methods-epi_archive_new.R | 136 -- tests/testthat/test-utils.R | 10 - vignettes/advanced.Rmd | 42 +- vignettes/aggregation.Rmd | 6 +- vignettes/archive.Rmd | 105 +- vignettes/compactify.Rmd | 15 +- vignettes/epiprocess.Rmd | 13 +- 65 files changed, 1528 insertions(+), 7722 deletions(-) delete mode 100644 R/archive_new.R delete mode 100644 R/grouped_archive_new.R delete mode 100644 R/methods-epi_archive_new.R delete mode 100644 man/as_epi_archive.Rd delete mode 100644 man/as_epi_archive2.Rd delete mode 100644 man/as_of.epi_archive2.Rd create mode 100644 man/clone.Rd create mode 100644 man/compactify.Rd delete mode 100644 man/epix_as_of2.Rd delete mode 100644 man/epix_fill_through_version2.Rd delete mode 100644 man/epix_merge2.Rd delete mode 100644 man/epix_slide2.Rd delete mode 100644 man/epix_truncate_versions_after.grouped_epi_archive2.Rd delete mode 100644 man/fill_through_version.epi_archive2.Rd delete mode 100644 man/group_by.epi_archive2.Rd delete mode 100644 man/is_epi_archive2.Rd delete mode 100644 man/merge_epi_archive2.Rd delete mode 100644 man/new_epi_archive2.Rd rename man/{print.epi_archive2.Rd => print.epi_archive.Rd} (56%) delete mode 100644 man/slide.epi_archive2.Rd delete mode 100644 man/slide.grouped_epi_archive2.Rd delete mode 100644 man/truncate_versions_after.epi_archive2.Rd delete mode 100644 man/truncate_versions_after.grouped_epi_archive2.Rd delete mode 100644 tests/testthat/test-archive_new.R delete mode 100644 tests/testthat/test-compactify_new.R delete mode 100644 tests/testthat/test-epix_fill_through_version_new.R delete mode 100644 tests/testthat/test-epix_merge_new.R delete mode 100644 tests/testthat/test-epix_slide_new.R delete mode 100644 tests/testthat/test-grouped_epi_archive_new.R delete mode 100644 tests/testthat/test-methods-epi_archive_new.R diff --git a/DESCRIPTION b/DESCRIPTION index cfdd9f49..2b53474c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,6 @@ Imports: lubridate, magrittr, purrr, - R6, rlang, slider, tibble, @@ -50,7 +49,9 @@ Imports: vctrs Suggests: covidcast, + devtools, epidatr, + here, knitr, outbreaks, rmarkdown, @@ -73,7 +74,6 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' - 'archive_new.R' 'autoplot.R' 'correlation.R' 'data.R' @@ -81,11 +81,9 @@ Collate: 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' - 'grouped_archive_new.R' 'grouped_epi_archive.R' 'growth_rate.R' 'key_colnames.R' - 'methods-epi_archive_new.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index d5d1cd7b..cc25c7d7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,98 +6,78 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) -S3method(as_of,epi_archive2) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) -S3method(clone,epi_archive2) -S3method(clone,grouped_epi_archive2) +S3method(clone,epi_archive) +S3method(clone,grouped_epi_archive) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) +S3method(epix_slide,epi_archive) +S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) -S3method(epix_truncate_versions_after,epi_archive2) S3method(epix_truncate_versions_after,grouped_epi_archive) -S3method(epix_truncate_versions_after,grouped_epi_archive2) S3method(group_by,epi_archive) -S3method(group_by,epi_archive2) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) -S3method(group_by,grouped_epi_archive2) S3method(group_by_drop_default,grouped_epi_archive) -S3method(group_by_drop_default,grouped_epi_archive2) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) -S3method(groups,grouped_epi_archive2) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) -S3method(print,epi_archive2) +S3method(print,epi_archive) S3method(print,epi_df) -S3method(print,grouped_epi_archive2) +S3method(print,grouped_epi_archive) S3method(select,epi_df) -S3method(slide,grouped_epi_archive2) S3method(summary,epi_df) -S3method(truncate_versions_after,grouped_epi_archive2) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) -S3method(ungroup,grouped_epi_archive2) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) -export(archive_cases_dv_subset_2) export(arrange) export(as_epi_archive) -export(as_epi_archive2) export(as_epi_df) -export(as_of) export(as_tsibble) export(autoplot) export(clone) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) -export(epi_archive) export(epi_cor) export(epi_slide) export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) -export(epix_as_of2) +export(epix_fill_through_version) export(epix_merge) -export(epix_merge2) export(epix_slide) -export(epix_slide2) export(epix_truncate_versions_after) -export(fill_through_version) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) -export(is_epi_archive2) export(is_epi_df) export(is_grouped_epi_archive) -export(is_grouped_epi_archive2) export(key_colnames) export(max_version_with_row_in) export(mutate) -export(new_epi_archive2) +export(new_epi_archive) export(new_epi_df) export(next_after) export(relocate) export(rename) export(slice) -export(slide) -export(truncate_versions_after) export(ungroup) export(unnest) -importFrom(R6,R6Class) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) @@ -138,12 +118,15 @@ importFrom(dplyr,bind_rows) importFrom(dplyr,dplyr_col_modify) importFrom(dplyr,dplyr_reconstruct) importFrom(dplyr,dplyr_row_slice) +importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) importFrom(dplyr,group_modify) importFrom(dplyr,group_vars) importFrom(dplyr,groups) +importFrom(dplyr,if_all) +importFrom(dplyr,if_any) importFrom(dplyr,mutate) importFrom(dplyr,relocate) importFrom(dplyr,rename) @@ -166,6 +149,7 @@ importFrom(rlang,arg_match) importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) +importFrom(rlang,check_dots_empty) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) diff --git a/NEWS.md b/NEWS.md index e2c5b8e4..4d52ded5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,7 +32,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 -- Refactor `epi_archive` to use S3 instead of R6 for its object model. The calls to some functions will change, but the functionality will remain the same. It will also help us maintain the package better in the future. (#340) +- Refactor `epi_archive` to use S3 instead of R6 for its object model. The + functionality stay the same, but it will break the member function interface. + For migration, convert `epi_archive$merge` to `epi_archive %>% epix_merge` + (similar for `slide`, `fill_through_version`, `truncate_after_version`, and + `as_of`) (#340). # epiprocess 0.7.0 diff --git a/R/archive.R b/R/archive.R index a530cc05..f871d239 100644 --- a/R/archive.R +++ b/R/archive.R @@ -6,6 +6,7 @@ # `data.table::` everywhere and not importing things. .datatable_aware <- TRUE + #' Validate a version bound arg #' #' Expected to be used on `clobberable_versions_start`, `versions_end`, @@ -49,6 +50,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, return(invisible(NULL)) } + #' `max(x$version)`, with error if `x` has 0 rows #' #' Exported to make defaults more easily copyable. @@ -82,6 +84,7 @@ max_version_with_row_in <- function(x) { version_bound } + #' Get the next possible value greater than `x` of the same type #' #' @param x the starting "value"(s) @@ -90,22 +93,53 @@ max_version_with_row_in <- function(x) { #' @export next_after <- function(x) UseMethod("next_after") + #' @export next_after.integer <- function(x) x + 1L + #' @export next_after.Date <- function(x) x + 1L + +#' Compactify +#' +#' This section describes the internals of how compactification works in an +#' `epi_archive()`. Compactification can potentially improve code speed or +#' memory usage, depending on your data. +#' +#' In general, the last version of each observation is carried forward (LOCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `versions_end`. One consequence is that the `DT` doesn't +#' have to contain a full snapshot of every version (although this generally +#' works), but can instead contain only the rows that are new or changed from +#' the previous version (see `compactify`, which does this automatically). +#' Currently, deletions must be represented as revising a row to a special +#' state (e.g., making the entries `NA` or including a special column that +#' flags the data as removed and performing some kind of post-processing), and +#' the archive is unaware of what this state is. Note that `NA`s *can* be +#' introduced by `epi_archive` methods for other reasons, e.g., in +#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' @name compactify +NULL + + +#' Epi Archive +#' #' @title `epi_archive` object #' -#' @description An `epi_archive` is an R6 class which contains a data table +#' @description An `epi_archive` is an S3 class which contains a data table #' along with several relevant pieces of metadata. The data table can be seen #' as the full archive (version history) for some signal variables of #' interest. #' -#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of -#' class `data.table` from the `data.table` package, with (at least) the -#' following columns: +#' @details An `epi_archive` contains a data table `DT`, of class `data.table` +#' from the `data.table` package, with (at least) the following columns: #' #' * `geo_value`: the geographic value associated with each row of measurements. #' * `time_value`: the time value associated with each row of measurements. @@ -118,38 +152,12 @@ next_after.Date <- function(x) x + 1L #' The data table `DT` has key variables `geo_value`, `time_value`, `version`, #' as well as any others (these can be specified when instantiating the #' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for -#' information and examples of relevant parameter names for an `epi_archive` object. -#' Note that there can only be a single row per unique combination of +#' on `DT` directly). Refer to the documentation for `as_epi_archive()` for +#' information and examples of relevant parameter names for an `epi_archive` +#' object. Note that there can only be a single row per unique combination of #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. #' -#' In general, the last version of each observation is carried forward (LOCF) to -#' fill in data between recorded versions, and between the last recorded -#' update and the `versions_end`. One consequence is that the `DT` -#' doesn't have to contain a full snapshot of every version (although this -#' generally works), but can instead contain only the rows that are new or -#' changed from the previous version (see `compactify`, which does this -#' automatically). Currently, deletions must be represented as revising a row -#' to a special state (e.g., making the entries `NA` or including a special -#' column that flags the data as removed and performing some kind of -#' post-processing), and the archive is unaware of what this state is. Note -#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, -#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to -#' represent potential update data that we do not yet have access to; or in -#' [`epix_merge`] to represent the "value" of an observation before the -#' version in which it was first released, or if no version of that -#' observation appears in the archive data at all. -#' -#' **A word of caution:** R6 objects, unlike most other objects in R, have -#' reference semantics. A primary consequence of this is that objects are not -#' copied when modified. You can read more about this in Hadley Wickham's -#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order -#' to construct a modified archive while keeping the original intact, first -#' make a clone using the `$clone` method, then overwrite the clone's `DT` -#' field with `data.table::copy(clone$DT)`, and finally perform the -#' modifications on the clone. -#' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: @@ -168,9 +176,8 @@ next_after.Date <- function(x) x + 1L #' @section Generating Snapshots: #' An `epi_archive` object can be used to generate a snapshot of the data in #' `epi_df` format, which represents the most up-to-date values of the signal -#' variables, as of the specified version. This is accomplished by calling the -#' `as_of()` method for an `epi_archive` object `x`. More details on this -#' method are documented in the wrapper function [`epix_as_of()`]. +#' variables, as of the specified version. This is accomplished by calling +#' `epix_as_of()`. #' #' @section Sliding Computations: #' We can run a sliding computation over an `epi_archive` object, much like @@ -179,595 +186,9 @@ next_after.Date <- function(x) x + 1L #' the way `epi_slide()` works for an `epi_df` object, but with one key #' difference: it is version-aware. That is, for an `epi_archive` object, the #' sliding computation at any given reference time point t is performed on -#' **data that would have been available as of t**. More details on `slide()` -#' are documented in the wrapper function [`epix_slide()`]. -#' -#' @importFrom R6 R6Class -#' @export -#' @examples -#' tib <- tibble::tibble( -#' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' value = rnorm(10, mean = 2, sd = 1) -#' ) +#' **data that would have been available as of t**. #' -#' toy_epi_archive <- tib %>% epi_archive$new( -#' geo_type = "state", -#' time_type = "day" -#' ) -#' toy_epi_archive -epi_archive <- - R6::R6Class( - classname = "epi_archive", - ##### - public = list( - #' @field DT (`data.table`)\cr - #' the (optionally compactified) datatable - DT = NULL, - #' @field geo_type (string)\cr - #' the resolution of the geographic label (e.g. state) - geo_type = NULL, - #' @field time_type (string)\cr - #' the resolution of the time column (e.g. day) - time_type = NULL, - #' @field additional_metadata (named list)\cr - #' any extra fields, such as `other_keys` - additional_metadata = NULL, - #' @field clobberable_versions_start (length-1 of same type&class as `version` column, or `NA`)\cr - #' the earliest version number that might be rewritten in the future without assigning a new version - #' date/number, or `NA` if this won't happen - clobberable_versions_start = NULL, - #' @field versions_end (length-1 of same type&class as `version` column)\cr - #' the latest version observed - versions_end = NULL, - #' @description Creates a new `epi_archive` object. - #' @param x A data frame, data table, or tibble, with columns `geo_value`, - #' `time_value`, `version`, and then any additional number of columns. - #' @param geo_type Type for the geo values. If missing, then the function will - #' attempt to infer it from the geo values present; if this fails, then it - #' will be set to "custom". - #' @param time_type Type for the time values. If missing, then the function will - #' attempt to infer it from the time values present; if this fails, then it - #' will be set to "custom". - #' @param other_keys Character vector specifying the names of variables in `x` - #' that should be considered key variables (in the language of `data.table`) - #' apart from "geo_value", "time_value", and "version". - #' @param additional_metadata List of additional metadata to attach to the - #' `epi_archive` object. The metadata will have `geo_type` and `time_type` - #' fields; named entries from the passed list or will be included as well. - #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are - #' considered redundant for the purposes of `epi_archive`'s built-in methods - #' such as `as_of`? As these methods use the last version of each observation - #' carried forward (LOCF) to interpolate between the version data provided, - #' rows that don't change these LOCF results can potentially be omitted to - #' save space while maintaining the same behavior (with the help of the - #' `clobberable_versions_start` and `versions_end` fields in some edge cases). - #' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will - #' remove these rows and issue a warning. Generally, this can be set to - #' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` - #' such as its `DT`, or rely on redundant updates to achieve a certain - #' behavior of the `ref_time_values` default in `epix_slide`, you will have to - #' determine whether `compactify=TRUE` will produce the desired results. If - #' compactification here is removing a large proportion of the rows, this may - #' indicate a potential for space, time, or bandwidth savings upstream the - #' data pipeline, e.g., by avoiding fetching, storing, or processing these - #' rows of `x`. - #' @param clobberable_versions_start Optional; as in [`as_epi_archive`] - #' @param versions_end Optional; as in [`as_epi_archive`] - #' @return An `epi_archive` object. - #' @importFrom data.table as.data.table key setkeyv - #' - #' @details - #' Refer to the documentation for [as_epi_archive()] for more information - #' and examples of parameter names. - initialize = function(x, geo_type, time_type, other_keys, - additional_metadata, compactify, - clobberable_versions_start, versions_end) { - assert_data_frame(x) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } - - # If geo type is missing, then try to guess it - if (missing(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (missing(time_type)) { - time_type <- guess_time_type(x$time_value) - } - - # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys <- NULL - if (missing(additional_metadata)) additional_metadata <- list() - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } - - # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify <- NULL - } - assert_logical(compactify, len = 1, null.ok = TRUE) - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end)) { - versions_end <- max_version_with_row_in(x) - } - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - sprintf( - "`versions_end` was %s, but `x` contained - updates for a later version or versions, up through %s", - versions_end, max(x[["version"]]) - ), - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - sprintf( - "`versions_end` was %s, but a `clobberable_versions_start` - of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start - ), - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - } - - # --- End of validation and replacing missing args with defaults --- - - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars <- c("geo_value", "time_value", other_keys, "version") - DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - - maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { - cli_abort("`x` must have one row per unique combination of the key variables. If you - have additional key variables other than `geo_value`, `time_value`, and - `version`, such as an age group column, please specify them in `other_keys`. - Otherwise, check for duplicate rows and/or conflicting values for the same - measurement.", - class = "epiprocess__epi_archive_requires_unique_key" - ) - } - - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec)) - ) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) - } - - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - elim <- keep_locf(DT) - DT <- rm_locf(DT) # nolint: object_name_linter - } else { - # Create empty data frame for nrow(elim) to be 0 - elim <- tibble::tibble() - } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- cli::format_inline( - "Found rows that appear redundant based on - last (version of each) observation carried forward; - these rows have been removed to 'compactify' and save space:", - keep_whitespace = FALSE - ) - warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - warning_outro <- cli::format_inline( - "Built-in `epi_archive` functionality should be unaffected, - but results may change if you work directly with its fields (such as `DT`). - See `?as_epi_archive` for details. - To silence this warning but keep compactification, - you can pass `compactify=TRUE` when constructing the archive.", - keep_whitespace = FALSE - ) - warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") - } - - # Instantiate all self variables - self$DT <- DT - self$geo_type <- geo_type - self$time_type <- time_type - self$additional_metadata <- additional_metadata - self$clobberable_versions_start <- clobberable_versions_start - self$versions_end <- versions_end - }, - #' Print information about an archive - #' @param class Boolean; whether to print the class label header - #' @param methods Boolean; whether to print all available methods of - #' the archive - #' @importFrom cli cli_inform - print = function(class = TRUE, methods = TRUE) { - cli_inform( - c( - ">" = if (class) "An `epi_archive` object, with metadata:", - "i" = if (length(setdiff(key(self$DT), c("geo_value", "time_value", "version"))) > 0) { - "Non-standard DT keys: {setdiff(key(self$DT), c('geo_value', 'time_value', 'version'))}" - }, - "i" = "Min/max time values: {min(self$DT$time_value)} / {max(self$DT$time_value)}", - "i" = "First/last version with update: {min(self$DT$version)} / {max(self$DT$version)}", - "i" = if (!is.na(self$clobberable_versions_start)) { - "Clobberable versions start: {self$clobberable_versions_start}" - }, - "i" = "Versions end: {self$versions_end}", - "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", - "i" = "A preview of the table ({nrow(self$DT)} rows x {ncol(self$DT)} columns):" - ) - ) - - return(invisible(self$DT %>% print())) - }, - ##### - #' @description Generates a snapshot in `epi_df` format as of a given version. - #' See the documentation for the wrapper function [`epix_as_of()`] for - #' details. The parameter descriptions below are copied from there - #' @param x An `epi_archive` object - #' @param max_version Version specifying the max version to permit in the - #' snapshot. That is, the snapshot will comprise the unique rows of the - #' current archive data that represent the most up-to-date signal values, as - #' of the specified `max_version` (and whose `time_value`s are at least - #' `min_time_value`). - #' @param min_time_value Time value specifying the min `time_value` to permit in - #' the snapshot. Default is `-Inf`, which effectively means that there is no - #' minimum considered. - #' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in - #' `epi_archive` format, and contain rows in the specified `time_value` range - #' having `version <= max_version`. The resulting object will cover a - #' potentially narrower `version` and `time_value` range than `x`, depending - #' on user-provided arguments. Otherwise, there will be one row in the output - #' for the `max_version` of each `time_value`. Default is `FALSE`. - #' @importFrom data.table between key - as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { - # Self max version and other keys - other_keys <- setdiff( - key(self$DT), - c("geo_value", "time_value", "version") - ) - if (length(other_keys) == 0) other_keys <- NULL - - # Check a few things on max_version - if (!test_set_equal(class(max_version), class(self$DT$version))) { - cli_abort( - "`max_version` must have the same classes as `self$DT$version`." - ) - } - if (!test_set_equal(typeof(max_version), typeof(self$DT$version))) { - cli_abort( - "`max_version` must have the same types as `self$DT$version`." - ) - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > self$versions_end) { - cli_abort("`max_version` must be at most `self$versions_end`.") - } - assert_logical(all_versions, len = 1) - if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - cli_warn( - 'Getting data as of some recent version which could still be - overwritten (under routine circumstances) without assigning a new - version number (a.k.a. "clobbered"). Thus, the snapshot that we - produce here should not be expected to be reproducible later. See - `?epi_archive` for more info and `?epix_as_of` on how to muffle.', - class = "epiprocess__snapshot_as_of_clobberable_version" - ) - } - - # Filter by version and return - if (all_versions) { - result <- epix_truncate_versions_after(self, max_version) - # `self` has already been `clone`d in `epix_truncate_versions_after` - # so we can modify the new archive's DT directly. - result$DT <- result$DT[time_value >= min_time_value, ] - return(result) - } - - return( - # Make sure to use data.table ways of filtering and selecting - self$DT[time_value >= min_time_value & version <= max_version, ] %>% - unique( - by = c("geo_value", "time_value", other_keys), - fromLast = TRUE - ) %>% - tibble::as_tibble() %>% - dplyr::select(-"version") %>% - as_epi_df( - geo_type = self$geo_type, - time_type = self$time_type, - as_of = max_version, - additional_metadata = c(self$additional_metadata, - other_keys = other_keys - ) - ) - ) - }, - ##### - #' @description Fill in unobserved history using requested scheme by mutating - #' `self` and potentially reseating its fields. See - #' [`epix_fill_through_version`] for a full description of the non-R6-method - #' version, which doesn't mutate the input archive but might alias its fields. - #' - #' @param fill_versions_end as in [`epix_fill_through_version`] - #' @param how as in [`epix_fill_through_version`] - #' - #' @importFrom data.table key setkeyv := address copy - #' @importFrom rlang arg_match - fill_through_version = function(fill_versions_end, - how = c("na", "locf")) { - validate_version_bound(fill_versions_end, self$DT, na_ok = FALSE) - how <- arg_match(how) - if (self$versions_end < fill_versions_end) { - new_DT <- switch(how, # nolint: object_name_linter - "na" = { - # old DT + a version consisting of all NA observations - # immediately after the last currently/actually-observed - # version. Note that this NA-observation version must only be - # added if `self` is outdated. - nonversion_key_cols <- setdiff(key(self$DT), "version") - nonkey_cols <- setdiff(names(self$DT), key(self$DT)) - next_version_tag <- next_after(self$versions_end) - if (next_version_tag > fill_versions_end) { - cli_abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), self$versions_end, next_version_tag, fill_versions_end)) - } - nonversion_key_vals_ever_recorded <- unique(self$DT, by = nonversion_key_cols) - # In edge cases, the `unique` result can alias the original - # DT; detect and copy if necessary: - if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { - nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) - } - next_version_DT <- nonversion_key_vals_ever_recorded[ # nolint: object_name_linter - , version := next_version_tag - ][ - # this makes the class of these columns logical (`NA` is a - # logical NA; we're relying on the rbind below to convert to - # the proper class&typeof) - , (nonkey_cols) := NA - ] - # full result DT: - setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] - }, - "locf" = { - # just the old DT; LOCF is built into other methods: - self$DT - } - ) - new_versions_end <- fill_versions_end - # Update `self` all at once with simple, error-free operations + - # return below: - self$DT <- new_DT - self$versions_end <- new_versions_end - } else { - # Already sufficiently up to date; nothing to do. - } - return(invisible(self)) - }, - ##### - #' @description Filter to keep only older versions, mutating the archive by - #' potentially reseating but not mutating some fields. `DT` is likely, but not - #' guaranteed, to be copied. Returns the mutated archive - #' [invisibly][base::invisible]. - #' @param x as in [`epix_truncate_versions_after`] - #' @param max_version as in [`epix_truncate_versions_after`] - truncate_versions_after = function(max_version) { - if (!test_set_equal(class(max_version), class(self$DT$version))) { - cli_abort("`max_version` must have the same classes as `self$DT$version`.") - } - if (!test_set_equal(typeof(max_version), typeof(self$DT$version))) { - cli_abort("`max_version` must have the same types as `self$DT$version`.") - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > self$versions_end) { - cli_abort("`max_version` must be at most `self$versions_end`.") - } - self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with = FALSE] - # (^ this filter operation seems to always copy the DT, even if it - # keeps every entry; we don't guarantee this behavior in - # documentation, though, so we could change to alias in this case) - if (!is.na(self$clobberable_versions_start) && self$clobberable_versions_start > max_version) { - self$clobberable_versions_start <- NA - } - self$versions_end <- max_version - return(invisible(self)) - }, - ##### - #' @description Merges another `epi_archive` with the current one, mutating the - #' current one by reseating its `DT` and several other fields, but avoiding - #' mutation of the old `DT`; returns the current archive - #' [invisibly][base::invisible]. See [`epix_merge`] for a full description - #' of the non-R6-method version, which does not mutate either archive, and - #' does not alias either archive's `DT`. - #' @param y as in [`epix_merge`] - #' @param sync as in [`epix_merge`] - #' @param compactify as in [`epix_merge`] - merge = function(y, sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE) { - result <- epix_merge(self, y, - sync = sync, - compactify = compactify - ) - - if (length(epi_archive$private_fields) != 0L) { - cli_abort("expected no private fields in epi_archive", - internal = TRUE - ) - } - - # Mutate fields all at once, trying to avoid any potential errors: - for (field_name in names(epi_archive$public_fields)) { - self[[field_name]] <- result[[field_name]] - } - - return(invisible(self)) - }, - #' group an epi_archive - #' @description - #' group an epi_archive - #' @param ... variables or computations to group by. Computations are always - #' done on the ungrouped data frame. To perform computations on the grouped - #' data, you need to use a separate [`mutate()`] step before the - #' [`group_by()`] - #' @param .add When `FALSE`, the default, [`group_by()`] will override existing - #' groups. To add to the existing groups, use `.add = TRUE`. - #' @param .drop Drop groups formed by factor levels that don't appear in the - #' data. The default is `TRUE` except when `.data` has been previously grouped - #' with `.drop = FALSE`. See [`group_by_drop_default()`] for details. - group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - group_by.epi_archive(self, ..., .add = .add, .drop = .drop) - }, - #' @description Slides a given function over variables in an `epi_archive` - #' object. See the documentation for the wrapper function [`epix_slide()`] for - #' details. The parameter descriptions below are copied from there - #' @importFrom data.table key - #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - #' @param f Function, formula, or missing; together with `...` specifies the - #' computation to slide. To "slide" means to apply a computation over a - #' sliding (a.k.a. "rolling") time window for each data group. The window is - #' determined by the `before` parameter described below. One time step is - #' typically one day or one week; see [`epi_slide`] details for more - #' explanation. If a function, `f` must take an `epi_df` with the same - #' column names as the archive's `DT`, minus the `version` column; followed - #' by a one-row tibble containing the values of the grouping variables for - #' the associated group; followed by a reference time value, usually as a - #' `Date` object; followed by any number of named arguments. If a formula, - #' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as - #' in `~ mean (.x$var)` to compute a mean of a column `var` for each - #' group-`ref_time_value` combination. The group key can be accessed via - #' `.y` or `.group_key`, and the reference time value can be accessed via - #' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the - #' computation. - #' @param ... Additional arguments to pass to the function or formula specified - #' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an - #' expression for tidy evaluation; in addition to referring to columns - #' directly by name, the expression has access to `.data` and `.env` pronouns - #' as in `dplyr` verbs, and can also refer to the `.group_key` and - #' `.ref_time_value`. See details of [`epi_slide`]. - #' @param before How far `before` each `ref_time_value` should the sliding - #' window extend? If provided, should be a single, non-NA, - #' [integer-compatible][vctrs::vec_cast] number of time steps. This window - #' endpoint is inclusive. For example, if `before = 7`, and one time step is - #' one day, then to produce a value for a `ref_time_value` of January 8, we - #' apply the given function or formula to data (for each group present) with - #' `time_value`s from January 1 onward, as they were reported on January 8. - #' For typical disease surveillance sources, this will not include any data - #' with a `time_value` of January 8, and, depending on the amount of reporting - #' latency, may not include January 7 or even earlier `time_value`s. (If - #' instead the archive were to hold nowcasts instead of regular surveillance - #' data, then we would indeed expect data for `time_value` January 8. If it - #' were to hold forecasts, then we would expect data for `time_value`s after - #' January 8, and the sliding window would extend as far after each - #' `ref_time_value` as needed to include all such `time_value`s.) - #' @param ref_time_values Reference time values / versions for sliding - #' computations; each element of this vector serves both as the anchor point - #' for the `time_value` window for the computation and the `max_version` - #' `as_of` which we fetch data in this window. If missing, then this will set - #' to a regularly-spaced sequence of values set to cover the range of - #' `version`s in the `DT` plus the `versions_end`; the spacing of values will - #' be guessed (using the GCD of the skips between values). - #' @param time_step Optional function used to define the meaning of one time - #' step, which if specified, overrides the default choice based on the - #' `time_value` column. This function must take a positive integer and return - #' an object of class `lubridate::period`. For example, we can use `time_step - #' = lubridate::hours` in order to set the time step to be one hour (this - #' would only be meaningful if `time_value` is of class `POSIXct`). - #' @param new_col_name String indicating the name of the new column that will - #' contain the derivative values. Default is "slide_value"; note that setting - #' `new_col_name` equal to an existing column name will overwrite this column. - #' @param as_list_col Should the slide results be held in a list column, or be - #' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, - #' in which case a list object returned by `f` would be unnested (using - #' [`tidyr::unnest()`]), and, if the slide computations output data frames, - #' the names of the resulting columns are given by prepending `new_col_name` - #' to the names of the list elements. - #' @param names_sep String specifying the separator to use in `tidyr::unnest()` - #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix - #' from `new_col_name` entirely. - #' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If - #' `all_versions = TRUE`, then `f` will be passed the version history (all - #' `version <= ref_time_value`) for rows having `time_value` between - #' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be - #' passed only the most recent `version` for every unique `time_value`. - #' Default is `FALSE`. - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # For an "ungrouped" slide, treat all rows as belonging to one big - # group (group by 0 vars), like `dplyr::summarize`, and let the - # resulting `grouped_epi_archive` handle the slide: - self$group_by()$slide( - f, ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions - ) %>% - # We want a slide on ungrouped archives to output something - # ungrouped, rather than retaining the trivial (0-variable) - # grouping applied above. So we `ungroup()`. However, the current - # `dplyr` implementation automatically ignores/drops trivial - # groupings, so this is just a no-op for now. - ungroup() - } - ) - ) - -#' Convert to `epi_archive` format -#' -#' Converts a data frame, data table, or tibble into an `epi_archive` -#' object. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. The parameter descriptions below are copied from there -#' -#' @param x A data frame, data table, or tibble, with columns `geo_value`, +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. #' @param geo_type Type for the geo values. If missing, then the function will #' attempt to infer it from the geo values present; if this fails, then it @@ -781,19 +202,9 @@ epi_archive <- #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or -#' `NULL` will remove these rows and issue a warning. Generally, this can be -#' set to `TRUE`, but if you directly inspect or edit the fields of the -#' `epi_archive` such as its `DT`, you will have to determine whether -#' `compactify=TRUE` will produce the desired results. If compactification -#' here is removing a large proportion of the rows, this may indicate a -#' potential for space, time, or bandwidth savings upstream the data pipeline, -#' e.g., when fetching, storing, or preparing the input data `x` +#' @param compactify Optional; Boolean or `NULL`. `TRUE` will remove some +#' redundant rows, `FALSE` will not, and missing or `NULL` will remove +#' redundant rows, but issue a warning. See more information at `compactify`. #' @param clobberable_versions_start Optional; `length`-1; either a value of the #' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and #' `typeof`: specifically, either (a) the earliest version that could be @@ -820,17 +231,12 @@ epi_archive <- #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. #' @return An `epi_archive` object. #' -#' @details This simply a wrapper around the `new()` method of the `epi_archive` -#' class, so for example: -#' ``` -#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") -#' ``` -#' would be equivalent to: -#' ``` -#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") -#' ``` +#' @importFrom data.table as.data.table key setkeyv +#' @importFrom dplyr if_any if_all everything #' +#' @name epi_archive #' @export +#' #' @examples #' # Simple ex. with necessary keys #' tib <- tibble::tibble( @@ -875,17 +281,377 @@ epi_archive <- #' time_type = "day", #' other_keys = "county" #' ) -as_epi_archive <- function(x, geo_type, time_type, other_keys, +#' +new_epi_archive <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + # If geo type is missing, then try to guess it + if (is.null(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } + + # If time type is missing, then try to guess it + if (missing(time_type) || is.null(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } + assert_logical(compactify, len = 1, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end) || is.null(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + sprintf( + "`versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } + + # --- End of validation and replacing missing args with defaults --- + + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { # nolint: object_usage_linter + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) # nolint: object_name_linter + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + structure( + list( + DT = DT, + geo_type = geo_type, + time_type = time_type, + additional_metadata = additional_metadata, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end + ), + class = "epi_archive" + ) +} + + +#' `as_epi_archive` converts a data frame, data table, or tibble into an +#' `epi_archive` object. +#' +#' @rdname epi_archive +#' +#' @export +as_epi_archive <- function(x, geo_type = NULL, time_type = NULL, other_keys = NULL, additional_metadata = list(), compactify = NULL, clobberable_versions_start = NA, versions_end = max_version_with_row_in(x)) { - epi_archive$new( + new_epi_archive( x, geo_type, time_type, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end ) } + +#' Print information about an `epi_archive` object +#' +#' @param x An `epi_archive` object. +#' @param ... Should be empty, there to satisfy the S3 generic. +#' @param class Boolean; whether to print the class label header +#' @param methods Boolean; whether to print all available methods of +#' the archive +#' +#' @importFrom cli cli_inform +#' @importFrom rlang check_dots_empty +#' @export +print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { + if (rlang::dots_n(...) > 0) { + cli_abort(c( + "Error in print.epi_archive()", + "i" = "Too many arguments passed to `print.epi_archive()`." + )) + } + + cli_inform( + c( + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { + "Non-standard DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}", + "i" = "First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}", + "i" = if (!is.na(x$clobberable_versions_start)) { + "Clobberable versions start: {x$clobberable_versions_start}" + }, + "i" = "Versions end: {x$versions_end}", + "i" = "A preview of the table ({nrow(x$DT)} rows x {ncol(x$DT)} columns):" + ) + ) + + print(x$DT[]) + return(invisible(x)) +} + + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive <- +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) +#' ) %>% +#' as_epi_archive(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) + assert_logical(.drop) + if (!.drop) { + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + } else if (any(diff(grouping_col_is_factor) == -1L)) { + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these + columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert all + grouping columns to factors beforehand, specify the non-factor grouping columns first, + or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + } + } + new_grouped_epi_archive(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop + ) +} + + #' Test for `epi_archive` format #' #' @param x An object. @@ -901,7 +667,7 @@ as_epi_archive <- function(x, geo_type, time_type, other_keys, #' # By default, grouped_epi_archives don't count as epi_archives, as they may #' # support a different set of operations from regular `epi_archives`. This #' # behavior can be controlled by `grouped_okay`. -#' grouped_archive <- archive_cases_dv_subset$group_by(geo_value) +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) #' is_epi_archive(grouped_archive) # FALSE #' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE #' @@ -909,3 +675,22 @@ as_epi_archive <- function(x, geo_type, time_type, other_keys, is_epi_archive <- function(x, grouped_okay = FALSE) { inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") } + + +#' Clone an `epi_archive` object. +#' +#' @param x An `epi_archive` object. +#' +#' @importFrom data.table copy +#' @export +clone <- function(x) { + UseMethod("clone") +} + + +#' @rdname clone +#' @export +clone.epi_archive <- function(x) { + x$DT <- data.table::copy(x$DT) + return(x) +} diff --git a/R/archive_new.R b/R/archive_new.R deleted file mode 100644 index 0b4f3695..00000000 --- a/R/archive_new.R +++ /dev/null @@ -1,1115 +0,0 @@ -# We use special features of data.table's `[`. The data.table package has a -# compatibility feature that disables some/all of these features if it thinks we -# might expect `data.frame`-compatible behavior instead. We can signal that we -# want the special behavior via `.datatable.aware = TRUE` or by importing any -# `data.table` package member. Do both to prevent surprises if we decide to use -# `data.table::` everywhere and not importing things. -.datatable.aware <- TRUE - -#' Validate a version bound arg -#' -#' Expected to be used on `clobberable_versions_start`, `versions_end`, -#' and similar arguments. Some additional context-specific checks may be needed. -#' -#' @param version_bound the version bound to validate -#' @param x a data frame containing a version column with which to check -#' compatibility -#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will -#' have a special context-dependent meaning.) -#' @param version_bound_arg optional string; what to call the version bound in -#' error messages -#' -#' @section Side effects: raises an error if version bound appears invalid -#' -#' @noRd -validate_version_bound <- function(version_bound, x, na_ok = FALSE, - version_bound_arg = rlang::caller_arg(version_bound), - x_arg = rlang::caller_arg(version_bound)) { - if (is.null(version_bound)) { - cli_abort( - "{version_bound_arg} cannot be NULL" - ) - } - if (na_ok && is.na(version_bound)) { - return(invisible(NULL)) - } - if (!test_set_equal(class(version_bound), class(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same classes as x$version, - which is {class(x$version)}", - ) - } - if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same types as x$version, - which is {typeof(x$version)}", - ) - } - - return(invisible(NULL)) -} - -#' `max(x$version)`, with error if `x` has 0 rows -#' -#' Exported to make defaults more easily copyable. -#' -#' @param x `x` argument of [`as_epi_archive`] -#' -#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or -#' an `NA` version value -#' -#' @export -max_version_with_row_in <- function(x) { - if (nrow(x) == 0L) { - cli_abort( - "`nrow(x)==0L`, representing a data set history with no row up through the - latest observed version, but we don't have a sensible guess at what version - that is, or whether any of the empty versions might be clobbered in the - future; if we use `x` to form an `epi_archive`, then - `clobberable_versions_start` and `versions_end` must be manually specified.", - class = "epiprocess__max_version_cannot_be_used" - ) - } else { - version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist - if (anyNA(version_col)) { - cli_abort("version values cannot be NA", - class = "epiprocess__version_values_must_not_be_na" - ) - } else { - version_bound <- max(version_col) - } - } -} - -#' Get the next possible value greater than `x` of the same type -#' -#' @param x the starting "value"(s) -#' @return same class, typeof, and length as `x` -#' -#' @export -next_after <- function(x) UseMethod("next_after") - -#' @export -next_after.integer <- function(x) x + 1L - -#' @export -next_after.Date <- function(x) x + 1L - - - -#' epi archive -#' @title `epi_archive` object -#' -#' @description An `epi_archive` is an R6 class which contains a data table -#' along with several relevant pieces of metadata. The data table can be seen -#' as the full archive (version history) for some signal variables of -#' interest. -#' -#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of -#' class `data.table` from the `data.table` package, with (at least) the -#' following columns: -#' -#' * `geo_value`: the geographic value associated with each row of measurements. -#' * `time_value`: the time value associated with each row of measurements. -#' * `version`: the time value specifying the version for each row of -#' measurements. For example, if in a given row the `version` is January 15, -#' 2022 and `time_value` is January 14, 2022, then this row contains the -#' measurements of the data for January 14, 2022 that were available one day -#' later. -#' -#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, -#' as well as any others (these can be specified when instantiating the -#' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for -#' information and examples of relevant parameter names for an `epi_archive` object. -#' Note that there can only be a single row per unique combination of -#' key variables, and thus the key variables are critical for figuring out how -#' to generate a snapshot of data from the archive, as of a given version. -#' -#' In general, the last version of each observation is carried forward (LOCF) to -#' fill in data between recorded versions, and between the last recorded -#' update and the `versions_end`. One consequence is that the `DT` -#' doesn't have to contain a full snapshot of every version (although this -#' generally works), but can instead contain only the rows that are new or -#' changed from the previous version (see `compactify`, which does this -#' automatically). Currently, deletions must be represented as revising a row -#' to a special state (e.g., making the entries `NA` or including a special -#' column that flags the data as removed and performing some kind of -#' post-processing), and the archive is unaware of what this state is. Note -#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, -#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to -#' represent potential update data that we do not yet have access to; or in -#' [`epix_merge`] to represent the "value" of an observation before the -#' version in which it was first released, or if no version of that -#' observation appears in the archive data at all. -#' -#' **A word of caution:** R6 objects, unlike most other objects in R, have -#' reference semantics. A primary consequence of this is that objects are not -#' copied when modified. You can read more about this in Hadley Wickham's -#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order -#' to construct a modified archive while keeping the original intact, first -#' make a clone using the `$clone` method, then overwrite the clone's `DT` -#' field with `data.table::copy(clone$DT)`, and finally perform the -#' modifications on the clone. -#' -#' @section Metadata: -#' The following pieces of metadata are included as fields in an `epi_archive` -#' object: -#' -#' * `geo_type`: the type for the geo values. -#' * `time_type`: the type for the time values. -#' * `additional_metadata`: list of additional metadata for the data archive. -#' -#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be -#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, -#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the -#' metadata of an `epi_archive` object are not currently used by any -#' downstream functions in the `epiprocess` package, and serve only as useful -#' bits of information to convey about the data set at hand. -#' -#' @section Generating Snapshots: -#' An `epi_archive` object can be used to generate a snapshot of the data in -#' `epi_df` format, which represents the most up-to-date values of the signal -#' variables, as of the specified version. This is accomplished by calling the -#' `as_of()` method for an `epi_archive` object `x`. More details on this -#' method are documented in the wrapper function [`epix_as_of()`]. -#' -#' @section Sliding Computations: -#' We can run a sliding computation over an `epi_archive` object, much like -#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling -#' the `slide()` method for an `epi_archive` object, which works similarly to -#' the way `epi_slide()` works for an `epi_df` object, but with one key -#' difference: it is version-aware. That is, for an `epi_archive` object, the -#' sliding computation at any given reference time point t is performed on -#' **data that would have been available as of t**. More details on `slide()` -#' are documented in the wrapper function [`epix_slide()`]. -#' -#' @export -#' @examples -#' tib <- tibble::tibble( -#' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' value = rnorm(10, mean = 2, sd = 1) -#' ) -#' -#' toy_epi_archive <- tib %>% new_epi_archive2( -#' geo_type = "state", -#' time_type = "day" -#' ) -#' toy_epi_archive -#' @name epi_archive -# TODO: Figure out where to actually put this documentation -NULL - -#' New epi archive -#' @description Creates a new `epi_archive` object. -#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param other_keys Character vector specifying the names of variables in `x` -#' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space while maintaining the same behavior (with the help of the -#' `clobberable_versions_start` and `versions_end` fields in some edge cases). -#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will -#' remove these rows and issue a warning. Generally, this can be set to -#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` -#' such as its `DT`, or rely on redundant updates to achieve a certain -#' behavior of the `ref_time_values` default in `epix_slide`, you will have to -#' determine whether `compactify=TRUE` will produce the desired results. If -#' compactification here is removing a large proportion of the rows, this may -#' indicate a potential for space, time, or bandwidth savings upstream the -#' data pipeline, e.g., by avoiding fetching, storing, or processing these -#' rows of `x`. -#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] -#' @param versions_end Optional; as in [`as_epi_archive`] -#' @return An `epi_archive` object. -#' @importFrom data.table as.data.table key setkeyv -#' -#' @details -#' Refer to the documentation for [as_epi_archive()] for more information -#' and examples of parameter names. -#' @export -new_epi_archive2 <- function( - x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NA, - versions_end = NULL) { - assert_data_frame(x) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } - - # If geo type is missing, then try to guess it - if (missing(geo_type) || is.null(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (missing(time_type) || is.null(time_type)) { - time_type <- guess_time_type(x$time_value) - } - - # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys <- NULL - if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } - - # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify <- NULL - } - assert_logical(compactify, len = 1, null.ok = TRUE) - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end) || is.null(versions_end)) { - versions_end <- max_version_with_row_in(x) - } - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - sprintf( - "`versions_end` was %s, but `x` contained - updates for a later version or versions, up through %s", - versions_end, max(x[["version"]]) - ), - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - sprintf( - "`versions_end` was %s, but a `clobberable_versions_start` - of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start - ), - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - } - - # --- End of validation and replacing missing args with defaults --- - - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars <- c("geo_value", "time_value", other_keys, "version") - DT <- as.data.table(x, key = key_vars) - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - - maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { - cli_abort("`x` must have one row per unique combination of the key variables. If you - have additional key variables other than `geo_value`, `time_value`, and - `version`, such as an age group column, please specify them in `other_keys`. - Otherwise, check for duplicate rows and/or conflicting values for the same - measurement.", - class = "epiprocess__epi_archive_requires_unique_key" - ) - } - - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec)) - ) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) - } - - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - elim <- keep_locf(DT) - DT <- rm_locf(DT) - } else { - # Create empty data frame for nrow(elim) to be 0 - elim <- tibble::tibble() - } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- cli::format_inline( - "Found rows that appear redundant based on - last (version of each) observation carried forward; - these rows have been removed to 'compactify' and save space:", - keep_whitespace = FALSE - ) - warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - warning_outro <- cli::format_inline( - "Built-in `epi_archive` functionality should be unaffected, - but results may change if you work directly with its fields (such as `DT`). - See `?as_epi_archive` for details. - To silence this warning but keep compactification, - you can pass `compactify=TRUE` when constructing the archive.", - keep_whitespace = FALSE - ) - warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") - } - - structure( - list( - DT = DT, - geo_type = geo_type, - time_type = time_type, - additional_metadata = additional_metadata, - clobberable_versions_start = clobberable_versions_start, - versions_end = versions_end, - private = list() # TODO: to be encapsulated with guard-rails later - ), - class = "epi_archive2" - ) -} - -#' Print information about an `epi_archive` object -#' @param class Boolean; whether to print the class label header -#' @param methods Boolean; whether to print all available methods of -#' the archive -#' @importFrom cli cli_inform -#' @export -print.epi_archive2 <- function(epi_archive, class = TRUE, methods = TRUE) { - cli_inform( - c( - ">" = if (class) "An `epi_archive` object, with metadata:", - "i" = if (length(setdiff(key(epi_archive$DT), c("geo_value", "time_value", "version"))) > 0) { - "Non-standard DT keys: {setdiff(key(epi_archive$DT), c('geo_value', 'time_value', 'version'))}" - }, - "i" = "Min/max time values: {min(epi_archive$DT$time_value)} / {max(epi_archive$DT$time_value)}", - "i" = "First/last version with update: {min(epi_archive$DT$version)} / {max(epi_archive$DT$version)}", - "i" = if (!is.na(epi_archive$clobberable_versions_start)) { - "Clobberable versions start: {epi_archive$clobberable_versions_start}" - }, - "i" = "Versions end: {epi_archive$versions_end}", - "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", - "i" = "A preview of the table ({nrow(epi_archive$DT)} rows x {ncol(epi_archive$DT)} columns):" - ) - ) - - return(invisible(epi_archive$DT %>% print())) -} - - -#' @export -as_of <- function(x, ...) { - UseMethod("as_of") -} - - -#' As of epi_archive -#' @description Generates a snapshot in `epi_df` format as of a given version. -#' See the documentation for the wrapper function [`epix_as_of()`] for -#' details. The parameter descriptions below are copied from there -#' @param epi_archive An `epi_archive` object -#' @param max_version Version specifying the max version to permit in the -#' snapshot. That is, the snapshot will comprise the unique rows of the -#' current archive data that represent the most up-to-date signal values, as -#' of the specified `max_version` (and whose `time_value`s are at least -#' `min_time_value`). -#' @param min_time_value Time value specifying the min `time_value` to permit in -#' the snapshot. Default is `-Inf`, which effectively means that there is no -#' minimum considered. -#' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in -#' `epi_archive` format, and contain rows in the specified `time_value` range -#' having `version <= max_version`. The resulting object will cover a -#' potentially narrower `version` and `time_value` range than `x`, depending -#' on user-provided arguments. Otherwise, there will be one row in the output -#' for the `max_version` of each `time_value`. Default is `FALSE`. -#' @importFrom data.table between key -#' @export -as_of.epi_archive2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { - other_keys <- setdiff( - key(epi_archive$DT), - c("geo_value", "time_value", "version") - ) - if (length(other_keys) == 0) other_keys <- NULL - - # Check a few things on max_version - if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { - cli_abort( - "`max_version` must have the same classes as `epi_archive$DT$version`." - ) - } - if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { - cli_abort( - "`max_version` must have the same types as `epi_archive$DT$version`." - ) - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > epi_archive$versions_end) { - cli_abort("`max_version` must be at most `epi_archive$versions_end`.") - } - assert_logical(all_versions, len = 1) - if (!is.na(epi_archive$clobberable_versions_start) && max_version >= epi_archive$clobberable_versions_start) { - cli_warn( - 'Getting data as of some recent version which could still be - overwritten (under routine circumstances) without assigning a new - version number (a.k.a. "clobbered"). Thus, the snapshot that we - produce here should not be expected to be reproducible later. See - `?epi_archive` for more info and `?epix_as_of` on how to muffle.', - class = "epiprocess__snapshot_as_of_clobberable_version" - ) - } - - # Filter by version and return - if (all_versions) { - # epi_archive is copied into result, so we can modify result directly - result <- epix_truncate_versions_after(epi_archive, max_version) - result$DT <- result$DT[time_value >= min_time_value, ] - return(result) - } - - # Make sure to use data.table ways of filtering and selecting - as_of_epi_df <- epi_archive$DT[time_value >= min_time_value & version <= max_version, ] %>% - unique( - by = c("geo_value", "time_value", other_keys), - fromLast = TRUE - ) %>% - tibble::as_tibble() %>% - dplyr::select(-"version") %>% - as_epi_df( - geo_type = epi_archive$geo_type, - time_type = epi_archive$time_type, - as_of = max_version, - additional_metadata = c(epi_archive$additional_metadata, - other_keys = other_keys - ) - ) - - return(as_of_epi_df) -} - - -#' @export -fill_through_version <- function(x, ...) { - UseMethod("fill_through_version") -} - - -#' Fill through version -#' @description Fill in unobserved history using requested scheme by mutating -#' the given object and potentially reseating its fields. See -#' [`epix_fill_through_version`], which doesn't mutate the input archive but -#' might alias its fields. -#' -#' @param epi_archive an `epi_archive` object -#' @param fill_versions_end as in [`epix_fill_through_version`] -#' @param how as in [`epix_fill_through_version`] -#' -#' @importFrom data.table key setkeyv := address copy -#' @importFrom rlang arg_match -fill_through_version.epi_archive2 <- function( - epi_archive, - fill_versions_end, - how = c("na", "locf")) { - validate_version_bound(fill_versions_end, epi_archive$DT, na_ok = FALSE) - how <- arg_match(how) - if (epi_archive$versions_end < fill_versions_end) { - new_DT <- switch(how, - "na" = { - # old DT + a version consisting of all NA observations - # immediately after the last currently/actually-observed - # version. Note that this NA-observation version must only be - # added if `epi_archive` is outdated. - nonversion_key_cols <- setdiff(key(epi_archive$DT), "version") - nonkey_cols <- setdiff(names(epi_archive$DT), key(epi_archive$DT)) - next_version_tag <- next_after(epi_archive$versions_end) - if (next_version_tag > fill_versions_end) { - cli_abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), epi_archive$versions_end, next_version_tag, fill_versions_end)) - } - nonversion_key_vals_ever_recorded <- unique(epi_archive$DT, by = nonversion_key_cols) - # In edge cases, the `unique` result can alias the original - # DT; detect and copy if necessary: - if (identical(address(epi_archive$DT), address(nonversion_key_vals_ever_recorded))) { - nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) - } - next_version_DT <- nonversion_key_vals_ever_recorded[ - , version := next_version_tag - ][ - # this makes the class of these columns logical (`NA` is a - # logical NA; we're relying on the rbind below to convert to - # the proper class&typeof) - , (nonkey_cols) := NA - ] - # full result DT: - setkeyv(rbind(epi_archive$DT, next_version_DT), key(epi_archive$DT))[] - }, - "locf" = { - # just the old DT; LOCF is built into other methods: - epi_archive$DT - } - ) - new_versions_end <- fill_versions_end - # Update `epi_archive` all at once with simple, error-free operations + - # return below: - epi_archive$DT <- new_DT - epi_archive$versions_end <- new_versions_end - } else { - # Already sufficiently up to date; nothing to do. - } - return(invisible(epi_archive)) -} - - -#' @export -truncate_versions_after <- function(x, ...) { - UseMethod("truncate_versions_after") -} - - -#' Truncate versions after -#' @description Filter to keep only older versions, mutating the archive by -#' potentially reseating but not mutating some fields. `DT` is likely, but not -#' guaranteed, to be copied. Returns the mutated archive -#' [invisibly][base::invisible]. -#' @param epi_archive as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] -truncate_versions_after.epi_archive2 <- function( - epi_archive, - max_version) { - if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { - cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") - } - if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { - cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > epi_archive$versions_end) { - cli_abort("`max_version` must be at most `epi_archive$versions_end`.") - } - epi_archive$DT <- epi_archive$DT[epi_archive$DT$version <= max_version, colnames(epi_archive$DT), with = FALSE] - # (^ this filter operation seems to always copy the DT, even if it - # keeps every entry; we don't guarantee this behavior in - # documentation, though, so we could change to alias in this case) - if (!is.na(epi_archive$clobberable_versions_start) && epi_archive$clobberable_versions_start > max_version) { - epi_archive$clobberable_versions_start <- NA - } - epi_archive$versions_end <- max_version - return(invisible(epi_archive)) -} - - -#' Merge epi archive -#' @description Merges another `epi_archive` with the current one, mutating the -#' current one by reseating its `DT` and several other fields, but avoiding -#' mutation of the old `DT`; returns the current archive -#' [invisibly][base::invisible]. See [`epix_merge`] for a full description -#' of the non-R6-method version, which does not mutate either archive, and -#' does not alias either archive's `DT`.a -#' @param x as in [`epix_merge`] -#' @param y as in [`epix_merge`] -#' @param sync as in [`epix_merge`] -#' @param compactify as in [`epix_merge`] -merge_epi_archive2 <- function( - x, - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE) { - result <- epix_merge(x, y, - sync = sync, - compactify = compactify - ) - - # TODO: Use encapsulating methods instead. - if (length(x$private_fields) != 0L) { - cli_abort("expected no private fields in x", - internal = TRUE - ) - } - - # Mutate fields all at once, trying to avoid any potential errors: - for (field_name in names(x$public_fields)) { - x[[field_name]] <- result[[field_name]] - } - - return(invisible(x)) -} - - -#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` -#' -#' @param .data An `epi_archive` or `grouped_epi_archive` -#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * For `group_by`: unquoted variable name(s) or other -#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to -#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to -#' perform grouping, but note that, if you are regrouping an already-grouped -#' `.data` object, the calculations will be carried out ignoring such grouping -#' (same as [in dplyr][dplyr::group_by]). -#' * For `ungroup`: either -#' * empty, in order to remove the grouping and output an `epi_archive`; or -#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] -#' expression(s), in order to remove the matching variables from the list of -#' grouping variables, and output another `grouped_epi_archive`. -#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by -#' the variable selection from `...` only; if `TRUE`, the output will be -#' grouped by the current grouping variables plus the variable selection from -#' `...`. -#' @param .drop As described in [`dplyr::group_by`]; determines treatment of -#' factor columns. -#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for -#' `is_grouped_epi_archive`: any object -#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or -#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; -#' `grouped_epi_archive` dispatches its own S3 method) -#' -#' @details -#' -#' To match `dplyr`, `group_by` allows "data masking" (also referred to as -#' "tidy evaluation") expressions `...`, not just column names, in a way similar -#' to `mutate`. Note that replacing or removing key columns with these -#' expressions is disabled. -#' -#' `archive %>% group_by()` and other expressions that group or regroup by zero -#' columns (indicating that all rows should be treated as part of one large -#' group) will output a `grouped_epi_archive`, in order to enable the use of -#' `grouped_epi_archive` methods on the result. This is in slight contrast to -#' the same operations on tibbles and grouped tibbles, which will *not* output a -#' `grouped_df` in these circumstances. -#' -#' Using `group_by` with `.add=FALSE` to override the existing grouping is -#' disabled; instead, `ungroup` first then `group_by`. -#' -#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, -#' introducing column-level aliasing between its input and its result. This -#' doesn't follow the general model for most `data.table` operations, which -#' seems to be that, given an nonaliased (i.e., unique) pointer to a -#' `data.table` object, its pointers to its columns should also be nonaliased. -#' If you mutate any of the columns of either the input or result, first ensure -#' that it is fine if columns of the other are also mutated, but do not rely on -#' such behavior to occur. Additionally, never perform mutation on the key -#' columns at all (except for strictly increasing transformations), as this will -#' invalidate sortedness assumptions about the rows. -#' -#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch -#' to `group_by_drop_default.default` (but there is a dedicated method for -#' `grouped_epi_archive`s). -#' -#' @examples -#' -#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) -#' -#' # `print` for metadata and method listing: -#' grouped_archive %>% print() -#' -#' # The primary use for grouping is to perform a grouped `epix_slide`: -#' -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = "case_rate_3d_av" -#' ) %>% -#' ungroup() -#' -#' # ----------------------------------------------------------------- -#' -#' # Advanced: some other features of dplyr grouping are implemented: -#' -#' library(dplyr) -#' toy_archive <- -#' tribble( -#' ~geo_value, ~age_group, ~time_value, ~version, ~value, -#' "us", "adult", "2000-01-01", "2000-01-02", 121, -#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) -#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) -#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) -#' ) %>% -#' mutate( -#' age_group = ordered(age_group, c("pediatric", "adult")), -#' time_value = as.Date(time_value), -#' version = as.Date(version) -#' ) %>% -#' as_epi_archive2(other_keys = "age_group") -#' -#' # The following are equivalent: -#' toy_archive %>% group_by(geo_value, age_group) -#' toy_archive %>% -#' group_by(geo_value) %>% -#' group_by(age_group, .add = TRUE) -#' grouping_cols <- c("geo_value", "age_group") -#' toy_archive %>% group_by(across(all_of(grouping_cols))) -#' -#' # And these are equivalent: -#' toy_archive %>% group_by(geo_value) -#' toy_archive %>% -#' group_by(geo_value, age_group) %>% -#' ungroup(age_group) -#' -#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -#' toy_archive %>% -#' group_by(geo_value) %>% -#' groups() -#' -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop = FALSE) %>% -#' epix_slide2(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' -#' @importFrom dplyr group_by -#' @export -#' -#' @aliases grouped_epi_archive -group_by.epi_archive2 <- function(epi_archive, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(epi_archive)) { - # `add` makes no difference; this is an ungrouped `epi_archive`. - detailed_mutate <- epix_detailed_restricted_mutate2(epi_archive, ...) - assert_logical(.drop) - if (!.drop) { - grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] - grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) - # ^ Use `as.list` to try to avoid any possibility of a deep copy. - if (!any(grouping_col_is_factor)) { - cli_warn( - "`.drop=FALSE` but there are no factor grouping columns; - did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - } else if (any(diff(grouping_col_is_factor) == -1L)) { - cli_warn( - "`.drop=FALSE` but there are one or more non-factor grouping columns listed - after a factor grouping column; this may produce groups with `NA`s for these - columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; - depending on how you want completion to work, you might instead want to convert all - grouping columns to factors beforehand, specify the non-factor grouping columns first, - or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" - ) - } - } - new_grouped_epi_archive(detailed_mutate[["archive"]], - detailed_mutate[["request_names"]], - drop = .drop - ) -} - - -#' @export -slide <- function(.data, ...) { - UseMethod("slide") -} - - -#' Slide over epi archive -#' @description Slides a given function over variables in an `epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. The parameter descriptions below are copied from there -#' @importFrom data.table key -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms -#' @param f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation over a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `before` parameter described below. One time step is -#' typically one day or one week; see [`epi_slide`] details for more -#' explanation. If a function, `f` must take an `epi_df` with the same -#' column names as the archive's `DT`, minus the `version` column; followed -#' by a one-row tibble containing the values of the grouping variables for -#' the associated group; followed by a reference time value, usually as a -#' `Date` object; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~ mean (.x$var)` to compute a mean of a column `var` for each -#' group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via -#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the -#' computation. -#' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an -#' expression for tidy evaluation; in addition to referring to columns -#' directly by name, the expression has access to `.data` and `.env` pronouns -#' as in `dplyr` verbs, and can also refer to the `.group_key` and -#' `.ref_time_value`. See details of [`epi_slide`]. -#' @param before How far `before` each `ref_time_value` should the sliding -#' window extend? If provided, should be a single, non-NA, -#' [integer-compatible][vctrs::vec_cast] number of time steps. This window -#' endpoint is inclusive. For example, if `before = 7`, and one time step is -#' one day, then to produce a value for a `ref_time_value` of January 8, we -#' apply the given function or formula to data (for each group present) with -#' `time_value`s from January 1 onward, as they were reported on January 8. -#' For typical disease surveillance sources, this will not include any data -#' with a `time_value` of January 8, and, depending on the amount of reporting -#' latency, may not include January 7 or even earlier `time_value`s. (If -#' instead the archive were to hold nowcasts instead of regular surveillance -#' data, then we would indeed expect data for `time_value` January 8. If it -#' were to hold forecasts, then we would expect data for `time_value`s after -#' January 8, and the sliding window would extend as far after each -#' `ref_time_value` as needed to include all such `time_value`s.) -#' @param ref_time_values Reference time values / versions for sliding -#' computations; each element of this vector serves both as the anchor point -#' for the `time_value` window for the computation and the `max_version` -#' `as_of` which we fetch data in this window. If missing, then this will set -#' to a regularly-spaced sequence of values set to cover the range of -#' `version`s in the `DT` plus the `versions_end`; the spacing of values will -#' be guessed (using the GCD of the skips between values). -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` -#' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If -#' `all_versions = TRUE`, then `f` will be passed the version history (all -#' `version <= ref_time_value`) for rows having `time_value` between -#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be -#' passed only the most recent `version` for every unique `time_value`. -#' Default is `FALSE`. -slide.epi_archive2 <- function(epi_archive, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # For an "ungrouped" slide, treat all rows as belonging to one big - # group (group by 0 vars), like `dplyr::summarize`, and let the - # resulting `grouped_epi_archive` handle the slide: - slide( - group_by(epi_archive), - f, - ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions - ) %>% - # We want a slide on ungrouped archives to output something - # ungrouped, rather than retaining the trivial (0-variable) - # grouping applied above. So we `ungroup()`. However, the current - # `dplyr` implementation automatically ignores/drops trivial - # groupings, so this is just a no-op for now. - ungroup() -} - - -#' Convert to `epi_archive` format -#' -#' Converts a data frame, data table, or tibble into an `epi_archive` -#' object. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. The parameter descriptions below are copied from there -#' -#' @param x A data frame, data table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param other_keys Character vector specifying the names of variables in `x` -#' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or -#' `NULL` will remove these rows and issue a warning. Generally, this can be -#' set to `TRUE`, but if you directly inspect or edit the fields of the -#' `epi_archive` such as its `DT`, you will have to determine whether -#' `compactify=TRUE` will produce the desired results. If compactification -#' here is removing a large proportion of the rows, this may indicate a -#' potential for space, time, or bandwidth savings upstream the data pipeline, -#' e.g., when fetching, storing, or preparing the input data `x` -#' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and -#' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data, but -#' using the *same* version tag as the old update data), or (b) `NA`, to -#' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable under routine circumstances, such as (a) -#' today's version of one/all of the columns being published after initially -#' being filled with `NA` or LOCF, (b) a buggy version of today's data being -#' published but then fixed and republished later in the day, or (c) data -#' pipeline delays (e.g., publisher uploading, periodic scraping, database -#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected -#' later in the day (or even on a different day) than expected; potential -#' causes vary between different data pipelines. The default value is `NA`, -#' which doesn't consider any versions to be clobberable. Another setting that -#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. -#' @param versions_end Optional; length-1, same `class` and `typeof` as -#' `x$version`: what is the last version we have observed? The default is -#' `max_version_with_row_in(x)`, but values greater than this could also be -#' valid, and would indicate that we observed additional versions of the data -#' beyond `max(x$version)`, but they all contained empty updates. (The default -#' value of `clobberable_versions_start` does not fully trust these empty -#' updates, and assumes that any version `>= max(x$version)` could be -#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. -#' @return An `epi_archive` object. -#' -#' @details This simply a wrapper around the `new()` method of the `epi_archive` -#' class, so for example: -#' ``` -#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") -#' ``` -#' would be equivalent to: -#' ``` -#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") -#' ``` -#' -#' @export -#' @examples -#' # Simple ex. with necessary keys -#' tib <- tibble::tibble( -#' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' value = rnorm(10, mean = 2, sd = 1) -#' ) -#' -#' toy_epi_archive <- tib %>% as_epi_archive2( -#' geo_type = "state", -#' time_type = "day" -#' ) -#' toy_epi_archive -#' -#' # Ex. with an additional key for county -#' df <- data.frame( -#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), -#' county = c(1, 3, 2, 5), -#' time_value = c( -#' "2020-06-01", -#' "2020-06-02", -#' "2020-06-01", -#' "2020-06-02" -#' ), -#' version = c( -#' "2020-06-02", -#' "2020-06-03", -#' "2020-06-02", -#' "2020-06-03" -#' ), -#' cases = c(1, 2, 3, 4), -#' cases_rate = c(0.01, 0.02, 0.01, 0.05) -#' ) -#' -#' x <- df %>% as_epi_archive2( -#' geo_type = "state", -#' time_type = "day", -#' other_keys = "county" -#' ) -as_epi_archive2 <- function(x, geo_type, time_type, other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x)) { - new_epi_archive2( - x, geo_type, time_type, other_keys, additional_metadata, - compactify, clobberable_versions_start, versions_end - ) -} - -#' Test for `epi_archive` format -#' -#' @param x An object. -#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also -#' count? Default is `FALSE`. -#' @return `TRUE` if the object inherits from `epi_archive`. -#' -#' @export -#' @examples -#' is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -#' is_epi_archive2(archive_cases_dv_subset_2) # TRUE -#' -#' # By default, grouped_epi_archives don't count as epi_archives, as they may -#' # support a different set of operations from regular `epi_archives`. This -#' # behavior can be controlled by `grouped_okay`. -#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) -#' is_epi_archive2(grouped_archive) # FALSE -#' is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE -#' -#' @seealso [`is_grouped_epi_archive`] -is_epi_archive2 <- function(x, grouped_okay = FALSE) { - inherits(x, "epi_archive2") || grouped_okay && inherits(x, "grouped_epi_archive2") -} - - -#' @export -clone <- function(x, ...) { - UseMethod("clone") -} - - -#' @export -clone.epi_archive2 <- function(epi_archive, deep = FALSE) { - # TODO: Finish. - if (deep) { - epi_archive$DT <- copy(epi_archive$DT) - } else { - epi_archive$DT <- copy(epi_archive$DT) - } - return(epi_archive) -} diff --git a/R/data.R b/R/data.R index 37ccc522..cbaaa901 100644 --- a/R/data.R +++ b/R/data.R @@ -195,11 +195,11 @@ delayed_assign_with_unregister_awareness <- function(x, value, # Like normal data objects, set `archive_cases_dv_subset` up as a promise, so it # doesn't take unnecessary space before it's evaluated. This also avoids a need # for @include tags. However, this pattern will use unnecessary space after this -# promise is evaluated, because `as_epi_archive` clones `archive_cases_dv_subset_dt` +# promise is evaluated, because `as_epi_archive` copies `archive_cases_dv_subset_dt` # and `archive_cases_dv_subset_dt` will stick around along with `archive_cases_dv_subset` # after they have been evaluated. We may want to add an option to avoid cloning # in `as_epi_archive` and make use of it here. But we may also want to change -# this into an active binding that clones every time, unless we can hide the +# this into an active binding that copies every time, unless we can hide the # `DT` field from the user (make it non-`public` in general) or make it # read-only (in this specific case), so that the user cannot modify the `DT` # here and potentially mess up examples that they refer to later on. @@ -289,11 +289,3 @@ delayed_assign_with_unregister_awareness( #' * Furthermore, the data has been limited to a very small number of rows, the #' signal names slightly altered, and formatted into a tibble. "jhu_csse_county_level_subset" - -#' @export -"archive_cases_dv_subset_2" - -delayed_assign_with_unregister_awareness( - "archive_cases_dv_subset_2", - as_epi_archive2(archive_cases_dv_subset_dt, compactify = FALSE) -) diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R index b531178f..949cc914 100644 --- a/R/group_by_epi_df_methods.R +++ b/R/group_by_epi_df_methods.R @@ -11,7 +11,3 @@ select.epi_df <- function(.data, ...) { might_decay <- reclass(selected, attr(selected, "metadata")) return(dplyr_reconstruct(might_decay, might_decay)) } - -# others to consider: -# - arrange -# - diff --git a/R/grouped_archive_new.R b/R/grouped_archive_new.R deleted file mode 100644 index c0e6c35e..00000000 --- a/R/grouped_archive_new.R +++ /dev/null @@ -1,456 +0,0 @@ -#' -#' Convenience function for performing a `tidy_select` on dots according to its -#' docs, and taking the names (rather than the integer indices). -#' -#' @param ... tidyselect-syntax selection description -#' @param .data named vector / data frame; context for the description / the -#' object to which the selections apply -#' @return character vector containing names of entries/columns of -#' `names(.data)` denoting the selection -#' -#' @noRd -eval_pure_select_names_from_dots <- function(..., .data) { - # `?tidyselect::eval_select` tells us to use this form when we take in dots. - # It seems a bit peculiar, since the expr doesn't pack with it a way to get at - # the environment for the dots, but it looks like `eval_select` will assume - # the caller env (our `environment()`) when given an expr, and thus have - # access to the dots. - # - # If we were allowing renaming, we'd need to be careful about which names (new - # vs. old vs. both) to return here. - names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) -} - -#' Get names of dots without forcing the dots -#' -#' For use in functions that use nonstandard evaluation (NSE) on the dots; we -#' can't use the pattern `names(list(...))` in this case because it will attempt -#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the -#' dots if we're using NSE. -#' -#' @noRd -nse_dots_names <- function(...) { - names(rlang::call_match()) -} -nse_dots_names2 <- function(...) { - rlang::names2(rlang::call_match()) -} - -#' @importFrom dplyr group_by_drop_default -#' @noRd -new_grouped_epi_archive <- function(ungrouped, vars, drop) { - if (inherits(ungrouped, "grouped_epi_archive")) { - cli_abort( - "`ungrouped` must not already be grouped (neither automatic regrouping - nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, - or `ungroup` first.", - class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", - epiprocess__ungrouped_class = class(ungrouped), - epiprocess__ungrouped_groups = groups(ungrouped) - ) - } - assert_class(ungrouped, "epi_archive2") - assert_character(vars) - if (!test_subset(vars, names(ungrouped$DT))) { - cli_abort( - "All grouping variables `vars` must be present in the data.", - ) - } - if ("version" %in% vars) { - cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") - } - assert_logical(drop, len = 1) - - # ----- - private <- list() - private$ungrouped <- ungrouped - private$vars <- vars - private$drop <- drop - - return(structure( - list( - private = private - ), - class = c("grouped_epi_archive2", "epi_archive2") - )) -} - -#' @export -print.grouped_epi_archive2 <- function(grouped_epi_archive, class = TRUE) { - if (class) cat("A `grouped_epi_archive` object:\n") - writeLines(wrap_varnames(grouped_epi_archive$private$vars, initial = "* Groups: ")) - # If none of the grouping vars is a factor, then $drop doesn't seem - # relevant, so try to be less verbose and don't message about it. - # - # Below map-then-extract may look weird, but the more natural - # extract-then-map appears to trigger copies of the extracted columns - # since we are working with a `data.table` (unless we go through - # `as.list`, but its current column-aliasing behavior is probably not - # something to rely too much on), while map functions currently appear - # to avoid column copies. - if (any(purrr::map_lgl(grouped_epi_archive$private$ungrouped$DT, is.factor)[grouped_epi_archive$private$vars])) { - cat(strwrap(init = "* ", prefix = " ", sprintf( - "%s groups formed by factor levels that don't appear in the data", - if (grouped_epi_archive$private$drop) "Drops" else "Does not drop" - ))) - cat("\n") - } - cat("It wraps an ungrouped `epi_archive`, with metadata:\n") - print(grouped_epi_archive$private$ungrouped, class = FALSE) - # Return self invisibly for convenience in `$`-"pipe": - invisible(grouped_epi_archive) -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @importFrom dplyr group_by -#' @export -group_by.grouped_epi_archive2 <- function( - grouped_epi_archive, - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(grouped_epi_archive)) { - assert_logical(.add, len = 1) - if (!.add) { - cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden - (neither automatic regrouping nor nested grouping is supported). - If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. - If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. - ', - class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" - ) - } else { - # `group_by` `...` computations are performed on ungrouped data (see - # `?dplyr::group_by`) - detailed_mutate <- epix_detailed_restricted_mutate2(grouped_epi_archive$private$ungrouped, ...) - out_ungrouped <- detailed_mutate[["archive"]] - vars_from_dots <- detailed_mutate[["request_names"]] - vars <- union(grouped_epi_archive$private$vars, vars_from_dots) - new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, vars, .drop) - } -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @export -group_by_drop_default.grouped_epi_archive2 <- function(grouped_epi_archive) { - grouped_epi_archive$private$drop -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @importFrom dplyr groups -#' @export -groups.grouped_epi_archive2 <- function(grouped_epi_archive) { - rlang::syms(grouped_epi_archive$private$vars) -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @importFrom dplyr ungroup -#' @export -ungroup.grouped_epi_archive2 <- function(grouped_epi_archive, ...) { - if (rlang::dots_n(...) == 0L) { - # No dots = special behavior: remove all grouping vars and convert to - # an ungrouped class, as with `grouped_df`s. - grouped_epi_archive$private$ungrouped - } else { - exclude_vars <- eval_pure_select_names_from_dots(..., .data = grouped_epi_archive$private$ungrouped$DT) - # (requiring a pure selection here is a little stricter than dplyr - # implementations, but passing a renaming selection into `ungroup` - # seems pretty weird.) - result_vars <- grouped_epi_archive$private$vars[!grouped_epi_archive$private$vars %in% exclude_vars] - # `vars` might be length 0 if the user's tidyselection removed all - # grouping vars. Unlike with tibble, opt here to keep the result as a - # grouped_epi_archive, for output class consistency when `...` is - # provided. - new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, result_vars, grouped_epi_archive$private$drop) - } -} - -#' Truncate versions after a given version, grouped -#' @description Filter to keep only older versions by mutating the underlying -#' `epi_archive` using `$truncate_versions_after`. Returns the mutated -#' `grouped_epi_archive` [invisibly][base::invisible]. -#' @param x as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] -#' @export -truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { - # The grouping is irrelevant for this method; if we were to split into - # groups and recombine appropriately, we should get the same result as - # just leveraging the ungrouped method, so just do the latter: - truncate_versions_after(grouped_epi_archive$private$ungrouped, max_version) - return(invisible(grouped_epi_archive)) -} - -#' Truncate versions after a given version, grouped -#' @export -epix_truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { - cloned_group_epi_archive <- clone(grouped_epi_archive, deep = TRUE) - return((truncate_versions_after(cloned_group_epi_archive, max_version))) - # ^ second set of parens drops invisibility -} - - -#' Slide over grouped epi archive -#' @description Slides a given function over variables in a `grouped_epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. -#' @importFrom data.table key address rbindlist setDF -#' @importFrom tibble as_tibble new_tibble validate_tibble -#' @importFrom dplyr group_by groups -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms -#' env missing_arg -#' @export -slide.grouped_epi_archive2 <- function(grouped_epi_archive, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = - # deprecated()` in the function signature, because they are from - # early development versions and much more likely to be clutter than - # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { - cli_abort(" - The `group_by` argument to `slide` has been removed; please use - the `group_by` S3 generic function or `$group_by` R6 method - before the slide instead. (If you were instead trying to pass a - `group_by` argument to `f` or create a column named `group_by`, - this check is a false positive, but you will still need to use a - different column name here and rename the resulting column after - the slide.) - ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") - } - if ("all_rows" %in% nse_dots_names(...)) { - cli_abort(" - The `all_rows` argument has been removed from `epix_slide` (but - is still supported in `epi_slide`). Add rows for excluded - results with a manual join instead. - ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") - } - - if (missing(ref_time_values)) { - ref_time_values <- epix_slide_ref_time_values_default(grouped_epi_archive$private$ungrouped) - } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (any(ref_time_values > grouped_epi_archive$private$ungrouped$versions_end)) { - cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") - } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("Some `ref_time_values` are duplicated.") - } - # Sort, for consistency with `epi_slide`, although the current - # implementation doesn't take advantage of it. - ref_time_values <- sort(ref_time_values) - } - - # Validate and pre-process `before`: - if (missing(before)) { - cli_abort("`before` is required (and must be passed by name); - if you did not want to apply a sliding window but rather - to map `as_of` and `f` across various `ref_time_values`, - pass a large `before` value (e.g., if time steps are days, - `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) - - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) - - # Symbolize column name - new_col <- sym(new_col_name) - - # Validate rest of parameters: - assert_logical(as_list_col, len = 1L) - assert_logical(all_versions, len = 1L) - assert_character(names_sep, len = 1L, null.ok = TRUE) - - # Computation for one group, one time value - comp_one_grp <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # Carry out the specified computation - comp_value <- f(.data_group, .group_key, ref_time_value, ...) - - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `as_of` step. - .data_group <- .data_group$DT - } - - assert( - check_atomic(comp_value, any.missing = TRUE), - check_data_frame(comp_value), - combine = "or", - .var.name = vname(comp_value) - ) - - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) - - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) - - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos <- enquos(...) - if (length(quos) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - cli_abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - f <- quos[[1]] - new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # magic value that passes zero args as dots in calls below - } - - f <- as_slide_computation(f, ...) - x <- lapply(ref_time_values, function(ref_time_value) { - # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, - # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw <- as_of(grouped_epi_archive$private$ungrouped, - ref_time_value, - min_time_value = ref_time_value - before, - all_versions = all_versions - ) - - # Set: - # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not - # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { - as_of_df <- as_of_raw - group_modify_fn <- comp_one_grp - } else { - as_of_archive <- as_of_raw - # We essentially want to `group_modify` the archive, but - # haven't implemented this method yet. Next best would be - # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. - # Instead, go through an ordinary data frame, trying to avoid - # copies. - if (address(as_of_archive$DT) == address(grouped_epi_archive$private$ungrouped$DT)) { - # `as_of` aliased its the full `$DT`; copy before mutating: - # - # Note: this step is probably unneeded; we're fine with - # aliasing of the DT or its columns: vanilla operations aren't - # going to mutate them in-place if they are aliases, and we're - # not performing mutation (unlike the situation with - # `fill_through_version` where we do mutate a `DT` and don't - # want aliasing). - as_of_archive$DT <- copy(as_of_archive$DT) - } - dt_key <- data.table::key(as_of_archive$DT) - as_of_df <- as_of_archive$DT - data.table::setDF(as_of_df) - - # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # .data_group is coming from as_of_df as a tibble, but we - # want to feed `comp_one_grp` an `epi_archive` backed by a - # DT; convert and wrap: - data.table::setattr(.data_group, "sorted", dt_key) - data.table::setDT(.data_group, key = dt_key) - .data_group_archive <- clone(as_of_archive) - .data_group_archive$DT <- .data_group - comp_one_grp(.data_group_archive, .group_key, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col - ) - } - } - - return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE - ) - ) - }) - # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) - # Reconstruct groups - x <- group_by(x, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop) - - # Unchop/unnest if we need to - if (!as_list_col) { - x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) - } - - # if (is_epi_df(x)) { - # # The analogue of `epi_df`'s `as_of` metadata for an archive is - # # `$versions_end`, at least in the current absence of - # # separate fields/columns denoting the "archive version" with a - # # different resolution, or from the perspective of a different - # # stage of a data pipeline. The `as_of` that is automatically - # # derived won't always match; override: - # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - # } - - # XXX We need to work out when we want to return an `epi_df` and how - # to get appropriate keys (see #290, #223, #163). We'll probably - # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have some more consistency across - # different ways of calling `epix_slide`, and to prevent `epi_df` - # output with invalid metadata, always output a (grouped or - # ungrouped) tibble. - x <- decay_epi_df(x) - - return(x) -} - - -# At time of writing, roxygen parses content in collation order, impacting the -# presentation of .Rd files that document multiple functions (see -# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining -# `Collate:`) and ordering of functions within each file in order to get the -# desired ordering. - - - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @export -is_grouped_epi_archive2 <- function(x) { - inherits(x, "grouped_epi_archive2") -} - - -#' @export -clone.grouped_epi_archive2 <- function(x, deep = FALSE) { - # TODO: Finish. - if (deep) { - ungrouped <- clone(x$private$ungrouped, deep = TRUE) - } else { - ungrouped <- x$private$ungrouped - } - new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) -} diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 02722c91..140ff9d3 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -1,3 +1,9 @@ +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). We use @include tags (determining +# `Collate:`) below to get the desired ordering. + + #' Get var names from select-only `tidy_select`ing `...` in `.data` #' #' Convenience function for performing a `tidy_select` on dots according to its @@ -22,6 +28,7 @@ eval_pure_select_names_from_dots <- function(..., .data) { names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) } + #' Get names of dots without forcing the dots #' #' For use in functions that use nonstandard evaluation (NSE) on the dots; we @@ -37,411 +44,388 @@ nse_dots_names2 <- function(...) { rlang::names2(rlang::call_match()) } + #' @importFrom dplyr group_by_drop_default #' @noRd -grouped_epi_archive <- - R6::R6Class( - classname = "grouped_epi_archive", - # (We don't R6-inherit `epi_archive` or S3-multiclass with "epi_archive"; - # any "inheritance" of functionality must be done via wrapper functions that - # are checked/tested for sensible operation.) - private = list( - ungrouped = NULL, - vars = NULL, - drop = NULL - ), - public = list( - initialize = function(ungrouped, vars, drop) { - if (inherits(ungrouped, "grouped_epi_archive")) { - cli_abort( - "`ungrouped` must not already be grouped (neither automatic regrouping - nor nested grouping is supported). - Either use `group_by` with `.add=TRUE`, or `ungroup` first.", - class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", - epiprocess__ungrouped_class = class(ungrouped), - epiprocess__ungrouped_groups = groups(ungrouped) - ) - } - assert_class(ungrouped, "epi_archive") - assert_character(vars) - if (!test_subset(vars, names(ungrouped$DT))) { - cli_abort( - "All grouping variables `vars` must be present in the data.", - ) - } - if ("version" %in% vars) { - cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") - } - assert_logical(drop, len = 1) - - # ----- - private$ungrouped <- ungrouped - private$vars <- vars - private$drop <- drop - }, - print = function(class = TRUE, methods = TRUE) { - if (class) cat("A `grouped_epi_archive` object:\n") - writeLines(wrap_varnames(private$vars, initial = "* Groups: ")) - # If none of the grouping vars is a factor, then $drop doesn't seem - # relevant, so try to be less verbose and don't message about it. - # - # Below map-then-extract may look weird, but the more natural - # extract-then-map appears to trigger copies of the extracted columns - # since we are working with a `data.table` (unless we go through - # `as.list`, but its current column-aliasing behavior is probably not - # something to rely too much on), while map functions currently appear - # to avoid column copies. - if (any(purrr::map_lgl(private$ungrouped$DT, is.factor)[private$vars])) { - cat(strwrap(init = "* ", prefix = " ", sprintf( - "%s groups formed by factor levels that don't appear in the data", - if (private$drop) "Drops" else "Does not drop" - ))) - cat("\n") - } - cat("It wraps an ungrouped `epi_archive`, with metadata:\n") - private$ungrouped$print(class = FALSE, methods = FALSE) - if (methods) { - cat("----------\n") - cat("Public `grouped_epi_archive` R6 methods:\n") - grouped_method_names <- names(grouped_epi_archive$public_methods) - ungrouped_method_names <- names(epi_archive$public_methods) - writeLines(wrap_varnames( - initial = "\u2022 Specialized `epi_archive` methods: ", - intersect(grouped_method_names, ungrouped_method_names) - )) - writeLines(wrap_varnames( - initial = "\u2022 Exclusive to `grouped_epi_archive`: ", - setdiff(grouped_method_names, ungrouped_method_names) - )) - writeLines(wrap_varnames( - initial = "\u2022 `ungroup` to use: ", - setdiff(ungrouped_method_names, grouped_method_names) - )) - } - # Return self invisibly for convenience in `$`-"pipe": - invisible(self) - }, - group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - assert_logical(.add, len = 1) - if (!.add) { - cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden - (neither automatic regrouping nor nested grouping is supported). - If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. - If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. - ', - class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" - ) - } else { - # `group_by` `...` computations are performed on ungrouped data (see - # `?dplyr::group_by`) - detailed_mutate <- epix_detailed_restricted_mutate(private$ungrouped, ...) - out_ungrouped <- detailed_mutate[["archive"]] - vars_from_dots <- detailed_mutate[["request_names"]] - vars <- union(private$vars, vars_from_dots) - grouped_epi_archive$new(private$ungrouped, vars, .drop) - } - }, - group_by_drop_default = function() { - private$drop - }, - groups = function() { - rlang::syms(private$vars) - }, - ungroup = function(...) { - if (rlang::dots_n(...) == 0L) { - # No dots = special behavior: remove all grouping vars and convert to - # an ungrouped class, as with `grouped_df`s. - private$ungrouped - } else { - exclude_vars <- eval_pure_select_names_from_dots(..., .data = private$ungrouped$DT) - # (requiring a pure selection here is a little stricter than dplyr - # implementations, but passing a renaming selection into `ungroup` - # seems pretty weird.) - result_vars <- private$vars[!private$vars %in% exclude_vars] - # `vars` might be length 0 if the user's tidyselection removed all - # grouping vars. Unlike with tibble, opt here to keep the result as a - # grouped_epi_archive, for output class consistency when `...` is - # provided. - grouped_epi_archive$new(private$ungrouped, result_vars, private$drop) - } - }, - #' @description Filter to keep only older versions by mutating the underlying - #' `epi_archive` using `$truncate_versions_after`. Returns the mutated - #' `grouped_epi_archive` [invisibly][base::invisible]. - #' @param x as in [`epix_truncate_versions_after`] - #' @param max_version as in [`epix_truncate_versions_after`] - truncate_versions_after = function(max_version) { - # The grouping is irrelevant for this method; if we were to split into - # groups and recombine appropriately, we should get the same result as - # just leveraging the ungrouped method, so just do the latter: - private$ungrouped$truncate_versions_after(max_version) - return(invisible(self)) - }, - #' @description Slides a given function over variables in a `grouped_epi_archive` - #' object. See the documentation for the wrapper function [`epix_slide()`] for - #' details. - #' @importFrom data.table key address rbindlist setDF - #' @importFrom tibble as_tibble new_tibble validate_tibble - #' @importFrom dplyr group_by groups - #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - #' env missing_arg - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = - # deprecated()` in the function signature, because they are from - # early development versions and much more likely to be clutter than - # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { - cli_abort(" - The `group_by` argument to `slide` has been removed; please use - the `group_by` S3 generic function or `$group_by` R6 method - before the slide instead. (If you were instead trying to pass a - `group_by` argument to `f` or create a column named `group_by`, - this check is a false positive, but you will still need to use a - different column name here and rename the resulting column after - the slide.) - ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") - } - if ("all_rows" %in% nse_dots_names(...)) { - cli_abort(" - The `all_rows` argument has been removed from `epix_slide` (but - is still supported in `epi_slide`). Add rows for excluded - results with a manual join instead. - ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") - } - - if (missing(ref_time_values)) { - ref_time_values <- epix_slide_ref_time_values_default(private$ungrouped) - } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (any(ref_time_values > private$ungrouped$versions_end)) { - cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") - } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("Some `ref_time_values` are duplicated.") - } - # Sort, for consistency with `epi_slide`, although the current - # implementation doesn't take advantage of it. - ref_time_values <- sort(ref_time_values) - } - - # Validate and pre-process `before`: - if (missing(before)) { - cli_abort("`before` is required (and must be passed by name); - if you did not want to apply a sliding window but rather - to map `as_of` and `f` across various `ref_time_values`, - pass a large `before` value (e.g., if time steps are days, - `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) - - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) - - # Symbolize column name - new_col <- sym(new_col_name) - - # Validate rest of parameters: - assert_logical(as_list_col, len = 1L) - assert_logical(all_versions, len = 1L) - assert_character(names_sep, len = 1L, null.ok = TRUE) - - # Computation for one group, one time value - comp_one_grp <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # Carry out the specified computation - comp_value <- f(.data_group, .group_key, ref_time_value, ...) - - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `as_of` step. - .data_group <- .data_group$DT - } - - assert( - check_atomic(comp_value, any.missing = TRUE), - check_data_frame(comp_value), - combine = "or", - .var.name = vname(comp_value) - ) - - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) - - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) - - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos <- enquos(...) - if (length(quos) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - cli_abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - f <- quos[[1]] - new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # magic value that passes zero args as dots in calls below - } - - f <- as_slide_computation(f, ...) - x <- lapply(ref_time_values, function(ref_time_value) { - # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, - # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw <- private$ungrouped$as_of( - ref_time_value, - min_time_value = ref_time_value - before, - all_versions = all_versions - ) - - # Set: - # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not - # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { - as_of_df <- as_of_raw - group_modify_fn <- comp_one_grp - } else { - as_of_archive <- as_of_raw - # We essentially want to `group_modify` the archive, but - # haven't implemented this method yet. Next best would be - # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. - # Instead, go through an ordinary data frame, trying to avoid - # copies. - if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { - # `as_of` aliased its the full `$DT`; copy before mutating: - # - # Note: this step is probably unneeded; we're fine with - # aliasing of the DT or its columns: vanilla operations aren't - # going to mutate them in-place if they are aliases, and we're - # not performing mutation (unlike the situation with - # `fill_through_version` where we do mutate a `DT` and don't - # want aliasing). - as_of_archive$DT <- copy(as_of_archive$DT) - } - dt_key <- data.table::key(as_of_archive$DT) - as_of_df <- as_of_archive$DT - data.table::setDF(as_of_df) - - # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # .data_group is coming from as_of_df as a tibble, but we - # want to feed `comp_one_grp` an `epi_archive` backed by a - # DT; convert and wrap: - data.table::setattr(.data_group, "sorted", dt_key) - data.table::setDT(.data_group, key = dt_key) - .data_group_archive <- as_of_archive$clone() - .data_group_archive$DT <- .data_group - comp_one_grp(.data_group_archive, .group_key, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col - ) - } - } - - return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(private$vars), .drop = private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE - ) - ) - }) - # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) - # Reconstruct groups - x <- group_by(x, !!!syms(private$vars), .drop = private$drop) - - # Unchop/unnest if we need to - if (!as_list_col) { - x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) - } - - # nolint start: commented_code_linter. - # if (is_epi_df(x)) { - # # The analogue of `epi_df`'s `as_of` metadata for an archive is - # # `$versions_end`, at least in the current absence of - # # separate fields/columns denoting the "archive version" with a - # # different resolution, or from the perspective of a different - # # stage of a data pipeline. The `as_of` that is automatically - # # derived won't always match; override: - # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - # } - # nolint end - - # XXX We need to work out when we want to return an `epi_df` and how - # to get appropriate keys (see #290, #223, #163). We'll probably - # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have some more consistency across - # different ways of calling `epix_slide`, and to prevent `epi_df` - # output with invalid metadata, always output a (grouped or - # ungrouped) tibble. - x <- decay_epi_df(x) - - return(x) - } +new_grouped_epi_archive <- function(x, vars, drop) { + if (inherits(x, "grouped_epi_archive")) { + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, + or `ungroup` first.", + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(x), + epiprocess__ungrouped_groups = groups(x) + ) + } + assert_class(x, "epi_archive") + assert_character(vars) + if (!test_subset(vars, names(x$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", ) - ) + } + if ("version" %in% vars) { + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + assert_logical(drop, len = 1) + + # ----- + private <- list() + private$ungrouped <- x + private$vars <- vars + private$drop <- drop + + return(structure( + list( + private = private + ), + class = c("grouped_epi_archive", "epi_archive") + )) +} + + +#' @export +print.grouped_epi_archive <- function(x, ..., class = TRUE) { + if (rlang::dots_n(...) > 0) { + cli_abort(c( + "Error in print.grouped_epi_archive()", + "i" = "Too many arguments passed to `print.grouped_epi_archive()`." + )) + } + + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(x$private$vars, initial = "* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(x$private$ungrouped$DT, is.factor)[x$private$vars])) { + cat(strwrap(initial = "* ", prefix = " ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (x$private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + print(x$private$ungrouped, class = FALSE) + # Return self invisibly for convenience in `$`-"pipe": + invisible(x) +} -# At time of writing, roxygen parses content in collation order, impacting the -# presentation of .Rd files that document multiple functions (see -# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining -# `Collate:`) and ordering of functions within each file in order to get the -# desired ordering. #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' #' @importFrom dplyr group_by #' @export -group_by.grouped_epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { - .data$group_by(..., .add = .add, .drop = .drop) +group_by.grouped_epi_archive <- function( + .data, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(.data)) { + assert_logical(.add, len = 1) + if (!.add) { + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate <- epix_detailed_restricted_mutate(.data$private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(.data$private$vars, vars_from_dots) + new_grouped_epi_archive(out_ungrouped, vars, .drop) + } } + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @param .tbl A `grouped_epi_archive` object. +#' +#' @export +group_by_drop_default.grouped_epi_archive <- function(.tbl) { + x <- .tbl + x$private$drop +} + + #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' #' @importFrom dplyr groups #' @export groups.grouped_epi_archive <- function(x) { - x$groups() + rlang::syms(x$private$vars) } + #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' #' @importFrom dplyr ungroup #' @export ungroup.grouped_epi_archive <- function(x, ...) { - x$ungroup(...) + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + x$private$ungrouped + } else { + exclude_vars <- eval_pure_select_names_from_dots(..., .data = x$private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars <- x$private$vars[!x$private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + new_grouped_epi_archive(x$private$ungrouped, result_vars, x$private$drop) + } } + +#' @rdname epix_slide +#' +#' @importFrom data.table key address rbindlist setDF copy +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' env missing_arg +#' @export +epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + cli_abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + cli_abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(x$private$ungrouped) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > x$private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + cli_abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `epix_as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col <- sym(new_col_name) + + # Validate rest of parameters: + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) + + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `epix_as_of` step. + .data_group <- .data_group$DT + } + + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } + + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # nolint: object_usage_linter. magic value that passes zero args as dots in calls below + } + + f <- as_slide_computation(f, ...) + out <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- x$private$ungrouped %>% epix_as_of( + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(x$private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation. + as_of_archive$DT <- data.table::copy(as_of_archive$DT) + } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- as_of_archive %>% clone() + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(x$private$vars), .drop = x$private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + out <- as_tibble(setDF(rbindlist(out))) + # Reconstruct groups + out <- group_by(out, !!!syms(x$private$vars), .drop = x$private$drop) + + # Unchop/unnest if we need to + if (!as_list_col) { + out <- tidyr::unnest(out, !!new_col, names_sep = names_sep) + } + + # nolint start: commented_code_linter. + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + # nolint end + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + out <- decay_epi_df(out) + + return(out) +} + + #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' @@ -450,16 +434,20 @@ is_grouped_epi_archive <- function(x) { inherits(x, "grouped_epi_archive") } -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' + #' @export -group_by_drop_default.grouped_epi_archive <- function(.tbl) { - .tbl$group_by_drop_default() +clone.grouped_epi_archive <- function(x, ...) { + ungrouped <- x$private$ungrouped %>% clone() + new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) } + +#' @rdname epix_truncate_versions_after #' @export epix_truncate_versions_after.grouped_epi_archive <- function(x, max_version) { - return((x$clone()$truncate_versions_after(max_version))) - # ^ second set of parens drops invisibility + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + x$private$ungrouped <- epix_truncate_versions_after(x$private$ungrouped, max_version) + x } diff --git a/R/growth_rate.R b/R/growth_rate.R index a60db452..f2b326a1 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -249,7 +249,9 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, # Estimate growth rate and return f <- genlasso::coef.genlasso(obj, df = df)$beta - d <- extend_r(diff(f) / diff(x)) + d <- diff(f) / diff(x) + # Extend by one element + d <- c(d, d[length(d)]) if (log_scale) { return(d[i0]) } else { diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 213cf1b1..f6846488 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -22,36 +22,17 @@ #' for the `max_version` of each `time_value`. Default is `FALSE`. #' @return An `epi_df` object. #' -#' @details This is simply a wrapper around the `as_of()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: -#' ``` -#' epix_as_of(x, max_version = v) -#' ``` -#' is equivalent to: -#' ``` -#' x$as_of(max_version = v) -#' ``` -#' -#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input -#' archives, but may in some edge cases alias parts of the inputs, so copy the -#' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, the only situation where there is potentially aliasing -#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change -#' in the future. -#' #' @examples #' # warning message of data latency shown #' epix_as_of( -#' x = archive_cases_dv_subset, +#' archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version) #' ) #' -#' @examples -#' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' #' epix_as_of( -#' x = archive_cases_dv_subset, +#' archive_cases_dv_subset, #' max_version = as.Date("2020-06-12") #' ) #' @@ -66,7 +47,7 @@ #' withCallingHandlers( #' { #' epix_as_of( -#' x = archive_cases_dv_subset, +#' archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version) #' ) #' }, @@ -75,14 +56,77 @@ #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. #' +#' @importFrom data.table between key #' @export epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { assert_class(x, "epi_archive") - return(x$as_of(max_version, min_time_value, all_versions = all_versions)) + + other_keys <- setdiff( + key(x$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!test_set_equal(class(max_version), class(x$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `epi_archive$DT$version`." + ) + } + if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { + cli_abort( + "`max_version` must have the same types as `epi_archive$DT$version`." + ) + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > x$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + assert_logical(all_versions, len = 1) + if (!is.na(x$clobberable_versions_start) && max_version >= x$clobberable_versions_start) { + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + # epi_archive is copied into result, so we can modify result directly + result <- epix_truncate_versions_after(x, max_version) + result$DT <- result$DT[time_value >= min_time_value, ] # nolint: object_usage_linter + return(result) + } + + # Make sure to use data.table ways of filtering and selecting + as_of_epi_df <- x$DT[time_value >= min_time_value & version <= max_version, ] %>% # nolint: object_usage_linter + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE + ) %>% + tibble::as_tibble() %>% + dplyr::select(-"version") %>% + as_epi_df( + geo_type = x$geo_type, + time_type = x$time_type, + as_of = max_version, + additional_metadata = c( + x$additional_metadata, + list(other_keys = other_keys) + ) + ) + + return(as_of_epi_df) } -#' `epi_archive` with unobserved history filled in (won't mutate, might alias) + +#' Fill `epi_archive` unobserved history #' +#' @description #' Sometimes, due to upstream data pipeline issues, we have to work with a #' version history that isn't completely up to date, but with functions that #' expect archives that are completely up to date, or equally as up-to-date as @@ -90,13 +134,6 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL #' pretend that we've "observed" additional versions, filling in these versions #' with NAs or extrapolated values. #' -#' '`epix_fill_through_version` will not mutate its `x` argument, but its result -#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate -#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to -#' give the result, but might reseat its fields (e.g., references to the old -#' `x$DT` might not be updated by this function or subsequent operations on -#' `x`), and returns the updated `x` [invisibly][base::invisible]. -#' #' @param x An `epi_archive` #' @param fill_versions_end Length-1, same class&type as `x$version`: the #' version through which to fill in missing version history; this will be the @@ -110,31 +147,79 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL #' version history with the last version of each observation carried forward #' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are #' based on LOCF). Default is `"na"`. +#' +#' @importFrom data.table copy ":=" +#' @importFrom rlang arg_match #' @return An `epi_archive` +#' @export epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf")) { assert_class(x, "epi_archive") - # Enclosing parentheses drop the invisibility flag. See description above of - # potential mutation and aliasing behavior. - (x$clone()$fill_through_version(fill_versions_end, how = how)) + + validate_version_bound(fill_versions_end, x$DT, na_ok = FALSE) + how <- arg_match(how) + if (x$versions_end < fill_versions_end) { + new_dt <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `epi_archive` is outdated. + nonversion_key_cols <- setdiff(key(x$DT), "version") + nonkey_cols <- setdiff(names(x$DT), key(x$DT)) + next_version_tag <- next_after(x$versions_end) + if (next_version_tag > fill_versions_end) { + cli_abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), x$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(x$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(x$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- data.table::copy(nonversion_key_vals_ever_recorded) + } + next_version_dt <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(x$DT, next_version_dt), key(x$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + x$DT + } + ) + new_versions_end <- fill_versions_end + # Update `epi_archive` all at once with simple, error-free operations + + # return below: + x$DT <- new_dt + x$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(x) } + #' Merge two `epi_archive` objects #' #' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and -#' set of key columns. When they also share a common `versions_end`, -#' using `$as_of` on the result should be the same as using `$as_of` on `x` and -#' `y` individually, then performing a full join of the `DT`s on the non-version -#' key columns (potentially consolidating multiple warnings about clobberable -#' versions). If the `versions_end` values differ, the -#' `sync` parameter controls what is done. -#' -#' This function, [`epix_merge`], does not mutate its inputs and will not alias -#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite -#' `x` with the result of the merge, reseating its `DT` and several other fields -#' (making them point to different objects), but avoiding mutation of the -#' contents of the old `DT` (only relevant if you have another reference to the -#' old `DT` in another object). +#' set of key columns. When they also share a common `versions_end`, using +#' `epix_as_of` on the result should be the same as using `epix_as_of` on `x` +#' and `y` individually, then performing a full join of the `DT`s on the +#' non-version key columns (potentially consolidating multiple warnings about +#' clobberable versions). If the `versions_end` values differ, the `sync` +#' parameter controls what is done. #' #' @param x,y Two `epi_archive` objects to join together. #' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the @@ -153,7 +238,7 @@ epix_fill_through_version <- function(x, fill_versions_end, #' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, #' and discard any rows containing update rows for later versions. #' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be -#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' compactified? See `as_epi_archive()` for an explanation of what this means. #' Default here is `TRUE`. #' @return the resulting `epi_archive` #' @@ -171,8 +256,6 @@ epix_fill_through_version <- function(x, fill_versions_end, #' as_epi_archive(compactify = TRUE) #' # merge results stored in a third object: #' xy <- epix_merge(x, y) -#' # vs. mutating x to hold the merge result: -#' x$merge(y) #' #' @importFrom data.table key set setkeyv #' @export @@ -207,13 +290,9 @@ epix_merge <- function(x, y, if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { NA # (any type of NA is fine here) } else { - min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) + min(c(x$clobberable_versions_start, y$clobberable_versions_start), na.rm = TRUE) } - # The actual merge below may not succeed 100% of the time, so do this - # preprocessing using non-mutating (but potentially aliasing) functions. This - # approach potentially uses more memory, but won't leave behind a - # partially-mutated `x` on failure. if (sync == "forbid") { if (!identical(x$versions_end, y$versions_end)) { cli_abort(paste( @@ -386,64 +465,6 @@ epix_merge <- function(x, y, )) } -# Helpers for `group_by`: - -#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input -#' -#' A workaround for `dplyr:::mutate_cols` not being exported and directly -#' applying test mock libraries likely being impossible (due to mocking another -#' package's S3 generic or method). -#' -#' Use solely with a single call to the [`dplyr::mutate`] function and then -#' `destructure_col_modify_recorder_df`; other applicable operations from -#' [dplyr::dplyr_extending] have not been implemented. -#' -#' @param parent_df the "parent class" data frame to wrap -#' @return a `col_modify_recorder_df` -#' -#' @noRd -new_col_modify_recorder_df <- function(parent_df) { - assert_class(parent_df, "data.frame") - `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) -} - -#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` -#' -#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` -#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the -#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record -#' -#' @noRd -destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { - assert_class(col_modify_recorder_df, "col_modify_recorder_df") - list( - unchanged_parent_df = col_modify_recorder_df %>% - `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% - `class<-`(setdiff(class(.data), "col_modify_recorder_df")), - cols = attr(col_modify_recorder_df, - "epiprocess::col_modify_recorder_df::cols", - exact = TRUE - ) - ) -} - -#' `dplyr_col_modify` method that simply records the `cols` argument -#' -#' Must export S3 methods in R >= 4.0, even if they're only designed to be -#' package internals, and must import any corresponding upstream S3 generic -#' functions: -#' @importFrom dplyr dplyr_col_modify -#' @export -#' @noRd -dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { - if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { - cli_abort("`col_modify_recorder_df` can only record `cols` once", - internal = TRUE - ) - } - attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols - data -} #' A more detailed but restricted `mutate` for use in `group_by.epi_archive` #' @@ -512,7 +533,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { out_dt <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% data.table::setDT(key = key(.data$DT)) - out_archive <- .data$clone() + out_archive <- .data %>% clone() out_archive$DT <- out_dt request_names <- names(col_modify_cols) return(list( @@ -532,163 +553,6 @@ epix_detailed_restricted_mutate <- function(.data, ...) { } } -#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` -#' -#' @param .data An `epi_archive` or `grouped_epi_archive` -#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * For `group_by`: unquoted variable name(s) or other -#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to -#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to -#' perform grouping, but note that, if you are regrouping an already-grouped -#' `.data` object, the calculations will be carried out ignoring such grouping -#' (same as [in dplyr][dplyr::group_by]). -#' * For `ungroup`: either -#' * empty, in order to remove the grouping and output an `epi_archive`; or -#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] -#' expression(s), in order to remove the matching variables from the list of -#' grouping variables, and output another `grouped_epi_archive`. -#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by -#' the variable selection from `...` only; if `TRUE`, the output will be -#' grouped by the current grouping variables plus the variable selection from -#' `...`. -#' @param .drop As described in [`dplyr::group_by`]; determines treatment of -#' factor columns. -#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for -#' `is_grouped_epi_archive`: any object -#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or -#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; -#' `grouped_epi_archive` dispatches its own S3 method) -#' -#' @details -#' -#' To match `dplyr`, `group_by` allows "data masking" (also referred to as -#' "tidy evaluation") expressions `...`, not just column names, in a way similar -#' to `mutate`. Note that replacing or removing key columns with these -#' expressions is disabled. -#' -#' `archive %>% group_by()` and other expressions that group or regroup by zero -#' columns (indicating that all rows should be treated as part of one large -#' group) will output a `grouped_epi_archive`, in order to enable the use of -#' `grouped_epi_archive` methods on the result. This is in slight contrast to -#' the same operations on tibbles and grouped tibbles, which will *not* output a -#' `grouped_df` in these circumstances. -#' -#' Using `group_by` with `.add=FALSE` to override the existing grouping is -#' disabled; instead, `ungroup` first then `group_by`. -#' -#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, -#' introducing column-level aliasing between its input and its result. This -#' doesn't follow the general model for most `data.table` operations, which -#' seems to be that, given an nonaliased (i.e., unique) pointer to a -#' `data.table` object, its pointers to its columns should also be nonaliased. -#' If you mutate any of the columns of either the input or result, first ensure -#' that it is fine if columns of the other are also mutated, but do not rely on -#' such behavior to occur. Additionally, never perform mutation on the key -#' columns at all (except for strictly increasing transformations), as this will -#' invalidate sortedness assumptions about the rows. -#' -#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch -#' to `group_by_drop_default.default` (but there is a dedicated method for -#' `grouped_epi_archive`s). -#' -#' @examples -#' -#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) -#' -#' # `print` for metadata and method listing: -#' grouped_archive %>% print() -#' -#' # The primary use for grouping is to perform a grouped `epix_slide`: -#' -#' archive_cases_dv_subset %>% -#' group_by(geo_value) %>% -#' epix_slide( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = "case_rate_3d_av" -#' ) %>% -#' ungroup() -#' -#' # ----------------------------------------------------------------- -#' -#' # Advanced: some other features of dplyr grouping are implemented: -#' -#' library(dplyr) -#' toy_archive <- -#' tribble( -#' ~geo_value, ~age_group, ~time_value, ~version, ~value, -#' "us", "adult", "2000-01-01", "2000-01-02", 121, -#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) -#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) -#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) -#' ) %>% -#' mutate( -#' age_group = ordered(age_group, c("pediatric", "adult")), -#' time_value = as.Date(time_value), -#' version = as.Date(version) -#' ) %>% -#' as_epi_archive(other_keys = "age_group") -#' -#' # The following are equivalent: -#' toy_archive %>% group_by(geo_value, age_group) -#' toy_archive %>% -#' group_by(geo_value) %>% -#' group_by(age_group, .add = TRUE) -#' grouping_cols <- c("geo_value", "age_group") -#' toy_archive %>% group_by(across(all_of(grouping_cols))) -#' -#' # And these are equivalent: -#' toy_archive %>% group_by(geo_value) -#' toy_archive %>% -#' group_by(geo_value, age_group) %>% -#' ungroup(age_group) -#' -#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -#' toy_archive %>% -#' group_by(geo_value) %>% -#' groups() -#' -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop = FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' -#' @importFrom dplyr group_by -#' @export -#' -#' @aliases grouped_epi_archive -group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { - # `add` makes no difference; this is an ungrouped `epi_archive`. - detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) - assert_logical(.drop) - if (!.drop) { - grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] - grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) - # ^ Use `as.list` to try to avoid any possibility of a deep copy. - if (!any(grouping_col_is_factor)) { - cli_warn( - "`.drop=FALSE` but there are no factor grouping columns; - did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - } else if (any(diff(grouping_col_is_factor) == -1L)) { - cli_warn( - "`.drop=FALSE` but there are one or more non-factor grouping columns listed - after a factor grouping column; this may produce groups with `NA`s for these columns; - see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; - depending on how you want completion to work, you might instead want to convert - all grouping columns to factors beforehand, specify the non-factor grouping columns - first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" - ) - } - } - grouped_epi_archive$new(detailed_mutate[["archive"]], - detailed_mutate[["request_names"]], - drop = .drop - ) -} #' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` #' @@ -742,8 +606,8 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ #' @param ref_time_values Reference time values / versions for sliding #' computations; each element of this vector serves both as the anchor point #' for the `time_value` window for the computation and the `max_version` -#' `as_of` which we fetch data in this window. If missing, then this will set -#' to a regularly-spaced sequence of values set to cover the range of +#' `epix_as_of` which we fetch data in this window. If missing, then this will +#' set to a regularly-spaced sequence of values set to cover the range of #' `version`s in the `DT` plus the `versions_end`; the spacing of values will #' be guessed (using the GCD of the skips between values). #' @param time_step Optional function used to define the meaning of one time @@ -823,30 +687,11 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ #' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch -#' properly-versioned snapshots from the data archive (via its `as_of()` -#' method), and (2) it performs a "manual" sliding of sorts, and does not -#' benefit from the highly efficient `slider` package. For this reason, it -#' should never be used in place of `epi_slide()`, and only used when -#' version-aware sliding is necessary (as it its purpose). -#' -#' Finally, this is simply a wrapper around the `slide()` method of the -#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an -#' object of either of these classes, then: -#' ``` -#' epix_slide(x, new_var = comp(old_var), before = 119) -#' ``` -#' is equivalent to: -#' ``` -#' x$slide(new_var = comp(old_var), before = 119) -#' ``` -#' -#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place -#' mutation of the input archives on their own. In some edge cases the inputs it -#' feeds to the slide computations may alias parts of the input archive, so copy -#' the slide computation inputs if needed before using mutating operations like -#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of -#' the slide operation may alias parts of the input archive, so similarly, make -#' sure to clone and/or copy appropriately before using in-place mutation. +#' properly-versioned snapshots from the data archive (via `epix_as_of()`), +#' and (2) it performs a "manual" sliding of sorts, and does not benefit from +#' the highly efficient `slider` package. For this reason, it should never be +#' used in place of `epi_slide()`, and only used when version-aware sliding is +#' necessary (as it its purpose). #' #' @examples #' library(dplyr) @@ -940,26 +785,52 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ #' filter(geo_value == "ca") %>% #' select(-geo_value) #' -#' @importFrom rlang enquo !!! #' @export -epix_slide <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { +epix_slide <- function( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE) { if (!is_epi_archive(x, grouped_okay = TRUE)) { cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } - return(x$slide(f, ..., - before = before, - ref_time_values = ref_time_values, - time_step = time_step, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, + UseMethod("epix_slide") +} + + +#' @rdname epix_slide +#' @export +epix_slide.epi_archive <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + epix_slide( + group_by(x), + f, + ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, all_versions = all_versions - )) + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() } + #' Default value for `ref_time_values` in an `epix_slide` #' #' @noRd @@ -969,16 +840,14 @@ epix_slide_ref_time_values_default <- function(ea) { return(ref_time_values) } + #' Filter an `epi_archive` object to keep only older versions #' #' Generates a filtered `epi_archive` from an `epi_archive` object, keeping #' only rows with `version` falling on or before a specified date. #' -#' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the -#' filtered archive. That is, the output archive will comprise rows of the -#' current archive data having `version` less than or equal to the -#' specified `max_version` +#' @param x An `epi_archive` object. +#' @param max_version The latest version to include in the archive. #' @return An `epi_archive` object #' #' @export @@ -986,8 +855,89 @@ epix_truncate_versions_after <- function(x, max_version) { UseMethod("epix_truncate_versions_after") } + +#' @rdname epix_truncate_versions_after #' @export epix_truncate_versions_after.epi_archive <- function(x, max_version) { - return((x$clone()$truncate_versions_after(max_version))) - # ^ second set of parens drops invisibility + if (!test_set_equal(class(max_version), class(x$DT$version))) { + cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + } + if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { + cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > x$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + x$DT <- x$DT[x$DT$version <= max_version, colnames(x$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(x$clobberable_versions_start) && x$clobberable_versions_start > max_version) { + x$clobberable_versions_start <- NA + } + x$versions_end <- max_version + return(x) +} + + +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df <- function(parent_df) { + assert_class(parent_df, "data.frame") + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { + assert_class(col_modify_recorder_df, "col_modify_recorder_df") + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.data), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) + ) +} + + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { + cli_abort("`col_modify_recorder_df` can only record `cols` once", + internal = TRUE + ) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data } diff --git a/R/methods-epi_archive_new.R b/R/methods-epi_archive_new.R deleted file mode 100644 index 3ce39afc..00000000 --- a/R/methods-epi_archive_new.R +++ /dev/null @@ -1,826 +0,0 @@ -#' Generate a snapshot from an `epi_archive` object -#' -#' Generates a snapshot in `epi_df` format from an `epi_archive` object, as of a -#' given version. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. -#' -#' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the -#' snapshot. That is, the snapshot will comprise the unique rows of the -#' current archive data that represent the most up-to-date signal values, as -#' of the specified `max_version` (and whose time values are at least -#' `min_time_value`.) -#' @param min_time_value Time value specifying the min time value to permit in -#' the snapshot. Default is `-Inf`, which effectively means that there is no -#' minimum considered. -#' @param all_versions If `all_versions = TRUE`, then the output will be in -#' `epi_archive` format, and contain rows in the specified `time_value` range -#' having `version <= max_version`. The resulting object will cover a -#' potentially narrower `version` and `time_value` range than `x`, depending -#' on user-provided arguments. Otherwise, there will be one row in the output -#' for the `max_version` of each `time_value`. Default is `FALSE`. -#' @return An `epi_df` object. -#' -#' @details This is simply a wrapper around the `as_of()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: -#' ``` -#' epix_as_of(x, max_version = v) -#' ``` -#' is equivalent to: -#' ``` -#' x$as_of(max_version = v) -#' ``` -#' -#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input -#' archives, but may in some edge cases alias parts of the inputs, so copy the -#' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, the only situation where there is potentially aliasing -#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change -#' in the future. -#' -#' @examples -#' # warning message of data latency shown -#' epix_as_of2( -#' archive_cases_dv_subset_2, -#' max_version = max(archive_cases_dv_subset_2$DT$version) -#' ) -#' -#' range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 -#' -#' epix_as_of2( -#' archive_cases_dv_subset_2, -#' max_version = as.Date("2020-06-12") -#' ) -#' -#' # When fetching a snapshot as of the latest version with update data in the -#' # archive, a warning is issued by default, as this update data might not yet -#' # be finalized (for example, if data versions are labeled with dates, these -#' # versions might be overwritten throughout the corresponding days with -#' # additional data or "hotfixes" of erroroneous data; when we build an archive -#' # based on database queries, the latest available update might still be -#' # subject to change, but previous versions should be finalized). We can -#' # muffle such warnings with the following pattern: -#' withCallingHandlers( -#' { -#' epix_as_of2( -#' archive_cases_dv_subset_2, -#' max_version = max(archive_cases_dv_subset_2$DT$version) -#' ) -#' }, -#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") -#' ) -#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used -#' # to globally toggle these warnings. -#' -#' @export -epix_as_of2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { - assert_class(epi_archive, "epi_archive2") - return(as_of(epi_archive, max_version, min_time_value, all_versions = all_versions)) -} - -#' `epi_archive` with unobserved history filled in (won't mutate, might alias) -#' -#' Sometimes, due to upstream data pipeline issues, we have to work with a -#' version history that isn't completely up to date, but with functions that -#' expect archives that are completely up to date, or equally as up-to-date as -#' another archive. This function provides one way to approach such mismatches: -#' pretend that we've "observed" additional versions, filling in these versions -#' with NAs or extrapolated values. -#' -#' '`epix_fill_through_version` will not mutate its `x` argument, but its result -#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate -#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to -#' give the result, but might reseat its fields (e.g., references to the old -#' `x$DT` might not be updated by this function or subsequent operations on -#' `x`), and returns the updated `x` [invisibly][base::invisible]. -#' -#' @param x An `epi_archive` -#' @param fill_versions_end Length-1, same class&type as `x$version`: the -#' version through which to fill in missing version history; this will be the -#' result's `$versions_end` unless it already had a later -#' `$versions_end`. -#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing -#' required version history with `NA`s, by inserting (if necessary) an update -#' immediately after the current `$versions_end` that revises all -#' existing measurements to be `NA` (this is only supported for `version` -#' classes with a `next_after` implementation); `"locf"` will fill in missing -#' version history with the last version of each observation carried forward -#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are -#' based on LOCF). Default is `"na"`. -#' @return An `epi_archive` -epix_fill_through_version2 <- function(epi_archive, fill_versions_end, - how = c("na", "locf")) { - assert_class(epi_archive, "epi_archive2") - cloned_epi_archive <- clone(epi_archive) - # Enclosing parentheses drop the invisibility flag. See description above of - # potential mutation and aliasing behavior. - (fill_through_version(cloned_epi_archive, fill_versions_end, how = how)) -} - -#' Merge two `epi_archive` objects -#' -#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and -#' set of key columns. When they also share a common `versions_end`, -#' using `$as_of` on the result should be the same as using `$as_of` on `x` and -#' `y` individually, then performing a full join of the `DT`s on the non-version -#' key columns (potentially consolidating multiple warnings about clobberable -#' versions). If the `versions_end` values differ, the -#' `sync` parameter controls what is done. -#' -#' This function, [`epix_merge`], does not mutate its inputs and will not alias -#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite -#' `x` with the result of the merge, reseating its `DT` and several other fields -#' (making them point to different objects), but avoiding mutation of the -#' contents of the old `DT` (only relevant if you have another reference to the -#' old `DT` in another object). -#' -#' @param x,y Two `epi_archive` objects to join together. -#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the -#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: -#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` -#' as the result's `versions_end`, but ensure that, if we request a snapshot -#' as of a version after `min(x$versions_end, y$versions_end)`, the -#' observation columns from the less up-to-date archive will be all NAs (i.e., -#' imagine there was an update immediately after its `versions_end` which -#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, -#' y$versions_end)` as the result's `versions_end`, allowing the last version -#' of each observation to be carried forward to extrapolate unavailable -#' versions for the less up-to-date input archive (i.e., imagining that in the -#' less up-to-date archive's data set remained unchanged between its actual -#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: -#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, -#' and discard any rows containing update rows for later versions. -#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be -#' compactified? See [`as_epi_archive`] for an explanation of what this means. -#' Default here is `TRUE`. -#' @return the resulting `epi_archive` -#' -#' @details In all cases, `additional_metadata` will be an empty list, and -#' `clobberable_versions_start` will be set to the earliest version that could -#' be clobbered in either input archive. -#' -#' @examples -#' # create two example epi_archive datasets -#' x <- archive_cases_dv_subset_2$DT %>% -#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% -#' as_epi_archive2(compactify = TRUE) -#' y <- archive_cases_dv_subset_2$DT %>% -#' dplyr::select(geo_value, time_value, version, percent_cli) %>% -#' as_epi_archive2(compactify = TRUE) -#' # merge results stored in a third object: -#' xy <- epix_merge2(x, y) -#' -#' @importFrom data.table key set setkeyv -#' @export -epix_merge2 <- function(x, y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE) { - assert_class(x, "epi_archive2") - assert_class(y, "epi_archive2") - sync <- rlang::arg_match(sync) - - if (!identical(x$geo_type, y$geo_type)) { - cli_abort("`x` and `y` must have the same `$geo_type`") - } - - if (!identical(x$time_type, y$time_type)) { - cli_abort("`x` and `y` must have the same `$time_type`") - } - - if (length(x$additional_metadata) != 0L) { - cli_warn("x$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - } - if (length(y$additional_metadata) != 0L) { - cli_warn("y$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - } - result_additional_metadata <- list() - - result_clobberable_versions_start <- - if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { - NA # (any type of NA is fine here) - } else { - min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) - } - - # The actual merge below may not succeed 100% of the time, so do this - # preprocessing using non-mutating (but potentially aliasing) functions. This - # approach potentially uses more memory, but won't leave behind a - # partially-mutated `x` on failure. - if (sync == "forbid") { - if (!identical(x$versions_end, y$versions_end)) { - cli_abort(paste( - "`x` and `y` were not equally up to date version-wise:", - "`x$versions_end` was not identical to `y$versions_end`;", - "either ensure that `x` and `y` are equally up to date before merging,", - "or specify how to deal with this using `sync`" - ), class = "epiprocess__epix_merge_unresolved_sync") - } else { - new_versions_end <- x$versions_end - x_DT <- x$DT - y_DT <- y$DT - } - } else if (sync %in% c("na", "locf")) { - new_versions_end <- max(x$versions_end, y$versions_end) - x_DT <- epix_fill_through_version2(x, new_versions_end, sync)$DT - y_DT <- epix_fill_through_version2(y, new_versions_end, sync)$DT - } else if (sync == "truncate") { - new_versions_end <- min(x$versions_end, y$versions_end) - x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] - y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] - } else { - cli_abort("unimplemented") - } - - # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same - # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to - # split the code into separate functions if we wish), but still refer to - # {x,y}$DT in the error messages (further relying on this assumption). - # - # Check&ensure that the above assumption; if it didn't already hold, we likely - # have a bug in the preprocessing, a weird/invalid archive as input, and/or a - # data.table version with different semantics (which may break other parts of - # our code). - x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) - y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) - if (!x_DT_key_as_expected || !y_DT_key_as_expected) { - cli_warn(" - `epiprocess` internal warning (please report): pre-processing for - epix_merge unexpectedly resulted in an intermediate data table (or - tables) with a different key than the corresponding input archive. - Manually setting intermediate data table keys to the expected values. - ", internal = TRUE) - setkeyv(x_DT, key(x$DT)) - setkeyv(y_DT, key(y$DT)) - } - # Without some sort of annotations of what various columns represent, we can't - # do something that makes sense when merging archives with mismatched keys. - # E.g., even if we assume extra keys represent demographic breakdowns, a - # sensible default treatment of count-type and rate-type value columns would - # differ. - if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { - cli_abort(" - The archives must have the same set of key column names; if the - key columns represent the same things, just with different - names, please retry after manually renaming to match; if they - represent different things (e.g., x has an age breakdown - but y does not), please retry after processing them to share - the same key (e.g., by summarizing x to remove the age breakdown, - or by applying a static age breakdown to y). - ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") - } - # `by` cols = result (and each input's) `key` cols, and determine - # the row set, determined using a full join via `merge` - # - # non-`by` cols = "value"-ish cols, and are looked up with last - # version carried forward via rolling joins - by <- key(x_DT) # = some perm of key(y_DT) - if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { - cli_abort('Invalid `by`; `by` is currently set to the common `key` of - the two archives, and is expected to contain - "geo_value", "time_value", and "version".', - class = "epiprocess__epi_archive_must_have_required_key_cols" - ) - } - if (length(by) < 1L || utils::tail(by, 1L) != "version") { - cli_abort('Invalid `by`; `by` is currently set to the common `key` of - the two archives, and is expected to have a "version" as - the last key col.', - class = "epiprocess__epi_archive_must_have_version_at_end_of_key" - ) - } - x_nonby_colnames <- setdiff(names(x_DT), by) - y_nonby_colnames <- setdiff(names(y_DT), by) - if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { - cli_abort(" - `x` and `y` DTs have overlapping non-by column names; - this is currently not supported; please manually fix up first: - any overlapping columns that can are key-like should be - incorporated into the key, and other columns should be renamed. - ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") - } - x_by_vals <- x_DT[, by, with = FALSE] - if (anyDuplicated(x_by_vals) != 0L) { - cli_abort(" - The `by` columns must uniquely determine rows of `x$DT`; - the `by` is currently set to the common `key` of the two - archives, so this can be resolved by adding key-like columns - to `x`'s key (to get a unique key). - ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") - } - y_by_vals <- y_DT[, by, with = FALSE] - if (anyDuplicated(y_by_vals) != 0L) { - cli_abort(" - The `by` columns must uniquely determine rows of `y$DT`; - the `by` is currently set to the common `key` of the two - archives, so this can be resolved by adding key-like columns - to `y`'s key (to get a unique key). - ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") - } - result_DT <- merge(x_by_vals, y_by_vals, - by = by, - # We must have `all=TRUE` or we may skip updates - # from x and/or y and corrupt the history - all = TRUE, - # We don't want Cartesian products, but the - # by-is-unique-key check above already ensures - # this. (Note that `allow.cartesian=FALSE` doesn't - # actually catch all Cartesian products anyway.) - # Disable superfluous check: - allow.cartesian = TRUE - ) - set( - result_DT, , x_nonby_colnames, - x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, - with = FALSE, - # It's good practice to specify `on`, and we must - # explicitly specify `on` if there's a potential key vs. - # by order mismatch (not possible currently for x - # with by = key(x$DT), but possible for y): - on = by, - # last version carried forward: - roll = TRUE, - # requesting non-version key that doesn't exist in the other archive, - # or before its first version, should result in NA - nomatch = NA, - # see note on `allow.cartesian` above; currently have a - # similar story here. - allow.cartesian = TRUE - ] - ) - set( - result_DT, , y_nonby_colnames, - y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, - with = FALSE, - on = by, - roll = TRUE, - nomatch = NA, - allow.cartesian = TRUE - ] - ) - # The key could be unset in case of a key vs. by order mismatch as - # noted above. Ensure that we keep it: - setkeyv(result_DT, by) - - return(as_epi_archive2( - result_DT[], # clear data.table internal invisibility flag if set - geo_type = x$geo_type, - time_type = x$time_type, - other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), - additional_metadata = result_additional_metadata, - # It'd probably be better to pre-compactify before the merge, and might be - # guaranteed not to be necessary to compactify the merge result if the - # inputs are already compactified, but at time of writing we don't have - # compactify in its own method or field, and it seems like it should be - # pretty fast anyway. - compactify = compactify, - clobberable_versions_start = result_clobberable_versions_start, - versions_end = new_versions_end - )) -} - -# Helpers for `group_by`: - -#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input -#' -#' A workaround for `dplyr:::mutate_cols` not being exported and directly -#' applying test mock libraries likely being impossible (due to mocking another -#' package's S3 generic or method). -#' -#' Use solely with a single call to the [`dplyr::mutate`] function and then -#' `destructure_col_modify_recorder_df`; other applicable operations from -#' [dplyr::dplyr_extending] have not been implemented. -#' -#' @param parent_df the "parent class" data frame to wrap -#' @return a `col_modify_recorder_df` -#' -#' @noRd -new_col_modify_recorder_df <- function(parent_df) { - assert_class(parent_df, "data.frame") - `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) -} - -#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` -#' -#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` -#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the -#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record -#' -#' @noRd -destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { - assert_class(col_modify_recorder_df, "col_modify_recorder_df") - list( - unchanged_parent_df = col_modify_recorder_df %>% - `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% - `class<-`(setdiff(class(.), "col_modify_recorder_df")), - cols = attr(col_modify_recorder_df, - "epiprocess::col_modify_recorder_df::cols", - exact = TRUE - ) - ) -} - -#' `dplyr_col_modify` method that simply records the `cols` argument -#' -#' Must export S3 methods in R >= 4.0, even if they're only designed to be -#' package internals, and must import any corresponding upstream S3 generic -#' functions: -#' @importFrom dplyr dplyr_col_modify -#' @export -#' @noRd -dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { - if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { - cli_abort("`col_modify_recorder_df` can only record `cols` once", - internal = TRUE - ) - } - attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols - data -} - -#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` -#' -#' More detailed: provides the names of the "requested" columns in addition to -#' the output expected from a regular `mutate` method. -#' -#' Restricted: doesn't allow replacing or removing key cols, where a sort is -#' potentially required at best and what the output key should be is unclear at -#' worst. (The originally expected restriction was that the `mutate` parameters -#' not present in `group_by` would not be recognized, but the current -#' implementation just lets `mutate` handle these even anyway, even if they're -#' not part of the regular `group_by` parameters; these arguments would have to -#' be passed by names with dot prefixes, so just hope that the user means to use -#' them here if provided.) -#' -#' This can introduce column-level aliasing in `data.table`s, which isn't really -#' intended in the `data.table` user model but we can make it part of our user -#' model (see -#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table -#' and links). -#' -#' Don't export this without cleaning up language of "mutate" as in side effects -#' vs. "mutate" as in `dplyr::mutate`. -#' @noRd -epix_detailed_restricted_mutate2 <- function(.data, ...) { - # We don't want to directly use `dplyr::mutate` on the `$DT`, as: - # - `mutate` behavior, including the output class, changes depending on - # whether `dtplyr` < 1.3.0 is loaded and would require post-processing - # - behavior with `dtplyr` isn't fully compatible - # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not - # appropriately handle the `= NULL` and `= ` tidyeval cases - # Instead: - # - Use `as.list` to get a shallow copy (undocumented, but apparently - # intended, behavior), then `as_tibble` (also shallow, given a list) to get - # back to something that will use `dplyr`'s included `mutate` method(s), - # then convert this using shallow operations into a `data.table`. - # - Use `col_modify_recorder_df` to get the desired details. - in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") - col_modify_cols <- - destructure_col_modify_recorder_df( - mutate(new_col_modify_recorder_df(in_tbl), ...) - )[["cols"]] - invalidated_key_col_is <- - which(purrr::map_lgl(key(.data$DT), function(key_colname) { - key_colname %in% names(col_modify_cols) && - !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) - })) - if (length(invalidated_key_col_is) != 0L) { - rlang::abort(paste_lines(c( - "Key columns must not be replaced or removed.", - wrap_varnames(key(.data$DT)[invalidated_key_col_is], - initial = "Flagged key cols: " - ) - ))) - } else { - # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing - # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, - # then convert it into a `data.table`. The key should still be valid - # (assuming that the user did not explicitly alter `key(.data$DT)` or the - # columns by reference somehow within `...` tidyeval-style computations, or - # trigger refcount-1 alterations due to still having >1 refcounts on the - # columns), set the "sorted" attribute accordingly to prevent attempted - # sorting (including potential extra copies) or sortedness checking, then - # `setDT` (rather than `as.data.table`, in order to prevent column copying - # to establish ownership according to `data.table`'s memory model). - out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% - data.table::setattr("sorted", data.table::key(.data$DT)) %>% - data.table::setDT(key = key(.data$DT)) - out_archive <- clone(.data) - out_archive$DT <- out_DT - request_names <- names(col_modify_cols) - return(list( - archive = out_archive, - request_names = request_names - )) - # (We might also consider special-casing when `mutate` hands back something - # equivalent (in some sense) to the input (probably only encountered when - # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, - # in the result, primarily in order to hedge against `as.list` or `setDT` - # changing their behavior and generating deep copies somehow. This could - # also prevent storage, and perhaps also generation, of shallow copies, but - # this seems unlikely to be a major gain unless it helps enable some - # in-place modifications of refcount-1 columns (although detecting this case - # seems to be common across `group_by` implementations; maybe there is - # something there).) - } -} - - -#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` -#' -#' Slides a given function over variables in an `epi_archive` object. This -#' behaves similarly to `epi_slide()`, with the key exception that it is -#' version-aware: the sliding computation at any given reference time t is -#' performed on **data that would have been available as of t**. See the -#' [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. -#' -#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, -#' all data in `x` will be treated as part of a single data group. -#' @param f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation over a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `before` parameter described below. One time step is -#' typically one day or one week; see [`epi_slide`] details for more -#' explanation. If a function, `f` must take an `epi_df` with the same -#' column names as the archive's `DT`, minus the `version` column; followed -#' by a one-row tibble containing the values of the grouping variables for -#' the associated group; followed by a reference time value, usually as a -#' `Date` object; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~ mean (.x$var)` to compute a mean of a column `var` for each -#' group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via -#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the -#' computation. -#' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an -#' expression for tidy evaluation; in addition to referring to columns -#' directly by name, the expression has access to `.data` and `.env` pronouns -#' as in `dplyr` verbs, and can also refer to the `.group_key` and -#' `.ref_time_value`. See details of [`epi_slide`]. -#' @param before How far `before` each `ref_time_value` should the sliding -#' window extend? If provided, should be a single, non-NA, -#' [integer-compatible][vctrs::vec_cast] number of time steps. This window -#' endpoint is inclusive. For example, if `before = 7`, and one time step is -#' one day, then to produce a value for a `ref_time_value` of January 8, we -#' apply the given function or formula to data (for each group present) with -#' `time_value`s from January 1 onward, as they were reported on January 8. -#' For typical disease surveillance sources, this will not include any data -#' with a `time_value` of January 8, and, depending on the amount of reporting -#' latency, may not include January 7 or even earlier `time_value`s. (If -#' instead the archive were to hold nowcasts instead of regular surveillance -#' data, then we would indeed expect data for `time_value` January 8. If it -#' were to hold forecasts, then we would expect data for `time_value`s after -#' January 8, and the sliding window would extend as far after each -#' `ref_time_value` as needed to include all such `time_value`s.) -#' @param ref_time_values Reference time values / versions for sliding -#' computations; each element of this vector serves both as the anchor point -#' for the `time_value` window for the computation and the `max_version` -#' `as_of` which we fetch data in this window. If missing, then this will set -#' to a regularly-spaced sequence of values set to cover the range of -#' `version`s in the `DT` plus the `versions_end`; the spacing of values will -#' be guessed (using the GCD of the skips between values). -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` -#' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If -#' `all_versions = TRUE`, then `f` will be passed the version history (all -#' `version <= ref_time_value`) for rows having `time_value` between -#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be -#' passed only the most recent `version` for every unique `time_value`. -#' Default is `FALSE`. -#' @return A tibble whose columns are: the grouping variables, `time_value`, -#' containing the reference time values for the slide computation, and a -#' column named according to the `new_col_name` argument, containing the slide -#' values. -#' -#' @details A few key distinctions between the current function and `epi_slide()`: -#' 1. In `f` functions for `epix_slide`, one should not assume that the input -#' data to contain any rows with `time_value` matching the computation's -#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for -#' typical epidemiological surveillance data, observations pertaining to a -#' particular time period (`time_value`) are first reported `as_of` some -#' instant after that time period has ended. -#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend -#' from `before` time steps before a given `ref_time_value` through the last -#' `time_value` available as of version `ref_time_value` (typically, this -#' won't include `ref_time_value` itself, as observations about a particular -#' time interval (e.g., day) are only published after that time interval -#' ends); `epi_slide` windows extend from `before` time steps before a -#' `ref_time_value` through `after` time steps after `ref_time_value`. -#' 3. The input class and columns are similar but different: `epix_slide` -#' (with the default `all_versions=FALSE`) keeps all columns and the -#' `epi_df`-ness of the first argument to each computation; `epi_slide` only -#' provides the grouping variables in the second input, and will convert the -#' first input into a regular tibble if the grouping variables include the -#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will -#' will provide an `epi_archive` rather than an `epi-df` to each -#' computation.) -#' 4. The output class and columns are similar but different: `epix_slide()` -#' returns a tibble containing only the grouping variables, `time_value`, and -#' the new column(s) from the slide computations, whereas `epi_slide()` -#' returns an `epi_df` with all original variables plus the new columns from -#' the slide computations. (Both will mirror the grouping or ungroupedness of -#' their input, with one exception: `epi_archive`s can have trivial -#' (zero-variable) groupings, but these will be dropped in `epix_slide` -#' results as they are not supported by tibbles.) -#' 5. There are no size stability checks or element/row recycling to maintain -#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is -#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly -#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed -#' in the "advanced" vignette. -#' 6. `all_rows` is not supported in `epix_slide`; since the slide -#' computations are allowed more flexibility in their outputs than in -#' `epi_slide`, we can't guess a good representation for missing computations -#' for excluded group-`ref_time_value` pairs. -#' 7. The `ref_time_values` default for `epix_slide` is based on making an -#' evenly-spaced sequence out of the `version`s in the `DT` plus the -#' `versions_end`, rather than the `time_value`s. -#' -#' Apart from the above distinctions, the interfaces between `epix_slide()` and -#' `epi_slide()` are the same. -#' -#' Furthermore, the current function can be considerably slower than -#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch -#' properly-versioned snapshots from the data archive (via its `as_of()` -#' method), and (2) it performs a "manual" sliding of sorts, and does not -#' benefit from the highly efficient `slider` package. For this reason, it -#' should never be used in place of `epi_slide()`, and only used when -#' version-aware sliding is necessary (as it its purpose). -#' -#' Finally, this is simply a wrapper around the `slide()` method of the -#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an -#' object of either of these classes, then: -#' ``` -#' epix_slide(x, new_var = comp(old_var), before = 119) -#' ``` -#' is equivalent to: -#' ``` -#' x$slide(new_var = comp(old_var), before = 119) -#' ``` -#' -#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place -#' mutation of the input archives on their own. In some edge cases the inputs it -#' feeds to the slide computations may alias parts of the input archive, so copy -#' the slide computation inputs if needed before using mutating operations like -#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of -#' the slide operation may alias parts of the input archive, so similarly, make -#' sure to clone and/or copy appropriately before using in-place mutation. -#' -#' @examples -#' library(dplyr) -#' -#' # Reference time points for which we want to compute slide values: -#' ref_time_values <- seq(as.Date("2020-06-01"), -#' as.Date("2020-06-15"), -#' by = "1 day" -#' ) -#' -#' # A simple (but not very useful) example (see the archive vignette for a more -#' # realistic one): -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = ref_time_values, -#' new_col_name = "case_rate_7d_av_recent_av" -#' ) %>% -#' ungroup() -#' # We requested time windows that started 2 days before the corresponding time -#' # values. The actual number of `time_value`s in each computation depends on -#' # the reporting latency of the signal and `time_value` range covered by the -#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -#' # discarded -#' # * 1 `time_value`, for ref time 2020-06-02 -#' # * 2 `time_value`s, for the rest of the results -#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because -#' # of data latency, we'll never have an observation -#' # `time_value == ref_time_value` as of `ref_time_value`. -#' # The example below shows this type of behavior in more detail. -#' -#' # Examining characteristics of the data passed to each computation with -#' # `all_versions=FALSE`. -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' function(x, gk, rtv) { -#' tibble( -#' time_range = if (nrow(x) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) -#' }, -#' n = nrow(x), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' before = 5, all_versions = FALSE, -#' ref_time_values = ref_time_values, names_sep = NULL -#' ) %>% -#' ungroup() %>% -#' arrange(geo_value, time_value) -#' -#' # --- Advanced: --- -#' -#' # `epix_slide` with `all_versions=FALSE` (the default) applies a -#' # version-unaware computation to several versions of the data. We can also -#' # use `all_versions=TRUE` to apply a version-*aware* computation to several -#' # versions of the data, again looking at characteristics of the data passed -#' # to each computation. In this case, each computation should expect an -#' # `epi_archive` containing the relevant version data: -#' -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' function(x, gk, rtv) { -#' tibble( -#' versions_start = if (nrow(x$DT) == 0L) { -#' "NA (0 rows)" -#' } else { -#' toString(min(x$DT$version)) -#' }, -#' versions_end = x$versions_end, -#' time_range = if (nrow(x$DT) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) -#' }, -#' n = nrow(x$DT), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' before = 5, all_versions = TRUE, -#' ref_time_values = ref_time_values, names_sep = NULL -#' ) %>% -#' ungroup() %>% -#' # Focus on one geo_value so we can better see the columns above: -#' filter(geo_value == "ca") %>% -#' select(-geo_value) -#' -#' @importFrom rlang enquo !!! -#' @export -epix_slide2 <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - if (!is_epi_archive2(x, grouped_okay = TRUE)) { - cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") - } - return(slide(x, f, ..., - before = before, - ref_time_values = ref_time_values, - time_step = time_step, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_versions = all_versions - )) -} - - -#' Filter an `epi_archive` object to keep only older versions -#' -#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping -#' only rows with `version` falling on or before a specified date. -#' -#' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the -#' filtered archive. That is, the output archive will comprise rows of the -#' current archive data having `version` less than or equal to the -#' specified `max_version` -#' @return An `epi_archive` object -#' -#' @export -epix_truncate_versions_after <- function(x, max_version) { - UseMethod("epix_truncate_versions_after") -} - -#' @export -epix_truncate_versions_after.epi_archive2 <- function(x, max_version) { - cloned_epi_archive <- clone(x) - return((truncate_versions_after(x, max_version))) - # ^ second set of parens drops invisibility -} diff --git a/R/utils.R b/R/utils.R index 57a7f53a..ea7afc2f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -359,8 +359,6 @@ as_slide_computation <- function(f, ...) { ) } -min_na_rm <- function(x) min(x, na.rm = TRUE) -extend_r <- function(x) c(x, x[length(x)]) guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { diff --git a/_pkgdown.yml b/_pkgdown.yml index 1daef5a0..4930c9f5 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -95,6 +95,9 @@ reference: - title: Basic automatic plotting - contents: - autoplot.epi_df + - title: Advanced internals + - contents: + - compactify - title: internal - contents: - epiprocess diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd deleted file mode 100644 index 93b10736..00000000 --- a/man/as_epi_archive.Rd +++ /dev/null @@ -1,142 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{as_epi_archive} -\alias{as_epi_archive} -\title{Convert to \code{epi_archive} format} -\usage{ -as_epi_archive( - x, - geo_type, - time_type, - other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x) -) -} -\arguments{ -\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{other_keys}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or -\code{NULL} will remove these rows and issue a warning. Generally, this can be -set to \code{TRUE}, but if you directly inspect or edit the fields of the -\code{epi_archive} such as its \code{DT}, you will have to determine whether -\code{compactify=TRUE} will produce the desired results. If compactification -here is removing a large proportion of the rows, this may indicate a -potential for space, time, or bandwidth savings upstream the data pipeline, -e.g., when fetching, storing, or preparing the input data \code{x}} - -\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} - -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is -\code{max_version_with_row_in(x)}, but values greater than this could also be -valid, and would indicate that we observed additional versions of the data -beyond \code{max(x$version)}, but they all contained empty updates. (The default -value of \code{clobberable_versions_start} does not fully trust these empty -updates, and assumes that any version \verb{>= max(x$version)} could be -clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -} -\value{ -An \code{epi_archive} object. -} -\description{ -Converts a data frame, data table, or tibble into an \code{epi_archive} -object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. The parameter descriptions below are copied from there -} -\details{ -This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example: - -\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} - -would be equivalent to: - -\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} -} -\examples{ -# Simple ex. with necessary keys -tib <- tibble::tibble( - geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5 - ), times = 2), - value = rnorm(10, mean = 2, sd = 1) -) - -toy_epi_archive <- tib \%>\% as_epi_archive( - geo_type = "state", - time_type = "day" -) -toy_epi_archive - -# Ex. with an additional key for county -df <- data.frame( - geo_value = c(replicate(2, "ca"), replicate(2, "fl")), - county = c(1, 3, 2, 5), - time_value = c( - "2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02" - ), - version = c( - "2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03" - ), - cases = c(1, 2, 3, 4), - cases_rate = c(0.01, 0.02, 0.01, 0.05) -) - -x <- df \%>\% as_epi_archive( - geo_type = "state", - time_type = "day", - other_keys = "county" -) -} diff --git a/man/as_epi_archive2.Rd b/man/as_epi_archive2.Rd deleted file mode 100644 index bc3f5185..00000000 --- a/man/as_epi_archive2.Rd +++ /dev/null @@ -1,142 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{as_epi_archive2} -\alias{as_epi_archive2} -\title{Convert to \code{epi_archive} format} -\usage{ -as_epi_archive2( - x, - geo_type, - time_type, - other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x) -) -} -\arguments{ -\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{other_keys}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or -\code{NULL} will remove these rows and issue a warning. Generally, this can be -set to \code{TRUE}, but if you directly inspect or edit the fields of the -\code{epi_archive} such as its \code{DT}, you will have to determine whether -\code{compactify=TRUE} will produce the desired results. If compactification -here is removing a large proportion of the rows, this may indicate a -potential for space, time, or bandwidth savings upstream the data pipeline, -e.g., when fetching, storing, or preparing the input data \code{x}} - -\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} - -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is -\code{max_version_with_row_in(x)}, but values greater than this could also be -valid, and would indicate that we observed additional versions of the data -beyond \code{max(x$version)}, but they all contained empty updates. (The default -value of \code{clobberable_versions_start} does not fully trust these empty -updates, and assumes that any version \verb{>= max(x$version)} could be -clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -} -\value{ -An \code{epi_archive} object. -} -\description{ -Converts a data frame, data table, or tibble into an \code{epi_archive} -object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. The parameter descriptions below are copied from there -} -\details{ -This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example: - -\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} - -would be equivalent to: - -\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} -} -\examples{ -# Simple ex. with necessary keys -tib <- tibble::tibble( - geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5 - ), times = 2), - value = rnorm(10, mean = 2, sd = 1) -) - -toy_epi_archive <- tib \%>\% as_epi_archive2( - geo_type = "state", - time_type = "day" -) -toy_epi_archive - -# Ex. with an additional key for county -df <- data.frame( - geo_value = c(replicate(2, "ca"), replicate(2, "fl")), - county = c(1, 3, 2, 5), - time_value = c( - "2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02" - ), - version = c( - "2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03" - ), - cases = c(1, 2, 3, 4), - cases_rate = c(0.01, 0.02, 0.01, 0.05) -) - -x <- df \%>\% as_epi_archive2( - geo_type = "state", - time_type = "day", - other_keys = "county" -) -} diff --git a/man/as_of.epi_archive2.Rd b/man/as_of.epi_archive2.Rd deleted file mode 100644 index 21a4cfc1..00000000 --- a/man/as_of.epi_archive2.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{as_of.epi_archive2} -\alias{as_of.epi_archive2} -\title{As of epi_archive} -\usage{ -\method{as_of}{epi_archive2}(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) -} -\arguments{ -\item{epi_archive}{An \code{epi_archive} object} - -\item{max_version}{Version specifying the max version to permit in the -snapshot. That is, the snapshot will comprise the unique rows of the -current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose \code{time_value}s are at least -\code{min_time_value}).} - -\item{min_time_value}{Time value specifying the min \code{time_value} to permit in -the snapshot. Default is \code{-Inf}, which effectively means that there is no -minimum considered.} - -\item{all_versions}{Boolean; If \code{all_versions = TRUE}, then the output will be in -\code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a -potentially narrower \code{version} and \code{time_value} range than \code{x}, depending -on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} -} -\description{ -Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for -details. The parameter descriptions below are copied from there -} diff --git a/man/clone.Rd b/man/clone.Rd new file mode 100644 index 00000000..a5597e3b --- /dev/null +++ b/man/clone.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{clone} +\alias{clone} +\alias{clone.epi_archive} +\title{Clone an \code{epi_archive} object.} +\usage{ +clone(x) + +\method{clone}{epi_archive}(x) +} +\arguments{ +\item{x}{An \code{epi_archive} object.} +} +\description{ +Clone an \code{epi_archive} object. +} diff --git a/man/compactify.Rd b/man/compactify.Rd new file mode 100644 index 00000000..2f210315 --- /dev/null +++ b/man/compactify.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{compactify} +\alias{compactify} +\title{Compactify} +\description{ +This section describes the internals of how compactification works in an +\code{epi_archive()}. Compactification can potentially improve code speed or +memory usage, depending on your data. +} +\details{ +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{versions_end}. One consequence is that the \code{DT} doesn't +have to contain a full snapshot of every version (although this generally +works), but can instead contain only the rows that are new or changed from +the previous version (see \code{compactify}, which does this automatically). +Currently, deletions must be represented as revising a row to a special +state (e.g., making the entries \code{NA} or including a special column that +flags the data as removed and performing some kind of post-processing), and +the archive is unaware of what this state is. Note that \code{NA}s \emph{can} be +introduced by \code{epi_archive} methods for other reasons, e.g., in +\code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 86e21b89..b7dd649e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,73 +1,97 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R, R/archive_new.R +% Please edit documentation in R/archive.R \name{epi_archive} \alias{epi_archive} +\alias{new_epi_archive} +\alias{as_epi_archive} \title{\code{epi_archive} object} -\description{ -An \code{epi_archive} is an R6 class which contains a data table -along with several relevant pieces of metadata. The data table can be seen -as the full archive (version history) for some signal variables of -interest. +\usage{ +new_epi_archive( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL +) -An \code{epi_archive} is an R6 class which contains a data table -along with several relevant pieces of metadata. The data table can be seen -as the full archive (version history) for some signal variables of -interest. -} -\details{ -An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of -class \code{data.table} from the \code{data.table} package, with (at least) the -following columns: -\itemize{ -\item \code{geo_value}: the geographic value associated with each row of measurements. -\item \code{time_value}: the time value associated with each row of measurements. -\item \code{version}: the time value specifying the version for each row of -measurements. For example, if in a given row the \code{version} is January 15, -2022 and \code{time_value} is January 14, 2022, then this row contains the -measurements of the data for January 14, 2022 that were available one day -later. +as_epi_archive( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x) +) } +\arguments{ +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} -The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, -as well as any others (these can be specified when instantiating the -\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for -information and examples of relevant parameter names for an \code{epi_archive} object. -Note that there can only be a single row per unique combination of -key variables, and thus the key variables are critical for figuring out how -to generate a snapshot of data from the archive, as of a given version. +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions, and between the last recorded -update and the \code{versions_end}. One consequence is that the \code{DT} -doesn't have to contain a full snapshot of every version (although this -generally works), but can instead contain only the rows that are new or -changed from the previous version (see \code{compactify}, which does this -automatically). Currently, deletions must be represented as revising a row -to a special state (e.g., making the entries \code{NA} or including a special -column that flags the data as removed and performing some kind of -post-processing), and the archive is unaware of what this state is. Note -that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, -e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to -represent potential update data that we do not yet have access to; or in -\code{\link{epix_merge}} to represent the "value" of an observation before the -version in which it was first released, or if no version of that -observation appears in the archive data at all. +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} -\strong{A word of caution:} R6 objects, unlike most other objects in R, have -reference semantics. A primary consequence of this is that objects are not -copied when modified. You can read more about this in Hadley Wickham's -\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order -to construct a modified archive while keeping the original intact, first -make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} -field with \code{data.table::copy(clone$DT)}, and finally perform the -modifications on the clone. +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} -epi archive +\item{compactify}{Optional; Boolean or \code{NULL}. \code{TRUE} will remove some +redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove +redundant rows, but issue a warning. See more information at \code{compactify}.} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} + +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +} +\value{ +An \code{epi_archive} object. +} +\description{ +An \code{epi_archive} is an S3 class which contains a data table +along with several relevant pieces of metadata. The data table can be seen +as the full archive (version history) for some signal variables of +interest. +} +\details{ +Epi Archive -An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of -class \code{data.table} from the \code{data.table} package, with (at least) the -following columns: +An \code{epi_archive} contains a data table \code{DT}, of class \code{data.table} +from the \code{data.table} package, with (at least) the following columns: \itemize{ \item \code{geo_value}: the geographic value associated with each row of measurements. \item \code{time_value}: the time value associated with each row of measurements. @@ -81,56 +105,14 @@ later. The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, as well as any others (these can be specified when instantiating the \code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for -information and examples of relevant parameter names for an \code{epi_archive} object. -Note that there can only be a single row per unique combination of +on \code{DT} directly). Refer to the documentation for \code{as_epi_archive()} for +information and examples of relevant parameter names for an \code{epi_archive} +object. Note that there can only be a single row per unique combination of key variables, and thus the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version. - -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions, and between the last recorded -update and the \code{versions_end}. One consequence is that the \code{DT} -doesn't have to contain a full snapshot of every version (although this -generally works), but can instead contain only the rows that are new or -changed from the previous version (see \code{compactify}, which does this -automatically). Currently, deletions must be represented as revising a row -to a special state (e.g., making the entries \code{NA} or including a special -column that flags the data as removed and performing some kind of -post-processing), and the archive is unaware of what this state is. Note -that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, -e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to -represent potential update data that we do not yet have access to; or in -\code{\link{epix_merge}} to represent the "value" of an observation before the -version in which it was first released, or if no version of that -observation appears in the archive data at all. - -\strong{A word of caution:} R6 objects, unlike most other objects in R, have -reference semantics. A primary consequence of this is that objects are not -copied when modified. You can read more about this in Hadley Wickham's -\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order -to construct a modified archive while keeping the original intact, first -make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} -field with \code{data.table::copy(clone$DT)}, and finally perform the -modifications on the clone. } \section{Metadata}{ -The following pieces of metadata are included as fields in an \code{epi_archive} -object: -\itemize{ -\item \code{geo_type}: the type for the geo values. -\item \code{time_type}: the type for the time values. -\item \code{additional_metadata}: list of additional metadata for the data archive. -} - -Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be -accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, -etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the -metadata of an \code{epi_archive} object are not currently used by any -downstream functions in the \code{epiprocess} package, and serve only as useful -bits of information to convey about the data set at hand. - - The following pieces of metadata are included as fields in an \code{epi_archive} object: \itemize{ @@ -151,16 +133,8 @@ bits of information to convey about the data set at hand. An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal -variables, as of the specified version. This is accomplished by calling the -\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this -method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. - - -An \code{epi_archive} object can be used to generate a snapshot of the data in -\code{epi_df} format, which represents the most up-to-date values of the signal -variables, as of the specified version. This is accomplished by calling the -\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this -method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. +variables, as of the specified version. This is accomplished by calling +\code{epix_as_of()}. } \section{Sliding Computations}{ @@ -171,21 +145,11 @@ the \code{slide()} method for an \code{epi_archive} object, which works similarl the way \code{epi_slide()} works for an \code{epi_df} object, but with one key difference: it is version-aware. That is, for an \code{epi_archive} object, the sliding computation at any given reference time point t is performed on -\strong{data that would have been available as of t}. More details on \code{slide()} -are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. - - -We can run a sliding computation over an \code{epi_archive} object, much like -\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling -the \code{slide()} method for an \code{epi_archive} object, which works similarly to -the way \code{epi_slide()} works for an \code{epi_df} object, but with one key -difference: it is version-aware. That is, for an \code{epi_archive} object, the -sliding computation at any given reference time point t is performed on -\strong{data that would have been available as of t}. More details on \code{slide()} -are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. +\strong{data that would have been available as of t}. } \examples{ +# Simple ex. with necessary keys tib <- tibble::tibble( geo_value = rep(c("ca", "hi"), each = 5), time_value = rep(seq(as.Date("2020-01-01"), @@ -197,419 +161,36 @@ tib <- tibble::tibble( value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% epi_archive$new( +toy_epi_archive <- tib \%>\% as_epi_archive( geo_type = "state", time_type = "day" ) toy_epi_archive -tib <- tibble::tibble( - geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5 - ), times = 2), - value = rnorm(10, mean = 2, sd = 1) + +# Ex. with an additional key for county +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) ) -toy_epi_archive <- tib \%>\% new_epi_archive2( +x <- df \%>\% as_epi_archive( geo_type = "state", - time_type = "day" + time_type = "day", + other_keys = "county" ) -toy_epi_archive -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{DT}}{(\code{data.table})\cr -the (optionally compactified) datatable} - -\item{\code{geo_type}}{(string)\cr -the resolution of the geographic label (e.g. state)} - -\item{\code{time_type}}{(string)\cr -the resolution of the time column (e.g. day)} - -\item{\code{additional_metadata}}{(named list)\cr -any extra fields, such as \code{other_keys}} -\item{\code{clobberable_versions_start}}{(length-1 of same type&class as \code{version} column, or \code{NA})\cr -the earliest version number that might be rewritten in the future without assigning a new version -date/number, or \code{NA} if this won't happen} - -\item{\code{versions_end}}{(length-1 of same type&class as \code{version} column)\cr -the latest version observed} -} -\if{html}{\out{
}} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-epi_archive-new}{\code{epi_archive$new()}} -\item \href{#method-epi_archive-print}{\code{epi_archive$print()}} -\item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} -\item \href{#method-epi_archive-fill_through_version}{\code{epi_archive$fill_through_version()}} -\item \href{#method-epi_archive-truncate_versions_after}{\code{epi_archive$truncate_versions_after()}} -\item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} -\item \href{#method-epi_archive-group_by}{\code{epi_archive$group_by()}} -\item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} -\item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{
}} -\if{latex}{\out{\hypertarget{method-epi_archive-new}{}}} -\subsection{Method \code{new()}}{ -Creates a new \code{epi_archive} object. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$new( - x, - geo_type, - time_type, - other_keys, - additional_metadata, - compactify, - clobberable_versions_start, - versions_end -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{A data frame, data table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{\code{geo_type}}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{\code{time_type}}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{\code{other_keys}}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{\code{additional_metadata}}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{\code{compactify}}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space while maintaining the same behavior (with the help of the -\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). -\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will -remove these rows and issue a warning. Generally, this can be set to -\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} -such as its \code{DT}, or rely on redundant updates to achieve a certain -behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to -determine whether \code{compactify=TRUE} will produce the desired results. If -compactification here is removing a large proportion of the rows, this may -indicate a potential for space, time, or bandwidth savings upstream the -data pipeline, e.g., by avoiding fetching, storing, or processing these -rows of \code{x}.} - -\item{\code{clobberable_versions_start}}{Optional; as in \code{\link{as_epi_archive}}} - -\item{\code{versions_end}}{Optional; as in \code{\link{as_epi_archive}}} -} -\if{html}{\out{
}} -} -\subsection{Details}{ -Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information -and examples of parameter names. -Print information about an archive -} - -\subsection{Returns}{ -An \code{epi_archive} object. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-print}{}}} -\subsection{Method \code{print()}}{ -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$print(class = TRUE, methods = TRUE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{class}}{Boolean; whether to print the class label header} - -\item{\code{methods}}{Boolean; whether to print all available methods of -the archive} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} -\subsection{Method \code{as_of()}}{ -Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for -details. The parameter descriptions below are copied from there -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf, all_versions = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{max_version}}{Version specifying the max version to permit in the -snapshot. That is, the snapshot will comprise the unique rows of the -current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose \code{time_value}s are at least -\code{min_time_value}).} - -\item{\code{min_time_value}}{Time value specifying the min \code{time_value} to permit in -the snapshot. Default is \code{-Inf}, which effectively means that there is no -minimum considered.} - -\item{\code{all_versions}}{Boolean; If \code{all_versions = TRUE}, then the output will be in -\code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a -potentially narrower \code{version} and \code{time_value} range than \code{x}, depending -on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} - -\item{\code{x}}{An \code{epi_archive} object} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-fill_through_version}{}}} -\subsection{Method \code{fill_through_version()}}{ -Fill in unobserved history using requested scheme by mutating -\code{self} and potentially reseating its fields. See -\code{\link{epix_fill_through_version}} for a full description of the non-R6-method -version, which doesn't mutate the input archive but might alias its fields. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$fill_through_version(fill_versions_end, how = c("na", "locf"))}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{fill_versions_end}}{as in \code{\link{epix_fill_through_version}}} - -\item{\code{how}}{as in \code{\link{epix_fill_through_version}}} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-truncate_versions_after}{}}} -\subsection{Method \code{truncate_versions_after()}}{ -Filter to keep only older versions, mutating the archive by -potentially reseating but not mutating some fields. \code{DT} is likely, but not -guaranteed, to be copied. Returns the mutated archive -\link[base:invisible]{invisibly}. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$truncate_versions_after(max_version)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{max_version}}{as in \code{\link{epix_truncate_versions_after}}} - -\item{\code{x}}{as in \code{\link{epix_truncate_versions_after}}} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} -\subsection{Method \code{merge()}}{ -Merges another \code{epi_archive} with the current one, mutating the -current one by reseating its \code{DT} and several other fields, but avoiding -mutation of the old \code{DT}; returns the current archive -\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description -of the non-R6-method version, which does not mutate either archive, and -does not alias either archive's \code{DT}. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$merge( - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{y}}{as in \code{\link{epix_merge}}} - -\item{\code{sync}}{as in \code{\link{epix_merge}}} - -\item{\code{compactify}}{as in \code{\link{epix_merge}} -group an epi_archive} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-group_by}{}}} -\subsection{Method \code{group_by()}}{ -group an epi_archive -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$group_by( - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(self) -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{...}}{variables or computations to group by. Computations are always -done on the ungrouped data frame. To perform computations on the grouped -data, you need to use a separate \code{\link[=mutate]{mutate()}} step before the -\code{\link[=group_by]{group_by()}}} - -\item{\code{.add}}{When \code{FALSE}, the default, \code{\link[=group_by]{group_by()}} will override existing -groups. To add to the existing groups, use \code{.add = TRUE}.} - -\item{\code{.drop}}{Drop groups formed by factor levels that don't appear in the -data. The default is \code{TRUE} except when \code{.data} has been previously grouped -with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop_default()}} for details.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-slide}{}}} -\subsection{Method \code{slide()}}{ -Slides a given function over variables in an \code{epi_archive} -object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. The parameter descriptions below are copied from there -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$slide( - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{f}}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the -computation.} - -\item{\code{...}}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} - -\item{\code{before}}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} - -\item{\code{ref_time_values}}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - -\item{\code{time_step}}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - -\item{\code{new_col_name}}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{\code{as_list_col}}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{\code{names_sep}}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{\code{all_versions}}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} } diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 9a0a53ce..dc359a7b 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -34,37 +34,17 @@ Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for examples. } -\details{ -This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: - -\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input -archives, but may in some edge cases alias parts of the inputs, so copy the -outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, the only situation where there is potentially aliasing -is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change -in the future. -} \examples{ # warning message of data latency shown epix_as_of( - x = archive_cases_dv_subset, + archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version) ) - range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 epix_as_of( - x = archive_cases_dv_subset, + archive_cases_dv_subset, max_version = as.Date("2020-06-12") ) @@ -79,7 +59,7 @@ epix_as_of( withCallingHandlers( { epix_as_of( - x = archive_cases_dv_subset, + archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version) ) }, diff --git a/man/epix_as_of2.Rd b/man/epix_as_of2.Rd deleted file mode 100644 index ac69e9a9..00000000 --- a/man/epix_as_of2.Rd +++ /dev/null @@ -1,95 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_as_of2} -\alias{epix_as_of2} -\title{Generate a snapshot from an \code{epi_archive} object} -\usage{ -epix_as_of2( - epi_archive, - max_version, - min_time_value = -Inf, - all_versions = FALSE -) -} -\arguments{ -\item{max_version}{Time value specifying the max version to permit in the -snapshot. That is, the snapshot will comprise the unique rows of the -current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose time values are at least -\code{min_time_value}.)} - -\item{min_time_value}{Time value specifying the min time value to permit in -the snapshot. Default is \code{-Inf}, which effectively means that there is no -minimum considered.} - -\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in -\code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a -potentially narrower \code{version} and \code{time_value} range than \code{x}, depending -on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} - -\item{x}{An \code{epi_archive} object} -} -\value{ -An \code{epi_df} object. -} -\description{ -Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, as of a -given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. -} -\details{ -This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: - -\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input -archives, but may in some edge cases alias parts of the inputs, so copy the -outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, the only situation where there is potentially aliasing -is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change -in the future. -} -\examples{ -# warning message of data latency shown -epix_as_of2( - archive_cases_dv_subset_2, - max_version = max(archive_cases_dv_subset_2$DT$version) -) - -range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 - -epix_as_of2( - archive_cases_dv_subset_2, - max_version = as.Date("2020-06-12") -) - -# When fetching a snapshot as of the latest version with update data in the -# archive, a warning is issued by default, as this update data might not yet -# be finalized (for example, if data versions are labeled with dates, these -# versions might be overwritten throughout the corresponding days with -# additional data or "hotfixes" of erroroneous data; when we build an archive -# based on database queries, the latest available update might still be -# subject to change, but previous versions should be finalized). We can -# muffle such warnings with the following pattern: -withCallingHandlers( - { - epix_as_of2( - archive_cases_dv_subset_2, - max_version = max(archive_cases_dv_subset_2$DT$version) - ) - }, - epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") -) -# Since R 4.0, there is a `globalCallingHandlers` function that can be used -# to globally toggle these warnings. - -} diff --git a/man/epix_fill_through_version.Rd b/man/epix_fill_through_version.Rd index d5d2c278..a6f9c360 100644 --- a/man/epix_fill_through_version.Rd +++ b/man/epix_fill_through_version.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/methods-epi_archive.R \name{epix_fill_through_version} \alias{epix_fill_through_version} -\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\title{Fill \code{epi_archive} unobserved history} \usage{ epix_fill_through_version(x, fill_versions_end, how = c("na", "locf")) } @@ -34,11 +34,3 @@ another archive. This function provides one way to approach such mismatches: pretend that we've "observed" additional versions, filling in these versions with NAs or extrapolated values. } -\details{ -'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result -might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate -\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to -give the result, but might reseat its fields (e.g., references to the old -\code{x$DT} might not be updated by this function or subsequent operations on -\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. -} diff --git a/man/epix_fill_through_version2.Rd b/man/epix_fill_through_version2.Rd deleted file mode 100644 index 7389388a..00000000 --- a/man/epix_fill_through_version2.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_fill_through_version2} -\alias{epix_fill_through_version2} -\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} -\usage{ -epix_fill_through_version2( - epi_archive, - fill_versions_end, - how = c("na", "locf") -) -} -\arguments{ -\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the -version through which to fill in missing version history; this will be the -result's \verb{$versions_end} unless it already had a later -\verb{$versions_end}.} - -\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing -required version history with \code{NA}s, by inserting (if necessary) an update -immediately after the current \verb{$versions_end} that revises all -existing measurements to be \code{NA} (this is only supported for \code{version} -classes with a \code{next_after} implementation); \code{"locf"} will fill in missing -version history with the last version of each observation carried forward -(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are -based on LOCF). Default is \code{"na"}.} - -\item{x}{An \code{epi_archive}} -} -\value{ -An \code{epi_archive} -} -\description{ -Sometimes, due to upstream data pipeline issues, we have to work with a -version history that isn't completely up to date, but with functions that -expect archives that are completely up to date, or equally as up-to-date as -another archive. This function provides one way to approach such mismatches: -pretend that we've "observed" additional versions, filling in these versions -with NAs or extrapolated values. -} -\details{ -'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result -might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate -\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to -give the result, but might reseat its fields (e.g., references to the old -\code{x$DT} might not be updated by this function or subsequent operations on -\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. -} diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 53dea071..ea0d2444 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -30,7 +30,7 @@ use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_en and discard any rows containing update rows for later versions.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be -compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +compactified? See \code{as_epi_archive()} for an explanation of what this means. Default here is \code{TRUE}.} } \value{ @@ -38,21 +38,14 @@ the resulting \code{epi_archive} } \description{ Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and -set of key columns. When they also share a common \code{versions_end}, -using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and -\code{y} individually, then performing a full join of the \code{DT}s on the non-version -key columns (potentially consolidating multiple warnings about clobberable -versions). If the \code{versions_end} values differ, the -\code{sync} parameter controls what is done. +set of key columns. When they also share a common \code{versions_end}, using +\code{epix_as_of} on the result should be the same as using \code{epix_as_of} on \code{x} +and \code{y} individually, then performing a full join of the \code{DT}s on the +non-version key columns (potentially consolidating multiple warnings about +clobberable versions). If the \code{versions_end} values differ, the \code{sync} +parameter controls what is done. } \details{ -This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias -either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite -\code{x} with the result of the merge, reseating its \code{DT} and several other fields -(making them point to different objects), but avoiding mutation of the -contents of the old \code{DT} (only relevant if you have another reference to the -old \code{DT} in another object). - In all cases, \code{additional_metadata} will be an empty list, and \code{clobberable_versions_start} will be set to the earliest version that could be clobbered in either input archive. @@ -67,7 +60,5 @@ y <- archive_cases_dv_subset$DT \%>\% as_epi_archive(compactify = TRUE) # merge results stored in a third object: xy <- epix_merge(x, y) -# vs. mutating x to hold the merge result: -x$merge(y) } diff --git a/man/epix_merge2.Rd b/man/epix_merge2.Rd deleted file mode 100644 index 11d0aff5..00000000 --- a/man/epix_merge2.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_merge2} -\alias{epix_merge2} -\title{Merge two \code{epi_archive} objects} -\usage{ -epix_merge2( - x, - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE -) -} -\arguments{ -\item{x, y}{Two \code{epi_archive} objects to join together.} - -\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the -case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: -\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} -as the result's \code{versions_end}, but ensure that, if we request a snapshot -as of a version after \code{min(x$versions_end, y$versions_end)}, the -observation columns from the less up-to-date archive will be all NAs (i.e., -imagine there was an update immediately after its \code{versions_end} which -revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version -of each observation to be carried forward to extrapolate unavailable -versions for the less up-to-date input archive (i.e., imagining that in the -less up-to-date archive's data set remained unchanged between its actual -\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: -use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, -and discard any rows containing update rows for later versions.} - -\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be -compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. -Default here is \code{TRUE}.} -} -\value{ -the resulting \code{epi_archive} -} -\description{ -Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and -set of key columns. When they also share a common \code{versions_end}, -using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and -\code{y} individually, then performing a full join of the \code{DT}s on the non-version -key columns (potentially consolidating multiple warnings about clobberable -versions). If the \code{versions_end} values differ, the -\code{sync} parameter controls what is done. -} -\details{ -This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias -either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite -\code{x} with the result of the merge, reseating its \code{DT} and several other fields -(making them point to different objects), but avoiding mutation of the -contents of the old \code{DT} (only relevant if you have another reference to the -old \code{DT} in another object). - -In all cases, \code{additional_metadata} will be an empty list, and -\code{clobberable_versions_start} will be set to the earliest version that could -be clobbered in either input archive. -} -\examples{ -# create two example epi_archive datasets -x <- archive_cases_dv_subset_2$DT \%>\% - dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% - as_epi_archive2(compactify = TRUE) -y <- archive_cases_dv_subset_2$DT \%>\% - dplyr::select(geo_value, time_value, version, percent_cli) \%>\% - as_epi_archive2(compactify = TRUE) -# merge results stored in a third object: -xy <- epix_merge2(x, y) - -} diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 3ac55a18..c8f09594 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -1,7 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R \name{epix_slide} \alias{epix_slide} +\alias{epix_slide.epi_archive} +\alias{epix_slide.grouped_epi_archive} \title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} \usage{ epix_slide( @@ -16,6 +18,32 @@ epix_slide( names_sep = "_", all_versions = FALSE ) + +\method{epix_slide}{epi_archive}( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) + +\method{epix_slide}{grouped_epi_archive}( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) } \arguments{ \item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, @@ -64,8 +92,8 @@ January 8, and the sliding window would extend as far after each \item{ref_time_values}{Reference time values / versions for sliding computations; each element of this vector serves both as the anchor point for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of +\code{epix_as_of} which we fetch data in this window. If missing, then this will +set to a regularly-spaced sequence of values set to cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will be guessed (using the GCD of the skips between values).} @@ -162,31 +190,11 @@ Apart from the above distinctions, the interfaces between \code{epix_slide()} an Furthermore, the current function can be considerably slower than \code{epi_slide()}, for two reasons: (1) it must repeatedly fetch -properly-versioned snapshots from the data archive (via its \code{as_of()} -method), and (2) it performs a "manual" sliding of sorts, and does not -benefit from the highly efficient \code{slider} package. For this reason, it -should never be used in place of \code{epi_slide()}, and only used when -version-aware sliding is necessary (as it its purpose). - -Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an -object of either of these classes, then: - -\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place -mutation of the input archives on their own. In some edge cases the inputs it -feeds to the slide computations may alias parts of the input archive, so copy -the slide computation inputs if needed before using mutating operations like -\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of -the slide operation may alias parts of the input archive, so similarly, make -sure to clone and/or copy appropriately before using in-place mutation. +properly-versioned snapshots from the data archive (via \code{epix_as_of()}), +and (2) it performs a "manual" sliding of sorts, and does not benefit from +the highly efficient \code{slider} package. For this reason, it should never be +used in place of \code{epi_slide()}, and only used when version-aware sliding is +necessary (as it its purpose). } \examples{ library(dplyr) diff --git a/man/epix_slide2.Rd b/man/epix_slide2.Rd deleted file mode 100644 index 8d822bc0..00000000 --- a/man/epix_slide2.Rd +++ /dev/null @@ -1,283 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_slide2} -\alias{epix_slide2} -\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} -\usage{ -epix_slide2( - x, - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -) -} -\arguments{ -\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, -all data in \code{x} will be treated as part of a single data group.} - -\item{f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the -computation.} - -\item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} - -\item{before}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} - -\item{ref_time_values}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} -} -\value{ -A tibble whose columns are: the grouping variables, \code{time_value}, -containing the reference time values for the slide computation, and a -column named according to the \code{new_col_name} argument, containing the slide -values. -} -\description{ -Slides a given function over variables in an \code{epi_archive} object. This -behaves similarly to \code{epi_slide()}, with the key exception that it is -version-aware: the sliding computation at any given reference time t is -performed on \strong{data that would have been available as of t}. See the -\href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. -} -\details{ -A few key distinctions between the current function and \code{epi_slide()}: -\enumerate{ -\item In \code{f} functions for \code{epix_slide}, one should not assume that the input -data to contain any rows with \code{time_value} matching the computation's -\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for -typical epidemiological surveillance data, observations pertaining to a -particular time period (\code{time_value}) are first reported \code{as_of} some -instant after that time period has ended. -\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend -from \code{before} time steps before a given \code{ref_time_value} through the last -\code{time_value} available as of version \code{ref_time_value} (typically, this -won't include \code{ref_time_value} itself, as observations about a particular -time interval (e.g., day) are only published after that time interval -ends); \code{epi_slide} windows extend from \code{before} time steps before a -\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. -\item The input class and columns are similar but different: \code{epix_slide} -(with the default \code{all_versions=FALSE}) keeps all columns and the -\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only -provides the grouping variables in the second input, and will convert the -first input into a regular tibble if the grouping variables include the -essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will -will provide an \code{epi_archive} rather than an \code{epi-df} to each -computation.) -\item The output class and columns are similar but different: \code{epix_slide()} -returns a tibble containing only the grouping variables, \code{time_value}, and -the new column(s) from the slide computations, whereas \code{epi_slide()} -returns an \code{epi_df} with all original variables plus the new columns from -the slide computations. (Both will mirror the grouping or ungroupedness of -their input, with one exception: \code{epi_archive}s can have trivial -(zero-variable) groupings, but these will be dropped in \code{epix_slide} -results as they are not supported by tibbles.) -\item There are no size stability checks or element/row recycling to maintain -size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is -roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly -analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed -in the "advanced" vignette. -\item \code{all_rows} is not supported in \code{epix_slide}; since the slide -computations are allowed more flexibility in their outputs than in -\code{epi_slide}, we can't guess a good representation for missing computations -for excluded group-\code{ref_time_value} pairs. -\item The \code{ref_time_values} default for \code{epix_slide} is based on making an -evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the -\code{versions_end}, rather than the \code{time_value}s. -} - -Apart from the above distinctions, the interfaces between \code{epix_slide()} and -\code{epi_slide()} are the same. - -Furthermore, the current function can be considerably slower than -\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch -properly-versioned snapshots from the data archive (via its \code{as_of()} -method), and (2) it performs a "manual" sliding of sorts, and does not -benefit from the highly efficient \code{slider} package. For this reason, it -should never be used in place of \code{epi_slide()}, and only used when -version-aware sliding is necessary (as it its purpose). - -Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an -object of either of these classes, then: - -\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place -mutation of the input archives on their own. In some edge cases the inputs it -feeds to the slide computations may alias parts of the input archive, so copy -the slide computation inputs if needed before using mutating operations like -\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of -the slide operation may alias parts of the input archive, so similarly, make -sure to clone and/or copy appropriately before using in-place mutation. -} -\examples{ -library(dplyr) - -# Reference time points for which we want to compute slide values: -ref_time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-15"), - by = "1 day" -) - -# A simple (but not very useful) example (see the archive vignette for a more -# realistic one): -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = ref_time_values, - new_col_name = "case_rate_7d_av_recent_av" - ) \%>\% - ungroup() -# We requested time windows that started 2 days before the corresponding time -# values. The actual number of `time_value`s in each computation depends on -# the reporting latency of the signal and `time_value` range covered by the -# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -# discarded -# * 1 `time_value`, for ref time 2020-06-02 -# * 2 `time_value`s, for the rest of the results -# * never the 3 `time_value`s we would get from `epi_slide`, since, because -# of data latency, we'll never have an observation -# `time_value == ref_time_value` as of `ref_time_value`. -# The example below shows this type of behavior in more detail. - -# Examining characteristics of the data passed to each computation with -# `all_versions=FALSE`. -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - function(x, gk, rtv) { - tibble( - time_range = if (nrow(x) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) - }, - n = nrow(x), - class1 = class(x)[[1L]] - ) - }, - before = 5, all_versions = FALSE, - ref_time_values = ref_time_values, names_sep = NULL - ) \%>\% - ungroup() \%>\% - arrange(geo_value, time_value) - -# --- Advanced: --- - -# `epix_slide` with `all_versions=FALSE` (the default) applies a -# version-unaware computation to several versions of the data. We can also -# use `all_versions=TRUE` to apply a version-*aware* computation to several -# versions of the data, again looking at characteristics of the data passed -# to each computation. In this case, each computation should expect an -# `epi_archive` containing the relevant version data: - -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - function(x, gk, rtv) { - tibble( - versions_start = if (nrow(x$DT) == 0L) { - "NA (0 rows)" - } else { - toString(min(x$DT$version)) - }, - versions_end = x$versions_end, - time_range = if (nrow(x$DT) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) - }, - n = nrow(x$DT), - class1 = class(x)[[1L]] - ) - }, - before = 5, all_versions = TRUE, - ref_time_values = ref_time_values, names_sep = NULL - ) \%>\% - ungroup() \%>\% - # Focus on one geo_value so we can better see the columns above: - filter(geo_value == "ca") \%>\% - select(-geo_value) - -} diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd index f30be07f..c12cf9bb 100644 --- a/man/epix_truncate_versions_after.Rd +++ b/man/epix_truncate_versions_after.Rd @@ -1,31 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, -% R/methods-epi_archive_new.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R \name{epix_truncate_versions_after} \alias{epix_truncate_versions_after} +\alias{epix_truncate_versions_after.epi_archive} +\alias{epix_truncate_versions_after.grouped_epi_archive} \title{Filter an \code{epi_archive} object to keep only older versions} \usage{ epix_truncate_versions_after(x, max_version) -epix_truncate_versions_after(x, max_version) +\method{epix_truncate_versions_after}{epi_archive}(x, max_version) + +\method{epix_truncate_versions_after}{grouped_epi_archive}(x, max_version) } \arguments{ -\item{x}{An \code{epi_archive} object} +\item{x}{An \code{epi_archive} object.} -\item{max_version}{Time value specifying the max version to permit in the -filtered archive. That is, the output archive will comprise rows of the -current archive data having \code{version} less than or equal to the -specified \code{max_version}} +\item{max_version}{The latest version to include in the archive.} } \value{ -An \code{epi_archive} object - An \code{epi_archive} object } \description{ -Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping -only rows with \code{version} falling on or before a specified date. - Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping only rows with \code{version} falling on or before a specified date. } diff --git a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd deleted file mode 100644 index 5fba48fb..00000000 --- a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/grouped_archive_new.R -\name{epix_truncate_versions_after.grouped_epi_archive2} -\alias{epix_truncate_versions_after.grouped_epi_archive2} -\title{Truncate versions after a given version, grouped} -\usage{ -\method{epix_truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) -} -\description{ -Truncate versions after a given version, grouped -} diff --git a/man/fill_through_version.epi_archive2.Rd b/man/fill_through_version.epi_archive2.Rd deleted file mode 100644 index 48afb864..00000000 --- a/man/fill_through_version.epi_archive2.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{fill_through_version.epi_archive2} -\alias{fill_through_version.epi_archive2} -\title{Fill through version} -\usage{ -\method{fill_through_version}{epi_archive2}(epi_archive, fill_versions_end, how = c("na", "locf")) -} -\arguments{ -\item{epi_archive}{an \code{epi_archive} object} - -\item{fill_versions_end}{as in \code{\link{epix_fill_through_version}}} - -\item{how}{as in \code{\link{epix_fill_through_version}}} -} -\description{ -Fill in unobserved history using requested scheme by mutating -the given object and potentially reseating its fields. See -\code{\link{epix_fill_through_version}}, which doesn't mutate the input archive but -might alias its fields. -} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index f157e834..782d5f3f 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -1,47 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_archive_new.R, -% R/grouped_epi_archive.R +% Please edit documentation in R/archive.R, R/grouped_epi_archive.R \name{group_by.epi_archive} \alias{group_by.epi_archive} \alias{grouped_epi_archive} -\alias{group_by.grouped_epi_archive2} -\alias{group_by_drop_default.grouped_epi_archive2} -\alias{groups.grouped_epi_archive2} -\alias{ungroup.grouped_epi_archive2} -\alias{is_grouped_epi_archive2} \alias{group_by.grouped_epi_archive} +\alias{group_by_drop_default.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} \alias{is_grouped_epi_archive} -\alias{group_by_drop_default.grouped_epi_archive} \title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} \usage{ \method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) -\method{group_by}{grouped_epi_archive2}( - grouped_epi_archive, - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(grouped_epi_archive) -) - -\method{group_by_drop_default}{grouped_epi_archive2}(grouped_epi_archive) - -\method{groups}{grouped_epi_archive2}(grouped_epi_archive) - -\method{ungroup}{grouped_epi_archive2}(grouped_epi_archive, ...) - -is_grouped_epi_archive2(x) - \method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by_drop_default}{grouped_epi_archive}(.tbl) + \method{groups}{grouped_epi_archive}(x) \method{ungroup}{grouped_epi_archive}(x, ...) is_grouped_epi_archive(x) - -\method{group_by_drop_default}{grouped_epi_archive}(.tbl) } \arguments{ \item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} @@ -71,12 +50,10 @@ grouped by the current grouping variables plus the variable selection from \item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor columns.} +\item{.tbl}{A \code{grouped_epi_archive} object.} + \item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for \code{is_grouped_epi_archive}: any object} - -\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or -\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; -\code{grouped_epi_archive} dispatches its own S3 method)} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} @@ -97,17 +74,6 @@ the same operations on tibbles and grouped tibbles, which will \emph{not} output Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is disabled; instead, \code{ungroup} first then \code{group_by}. -Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, -introducing column-level aliasing between its input and its result. This -doesn't follow the general model for most \code{data.table} operations, which -seems to be that, given an nonaliased (i.e., unique) pointer to a -\code{data.table} object, its pointers to its columns should also be nonaliased. -If you mutate any of the columns of either the input or result, first ensure -that it is fine if columns of the other are also mutated, but do not rely on -such behavior to occur. Additionally, never perform mutation on the key -columns at all (except for strictly increasing transformations), as this will -invalidate sortedness assumptions about the rows. - \code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch to \code{group_by_drop_default.default} (but there is a dedicated method for \code{grouped_epi_archive}s). diff --git a/man/group_by.epi_archive2.Rd b/man/group_by.epi_archive2.Rd deleted file mode 100644 index fa9040c3..00000000 --- a/man/group_by.epi_archive2.Rd +++ /dev/null @@ -1,147 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{group_by.epi_archive2} -\alias{group_by.epi_archive2} -\alias{grouped_epi_archive} -\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} -\usage{ -\method{group_by}{epi_archive2}( - epi_archive, - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(epi_archive) -) -} -\arguments{ -\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); -\itemize{ -\item For \code{group_by}: unquoted variable name(s) or other -\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to -use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to -perform grouping, but note that, if you are regrouping an already-grouped -\code{.data} object, the calculations will be carried out ignoring such grouping -(same as \link[dplyr:group_by]{in dplyr}). -\item For \code{ungroup}: either -\itemize{ -\item empty, in order to remove the grouping and output an \code{epi_archive}; or -\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} -expression(s), in order to remove the matching variables from the list of -grouping variables, and output another \code{grouped_epi_archive}. -} -}} - -\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by -the variable selection from \code{...} only; if \code{TRUE}, the output will be -grouped by the current grouping variables plus the variable selection from -\code{...}.} - -\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of -factor columns.} - -\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} - -\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for -\code{is_grouped_epi_archive}: any object} - -\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or -\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; -\code{grouped_epi_archive} dispatches its own S3 method)} -} -\description{ -\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} -} -\details{ -To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as -"tidy evaluation") expressions \code{...}, not just column names, in a way similar -to \code{mutate}. Note that replacing or removing key columns with these -expressions is disabled. - -\code{archive \%>\% group_by()} and other expressions that group or regroup by zero -columns (indicating that all rows should be treated as part of one large -group) will output a \code{grouped_epi_archive}, in order to enable the use of -\code{grouped_epi_archive} methods on the result. This is in slight contrast to -the same operations on tibbles and grouped tibbles, which will \emph{not} output a -\code{grouped_df} in these circumstances. - -Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is -disabled; instead, \code{ungroup} first then \code{group_by}. - -Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, -introducing column-level aliasing between its input and its result. This -doesn't follow the general model for most \code{data.table} operations, which -seems to be that, given an nonaliased (i.e., unique) pointer to a -\code{data.table} object, its pointers to its columns should also be nonaliased. -If you mutate any of the columns of either the input or result, first ensure -that it is fine if columns of the other are also mutated, but do not rely on -such behavior to occur. Additionally, never perform mutation on the key -columns at all (except for strictly increasing transformations), as this will -invalidate sortedness assumptions about the rows. - -\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch -to \code{group_by_drop_default.default} (but there is a dedicated method for -\code{grouped_epi_archive}s). -} -\examples{ - -grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) - -# `print` for metadata and method listing: -grouped_archive \%>\% print() - -# The primary use for grouping is to perform a grouped `epix_slide`: - -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = "case_rate_3d_av" - ) \%>\% - ungroup() - -# ----------------------------------------------------------------- - -# Advanced: some other features of dplyr grouping are implemented: - -library(dplyr) -toy_archive <- - tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) - ) \%>\% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version) - ) \%>\% - as_epi_archive2(other_keys = "age_group") - -# The following are equivalent: -toy_archive \%>\% group_by(geo_value, age_group) -toy_archive \%>\% - group_by(geo_value) \%>\% - group_by(age_group, .add = TRUE) -grouping_cols <- c("geo_value", "age_group") -toy_archive \%>\% group_by(across(all_of(grouping_cols))) - -# And these are equivalent: -toy_archive \%>\% group_by(geo_value) -toy_archive \%>\% - group_by(geo_value, age_group) \%>\% - ungroup(age_group) - -# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -toy_archive \%>\% - group_by(geo_value) \%>\% - groups() - -toy_archive \%>\% - group_by(geo_value, age_group, .drop = FALSE) \%>\% - epix_slide2(f = ~ sum(.x$value), before = 20) \%>\% - ungroup() - -} diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd index 2beb3a8c..06669709 100644 --- a/man/is_epi_archive.Rd +++ b/man/is_epi_archive.Rd @@ -25,7 +25,7 @@ is_epi_archive(archive_cases_dv_subset) # TRUE # By default, grouped_epi_archives don't count as epi_archives, as they may # support a different set of operations from regular `epi_archives`. This # behavior can be controlled by `grouped_okay`. -grouped_archive <- archive_cases_dv_subset$group_by(geo_value) +grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) is_epi_archive(grouped_archive) # FALSE is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE diff --git a/man/is_epi_archive2.Rd b/man/is_epi_archive2.Rd deleted file mode 100644 index df258d3e..00000000 --- a/man/is_epi_archive2.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{is_epi_archive2} -\alias{is_epi_archive2} -\title{Test for \code{epi_archive} format} -\usage{ -is_epi_archive2(x, grouped_okay = FALSE) -} -\arguments{ -\item{x}{An object.} - -\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also -count? Default is \code{FALSE}.} -} -\value{ -\code{TRUE} if the object inherits from \code{epi_archive}. -} -\description{ -Test for \code{epi_archive} format -} -\examples{ -is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -is_epi_archive2(archive_cases_dv_subset_2) # TRUE - -# By default, grouped_epi_archives don't count as epi_archives, as they may -# support a different set of operations from regular `epi_archives`. This -# behavior can be controlled by `grouped_okay`. -grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) -is_epi_archive2(grouped_archive) # FALSE -is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE - -} -\seealso{ -\code{\link{is_grouped_epi_archive}} -} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index 6f0d35b3..cca554fa 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -1,25 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R, R/archive_new.R +% Please edit documentation in R/archive.R \name{max_version_with_row_in} \alias{max_version_with_row_in} \title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ -max_version_with_row_in(x) - max_version_with_row_in(x) } \arguments{ \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ -\code{max(x$version)} if it has any rows; raises error if it has 0 rows or -an \code{NA} version value - \code{max(x$version)} if it has any rows; raises error if it has 0 rows or an \code{NA} version value } \description{ -Exported to make defaults more easily copyable. - Exported to make defaults more easily copyable. } diff --git a/man/merge_epi_archive2.Rd b/man/merge_epi_archive2.Rd deleted file mode 100644 index dd1e671e..00000000 --- a/man/merge_epi_archive2.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{merge_epi_archive2} -\alias{merge_epi_archive2} -\title{Merge epi archive} -\usage{ -merge_epi_archive2( - x, - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE -) -} -\arguments{ -\item{x}{as in \code{\link{epix_merge}}} - -\item{y}{as in \code{\link{epix_merge}}} - -\item{sync}{as in \code{\link{epix_merge}}} - -\item{compactify}{as in \code{\link{epix_merge}}} -} -\description{ -Merges another \code{epi_archive} with the current one, mutating the -current one by reseating its \code{DT} and several other fields, but avoiding -mutation of the old \code{DT}; returns the current archive -\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description -of the non-R6-method version, which does not mutate either archive, and -does not alias either archive's \code{DT}.a -} diff --git a/man/new_epi_archive2.Rd b/man/new_epi_archive2.Rd deleted file mode 100644 index 52141190..00000000 --- a/man/new_epi_archive2.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{new_epi_archive2} -\alias{new_epi_archive2} -\title{New epi archive} -\usage{ -new_epi_archive2( - x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NA, - versions_end = NULL -) -} -\arguments{ -\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{other_keys}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space while maintaining the same behavior (with the help of the -\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). -\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will -remove these rows and issue a warning. Generally, this can be set to -\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} -such as its \code{DT}, or rely on redundant updates to achieve a certain -behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to -determine whether \code{compactify=TRUE} will produce the desired results. If -compactification here is removing a large proportion of the rows, this may -indicate a potential for space, time, or bandwidth savings upstream the -data pipeline, e.g., by avoiding fetching, storing, or processing these -rows of \code{x}.} - -\item{clobberable_versions_start}{Optional; as in \code{\link{as_epi_archive}}} - -\item{versions_end}{Optional; as in \code{\link{as_epi_archive}}} -} -\value{ -An \code{epi_archive} object. -} -\description{ -Creates a new \code{epi_archive} object. -} -\details{ -Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information -and examples of parameter names. -} diff --git a/man/next_after.Rd b/man/next_after.Rd index 82fd3ebb..5170e8d9 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -1,23 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R, R/archive_new.R +% Please edit documentation in R/archive.R \name{next_after} \alias{next_after} \title{Get the next possible value greater than \code{x} of the same type} \usage{ -next_after(x) - next_after(x) } \arguments{ \item{x}{the starting "value"(s)} } \value{ -same class, typeof, and length as \code{x} - same class, typeof, and length as \code{x} } \description{ -Get the next possible value greater than \code{x} of the same type - Get the next possible value greater than \code{x} of the same type } diff --git a/man/print.epi_archive2.Rd b/man/print.epi_archive.Rd similarity index 56% rename from man/print.epi_archive2.Rd rename to man/print.epi_archive.Rd index 0105c47e..6f823ccd 100644 --- a/man/print.epi_archive2.Rd +++ b/man/print.epi_archive.Rd @@ -1,12 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{print.epi_archive2} -\alias{print.epi_archive2} +% Please edit documentation in R/archive.R +\name{print.epi_archive} +\alias{print.epi_archive} \title{Print information about an \code{epi_archive} object} \usage{ -\method{print}{epi_archive2}(epi_archive, class = TRUE, methods = TRUE) +\method{print}{epi_archive}(x, ..., class = TRUE, methods = TRUE) } \arguments{ +\item{x}{An \code{epi_archive} object.} + +\item{...}{Should be empty, there to satisfy the S3 generic.} + \item{class}{Boolean; whether to print the class label header} \item{methods}{Boolean; whether to print all available methods of diff --git a/man/slide.epi_archive2.Rd b/man/slide.epi_archive2.Rd deleted file mode 100644 index 54db5636..00000000 --- a/man/slide.epi_archive2.Rd +++ /dev/null @@ -1,101 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{slide.epi_archive2} -\alias{slide.epi_archive2} -\title{Slide over epi archive} -\usage{ -\method{slide}{epi_archive2}( - epi_archive, - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -) -} -\arguments{ -\item{f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the -computation.} - -\item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} - -\item{before}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} - -\item{ref_time_values}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} -} -\description{ -Slides a given function over variables in an \code{epi_archive} -object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. The parameter descriptions below are copied from there -} diff --git a/man/slide.grouped_epi_archive2.Rd b/man/slide.grouped_epi_archive2.Rd deleted file mode 100644 index b5aac24c..00000000 --- a/man/slide.grouped_epi_archive2.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/grouped_archive_new.R -\name{slide.grouped_epi_archive2} -\alias{slide.grouped_epi_archive2} -\title{Slide over grouped epi archive} -\usage{ -\method{slide}{grouped_epi_archive2}( - grouped_epi_archive, - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -) -} -\description{ -Slides a given function over variables in a \code{grouped_epi_archive} -object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. -} diff --git a/man/truncate_versions_after.epi_archive2.Rd b/man/truncate_versions_after.epi_archive2.Rd deleted file mode 100644 index 08ae40d4..00000000 --- a/man/truncate_versions_after.epi_archive2.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{truncate_versions_after.epi_archive2} -\alias{truncate_versions_after.epi_archive2} -\title{Truncate versions after} -\usage{ -\method{truncate_versions_after}{epi_archive2}(epi_archive, max_version) -} -\arguments{ -\item{epi_archive}{as in \code{\link{epix_truncate_versions_after}}} - -\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} -} -\description{ -Filter to keep only older versions, mutating the archive by -potentially reseating but not mutating some fields. \code{DT} is likely, but not -guaranteed, to be copied. Returns the mutated archive -\link[base:invisible]{invisibly}. -} diff --git a/man/truncate_versions_after.grouped_epi_archive2.Rd b/man/truncate_versions_after.grouped_epi_archive2.Rd deleted file mode 100644 index 7c25950f..00000000 --- a/man/truncate_versions_after.grouped_epi_archive2.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/grouped_archive_new.R -\name{truncate_versions_after.grouped_epi_archive2} -\alias{truncate_versions_after.grouped_epi_archive2} -\title{Truncate versions after a given version, grouped} -\usage{ -\method{truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) -} -\arguments{ -\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} - -\item{x}{as in \code{\link{epix_truncate_versions_after}}} -} -\description{ -Filter to keep only older versions by mutating the underlying -\code{epi_archive} using \verb{$truncate_versions_after}. Returns the mutated -\code{grouped_epi_archive} \link[base:invisible]{invisibly}. -} diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 47506152..d78167d7 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -111,12 +111,12 @@ test_that("archive version bounds args work as intended", { ) expect_error(as_epi_archive(update_tbl, versions_end = NA), regexp = "must have the same classes") ea_default <- as_epi_archive(update_tbl) - ea_default$as_of(measurement_date + 4L) + ea_default %>% epix_as_of(measurement_date + 4L) expect_warning( regexp = NA, - ea_default$as_of(measurement_date + 5L), - class = "epiprocess__snapshot_as_of_clobberable_version" + ea_default %>% epix_as_of(measurement_date + 5L), + class = "epiprocess__snapshot_epix_as_of_clobberable_version" ) - ea_default$as_of(measurement_date + 5L) - expect_error(ea_default$as_of(measurement_date + 6L)) + ea_default %>% epix_as_of(measurement_date + 5L) + expect_error(ea_default %>% epix_as_of(measurement_date + 6L)) }) diff --git a/tests/testthat/test-archive_new.R b/tests/testthat/test-archive_new.R deleted file mode 100644 index 98f708d7..00000000 --- a/tests/testthat/test-archive_new.R +++ /dev/null @@ -1,173 +0,0 @@ -library(dplyr) - -test_that("first input must be a data.frame", { - expect_error(as_epi_archive2(c(1, 2, 3), compactify = FALSE), - regexp = "Must be of type 'data.frame'." - ) -}) - -dt <- archive_cases_dv_subset_2$DT - -test_that("data.frame must contain geo_value, time_value and version columns", { - expect_error(as_epi_archive2(select(dt, -geo_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - expect_error(as_epi_archive2(select(dt, -time_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - expect_error(as_epi_archive2(select(dt, -version), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) -}) - -test_that("other_keys can only contain names of the data.frame columns", { - expect_error(as_epi_archive2(dt, other_keys = "xyz", compactify = FALSE), - regexp = "`other_keys` must be contained in the column names of `x`." - ) - expect_error(as_epi_archive2(dt, other_keys = "percent_cli", compactify = FALSE), NA) -}) - -test_that("other_keys cannot contain names geo_value, time_value or version", { - expect_error(as_epi_archive2(dt, other_keys = "geo_value", compactify = FALSE), - regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." - ) - expect_error(as_epi_archive2(dt, other_keys = "time_value", compactify = FALSE), - regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." - ) - expect_error(as_epi_archive2(dt, other_keys = "version", compactify = FALSE), - regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." - ) -}) - -test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { - expect_warning(as_epi_archive2(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." - ) - expect_warning(as_epi_archive2(dt, additional_metadata = list(time_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." - ) -}) - -test_that("epi_archives are correctly instantiated with a variety of data types", { - # Data frame - df <- data.frame( - geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20 - ) - - ea1 <- as_epi_archive2(df, compactify = FALSE) - expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) - expect_equal(ea1$additional_metadata, list()) - - ea2 <- as_epi_archive2(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) - expect_equal(ea2$additional_metadata, list(value = df$value)) - - # Tibble - tib <- tibble::tibble(df, code = "x") - - ea3 <- as_epi_archive2(tib, compactify = FALSE) - expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) - expect_equal(ea3$additional_metadata, list()) - - ea4 <- as_epi_archive2(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea4$additional_metadata, list(value = df$value)) - - # Keyed data.table - kdt <- data.table::data.table( - geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20, - code = "CA", - key = "code" - ) - - ea5 <- as_epi_archive2(kdt, compactify = FALSE) - # Key from data.table isn't absorbed when as_epi_archive2 is used - expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) - expect_equal(ea5$additional_metadata, list()) - - ea6 <- as_epi_archive2(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) - # Mismatched keys, but the one from as_epi_archive2 overrides - expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) - expect_equal(ea6$additional_metadata, list(value = df$value)) - - # Unkeyed data.table - udt <- data.table::data.table( - geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20, - code = "CA" - ) - - ea7 <- as_epi_archive2(udt, compactify = FALSE) - expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) - expect_equal(ea7$additional_metadata, list()) - - ea8 <- as_epi_archive2(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea8$additional_metadata, list(value = df$value)) - - # epi_df - edf1 <- jhu_csse_daily_subset %>% - select(geo_value, time_value, cases) %>% - mutate(version = max(time_value), code = "USA") - - ea9 <- as_epi_archive2(edf1, compactify = FALSE) - expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) - expect_equal(ea9$additional_metadata, list()) - - ea10 <- as_epi_archive2(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea10$additional_metadata, list(value = df$value)) - - # Keyed epi_df - edf2 <- data.frame( - geo_value = "al", - time_value = rep(as.Date("2020-01-01") + 0:9, 2), - version = c( - rep(as.Date("2020-01-25"), 10), - rep(as.Date("2020-01-26"), 10) - ), - cases = 1:20, - misc = "USA" - ) %>% - as_epi_df(additional_metadata = list(other_keys = "misc")) - - ea11 <- as_epi_archive2(edf2, compactify = FALSE) - expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) - expect_equal(ea11$additional_metadata, list()) - - ea12 <- as_epi_archive2(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) - expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) - expect_equal(ea12$additional_metadata, list(value = df$misc)) -}) - -test_that("`epi_archive` rejects nonunique keys", { - toy_update_tbl <- - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130, - "us", "pediatric", "2000-01-01", "2000-01-02", 5 - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version) - ) - expect_error( - as_epi_archive2(toy_update_tbl), - class = "epiprocess__epi_archive_requires_unique_key" - ) - expect_error( - regexp = NA, - as_epi_archive2(toy_update_tbl, other_keys = "age_group"), - ) -}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 58e97884..263d67b7 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,7 +2,7 @@ library(epiprocess) library(data.table) library(dplyr) -dt <- archive_cases_dv_subset_2$DT +dt <- archive_cases_dv_subset$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) @@ -84,8 +84,8 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti # Row 22, an LOCF row corresponding to the latest version, is omitted in # ea_true latest_version <- max(ea_false$DT$version) - as_of_true <- ea_true$as_of(latest_version) - as_of_false <- ea_false$as_of(latest_version) + as_of_true <- epix_as_of(ea_true, latest_version) + as_of_false <- epix_as_of(ea_false, latest_version) expect_identical(as_of_true, as_of_false) }) diff --git a/tests/testthat/test-compactify_new.R b/tests/testthat/test-compactify_new.R deleted file mode 100644 index cd53913d..00000000 --- a/tests/testthat/test-compactify_new.R +++ /dev/null @@ -1,110 +0,0 @@ -library(epiprocess) -library(data.table) -library(dplyr) - -dt <- archive_cases_dv_subset_2$DT -dt <- filter(dt, geo_value == "ca") %>% - filter(version <= "2020-06-15") %>% - select(-case_rate_7d_av) - -test_that("Input for compactify must be NULL or a boolean", { - expect_error(as_epi_archive2(dt, compactify = "no")) -}) - -dt$percent_cli <- c(1:80) -dt$case_rate <- c(1:80) - -row_replace <- function(dt, row, x, y) { - # (This way of "replacing" elements appears to use copy-on-write even though - # we are working with a data.table.) - dt[row, 4] <- x - dt[row, 5] <- y - dt -} - -# Note that compactify is working on version-wise LOCF (last version of each -# observation carried forward) - -# Rows 1 should not be eliminated even if NA -dt <- row_replace(dt, 1, NA, NA) # Not LOCF - -# NOTE! We are assuming that there are no NA's in geo_value, time_value, -# and version. Even though compactify may erroneously remove the first row -# if it has all NA's, we are not testing this behaviour for now as this dataset -# has problems beyond the scope of this test - -# Rows 11 and 12 correspond to different time_values -dt <- row_replace(dt, 12, 11, 11) # Not LOCF - -# Rows 20 and 21 only differ in version -dt <- row_replace(dt, 21, 20, 20) # LOCF - -# Rows 21 and 22 only differ in version -dt <- row_replace(dt, 22, 20, 20) # LOCF - -# Row 39 comprises the first NA's -dt <- row_replace(dt, 39, NA, NA) # Not LOCF - -# Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(dt, 40, NA, NA) # LOCF - -# Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(dt, 62, 15, 15) # Not LOCF - -# Row 73 only has one value carried over -dt <- row_replace(dt, 74, 73, 74) # Not LOCF - -dt_true <- as_tibble(as_epi_archive2(dt, compactify = TRUE)$DT) -dt_false <- as_tibble(as_epi_archive2(dt, compactify = FALSE)$DT) -dt_null <- suppressWarnings(as_tibble(as_epi_archive2(dt, compactify = NULL)$DT)) - -test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive2(dt, compactify = NULL)) -}) - -test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive2(dt[1:5], compactify = NULL), NA) -}) - -test_that("LOCF values are ignored with compactify=FALSE", { - expect_identical(nrow(dt), nrow(dt_false)) -}) - -test_that("LOCF values are taken out with compactify=TRUE", { - dt_test <- as_tibble(as_epi_archive2(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) - - expect_identical(dt_true, dt_null) - expect_identical(dt_null, dt_test) -}) - -test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { - ea_true <- as_epi_archive2(dt, compactify = TRUE) - ea_false <- as_epi_archive2(dt, compactify = FALSE) - - # Row 22, an LOCF row corresponding to the latest version, is omitted in - # ea_true - latest_version <- max(ea_false$DT$version) - as_of_true <- as_of(ea_true, latest_version) - as_of_false <- as_of(ea_false, latest_version) - - expect_identical(as_of_true, as_of_false) -}) - -test_that("compactify does not alter the default clobberable and observed version bounds", { - x <- tibble::tibble( - geo_value = "geo1", - time_value = as.Date("2000-01-01"), - version = as.Date("2000-01-01") + 1:5, - value = 42L - ) - ea_true <- as_epi_archive2(x, compactify = TRUE) - ea_false <- as_epi_archive2(x, compactify = FALSE) - # We say that we base the bounds on the user's `x` arg. We might mess up or - # change our minds and base things on the `DT` field (or a temporary `DT` - # variable, post-compactify) instead. Check that this test would trigger - # in that case: - expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) - # The actual test: - expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) - expect_identical(ea_true$versions_end, ea_false$versions_end) -}) diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R index 5be3824e..7d29149b 100644 --- a/tests/testthat/test-deprecations.R +++ b/tests/testthat/test-deprecations.R @@ -5,8 +5,8 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - slide(function(...) {}, before = 2L, group_by = c()), + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( @@ -16,9 +16,9 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - group_by(geo_value)$ - slide(function(...) {}, before = 2L, group_by = c()), + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) # @@ -28,8 +28,8 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - slide(function(...) {}, before = 2L, all_rows = TRUE), + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( @@ -39,9 +39,9 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - group_by(geo_value)$ - slide(function(...) {}, before = 2L, all_rows = TRUE), + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) }) diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 9ba847fa..89bb4804 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -7,15 +7,7 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to ea_trivial_fill_na1 <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") ea_trivial_fill_na2 <- epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") ea_trivial_fill_locf <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") - # Below, we want R6 objects to be compared based on contents rather than - # addresses. We appear to get this with `expect_identical` in `testthat` - # edition 3, which is based on `waldo::compare` rather than `base::identical`; - # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 - # objects by contents rather than address (in a way that is tested but maybe - # not guaranteed via user docs). Use `testthat::local_edition` to ensure we - # use testthat edition 3 here (use `testthat::` to prevent ambiguity with - # `readr`). - testthat::local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) expect_identical(ea_orig, ea_trivial_fill_na2) expect_identical(ea_orig, ea_trivial_fill_locf) @@ -33,20 +25,17 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte ea_fill_na <- epix_fill_through_version(ea_orig, later_unobserved_version, "na") ea_fill_locf <- epix_fill_through_version(ea_orig, later_unobserved_version, "locf") - # We use testthat edition 3 features here, passing `ignore_attr` to - # `waldo::compare`. Ensure we are using edition 3: - testthat::local_edition(3) withCallingHandlers( { expect_identical(ea_fill_na$versions_end, later_unobserved_version) - expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), + expect_identical(tibble::as_tibble(epix_as_of(ea_fill_na, first_unobserved_version)), tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), ignore_attr = TRUE ) expect_identical(ea_fill_locf$versions_end, later_unobserved_version) expect_identical( - ea_fill_locf$as_of(first_unobserved_version), - ea_fill_locf$as_of(ea_orig$versions_end) %>% + epix_as_of(ea_fill_locf, first_unobserved_version), + epix_as_of(ea_fill_locf, ea_orig$versions_end) %>% { attr(., "metadata")$as_of <- first_unobserved_version . @@ -69,54 +58,31 @@ test_that("epix_fill_through_version does not mutate x", { # doesn't seem sufficient to trigger) as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) )) { - # We want to perform a strict comparison of the contents of `ea_orig` before - # and `ea_orig` after. `clone` + `expect_identical` based on waldo would - # sort of work, but we might want something stricter. `as.list` + - # `identical` plus a check of the DT seems to do the trick. - ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_before <- clone(ea_orig) ea_orig_dt_before_copy <- data.table::copy(ea_orig$DT) some_unobserved_version <- 8L - # + ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list <- as.list(ea_orig) - # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_before, ea_orig) expect_identical(ea_orig_dt_before_copy, ea_orig$DT) - # + ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") - ea_orig_after_as_list <- as.list(ea_orig) - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_before, ea_orig) expect_identical(ea_orig_dt_before_copy, ea_orig$DT) } }) -test_that("x$fill_through_version mutates x (if needed)", { - ea <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )) - # We want the contents to change in a substantial way that makes waldo compare - # different (if the contents need to change). - ea_before_copies_as_list <- lapply(ea, data.table::copy) - some_unobserved_version <- 8L - ea$fill_through_version(some_unobserved_version, "na") - ea_after_copies_as_list <- lapply(ea, data.table::copy) - expect_failure(expect_identical(ea_before_copies_as_list, ea_after_copies_as_list)) -}) - -test_that("{epix_,$}fill_through_version return with expected visibility", { +test_that("epix_fill_through_version return with expected visibility", { ea <- as_epi_archive(data.table::data.table( geo_value = "g1", time_value = as.Date("2020-01-01"), version = 1:5, value = 1:5 )) expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) - expect_false(withVisible(ea$fill_through_version(15L, "na"))[["visible"]]) }) test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - old_dt <- ea$DT - old_dt_copy <- data.table::copy(old_dt) + old_dt_copy <- data.table::copy(ea$DT) old_key <- data.table::key(ea$DT) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) diff --git a/tests/testthat/test-epix_fill_through_version_new.R b/tests/testthat/test-epix_fill_through_version_new.R deleted file mode 100644 index 2b76a851..00000000 --- a/tests/testthat/test-epix_fill_through_version_new.R +++ /dev/null @@ -1,109 +0,0 @@ -test_that("epix_fill_through_version2 mirrors input when it is sufficiently up to date", { - ea_orig <- as_epi_archive2(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )) - some_earlier_observed_version <- 2L - ea_trivial_fill_na1 <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "na") - ea_trivial_fill_na2 <- epix_fill_through_version2(ea_orig, ea_orig$versions_end, "na") - ea_trivial_fill_locf <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "locf") - # Below, we want R6 objects to be compared based on contents rather than - # addresses. We appear to get this with `expect_identical` in `testthat` - # edition 3, which is based on `waldo::compare` rather than `base::identical`; - # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 - # objects by contents rather than address (in a way that is tested but maybe - # not guaranteed via user docs). Use `testthat::local_edition` to ensure we - # use testthat edition 3 here (use `testthat::` to prevent ambiguity with - # `readr`). - testthat::local_edition(3) - expect_identical(ea_orig, ea_trivial_fill_na1) - expect_identical(ea_orig, ea_trivial_fill_na2) - expect_identical(ea_orig, ea_trivial_fill_locf) -}) - -test_that("epix_fill_through_version2 can extend observed versions, gives expected `as_of`s", { - ea_orig <- as_epi_archive2(data.table::data.table( - geo_value = "g1", - time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), - version = c(1:5, 2L), - value = 1:6 - )) - first_unobserved_version <- 6L - later_unobserved_version <- 10L - ea_fill_na <- epix_fill_through_version2(ea_orig, later_unobserved_version, "na") - ea_fill_locf <- epix_fill_through_version2(ea_orig, later_unobserved_version, "locf") - - # We use testthat edition 3 features here, passing `ignore_attr` to - # `waldo::compare`. Ensure we are using edition 3: - testthat::local_edition(3) - withCallingHandlers( - { - expect_identical(ea_fill_na$versions_end, later_unobserved_version) - expect_identical(tibble::as_tibble(as_of(ea_fill_na, first_unobserved_version)), - tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), - ignore_attr = TRUE - ) - expect_identical(ea_fill_locf$versions_end, later_unobserved_version) - expect_identical( - as_of(ea_fill_locf, first_unobserved_version), - as_of(ea_fill_locf, ea_orig$versions_end) %>% - { - attr(., "metadata")$as_of <- first_unobserved_version - . - } - ) - }, - epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") - ) -}) - -test_that("epix_fill_through_version2 does not mutate x", { - for (ea_orig in list( - # vanilla case - as_epi_archive2(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )), - # data.table unique yielding original DT by reference special case (maybe - # having only 1 row is the trigger? having no revisions of initial values - # doesn't seem sufficient to trigger) - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - )) { - # We want to perform a strict comparison of the contents of `ea_orig` before - # and `ea_orig` after. `clone` + `expect_identical` based on waldo would - # sort of work, but we might want something stricter. `as.list` + - # `identical` plus a check of the DT seems to do the trick. - ea_orig_before_as_list <- as.list(ea_orig) - ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) - some_unobserved_version <- 8L - # - ea_fill_na <- epix_fill_through_version2(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list <- as.list(ea_orig) - # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) - expect_identical(ea_orig_DT_before_copy, ea_orig$DT) - # - ea_fill_locf <- epix_fill_through_version2(ea_orig, some_unobserved_version, "locf") - ea_orig_after_as_list <- as.list(ea_orig) - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) - expect_identical(ea_orig_DT_before_copy, ea_orig$DT) - } -}) - -test_that("epix_fill_through_version return with expected visibility", { - ea <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )) - expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) -}) - -test_that("epix_fill_through_version2 returns same key & doesn't mutate old DT or its key", { - ea <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - old_DT <- ea$DT - old_DT_copy <- data.table::copy(old_DT) - old_key <- data.table::key(ea$DT) - expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "na")$DT), old_key) - expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "locf")$DT), old_key) - expect_identical(data.table::key(ea$DT), old_key) -}) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 181aee28..9bcc7d67 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,5 +1,6 @@ test_that("epix_merge requires forbids on invalid `y`", { - ea <- archive_cases_dv_subset$clone() + ea <- archive_cases_dv_subset %>% + clone() expect_error(epix_merge(ea, data.frame(x = 1))) }) @@ -58,9 +59,7 @@ test_that("epix_merge merges and carries forward updates properly", { dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) ) - # We rely on testthat edition 3 expect_identical using waldo, not identical. See - # test-epix_fill_through_version.R comments for details. - testthat::local_edition(3) + expect_identical(xy, xy_expected) }) diff --git a/tests/testthat/test-epix_merge_new.R b/tests/testthat/test-epix_merge_new.R deleted file mode 100644 index 10041dbb..00000000 --- a/tests/testthat/test-epix_merge_new.R +++ /dev/null @@ -1,226 +0,0 @@ -test_that("epix_merge requires forbids on invalid `y`", { - ea <- archive_cases_dv_subset_2 %>% - clone() - expect_error(epix_merge2(ea, data.frame(x = 1))) -}) - -test_that("epix_merge merges and carries forward updates properly", { - x <- as_epi_archive2( - data.table::as.data.table( - tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, - # same version set for x and y - "g1", 1L, 1:3, paste0("XA", 1:3), - # versions of x surround those of y + this measurement has - # max update version beyond some others - "g1", 2L, 1:5, paste0("XB", 1:5), - # mirror case - "g1", 3L, 2L, paste0("XC", 2L), - # x has 1 version, y has 0 - "g1", 4L, 1L, paste0("XD", 1L), - # non-NA values that should be carried forward - # (version-wise LOCF) in other versions, plus NAs that - # should (similarly) be carried forward as NA (latter - # wouldn't work with an ordinary merge + post-processing - # with `data.table::nafill`) - "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) - ) %>% - tidyr::unchop(c(version, x_value)) %>% - dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) - ) - ) - y <- as_epi_archive2( - data.table::as.data.table( - tibble::tribble( - ~geo_value, ~time_value, ~version, ~y_value, - "g1", 1L, 1:3, paste0("YA", 1:3), - "g1", 2L, 2L, paste0("YB", 2L), - "g1", 3L, 1:5, paste0("YC", 1:5), - "g1", 5L, 1L, paste0("YD", 1L), - "g1", 6L, 1:5, paste0("YE", 1:5), - ) %>% - tidyr::unchop(c(version, y_value)) %>% - dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) - ) - ) - xy <- epix_merge2(x, y) - xy_expected <- as_epi_archive2( - data.table::as.data.table( - tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), - "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), - "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), - "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), - "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), - "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), - ) %>% - tidyr::unchop(c(version, x_value, y_value)) %>% - dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) - ) - ) - # We rely on testthat edition 3 expect_identical using waldo, not identical. See - # test-epix_fill_through_version.R comments for details. - testthat::local_edition(3) - expect_identical(xy, xy_expected) -}) - -test_that("epix_merge forbids and warns on metadata and naming issues", { - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) - ), - regexp = "must have the same.*geo_type" - ) - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) - ), - regexp = "must have the same.*time_type" - ) - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) - ), - regexp = "overlapping.*names" - ) - expect_warning( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), - additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) - ), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) - ), - regexp = "x\\$additional_metadata", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - expect_warning( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), - additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) - ) - ), - regexp = "y\\$additional_metadata", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) -}) - -# use `local` to prevent accidentally using the x, y, xy bindings here -# elsewhere, while allowing reuse across a couple tests -local({ - x <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), - clobberable_versions_start = 1L, versions_end = 10L - ) - y <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), - clobberable_versions_start = 3L, versions_end = 10L - ) - xy <- epix_merge2(x, y) - test_that("epix_merge considers partially-clobberable row to be clobberable", { - expect_identical(xy$clobberable_versions_start, 1L) - }) - test_that("epix_merge result uses versions_end metadata not max version val", { - expect_identical(xy$versions_end, 10L) - }) -}) - -local({ - x <- as_epi_archive2( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), - clobberable_versions_start = 1L, - versions_end = 3L - ) - y <- as_epi_archive2( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), - clobberable_versions_start = 1L - ) - test_that('epix_merge forbids on sync default or "forbid"', { - expect_error(epix_merge2(x, y), - class = "epiprocess__epix_merge_unresolved_sync" - ) - expect_error(epix_merge2(x, y, sync = "forbid"), - class = "epiprocess__epix_merge_unresolved_sync" - ) - }) - test_that('epix_merge sync="na" works', { - expect_equal( - epix_merge2(x, y, sync = "na"), - as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet - 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated - # (we should not have a y vals -> NA update here; version 5 should be - # the `versions_end` of the result) - ), clobberable_versions_start = 1L) - ) - }) - test_that('epix_merge sync="locf" works', { - expect_equal( - epix_merge2(x, y, sync = "locf"), - as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated - ), clobberable_versions_start = 1L) - ) - }) - test_that('epix_merge sync="truncate" works', { - expect_equal( - epix_merge2(x, y, sync = "truncate"), - as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - # y's update beyond x's last update has been truncated - ), clobberable_versions_start = 1L, versions_end = 3L) - ) - }) - x_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) - y_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) - xy_no_conflict_expected <- as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet - )) - test_that('epix_merge sync="forbid" on no-conflict works', { - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "forbid"), - xy_no_conflict_expected - ) - }) - test_that('epix_merge sync="na" on no-conflict works', { - # This test is the main reason for these no-conflict tests. We want to make - # sure that we don't add an unnecessary NA-ing-out version beyond a common - # versions_end. - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "na"), - xy_no_conflict_expected - ) - }) - test_that('epix_merge sync="locf" on no-conflict works', { - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "locf"), - xy_no_conflict_expected - ) - }) - test_that('epix_merge sync="truncate" on no-conflict works', { - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "truncate"), - xy_no_conflict_expected - ) - }) -}) - - -test_that('epix_merge sync="na" balks if do not know next_after', { - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), - sync = "na" - ), - regexp = "no applicable method.*next_after" - ) -}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 07f0e5bf..b7a3e946 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -39,15 +39,13 @@ test_that("epix_slide works as intended", { expect_identical(xx1, xx2) # * - xx3 <- ( - xx - $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide( + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + epix_slide( f = ~ sum(.x$binary), before = 2, new_col_name = "sum_binary" ) - ) expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical @@ -95,15 +93,13 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { expect_identical(xx_dfrow1, xx_dfrow2) # * - xx_dfrow3 <- ( - xx - $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide( + xx_dfrow3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + epix_slide( f = ~ data.frame(bin_sum = sum(.x$binary)), before = 2, as_list_col = TRUE ) - ) expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical @@ -179,39 +175,40 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { test_that("epix_slide `before` validation works", { expect_error( - xx$slide(f = ~ sum(.x$binary)), + xx %>% epix_slide(f = ~ sum(.x$binary)), "`before` is required" ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = NA), + xx %>% epix_slide(f = ~ sum(.x$binary), before = NA), "Assertion on 'before' failed: May not be NA" ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = -1), + xx %>% epix_slide(f = ~ sum(.x$binary), before = -1), "Assertion on 'before' failed: Element 1 is not >= 0" ) - expect_error(xx$slide(f = ~ sum(.x$binary), before = 1.5), + expect_error( + xx %>% epix_slide(f = ~ sum(.x$binary), before = 1.5), regexp = "before", class = "vctrs_error_incompatible_type" ) # We might want to allow this at some point (issue #219): - expect_error(xx$slide(f = ~ sum(.x$binary), before = Inf), + expect_error( + xx %>% epix_slide(f = ~ sum(.x$binary), before = Inf), regexp = "before", class = "vctrs_error_incompatible_type" ) - # (wrapper shouldn't introduce a value:) - expect_error(epix_slide(xx, f = ~ sum(.x$binary)), "`before` is required") + expect_error(xx %>% epix_slide(f = ~ sum(.x$binary)), "`before` is required") # These `before` values should be accepted: expect_error( - xx$slide(f = ~ sum(.x$binary), before = 0), + xx %>% epix_slide(f = ~ sum(.x$binary), before = 0), NA ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = 2L), + xx %>% epix_slide(f = ~ sum(.x$binary), before = 2L), NA ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = 365000), + xx %>% epix_slide(f = ~ sum(.x$binary), before = 365000), NA ) }) @@ -251,12 +248,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss # # (S3 group_by behavior for this case is the `reference_by_modulus`) expect_identical( - ea$group_by(modulus)$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(modulus) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the .data pronoun behavior: @@ -271,12 +270,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus ) expect_identical( - ea$group_by(.data$modulus)$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(.data$modulus) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the passing across-all-of-string-literal behavior: @@ -291,12 +292,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus ) expect_identical( - ea$group_by(across(all_of("modulus")))$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(across(all_of("modulus"))) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the passing-across-all-of-string-var behavior: @@ -312,12 +315,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus ) expect_identical( - ea$group_by(dplyr::across(tidyselect::all_of(my_group_by)))$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the default behavior (default in this case should just be grouping by neither): @@ -332,7 +337,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_neither ) expect_identical( - ea$slide( + ea %>% epix_slide( f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, @@ -356,12 +361,6 @@ ea <- tibble::tribble( as_epi_archive() test_that("epix_slide with all_versions option has access to all older versions", { - library(data.table) - # Make sure we're using testthat edition 3, where `expect_identical` doesn't - # actually mean `base::identical` but something more content-based using - # `waldo` package: - testthat::local_edition(3) - slide_fn <- function(x, gk, rtv) { return(tibble( n_versions = length(unique(x$DT$version)), @@ -371,8 +370,8 @@ test_that("epix_slide with all_versions option has access to all older versions" )) } - ea_orig_mirror <- ea$clone(deep = TRUE) - ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + ea_orig_mirror <- ea %>% clone() + ea_orig_mirror$DT <- data.table::copy(ea_orig_mirror$DT) result1 <- ea %>% group_by() %>% @@ -397,16 +396,14 @@ test_that("epix_slide with all_versions option has access to all older versions" expect_identical(result1, result2) # * - result3 <- ( - ea - $group_by() - $slide( + result3 <- ea %>% + group_by() %>% + epix_slide( f = slide_fn, before = 10^3, names_sep = NULL, all_versions = TRUE ) - ) expect_identical(result1, result3) # This and * Imply result2 and result3 are identical @@ -427,7 +424,7 @@ test_that("epix_slide with all_versions option has access to all older versions" group_by() %>% epix_slide( data = slide_fn( - .data$clone(), # hack to convert from pronoun back to archive + .x, stop("slide_fn doesn't use group key, no need to prepare it") ), before = 10^3, @@ -436,14 +433,10 @@ test_that("epix_slide with all_versions option has access to all older versions" ) expect_identical(result1, result5) # This and * Imply result2 and result5 are identical - expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea }) -test_that("as_of and epix_slide with long enough window are compatible", { - library(data.table) - testthat::local_edition(3) - +test_that("epix_as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: f1 <- function(x, gk, rtv) { @@ -454,8 +447,8 @@ test_that("as_of and epix_slide with long enough window are compatible", { ref_time_value1 <- 5 expect_identical( - ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), - ea$slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ea %>% epix_as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% epix_slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) ) # For all_versions = TRUE: @@ -475,18 +468,24 @@ test_that("as_of and epix_slide with long enough window are compatible", { ) %>% # assess as nowcast: unnest(data) %>% - inner_join(x$as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + inner_join( + x %>% epix_as_of(x$versions_end), + by = setdiff(key(x$DT), c("version")) + ) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } ref_time_value2 <- 5 expect_identical( - ea$as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), - ea$slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ea %>% + epix_as_of(ref_time_value2, all_versions = TRUE) %>% + f2() %>% + mutate(time_value = ref_time_value2, .before = 1L), + ea %>% epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo <- ea$clone() + ea_multigeo <- ea %>% clone() ea_multigeo$DT <- rbind( ea_multigeo$DT, copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] @@ -547,16 +546,14 @@ test_that("epix_slide with all_versions option works as intended", { expect_identical(xx1, xx2) # * - xx3 <- ( - xx - $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide( + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + epix_slide( f = ~ sum(.x$DT$binary), before = 2, new_col_name = "sum_binary", all_versions = TRUE ) - ) expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical }) @@ -568,7 +565,7 @@ test_that("epix_slide with all_versions option works as intended", { # back depending on the decisions there: # # test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { -# ea_updated_stale = ea$clone() +# ea_updated_stale = ea %>% clone() # ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) # # # expect_identical( @@ -811,7 +808,6 @@ test_that("`epix_slide` can access objects inside of helper functions", { helper(archive_cases_dv_subset, as.Date("2021-01-01")), NA ) - expect_error( helper(xx, 3L), NA diff --git a/tests/testthat/test-epix_slide_new.R b/tests/testthat/test-epix_slide_new.R deleted file mode 100644 index 49ef5e41..00000000 --- a/tests/testthat/test-epix_slide_new.R +++ /dev/null @@ -1,810 +0,0 @@ -library(dplyr) - -test_that("epix_slide2 only works on an epi_archive", { - expect_error(epix_slide2(data.frame(x = 1))) -}) - -x <- tibble::tribble( - ~version, ~time_value, ~binary, - 4, c(1:3), 2^(1:3), - 5, c(1:2, 4), 2^(4:6), - 6, c(1:2, 4:5), 2^(7:10), - 7, 2:6, 2^(11:15) -) %>% - tidyr::unnest(c(time_value, binary)) - -xx <- bind_cols(geo_value = rep("x", 15), x) %>% - as_epi_archive2() - -test_that("epix_slide2 works as intended", { - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary" - ) - - xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - sum_binary = c( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9, - 2^15 + 2^14 - ) - ) %>% - group_by(geo_value) - - expect_identical(xx1, xx2) # * - - xx3 <- xx %>% - group_by( - dplyr::across(dplyr::all_of("geo_value")) - ) %>% - slide( - f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary" - ) - - expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical - - # function interface - xx4 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2(f = function(x, gk, rtv) { - tibble::tibble(sum_binary = sum(x$binary)) - }, before = 2, names_sep = NULL) - - expect_identical(xx1, xx4) - - # tidyeval interface - xx5 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - sum_binary = sum(binary), - before = 2 - ) - - expect_identical(xx1, xx5) -}) - -test_that("epix_slide2 works as intended with `as_list_col=TRUE`", { - xx_dfrow1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE - ) - - xx_dfrow2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - c( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9, - 2^15 + 2^14 - ) %>% - purrr::map(~ data.frame(bin_sum = .x)) - ) %>% - group_by(geo_value) - - expect_identical(xx_dfrow1, xx_dfrow2) # * - - xx_dfrow3 <- xx %>% - group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% - slide( - f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE - ) - - expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical - - xx_df1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ data.frame(bin = .x$binary), - before = 2, - as_list_col = TRUE - ) - - xx_df2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - list( - c(2^3, 2^2), - c(2^6, 2^3), - c(2^10, 2^9), - c(2^15, 2^14) - ) %>% - purrr::map(~ data.frame(bin = rev(.x))) - ) %>% - group_by(geo_value) - - expect_identical(xx_df1, xx_df2) - - xx_scalar1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ sum(.x$binary), - before = 2, - as_list_col = TRUE - ) - - xx_scalar2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - list( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9, - 2^15 + 2^14 - ) - ) %>% - group_by(geo_value) - - expect_identical(xx_scalar1, xx_scalar2) - - xx_vec1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ .x$binary, - before = 2, - as_list_col = TRUE - ) - - xx_vec2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - list( - c(2^3, 2^2), - c(2^6, 2^3), - c(2^10, 2^9), - c(2^15, 2^14) - ) %>% - purrr::map(rev) - ) %>% - group_by(geo_value) - - expect_identical(xx_vec1, xx_vec2) -}) - -test_that("epix_slide2 `before` validation works", { - expect_error( - slide(xx, f = ~ sum(.x$binary)), - "`before` is required" - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = NA), - "Assertion on 'before' failed: May not be NA" - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = -1), - "Assertion on 'before' failed: Element 1 is not >= 0" - ) - expect_error(slide(xx, f = ~ sum(.x$binary), before = 1.5), - regexp = "before", - class = "vctrs_error_incompatible_type" - ) - # We might want to allow this at some point (issue #219): - expect_error(slide(xx, f = ~ sum(.x$binary), before = Inf), - regexp = "before", - class = "vctrs_error_incompatible_type" - ) - # (wrapper shouldn't introduce a value:) - expect_error(epix_slide2(xx, f = ~ sum(.x$binary)), "`before` is required") - # These `before` values should be accepted: - expect_error( - slide(xx, f = ~ sum(.x$binary), before = 0), - NA - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = 2L), - NA - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = 365000), - NA - ) -}) - -test_that("quosure passing issue in epix_slide2 is resolved + other potential issues", { - # (First part adapted from @examples) - time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-02"), - by = "1 day" - ) - # We only have one non-version, non-time key in the example archive. Add - # another so that we don't accidentally pass tests due to accidentally - # matching the default grouping. - ea <- as_epi_archive2( - archive_cases_dv_subset$DT %>% - dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), - other_keys = "modulus", - compactify = TRUE - ) - reference_by_modulus <- ea %>% - group_by(modulus) %>% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ) - reference_by_neither <- ea %>% - group_by() %>% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ) - # test the passing-something-that-must-be-enquosed behavior: - # - # (S3 group_by behavior for this case is the `reference_by_modulus`) - expect_identical( - ea %>% group_by(modulus) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the .data pronoun behavior: - expect_identical( - epix_slide2( - x = ea %>% group_by(.data$modulus), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - expect_identical( - ea %>% group_by(.data$modulus) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the passing across-all-of-string-literal behavior: - expect_identical( - epix_slide2( - x = ea %>% group_by(dplyr::across(all_of("modulus"))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - expect_identical( - ea %>% group_by(across(all_of("modulus"))) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the passing-across-all-of-string-var behavior: - my_group_by <- "modulus" - expect_identical( - epix_slide2( - x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - expect_identical( - ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the default behavior (default in this case should just be grouping by neither): - expect_identical( - epix_slide2( - x = ea, - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_neither - ) - expect_identical( - ea %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_neither - ) -}) - -ea <- tibble::tribble( - ~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1) -) %>% - tidyr::unnest(c(time_value, binary)) %>% - mutate(geo_value = "x") %>% - as_epi_archive2() - -test_that("epix_slide2 with all_versions option has access to all older versions", { - library(data.table) - # Make sure we're using testthat edition 3, where `expect_identical` doesn't - # actually mean `base::identical` but something more content-based using - # `waldo` package: - testthat::local_edition(3) - - slide_fn <- function(x, gk, rtv) { - return(tibble( - n_versions = length(unique(x$DT$version)), - n_row = nrow(x$DT), - dt_class1 = class(x$DT)[[1L]], - dt_key = list(key(x$DT)) - )) - } - - ea_orig_mirror <- ea %>% clone(deep = TRUE) - ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) - - result1 <- ea %>% - group_by() %>% - epix_slide2( - f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_true(inherits(result1, "tbl_df")) - - result2 <- tibble::tribble( - ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, - 2, 1L, sum(1:1), "data.table", key(ea$DT), - 3, 2L, sum(1:2), "data.table", key(ea$DT), - 4, 3L, sum(1:3), "data.table", key(ea$DT), - 5, 4L, sum(1:4), "data.table", key(ea$DT), - 6, 5L, sum(1:5), "data.table", key(ea$DT), - 7, 6L, sum(1:6), "data.table", key(ea$DT), - ) - - expect_identical(result1, result2) # * - - result3 <- ea %>% - group_by() %>% - slide( - f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_identical(result1, result3) # This and * Imply result2 and result3 are identical - - # formula interface - result4 <- ea %>% - group_by() %>% - epix_slide2( - f = ~ slide_fn(.x, .y), - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_identical(result1, result4) # This and * Imply result2 and result4 are identical - - # tidyeval interface - result5 <- ea %>% - group_by() %>% - epix_slide2( - data = slide_fn( - .x, - stop("slide_fn doesn't use group key, no need to prepare it") - ), - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_identical(result1, result5) # This and * Imply result2 and result5 are identical - expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea -}) - -test_that("as_of and epix_slide2 with long enough window are compatible", { - library(data.table) - testthat::local_edition(3) - - # For all_versions = FALSE: - - f1 <- function(x, gk, rtv) { - tibble( - diff_mean = mean(diff(x$binary)) - ) - } - ref_time_value1 <- 5 - - expect_identical( - ea %>% as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), - ea %>% slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) - ) - - # For all_versions = TRUE: - - f2 <- function(x, gk, rtv) { - x %>% - # extract time&version-lag-1 data: - epix_slide2( - function(subx, subgk, rtv) { - tibble(data = list( - subx %>% - filter(time_value == attr(subx, "metadata")$as_of - 1) %>% - rename(real_time_value = time_value, lag1 = binary) - )) - }, - before = 1, names_sep = NULL - ) %>% - # assess as nowcast: - unnest(data) %>% - inner_join(x %>% as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% - summarize(mean_abs_delta = mean(abs(binary - lag1))) - } - ref_time_value2 <- 5 - - expect_identical( - ea %>% as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), - ea %>% slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) - ) - - # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo <- ea %>% clone() - ea_multigeo$DT <- rbind( - ea_multigeo$DT, - copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] - ) - setkeyv(ea_multigeo$DT, key(ea$DT)) - - expect_identical( - ea_multigeo %>% - group_by(geo_value) %>% - epix_slide2(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% - filter(geo_value == "x"), - ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` - epix_as_of2(ref_time_value2, all_versions = TRUE) %>% - f2() %>% - transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% - group_by(geo_value) - ) -}) - -test_that("epix_slide2 `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { - slide_fn <- function(x, gk, rtv) { - expect_true(is_epi_archive2(x)) - return(NA) - } - - ea %>% - group_by() %>% - epix_slide2( - f = slide_fn, - before = 1, - ref_time_values = 5, - new_col_name = "out", - all_versions = TRUE - ) -}) - -test_that("epix_slide2 with all_versions option works as intended", { - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE - ) - - xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - sum_binary = c( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9 + 2^6, - 2^15 + 2^14 + 2^10 - ) - ) %>% - group_by(geo_value) - - expect_identical(xx1, xx2) # * - - xx3 <- xx %>% - group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% - slide( - f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE - ) - - expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical -}) - -# XXX currently, we're using a stopgap measure of having `epix_slide2` always -# output a (grouped/ungrouped) tibble while we think about the class, columns, -# and attributes of `epix_slide2` output more carefully. We might bring this test -# back depending on the decisions there: -# -# test_that("`epix_slide2` uses `versions_end` as a resulting `epi_df`'s `as_of`", { -# ea_updated_stale = ea$clone() -# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) -# # -# expect_identical( -# ea_updated_stale %>% -# group_by(geo_value) %>% -# epix_slide2(~ slice_head(.x, n = 1L), before = 10L) %>% -# ungroup() %>% -# attr("metadata") %>% -# .$as_of, -# 10 -# ) -# }) - -test_that("epix_slide2 works with 0-row computation outputs", { - epix_slide_empty <- function(ea, ...) { - ea %>% - epix_slide2(before = 5L, ..., function(x, gk, rtv) { - tibble::tibble() - }) - } - expect_identical( - ea %>% - epix_slide_empty(), - tibble::tibble( - time_value = ea$DT$version[integer(0)] - ) - ) - expect_identical( - ea %>% - group_by(geo_value) %>% - epix_slide_empty(), - tibble::tibble( - geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] - ) %>% - # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, - # as_of = ea$versions_end) %>% - group_by(geo_value) - ) - # with `all_versions=TRUE`, we have something similar but never get an - # `epi_df`: - expect_identical( - ea %>% - epix_slide_empty(all_versions = TRUE), - tibble::tibble( - time_value = ea$DT$version[integer(0)] - ) - ) - expect_identical( - ea %>% - group_by(geo_value) %>% - epix_slide_empty(all_versions = TRUE), - tibble::tibble( - geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] - ) %>% - group_by(geo_value) - ) -}) - -# test_that("epix_slide grouped by geo can produce `epi_df` output", { -# # This is a characterization test. Not sure we actually want this behavior; -# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 -# expect_identical( -# ea %>% -# group_by(geo_value) %>% -# epix_slide(before = 5L, function(x,g) { -# tibble::tibble(value = 42) -# }, names_sep = NULL), -# tibble::tibble( -# geo_value = "x", -# time_value = epix_slide_ref_time_values_default(ea), -# value = 42 -# ) %>% -# new_epi_df(as_of = ea$versions_end) -# ) -# }) - -test_that("epix_slide alerts if the provided f doesn't take enough args", { - f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) - expect_warning(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) - - f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(epix_slide2(xx, f_x_dots, before = 2L), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" - ) -}) - -test_that("epix_slide2 computation via formula can use ref_time_value", { - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) - ) %>% - group_by(geo_value) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~.ref_time_value, - before = 2 - ) - - expect_identical(xx1, xx_ref) - - xx2 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~.z, - before = 2 - ) - - expect_identical(xx2, xx_ref) - - xx3 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~..3, - before = 2 - ) - - expect_identical(xx3, xx_ref) -}) - -test_that("epix_slide2 computation via function can use ref_time_value", { - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) - ) %>% - group_by(geo_value) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = function(x, g, t) t, - before = 2 - ) - - expect_identical(xx1, xx_ref) -}) - -test_that("epix_slide2 computation via dots can use ref_time_value and group", { - # ref_time_value - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) - ) %>% - group_by(geo_value) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - slide_value = .ref_time_value - ) - - expect_identical(xx1, xx_ref) - - # group_key - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = "x" - ) %>% - group_by(geo_value) - - # Use group_key column - xx3 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - slide_value = .group_key$geo_value - ) - - expect_identical(xx3, xx_ref) - - # Use entire group_key object - expect_error( - xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - slide_value = nrow(.group_key) - ), - NA - ) -}) - -test_that("epix_slide2 computation via dots outputs the same result using col names and the data var", { - xx_ref <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - sum_binary = sum(time_value) - ) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - sum_binary = sum(.x$time_value) - ) - - expect_identical(xx1, xx_ref) - - xx2 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - sum_binary = sum(.data$time_value) - ) - - expect_identical(xx2, xx_ref) -}) - -test_that("`epix_slide2` doesn't decay date output", { - expect_true( - xx$DT %>% - as_tibble() %>% - mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% - as_epi_archive2() %>% - epix_slide2(before = 5L, ~ attr(.x, "metadata")$as_of) %>% - `[[`("slide_value") %>% - inherits("Date") - ) -}) - -test_that("`epix_slide2` can access objects inside of helper functions", { - helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epix_slide2(has_needle = time_value_needle %in% time_value, before = 365000L) - } - expect_error( - helper(archive_cases_dv_subset_2, as.Date("2021-01-01")), - NA - ) - expect_error( - helper(xx, 3L), - NA - ) -}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 45251a89..413741aa 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -16,10 +16,6 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) %>% as_epi_archive(other_keys = "age_group") - # Ensure that we're using testthat edition 3's idea of "identical", which is - # not as strict as `identical`: - testthat::local_edition(3) - # Test equivalency claims in example: by_both_keys <- toy_archive %>% group_by(geo_value, age_group) expect_identical( diff --git a/tests/testthat/test-grouped_epi_archive_new.R b/tests/testthat/test-grouped_epi_archive_new.R deleted file mode 100644 index 8f0133b9..00000000 --- a/tests/testthat/test-grouped_epi_archive_new.R +++ /dev/null @@ -1,104 +0,0 @@ -test_that("Grouping, regrouping, and ungrouping archives works as intended", { - # From an example: - library(dplyr) - toy_archive <- - tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version) - ) %>% - as_epi_archive2(other_keys = "age_group") - - # Ensure that we're using testthat edition 3's idea of "identical", which is - # not as strict as `identical`: - testthat::local_edition(3) - - # Test equivalency claims in example: - by_both_keys <- toy_archive %>% group_by(geo_value, age_group) - expect_identical( - by_both_keys, - toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) - ) - grouping_cols <- c("geo_value", "age_group") - expect_identical( - by_both_keys, - toy_archive %>% group_by(across(all_of(grouping_cols))) - ) - - expect_identical( - toy_archive %>% group_by(geo_value), - toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) - ) - - # Test `.drop` behavior: - expect_error(toy_archive %>% group_by(.drop = "bogus"), - regexp = "Must be of type 'logical', not 'character'" - ) - expect_warning(toy_archive %>% group_by(.drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - expect_warning( - grouped_factor_then_nonfactor <- - toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" - ) - expect_identical( - grouped_factor_then_nonfactor %>% - epix_slide2(before = 10, s = sum(value)), - tibble::tribble( - ~age_group, ~geo_value, ~time_value, ~s, - "pediatric", NA_character_, "2000-01-02", 0, - "adult", "us", "2000-01-02", 121, - "pediatric", "us", "2000-01-03", 5, - "adult", "us", "2000-01-03", 255 - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) - ) %>% - # # See - # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 - # # and - # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 - # # for why this is commented out, pending some design - # # decisions. - # # - # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - # as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(age_group, geo_value, time_value, s) %>% - group_by(age_group, geo_value, .drop = FALSE) - ) - expect_identical( - toy_archive %>% - group_by(geo_value, age_group, .drop = FALSE) %>% - epix_slide2(before = 10, s = sum(value)), - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~s, - "us", "pediatric", "2000-01-02", 0, - "us", "adult", "2000-01-02", 121, - "us", "pediatric", "2000-01-03", 5, - "us", "adult", "2000-01-03", 255 - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) - ) %>% - # as_epi_df(as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(geo_value, age_group, time_value, s) %>% - group_by(geo_value, age_group, .drop = FALSE) - ) -}) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 7ab63f19..5be5330f 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -1,6 +1,7 @@ library(dplyr) -ea <- archive_cases_dv_subset$clone() +ea <- archive_cases_dv_subset %>% + clone() ea2_data <- tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, @@ -17,35 +18,27 @@ ea2_data <- tibble::tribble( ) %>% dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) -# epix_as_of tests -test_that("epix_as_of behaves identically to as_of method", { - expect_identical( - epix_as_of(ea, max_version = min(ea$DT$version)), - ea$as_of(max_version = min(ea$DT$version)) - ) -}) - -test_that("Errors are thrown due to bad as_of inputs", { +test_that("Errors are thrown due to bad epix_as_of inputs", { # max_version cannot be of string class rather than date class - expect_error(ea$as_of("2020-01-01")) + expect_error(ea %>% epix_as_of("2020-01-01")) # max_version cannot be later than latest version - expect_error(ea$as_of(as.Date("2025-01-01"))) + expect_error(ea %>% epix_as_of(as.Date("2025-01-01"))) # max_version cannot be a vector - expect_error(ea$as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + expect_error(ea %>% epix_as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) }) test_that("Warning against max_version being clobberable", { # none by default - expect_warning(regexp = NA, ea$as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea$as_of(max_version = min(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(max_version = min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable <- ea$clone() + ea_with_clobberable <- ea %>% clone() ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) - expect_warning(ea_with_clobberable$as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea_with_clobberable$as_of(max_version = min(ea$DT$version))) + expect_warning(ea_with_clobberable %>% epix_as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(max_version = min(ea$DT$version))) }) -test_that("as_of properly grabs the data and doesn't mutate key", { +test_that("epix_as_of properly grabs the data and doesn't mutate key", { d <- as.Date("2020-06-01") ea2 <- ea2_data %>% @@ -99,7 +92,7 @@ test_that("epix_truncate_version_after doesn't filter if max_verion at latest ve ea2 <- ea2_data %>% as_epi_archive() - ea_expected <- ea2$clone() + ea_expected <- ea2 %>% clone() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -114,7 +107,7 @@ test_that("epix_truncate_version_after returns the same grouping type as input e epix_truncate_versions_after(max_version = as.Date("2020-06-04")) expect_true(is_epi_archive(ea_as_of, grouped_okay = FALSE)) - ea2_grouped <- ea2$group_by(geo_value) + ea2_grouped <- ea2 %>% group_by(geo_value) ea_as_of <- ea2_grouped %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -125,11 +118,11 @@ test_that("epix_truncate_version_after returns the same grouping type as input e test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { ea2 <- ea2_data %>% as_epi_archive() - ea2 <- ea2$group_by(geo_value) + ea2 <- ea2 %>% group_by(geo_value) - ea_expected <- ea2$clone() + ea_expected <- ea2 %>% clone() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of$groups(), ea_expected$groups()) + expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) }) diff --git a/tests/testthat/test-methods-epi_archive_new.R b/tests/testthat/test-methods-epi_archive_new.R deleted file mode 100644 index eb2c14be..00000000 --- a/tests/testthat/test-methods-epi_archive_new.R +++ /dev/null @@ -1,136 +0,0 @@ -library(dplyr) - -ea <- archive_cases_dv_subset_2 %>% - clone() - -ea2_data <- tibble::tribble( - ~geo_value, ~time_value, ~version, ~cases, - "ca", "2020-06-01", "2020-06-01", 1, - "ca", "2020-06-01", "2020-06-02", 2, - # - "ca", "2020-06-02", "2020-06-02", 0, - "ca", "2020-06-02", "2020-06-03", 1, - "ca", "2020-06-02", "2020-06-04", 2, - # - "ca", "2020-06-03", "2020-06-03", 1, - # - "ca", "2020-06-04", "2020-06-04", 4, -) %>% - dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) - -# epix_as_of tests -test_that("epix_as_of behaves identically to as_of method", { - expect_identical( - epix_as_of2(ea, max_version = min(ea$DT$version)), - ea %>% as_of(max_version = min(ea$DT$version)) - ) -}) - -test_that("Errors are thrown due to bad as_of inputs", { - # max_version cannot be of string class rather than date class - expect_error(ea %>% as_of("2020-01-01")) - # max_version cannot be later than latest version - expect_error(ea %>% as_of(as.Date("2025-01-01"))) - # max_version cannot be a vector - expect_error(ea %>% as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) -}) - -test_that("Warning against max_version being clobberable", { - # none by default - expect_warning(regexp = NA, ea %>% as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea %>% as_of(max_version = min(ea$DT$version))) - # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable <- ea %>% clone() - ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) - expect_warning(ea_with_clobberable %>% as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea_with_clobberable %>% as_of(max_version = min(ea$DT$version))) -}) - -test_that("as_of properly grabs the data and doesn't mutate key", { - d <- as.Date("2020-06-01") - - ea2 <- ea2_data %>% - as_epi_archive2() - - old_key <- data.table::key(ea2$DT) - - edf_as_of <- ea2 %>% - epix_as_of2(max_version = as.Date("2020-06-03")) - - edf_expected <- as_epi_df(tibble( - geo_value = "ca", - time_value = d + 0:2, - cases = c(2, 1, 1) - ), as_of = as.Date("2020-06-03")) - - expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) - expect_equal(data.table::key(ea2$DT), old_key) -}) - -test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { - # x must be an archive - expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) - # max_version cannot be of string class rather than date class - expect_error(epix_truncate_versions_after(ea, "2020-01-01")) - # max_version cannot be a vector - expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) - # max_version cannot be missing - expect_error(epix_truncate_versions_after(ea, as.Date(NA))) - # max_version cannot be after latest version in archive - expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) -}) - -test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { - ea2 <- ea2_data %>% - as_epi_archive2() - - old_key <- data.table::key(ea2$DT) - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-02")) - - ea_expected <- ea2_data[1:3, ] %>% - as_epi_archive2() - - expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) - expect_equal(data.table::key(ea2$DT), old_key) -}) - -test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { - ea2 <- ea2_data %>% - as_epi_archive2() - - ea_expected <- ea2 %>% clone() - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) -}) - -test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { - ea2 <- ea2_data %>% - as_epi_archive2() - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_epi_archive2(ea_as_of, grouped_okay = FALSE)) - - ea2_grouped <- ea2 %>% group_by(geo_value) - - ea_as_of <- ea2_grouped %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_grouped_epi_archive2(ea_as_of)) -}) - - -test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { - ea2 <- ea2_data %>% - as_epi_archive2() - ea2 <- ea2 %>% group_by(geo_value) - - ea_expected <- ea2 %>% clone() - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) -}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 83cc07f6..dbe15450 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,13 +1,3 @@ -test_that("new summarizing functions work", { - x <- c(3, 4, 5, 9, NA) - expect_equal(min_na_rm(x), 3) -}) - -test_that("Other capital letter functions work", { - x <- c(1, 2, 3, 4, 5) - expect_equal(extend_r(x), c(1, 2, 3, 4, 5, 5)) -}) - test_that("guess_geo_type tests for different types of geo_value's", { # California, New York states <- c("ca", "ny") diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index c010c1f3..1ea13c5f 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -9,14 +9,13 @@ vignette: > In this vignette, we discuss how to use the sliding functionality in the `epiprocess` package with less common grouping schemes or with computations that -have advanced output structures. -The output of a slide computation should either be an atomic value/vector, or a -data frame. This data frame can have multiple columns, multiple rows, or both. +have advanced output structures. The output of a slide computation should either +be an atomic value/vector, or a data frame. This data frame can have multiple +columns, multiple rows, or both. During basic usage (e.g., when all optional arguments are set to their defaults): * `epi_slide(edf, , .....)`: - * keeps **all** columns of `edf`, adds computed column(s) * outputs **one row per row in `edf`** (recycling outputs from computations appropriately if there are multiple time series bundled @@ -26,9 +25,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) `dplyr::arrange(time_value, .by_group = TRUE)`** * outputs an **`epi_df`** if the required columns are present, otherwise a tibble - * `epix_slide(ea, , .....)`: - * keeps **grouping and `time_value`** columns of `ea`, adds computed column(s) * outputs **any number of rows** (computations are allowed to output any @@ -40,6 +37,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * outputs a **tibble** These differences in basic behavior make some common slide operations require less boilerplate: + * predictors and targets calculated with `epi_slide` are automatically lined up with each other and with the signals from which they were calculated; and * computations for an `epix_slide` can output data frames with any number of @@ -84,13 +82,14 @@ simple synthetic example. ```{r message = FALSE} library(epiprocess) library(dplyr) +set.seed(123) edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), x = seq_along(geo_value) + 0.01 * rnorm(length(geo_value)), ) %>% - as_epi_df() + as_epi_df(as_of = as.Date("2024-03-20")) # 2-day trailing average, per geo value edf %>% @@ -111,17 +110,17 @@ edf %>% edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive2() %>% + as_epi_archive() %>% group_by(geo_value) %>% - epix_slide2(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive2() %>% + as_epi_archive() %>% group_by(geo_value) %>% - epix_slide2(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() ``` @@ -219,9 +218,9 @@ edf %>% edf %>% mutate(version = time_value) %>% - as_epi_archive2() %>% + as_epi_archive() %>% group_by(geo_value) %>% - epix_slide2( + epix_slide( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL @@ -317,17 +316,17 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive2(compactify = FALSE) + as_epi_archive(compactify = FALSE) # mutating merge operation: -x <- epix_merge2( +x <- epix_merge( x, y2 %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value ) %>% - as_epi_archive2(compactify = FALSE), + as_epi_archive(compactify = FALSE), sync = "locf", compactify = FALSE ) @@ -338,9 +337,9 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -x <- archive_cases_dv_subset_2$DT %>% +x <- archive_cases_dv_subset$DT %>% filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive2(compactify = FALSE) + as_epi_archive(compactify = FALSE) ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -458,7 +457,7 @@ data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates -x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month" @@ -468,7 +467,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide2( + epix_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), @@ -525,10 +524,7 @@ separate ARX model on each state. As in the archive vignette, we can see a difference between version-aware (right column) and -unaware (left column) forecasting, as well. - ## Attribution The `case_rate_7d_av` data used in this document is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. - - diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index 205ed084..dca595ff 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -34,7 +34,7 @@ x <- pub_covidcast( ) %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% - as_epi_df() + as_epi_df(as_of = as.Date("2024-03-20")) ``` The data contains 16,212 rows and 5 columns. @@ -192,7 +192,7 @@ running `epi_slide()` on the zero-filled data brings these trailing averages ```{r} xt %>% - as_epi_df() %>% + as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% @@ -203,7 +203,7 @@ xt %>% print(n = 7) xt_filled %>% - as_epi_df() %>% + as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 0b57d639..6193981a 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -7,16 +7,16 @@ vignette: > %\VignetteEncoding{UTF-8} --- -In addition to the `epi_df` data structure, which we have been working with all -along in these vignettes, the `epiprocess` package has a companion structure -called `epi_archive`. In comparison to an `epi_df` object, which can be seen as -storing a single snapshot of a data set with the most up-to-date signal values -as of some given time, an `epi_archive` object stores the full version history -of a data set. Many signals of interest for epidemiological tracking are subject -to revision (some more than others), and paying attention to data revisions can -be important for all sorts of downstream data analysis and modeling tasks. - -This vignette walks through working with `epi_archive` objects and demonstrates +In addition to the `epi_df` data structure, the `epiprocess` package has a +companion structure called `epi_archive`. In comparison to an `epi_df` object, +which can be seen as storing a single snapshot of a data set with the most +up-to-date signal values as of some given time, an `epi_archive` object stores +the full version history of a data set. Many signals of interest for +epidemiological tracking are subject to revision (some more than others) and +paying attention to data revisions can be important for all sorts of downstream +data analysis and modeling tasks. + +This vignette walks through working with `epi_archive()` objects and demonstrates some of their key functionality. We'll work with a signal on the percentage of doctor's visits with CLI (COVID-like illness) computed from medical insurance claims, available through the [COVIDcast @@ -55,9 +55,8 @@ library(ggplot2) ## Getting data into `epi_archive` format -An epi_archive object -can be constructed from a data frame, data table, or tibble, provided that it -has (at least) the following columns: +An `epi_archive()` object can be constructed from a data frame, data table, or +tibble, provided that it has (at least) the following columns: * `geo_value`: the geographic value associated with each row of measurements. * `time_value`: the time value associated with each row of measurements. @@ -71,30 +70,30 @@ As we can see from the above, the data frame returned by format, with `issue` playing the role of `version`. We can now use `as_epi_archive()` to bring it into `epi_archive` format. For removal of redundant version updates in `as_epi_archive` using compactify, please refer to -the compactify vignette. +the [compactify vignette](articles/compactify.html). ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive2(compactify = TRUE) + as_epi_archive(compactify = TRUE) class(x) print(x) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset_2$DT %>% +x <- archive_cases_dv_subset$DT %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive2(compactify = TRUE) + as_epi_archive(compactify = TRUE) class(x) print(x) ``` -An `epi_archive` is special kind of class called an R6 class. Its primary field -is a data table `DT`, which is of class `data.table` (from the `data.table` -package), and has columns `geo_value`, `time_value`, `version`, as well as any -number of additional columns. +An `epi_archive` is consists of a primary field `DT`, which is a data table +(from the `data.table` package) that has the columns `geo_value`, `time_value`, +`version` (and possibly additional ones), and other metadata fields, such as +`geo_type` and `time_type`. ```{r} class(x$DT) @@ -112,9 +111,7 @@ key(x$DT) ``` In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions. **A word of caution:** R6 objects, -unlike most other objects in R, have reference semantics. An important -consequence of this is that objects are not copied when modified. +fill in data between recorded versions. ```{r} original_value <- x$DT$percent_cli[1] @@ -125,10 +122,6 @@ head(x$DT) x$DT$percent_cli[1] <- original_value ``` -To make a copy, we can use the `clone()` method for an R6 class, as in `y <- -x$clone()`. You can read more about reference semantics in Hadley Wickham's -[Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. - ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` @@ -146,15 +139,12 @@ call (as it did in the case above). ## Producing snapshots in `epi_df` form -A key method of an `epi_archive` class is `as_of()`, which generates a snapshot +A key method of an `epi_archive` class is `epix_as_of()`, which generates a snapshot of the archive in `epi_df` format. This represents the most up-to-date values of -the signal variables as of a given version. This can be accessed via `x$as_of()` -for an `epi_archive` object `x`, but the package also provides a simple wrapper -function `epix_as_of()` since this is likely a more familiar interface for users -not familiar with R6 (or object-oriented programming). +the signal variables as of a given version. ```{r} -x_snapshot <- epix_as_of2(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -174,7 +164,7 @@ this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} -x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, max_version = max(x$DT$version)) ``` Below, we pull several snapshots from the archive, spaced one month apart. We @@ -188,7 +178,7 @@ theme_set(theme_bw()) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of2(x, max_version = v) %>% mutate(version = v) + epix_as_of(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) @@ -219,19 +209,14 @@ they overestimate it (both states towards the beginning of 2021), though not quite as dramatically. Modeling the revision process, which is often called *backfill modeling*, is an important statistical problem in it of itself. - - ## Merging `epi_archive` objects Now we demonstrate how to merge two `epi_archive` objects together, e.g., so that grabbing data from multiple sources as of a particular version can be -performed with a single `as_of` call. The `epi_archive` class provides a method -`merge()` precisely for this purpose. The wrapper function is called -`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will -mutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI -from outpatient visits to another one of versioned COVID-19 case reporting data, -which we fetch the from the [COVIDcast +performed with a single `epix_as_of` call. The function `epix_merge()` is made +for this purpose. Below we merge the working `epi_archive` of versioned +percentage CLI from outpatient visits to another one of versioned COVID-19 case +reporting data, which we fetch the from the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the rate scale (counts per 100,000 people in the population). @@ -258,34 +243,26 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive2(compactify = TRUE) + as_epi_archive(compactify = TRUE) -x <- epix_merge2(x, y, sync = "locf", compactify = TRUE) +x <- epix_merge(x, y, sync = "locf", compactify = TRUE) print(x) head(x$DT) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset_2 +x <- archive_cases_dv_subset print(x) head(x$DT) ``` -Importantly, see that `x$merge` mutated `x` to hold the result of the merge. We -could also have used `xy = epix_merge(x,y)` to avoid mutating `x`. See the -documentation for either for more detailed descriptions of what mutation, -pointer aliasing, and pointer reseating is possible. - ## Sliding version-aware computations -Lastly, we demonstrate another key method of the `epi_archive` class, which is -the `slide()` method. It works just like `epi_slide()` does for an `epi_df` -object, but with one key difference: it performs version-aware computations. -That is, for the computation at any given reference time t, it only uses **data -that would have been available as of t**. The wrapper function is called -`epix_slide()`; again, this is just for convenience/familiarity---and its -interface is purposely designed mirror that of `epi_slide()` for `epi_df` -objects. +Lastly, we demonstrate another key method for archives, which is the +`epix_slide()`. It works just like `epi_slide()` does for an `epi_df` object, +but with one key difference: it performs version-aware computations. That is, +for the computation at any given reference time t, it only uses **data that +would have been available as of t**. For the demonstration, we'll revisit the forecasting example from the [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html), and now @@ -362,7 +339,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), z <- x %>% group_by(geo_value) %>% - epix_slide2( + epix_slide( fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, ref_time_values = fc_time_values ) %>% @@ -389,14 +366,14 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, max_version = max(x$DT$version)) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(.data$geo_value) %>% - epix_slide2( + epix_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 0b68c73b..8579be6a 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -32,10 +32,10 @@ from the second from the third value included. library(epiprocess) library(dplyr) -dt <- archive_cases_dv_subset_2$DT +dt <- archive_cases_dv_subset$DT -locf_omitted <- as_epi_archive2(dt) -locf_included <- as_epi_archive2(dt, compactify = FALSE) +locf_omitted <- as_epi_archive(dt) +locf_included <- as_epi_archive(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -48,8 +48,8 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu ```{r} dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive2(dt2, compactify = FALSE) -locf_omitted_2 <- as_epi_archive2(dt2, compactify = TRUE) +locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were @@ -93,7 +93,7 @@ We would also like to measure the speed of `epi_archive` methods. # Performance of as_of iterated 200 times iterate_as_of <- function(my_ea) { for (i in 1:1000) { - my_ea$as_of(min(my_ea$DT$time_value) + i - 1000) + my_ea %>% epix_as_of(min(my_ea$DT$time_value) + i - 1000) } } @@ -101,11 +101,12 @@ speeds <- rbind(speeds, speed_test(iterate_as_of, "as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(.data$case_rate_7d_av), before = 7) + my_ea %>% epix_slide(median = median(.data$case_rate_7d_av), before = 7) } speeds <- rbind(speeds, speed_test(slide_median, "slide_median")) ``` + Here is a detailed performance comparison: ```{r} diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 85b1e1f4..12020d89 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -125,7 +125,7 @@ and `time_value` columns, respectively, but inferring the `as_of` field is not as easy. See the documentation for `as_epi_df()` more details. ```{r} -x <- as_epi_df(cases) %>% +x <- as_epi_df(cases, as_of = as.Date("2024-03-20")) %>% select(geo_value, time_value, total_cases = value) attributes(x)$metadata @@ -169,7 +169,7 @@ data.frame( # misnamed reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) -) %>% as_epi_df() +) %>% as_epi_df(as_of = as.Date("2024-03-20")) ``` The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. @@ -220,7 +220,7 @@ ex3 <- ex3 %>% state = rep(tolower("MA"), 6), pol = rep(c("blue", "swing", "swing"), each = 2) ) %>% - as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) + as_epi_df(additional_metadata = list(other_keys = c("state", "pol")), as_of = as.Date("2024-03-20")) attr(ex3, "metadata") ``` @@ -256,7 +256,7 @@ cases in Canada in 2003, from the x <- outbreaks::sars_canada_2003 %>% mutate(geo_value = "ca") %>% select(geo_value, time_value = date, starts_with("cases")) %>% - as_epi_df(geo_type = "nation") + as_epi_df(geo_type = "nation", as_of = as.Date("2024-03-20")) head(x) @@ -303,7 +303,7 @@ x <- outbreaks::ebola_sierraleone_2014 %>% filter(cases == 1) %>% group_by(geo_value, time_value) %>% summarise(cases = sum(cases)) %>% - as_epi_df(geo_type = "province") + as_epi_df(geo_type = "province", as_of = as.Date("2024-03-20")) ggplot(x, aes(x = time_value, y = cases)) + geom_col(aes(fill = geo_value), show.legend = FALSE) + @@ -312,11 +312,8 @@ ggplot(x, aes(x = time_value, y = cases)) + labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") ``` - - ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - From 183d0f1230f643ee1c355ee24b4c27e3de65a798 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 22 Apr 2024 15:11:34 -0700 Subject: [PATCH 251/345] `new_archive` validation tightening, streamlining, type stability - Forbid `NA` `compactify` - Remove `missing` checks when `is.null` suffices - Remove redundant default code - Make local `other_keys` have more consistent typing across branches --- R/archive.R | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/R/archive.R b/R/archive.R index f871d239..7d9a9539 100644 --- a/R/archive.R +++ b/R/archive.R @@ -307,13 +307,13 @@ new_epi_archive <- function( } # If time type is missing, then try to guess it - if (missing(time_type) || is.null(time_type)) { + if (is.null(time_type)) { time_type <- guess_time_type(x$time_value) } # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys <- NULL - if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (is.null(other_keys)) other_keys <- character(0L) + if (is.null(additional_metadata)) additional_metadata <- list() if (!test_subset(other_keys, names(x))) { cli_abort("`other_keys` must be contained in the column names of `x`.") } @@ -325,17 +325,11 @@ new_epi_archive <- function( } # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify <- NULL - } - assert_logical(compactify, len = 1, null.ok = TRUE) + assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end) || is.null(versions_end)) { + if (is.null(versions_end)) { versions_end <- max_version_with_row_in(x) } validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) From 7bc4735d7df9e26ca11d76561d30e52e7950d392 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 22 Apr 2024 15:55:47 -0700 Subject: [PATCH 252/345] Fix version_bound_arg validation issues - Validate length. - Tweak message regarding type since typeof is length 1. - Actually raise error if NA when NA not allowed. - Make tests check the source of the error, since not being specific + R configuration masked some of these issues. --- R/archive.R | 37 ++++++++++++++------ tests/testthat/test-archive-version-bounds.R | 23 +++++++----- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/R/archive.R b/R/archive.R index 7d9a9539..55de1132 100644 --- a/R/archive.R +++ b/R/archive.R @@ -28,23 +28,38 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, x_arg = rlang::caller_arg(version_bound)) { if (is.null(version_bound)) { cli_abort( - "{version_bound_arg} cannot be NULL" + "{version_bound_arg} cannot be NULL", + class = "epiprocess__version_bound_null" ) } - if (na_ok && is.na(version_bound)) { - return(invisible(NULL)) - } - if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + if (length(version_bound) != 1L) { cli_abort( - "{version_bound_arg} must have the same classes as x$version, - which is {class(x$version)}", + "{version_bound_arg} must have length of 1", + class = "epiprocess__version_bound_wrong_length" ) } - if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same types as x$version, + if (is.na(version_bound)) { + if (!na_ok) { + cli_abort( + "{version_bound_arg} cannot be NA", + class = "epiprocess__version_bound_na_with_na_not_okay" + ) + } + } else { + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + class = "epiprocess__version_bound_mismatched_class" + ) + } + if (!identical(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same type as x$version, which is {typeof(x$version)}", - ) + class = "epiprocess__version_bound_mismatched_typeof" + ) + } } return(invisible(NULL)) diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index d78167d7..c052b47b 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -2,17 +2,21 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { my_version_bound <- NA x <- tibble::tibble(version = 5L) validate_version_bound(my_version_bound, x, na_ok = TRUE) - expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE)) + expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE), + class = "epiprocess__version_bound_na_with_na_not_okay") }) test_that("`validate_version_bound` catches bounds that are the wrong length", { x <- tibble::tibble(version = 5L) my_version_bound1a <- NULL - expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE)) + expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE), + class = "epiprocess__version_bound_null") my_version_bound1b <- integer(0L) - expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE)) + expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE), + class = "epiprocess__version_bound_wrong_length") my_version_bound2 <- c(2, 10) - expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE)) + expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE), + class = "epiprocess__version_bound_wrong_length") }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { @@ -52,11 +56,13 @@ test_that("`validate_version_bound` validate and class checks together allow and validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") expect_error( validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes" + regexp = "must have the same classes", + class = "epiprocess__version_bound_mismatched_class" ) expect_error( validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes" + regexp = "must have the same classes", + class = "epiprocess__version_bound_mismatched_class" ) # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same classes") @@ -64,7 +70,7 @@ test_that("`validate_version_bound` validate and class checks together allow and expect_error(validate_version_bound( `class<-`(list(2), "clazz"), tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same types") + ), regexp = "must have the same type", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -109,7 +115,8 @@ test_that("archive version bounds args work as intended", { ), regexp = "`clobberable_versions_start`.*indicated that there were later observed versions" ) - expect_error(as_epi_archive(update_tbl, versions_end = NA), regexp = "must have the same classes") + expect_error(as_epi_archive(update_tbl, versions_end = NA), + class = "epiprocess__version_bound_na_with_na_not_okay") ea_default <- as_epi_archive(update_tbl) ea_default %>% epix_as_of(measurement_date + 4L) expect_warning( From 5392641e42b945dd24366f78bee61737180a5539 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 22 Apr 2024 16:11:06 -0700 Subject: [PATCH 253/345] Don't use cli_ with dynamic format strings See https://rlang.r-lib.org/reference/topic-condition-formatting.html#transitioning-from-abort-to-cli-abort- --- R/archive.R | 14 ++++---------- R/methods-epi_archive.R | 14 +++++++------- R/methods-epi_df.R | 6 ++---- R/utils.R | 9 +++++---- tests/testthat/test-archive-version-bounds.R | 15 ++++++++++----- 5 files changed, 28 insertions(+), 30 deletions(-) diff --git a/R/archive.R b/R/archive.R index 55de1132..5e5ef436 100644 --- a/R/archive.R +++ b/R/archive.R @@ -351,21 +351,15 @@ new_epi_archive <- function( validate_version_bound(versions_end, x, na_ok = FALSE) if (nrow(x) > 0L && versions_end < max(x[["version"]])) { cli_abort( - sprintf( - "`versions_end` was %s, but `x` contained - updates for a later version or versions, up through %s", - versions_end, max(x[["version"]]) - ), + "`versions_end` was {versions_end}, but `x` contained + updates for a later version or versions, up through {max(x$version)}", class = "epiprocess__versions_end_earlier_than_updates" ) } if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { cli_abort( - sprintf( - "`versions_end` was %s, but a `clobberable_versions_start` - of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start - ), + "`versions_end` was {versions_end}, but a `clobberable_versions_start` + of {clobberable_versions_start} indicated that there were later observed versions", class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" ) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index f6846488..3c351f37 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -169,13 +169,13 @@ epix_fill_through_version <- function(x, fill_versions_end, nonkey_cols <- setdiff(names(x$DT), key(x$DT)) next_version_tag <- next_after(x$versions_end) if (next_version_tag > fill_versions_end) { - cli_abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), x$versions_end, next_version_tag, fill_versions_end)) + cli_abort(paste( + "Apparent problem with {.code next_after} method:", + "archive contained observations through version {x$versions_end}", + "and the next possible version was supposed to be {next_version_tag},", + "but this appeared to jump from a version < {fill_versions_end}", + "to one > {fill_versions_end}, implying at least one version in between." + )) } nonversion_key_vals_ever_recorded <- unique(x$DT, by = nonversion_key_cols) # In edge cases, the `unique` result can alias the original diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 632dc3a3..97fc4576 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -136,10 +136,8 @@ dplyr_reconstruct.epi_df <- function(data, template) { dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { cli_abort(paste0( - "Column name(s) ", - paste(unique(dup_col_names), - collapse = ", " - ), " must not be duplicated." + "Column name(s) {unique(dup_col_names)}", + "must not be duplicated." )) } diff --git a/R/utils.R b/R/utils.R index ea7afc2f..0233f775 100644 --- a/R/utils.R +++ b/R/utils.R @@ -142,7 +142,7 @@ assert_sufficient_f_args <- function(f, ...) { # `f` doesn't take enough args. if (rlang::dots_n(...) == 0L) { # common case; try for friendlier error message - cli_abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), + cli_abort("`f` must take at least {n_mandatory_f_args} arguments", class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", epiprocess__f = f ) @@ -312,7 +312,7 @@ as_slide_computation <- function(f, ...) { if (is_formula(f)) { if (length(f) > 2) { - cli_abort(sprintf("%s must be a one-sided formula", arg), + cli_abort("{.code {arg}} must be a one-sided formula", class = "epiprocess__as_slide_computation__formula_is_twosided", epiprocess__f = f, call = call @@ -350,7 +350,8 @@ as_slide_computation <- function(f, ...) { } cli_abort( - sprintf("Can't convert an object of class %s to a slide computation", paste(collapse = " ", deparse(class(f)))), + "Can't convert an object of class {paste(collapse = ' ', deparse(class(f)))} + to a slide computation", class = "epiprocess__as_slide_computation__cant_convert_catchall", epiprocess__f = f, epiprocess__f_class = class(f), @@ -687,7 +688,7 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { sorted_distinct_ref_time_values <- sort(unique(ref_time_values)) if (length(sorted_distinct_ref_time_values) < 2L) { - cli_abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) + cli_abort("Not enough distinct values in {.code {ref_time_values_arg}} to guess the period.", ref_time_values_arg) } skips <- diff(sorted_distinct_ref_time_values) decayed_skips <- diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index c052b47b..a8f12a3f 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -3,20 +3,24 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { x <- tibble::tibble(version = 5L) validate_version_bound(my_version_bound, x, na_ok = TRUE) expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE), - class = "epiprocess__version_bound_na_with_na_not_okay") + class = "epiprocess__version_bound_na_with_na_not_okay" + ) }) test_that("`validate_version_bound` catches bounds that are the wrong length", { x <- tibble::tibble(version = 5L) my_version_bound1a <- NULL expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE), - class = "epiprocess__version_bound_null") + class = "epiprocess__version_bound_null" + ) my_version_bound1b <- integer(0L) expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE), - class = "epiprocess__version_bound_wrong_length") + class = "epiprocess__version_bound_wrong_length" + ) my_version_bound2 <- c(2, 10) expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE), - class = "epiprocess__version_bound_wrong_length") + class = "epiprocess__version_bound_wrong_length" + ) }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { @@ -116,7 +120,8 @@ test_that("archive version bounds args work as intended", { regexp = "`clobberable_versions_start`.*indicated that there were later observed versions" ) expect_error(as_epi_archive(update_tbl, versions_end = NA), - class = "epiprocess__version_bound_na_with_na_not_okay") + class = "epiprocess__version_bound_na_with_na_not_okay" + ) ea_default <- as_epi_archive(update_tbl) ea_default %>% epix_as_of(measurement_date + 4L) expect_warning( From 9491797f51651142787cfaef219586ba5d0f022f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 23 Apr 2024 10:29:51 -0700 Subject: [PATCH 254/345] Note reassignment in R6 migration for mutating functions, + details --- NEWS.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4d52ded5..1ee3384f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,11 +32,15 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 -- Refactor `epi_archive` to use S3 instead of R6 for its object model. The +- Refactored `epi_archive` to use S3 instead of R6 for its object model. The functionality stay the same, but it will break the member function interface. - For migration, convert `epi_archive$merge` to `epi_archive %>% epix_merge` - (similar for `slide`, `fill_through_version`, `truncate_after_version`, and - `as_of`) (#340). + For migration, you can usually just convert `epi_archive$merge(...)` to + `epi_archive <- epi_archive %>% epix_merge(...)` (and the same for + `fill_through_version` and `truncate_after_version`) and + `epi_archive$slide(...)` to `epi_archive %>% epix_slide(...)` (and the same + for `as_of`, `group_by`, `slide`, etc.) (#340). In some limited situations, + such as if you have a helper function that calls `epi_archive$merge` etc. on + one of its arguments, then you may need to more carefully refactor them. # epiprocess 0.7.0 From 27b798dbfcc427a7534f6cd679d949b9b449eee5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Apr 2024 10:46:34 -0700 Subject: [PATCH 255/345] Adjust class, typeof, colnames checks & messaging, gcd messaging - S3 class vectors are ordered, so use `identical` - Improve class vector formatting - Tweak other `class` and `typeof` message text - Improve duplicate colnames message - Improve vector interpolation formatting - Fix typo in GCD error messaging --- NAMESPACE | 1 + R/archive.R | 10 +++++----- R/methods-epi_archive.R | 16 ++++++++-------- R/methods-epi_df.R | 9 ++++++--- R/utils.R | 2 +- tests/testthat/test-archive-version-bounds.R | 16 ++++++++-------- tests/testthat/test-methods-epi_df.R | 4 ++-- 7 files changed, 31 insertions(+), 27 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index cc25c7d7..2174b78b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,6 +98,7 @@ importFrom(checkmate,test_subset) importFrom(checkmate,vname) importFrom(cli,cli_abort) importFrom(cli,cli_inform) +importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(data.table,":=") importFrom(data.table,address) diff --git a/R/archive.R b/R/archive.R index 5e5ef436..325cdf48 100644 --- a/R/archive.R +++ b/R/archive.R @@ -46,17 +46,17 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, ) } } else { - if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + if (!identical(class(version_bound), class(x[["version"]]))) { cli_abort( - "{version_bound_arg} must have the same classes as x$version, - which is {class(x$version)}", + "{version_bound_arg} must have the same `class` vector as x$version, + which has a `class` of {paste(collapse = ' ', deparse(class(x$version)))}", class = "epiprocess__version_bound_mismatched_class" ) } if (!identical(typeof(version_bound), typeof(x[["version"]]))) { cli_abort( - "{version_bound_arg} must have the same type as x$version, - which is {typeof(x$version)}", + "{version_bound_arg} must have the same `typeof` as x$version, + which has a `typeof` of {typeof(x$version)}", class = "epiprocess__version_bound_mismatched_typeof" ) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 3c351f37..0d527244 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -68,14 +68,14 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL if (length(other_keys) == 0) other_keys <- NULL # Check a few things on max_version - if (!test_set_equal(class(max_version), class(x$DT$version))) { + if (!identical(class(max_version), class(x$DT$version))) { cli_abort( - "`max_version` must have the same classes as `epi_archive$DT$version`." + "`max_version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { + if (!identical(typeof(max_version), typeof(x$DT$version))) { cli_abort( - "`max_version` must have the same types as `epi_archive$DT$version`." + "`max_version` must have the same `typeof` as `epi_archive$DT$version`." ) } assert_scalar(max_version, na.ok = FALSE) @@ -859,11 +859,11 @@ epix_truncate_versions_after <- function(x, max_version) { #' @rdname epix_truncate_versions_after #' @export epix_truncate_versions_after.epi_archive <- function(x, max_version) { - if (!test_set_equal(class(max_version), class(x$DT$version))) { - cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + if (!identical(class(max_version), class(x$DT$version))) { + cli_abort("`max_version` must have the same `class` as `epi_archive$DT$version`.") } - if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { - cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + if (!identical(typeof(max_version), typeof(x$DT$version))) { + cli_abort("`max_version` must have the same `typeof` as `epi_archive$DT$version`.") } assert_scalar(max_version, na.ok = FALSE) if (max_version > x$versions_end) { diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 97fc4576..526a1171 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -123,6 +123,7 @@ decay_epi_df <- function(x) { #' @param template `epi_df` template to use to restore #' @return `epi_df` or degrade into `tbl_df` #' @importFrom dplyr dplyr_reconstruct +#' @importFrom cli cli_vec #' @export #' @noRd dplyr_reconstruct.epi_df <- function(data, template) { @@ -135,9 +136,11 @@ dplyr_reconstruct.epi_df <- function(data, template) { # Duplicate columns, cli_abort dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { - cli_abort(paste0( - "Column name(s) {unique(dup_col_names)}", - "must not be duplicated." + cli_abort(c( + "Duplicate column names are not allowed", + "i" = "Duplicated column name{?s}: + {cli_vec(unique(dup_col_names), + style = list('vec-sep2' = ', ', 'vec-last' = ', '))}" )) } diff --git a/R/utils.R b/R/utils.R index 0233f775..5662ab4f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -609,7 +609,7 @@ gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { # `b_curr` is the candidate GCD / iterand; check first if it seems too small: if (abs(b_curr) <= iatol) { cli_abort( - "No GCD found; remaining potential Gads are all too small relative + "No GCD found; remaining potential GCDs are all too small relative to one/both of the original inputs; see `irtol` setting." ) } diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index a8f12a3f..d36fcab1 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -51,30 +51,30 @@ test_that("`validate_version_bound` validate and class checks together allow and my_version_bound1 <- `class<-`(24, "c1") expect_error( validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), - regexp = "must have the same classes as" + regexp = "must have the same `class` vector as" ) my_version_bound2 <- `class<-`(list(12), c("c2a", "c2b", "c2c")) - expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), regexp = "must have the same classes") + expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), regexp = "must have the same `class`") # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: validate_version_bound(my_date, x_date, version_bound_arg = "vb") validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") expect_error( validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes", + regexp = "must have the same `class`", class = "epiprocess__version_bound_mismatched_class" ) expect_error( validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes", + regexp = "must have the same `class`", class = "epiprocess__version_bound_mismatched_class" ) # Bad: - expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same classes") - expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same classes") + expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same `class`") + expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same `class`") expect_error(validate_version_bound( `class<-`(list(2), "clazz"), tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same type", class = "epiprocess__version_bound_mismatched_typeof") + ), regexp = "must have the same `typeof`", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -99,7 +99,7 @@ test_that("archive version bounds args work as intended", { clobberable_versions_start = 1241, versions_end = measurement_date ), - regexp = "must have the same classes" + regexp = "must have the same `class`" ) expect_error( as_epi_archive(update_tbl[integer(0L), ]), diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index cff88dac..b071d3ec 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -84,11 +84,11 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { test_that("When duplicate cols in subset should abort", { expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], - "Column name(s) time_value, y must not be duplicated.", + "Duplicated column names: time_value, y", fixed = TRUE ) expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], - "Column name(s) geo_value must not be duplicated.", + "Duplicated column name: geo_value", fixed = TRUE ) }) From cd7f83cc8dd30d7685017ce6a358bffb80df1543 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 24 Apr 2024 15:06:16 -0700 Subject: [PATCH 256/345] lint: use rlang %||% idiom --- NAMESPACE | 1 + R/archive.R | 19 +++++-------------- R/data.R | 2 +- R/epi_df.R | 4 +--- R/epiprocess.R | 1 + R/growth_rate.R | 10 +++++----- 6 files changed, 14 insertions(+), 23 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 2174b78b..d0c67474 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -144,6 +144,7 @@ importFrom(purrr,map) importFrom(purrr,map_lgl) importFrom(rlang,"!!!") importFrom(rlang,"!!") +importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) diff --git a/R/archive.R b/R/archive.R index 325cdf48..ffd8b1f0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -316,19 +316,12 @@ new_epi_archive <- function( cli_abort("Column `version` must not contain missing values.") } - # If geo type is missing, then try to guess it - if (is.null(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (is.null(time_type)) { - time_type <- guess_time_type(x$time_value) - } + geo_type <- geo_type %||% guess_geo_type(x$geo_value) + time_type <- time_type %||% guess_time_type(x$time_value) + other_keys <- other_keys %||% character(0L) + additional_metadata <- additional_metadata %||% list() # Finish off with small checks on keys variables and metadata - if (is.null(other_keys)) other_keys <- character(0L) - if (is.null(additional_metadata)) additional_metadata <- list() if (!test_subset(other_keys, names(x))) { cli_abort("`other_keys` must be contained in the column names of `x`.") } @@ -344,9 +337,7 @@ new_epi_archive <- function( # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: - if (is.null(versions_end)) { - versions_end <- max_version_with_row_in(x) - } + versions_end <- versions_end %||% max_version_with_row_in(x) validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) validate_version_bound(versions_end, x, na_ok = FALSE) if (nrow(x) > 0L && versions_end < max(x[["version"]])) { diff --git a/R/data.R b/R/data.R index cbaaa901..ec677547 100644 --- a/R/data.R +++ b/R/data.R @@ -123,7 +123,7 @@ some_package_is_being_unregistered <- function(parent_n = 0L) { # evaluation has been triggered via `unregister`. simple_call_names <- purrr::map_chr(calls_to_inspect, function(call) { maybe_simple_call_name <- rlang::call_name(call) - if (is.null(maybe_simple_call_name)) NA_character_ else maybe_simple_call_name + maybe_simple_call_name %||% NA_character_ }) # `pkgload::unregister` is an (the?) exported function that forces # package-level promises, while `pkgload:::unregister_namespace` is the diff --git a/R/epi_df.R b/R/epi_df.R index 9ed677cf..f4df1604 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -99,9 +99,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, assert_data_frame(x) assert_list(additional_metadata) - if (is.null(additional_metadata[["other_keys"]])) { - additional_metadata[["other_keys"]] <- character(0L) - } + additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) # If geo type is missing, then try to guess it if (missing(geo_type)) { diff --git a/R/epiprocess.R b/R/epiprocess.R index e3918708..8981c630 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -9,6 +9,7 @@ #' assert_int assert_numeric check_data_frame vname check_atomic #' anyInfinite test_subset test_set_equal checkInt #' @importFrom cli cli_abort cli_inform cli_warn +#' @importFrom rlang %||% #' @name epiprocess "_PACKAGE" utils::globalVariables(c(".x", ".group_key", ".ref_time_value")) diff --git a/R/growth_rate.R b/R/growth_rate.R index f2b326a1..4537375d 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -226,11 +226,11 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, k <- params$k # Default parameters - if (is.null(ord)) ord <- 3 - if (is.null(maxsteps)) maxsteps <- 1000 - if (is.null(cv)) cv <- TRUE - if (is.null(df)) df <- "min" - if (is.null(k)) k <- 3 + ord <- ord %||% 3 + maxsteps <- maxsteps %||% 1000 + cv <- cv %||% TRUE + df <- df %||% "min" + k <- k %||% 3 # Check cv and df combo if (is.numeric(df)) cv <- FALSE From 9641bdec7578c05e8886ecd6b6bb92d4ed658d49 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 10:01:51 -0700 Subject: [PATCH 257/345] Use an actual existence-checking [[ instead of pluck --- NAMESPACE | 1 + R/archive.R | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index d0c67474..c444e7fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -93,6 +93,7 @@ importFrom(checkmate,assert_scalar) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) +importFrom(checkmate,check_names) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) diff --git a/R/archive.R b/R/archive.R index ffd8b1f0..a39c39a0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -75,6 +75,8 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, #' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or #' an `NA` version value #' +#' @importFrom checkmate check_names +#' #' @export max_version_with_row_in <- function(x) { if (nrow(x) == 0L) { @@ -87,7 +89,8 @@ max_version_with_row_in <- function(x) { class = "epiprocess__max_version_cannot_be_used" ) } else { - version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist + check_names(names(x), must.include = "version") + version_col <- x[["version"]] if (anyNA(version_col)) { cli_abort("version values cannot be NA", class = "epiprocess__version_values_must_not_be_na" From 7bf29d809687728753cb96947176369ca74c1d23 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 16:52:42 -0700 Subject: [PATCH 258/345] Improve print.epi_archive on empty archives --- R/archive.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index a39c39a0..5124420a 100644 --- a/R/archive.R +++ b/R/archive.R @@ -486,8 +486,12 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { "Non-standard DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}" }, - "i" = "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}", - "i" = "First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}", + "i" = if (nrow(x$DT) != 0L) { + "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}" + }, + "i" = if (nrow(x$DT) != 0L) { + "First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}" + }, "i" = if (!is.na(x$clobberable_versions_start)) { "Clobberable versions start: {x$clobberable_versions_start}" }, From e3902db0ec40859c6dd5d4b7fe2ee9ceaeac7db0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 17:24:33 -0700 Subject: [PATCH 259/345] Improve print.epi_archive in Rmds, capture.output, logs Print to stdout and without using messages for all the output. Prevents Rmds from splitting print output into multiple chunks. Allows `capture.output` by default to capture all expected output, and the same for logging utilities expecting regular output to come from stdout. --- NAMESPACE | 3 ++- R/archive.R | 8 ++++---- R/epiprocess.R | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c444e7fb..907fc451 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -97,10 +97,11 @@ importFrom(checkmate,check_names) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) +importFrom(cli,cat_line) importFrom(cli,cli_abort) -importFrom(cli,cli_inform) importFrom(cli,cli_vec) importFrom(cli,cli_warn) +importFrom(cli,format_message) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) diff --git a/R/archive.R b/R/archive.R index 5124420a..95e7eee9 100644 --- a/R/archive.R +++ b/R/archive.R @@ -469,7 +469,7 @@ as_epi_archive <- function(x, geo_type = NULL, time_type = NULL, other_keys = NU #' @param methods Boolean; whether to print all available methods of #' the archive #' -#' @importFrom cli cli_inform +#' @importFrom cli cat_line format_message #' @importFrom rlang check_dots_empty #' @export print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { @@ -480,7 +480,7 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { )) } - cli_inform( + cat_line(format_message( c( ">" = if (class) "An `epi_archive` object, with metadata:", "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { @@ -498,9 +498,9 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { "i" = "Versions end: {x$versions_end}", "i" = "A preview of the table ({nrow(x$DT)} rows x {ncol(x$DT)} columns):" ) - ) - + )) print(x$DT[]) + return(invisible(x)) } diff --git a/R/epiprocess.R b/R/epiprocess.R index 8981c630..5ef80739 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -8,7 +8,7 @@ #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic #' anyInfinite test_subset test_set_equal checkInt -#' @importFrom cli cli_abort cli_inform cli_warn +#' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% #' @name epiprocess "_PACKAGE" From 6f37e3eda846cf0d4d6719e0bbd14dc453d6d5bd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 08:29:41 -0700 Subject: [PATCH 260/345] Eliminate single-use, unneeded local var --- R/archive.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 95e7eee9..61ab5182 100644 --- a/R/archive.R +++ b/R/archive.R @@ -367,8 +367,7 @@ new_epi_archive <- function( DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { + if (anyDuplicated(DT, by = key(DT)) != 0L) { cli_abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. From 861cdd3d90f01b59eca7ad5c9bec8e8fdd08d6d8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 09:20:45 -0700 Subject: [PATCH 261/345] Remove outdated doc comment This applied for a different default `clobberable_versions_start`. --- R/methods-epi_archive.R | 1 - man/epix_as_of.Rd | 1 - 2 files changed, 2 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 0d527244..bfec55f8 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -23,7 +23,6 @@ #' @return An `epi_df` object. #' #' @examples -#' # warning message of data latency shown #' epix_as_of( #' archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version) diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index dc359a7b..1833aad3 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -35,7 +35,6 @@ given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/ar examples. } \examples{ -# warning message of data latency shown epix_as_of( archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version) From a92aa580cc54445bcdf7bd55e64db9d9655ed8e6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 09:44:45 -0700 Subject: [PATCH 262/345] More updates due to `clobberable_versions_start` default, bad name - Update `epix_as_of` docs further based on `clobberable_versions_start` now defaulting to `NA`. - Don't include `max_version =` in example `epix_as_of` calls as it seems atypical and a strange name if extracting a snapshot rather than an archive. --- R/methods-epi_archive.R | 45 +++++++++++++++++++++-------------------- man/epix_as_of.Rd | 45 +++++++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index bfec55f8..b6bf3837 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -30,30 +30,31 @@ #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' -#' epix_as_of( -#' archive_cases_dv_subset, -#' max_version = as.Date("2020-06-12") -#' ) +#' epix_as_of(archive_cases_dv_subset, as.Date("2020-06-12")) #' -#' # When fetching a snapshot as of the latest version with update data in the -#' # archive, a warning is issued by default, as this update data might not yet -#' # be finalized (for example, if data versions are labeled with dates, these -#' # versions might be overwritten throughout the corresponding days with -#' # additional data or "hotfixes" of erroroneous data; when we build an archive -#' # based on database queries, the latest available update might still be -#' # subject to change, but previous versions should be finalized). We can -#' # muffle such warnings with the following pattern: -#' withCallingHandlers( -#' { -#' epix_as_of( -#' archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version) -#' ) -#' }, -#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' # --- Advanced: --- +#' +#' # When requesting recent versions of a data set, there can be some +#' # reproducibility issues. For example, requesting data as of the current date +#' # may return different values based on whether today's data is available yet +#' # or not. Other factors include the time it takes between data becoming +#' # available and when you download the data, and whether the data provider +#' # will overwrite ("clobber") version data rather than just publishing new +#' # versions. You can include information about these factors by setting the +#' # `clobberable_versions_start` and `versions_end` of an `epi_archive`, in +#' # which case you will get warnings about potential reproducibility issues: +#' +#' archive_cases_dv_subset2 <- as_epi_archive( +#' archive_cases_dv_subset$DT, +#' # Suppose last version with an update could potentially be rewritten +#' # (a.k.a. "hotfixed", "clobbered", etc.): +#' clobberable_versions_start = max(archive_cases_dv_subset$DT$version), +#' # Suppose today is the following day, and there are no updates out yet: +#' versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, +#' compactify = TRUE #' ) -#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used -#' # to globally toggle these warnings. +#' +#' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) #' #' @importFrom data.table between key #' @export diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 1833aad3..42b121fa 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -42,29 +42,30 @@ epix_as_of( range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 -epix_as_of( - archive_cases_dv_subset, - max_version = as.Date("2020-06-12") -) +epix_as_of(archive_cases_dv_subset, as.Date("2020-06-12")) -# When fetching a snapshot as of the latest version with update data in the -# archive, a warning is issued by default, as this update data might not yet -# be finalized (for example, if data versions are labeled with dates, these -# versions might be overwritten throughout the corresponding days with -# additional data or "hotfixes" of erroroneous data; when we build an archive -# based on database queries, the latest available update might still be -# subject to change, but previous versions should be finalized). We can -# muffle such warnings with the following pattern: -withCallingHandlers( - { - epix_as_of( - archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version) - ) - }, - epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +# --- Advanced: --- + +# When requesting recent versions of a data set, there can be some +# reproducibility issues. For example, requesting data as of the current date +# may return different values based on whether today's data is available yet +# or not. Other factors include the time it takes between data becoming +# available and when you download the data, and whether the data provider +# will overwrite ("clobber") version data rather than just publishing new +# versions. You can include information about these factors by setting the +# `clobberable_versions_start` and `versions_end` of an `epi_archive`, in +# which case you will get warnings about potential reproducibility issues: + +archive_cases_dv_subset2 <- as_epi_archive( + archive_cases_dv_subset$DT, + # Suppose last version with an update could potentially be rewritten + # (a.k.a. "hotfixed", "clobbered", etc.): + clobberable_versions_start = max(archive_cases_dv_subset$DT$version), + # Suppose today is the following day, and there are no updates out yet: + versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, + compactify = TRUE ) -# Since R 4.0, there is a `globalCallingHandlers` function that can be used -# to globally toggle these warnings. + +epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) } From 1cead3036da700af0887a37b0bd19f162aada632 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 10:23:18 -0700 Subject: [PATCH 263/345] fix: grouped_epi_archives are not epi_archives We don't want to try to use an `epi_archive` method implementation on a `grouped_epi_archive`, or have `is_epi_archive` succeed on them even with `grouped_okay = FALSE`, to prevent attempted extraction of nonexistent fields. --- R/grouped_epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 140ff9d3..8473ab35 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -80,7 +80,7 @@ new_grouped_epi_archive <- function(x, vars, drop) { list( private = private ), - class = c("grouped_epi_archive", "epi_archive") + class = "grouped_epi_archive" )) } From 38c3322d1166da557f9fd54b9f0c86736aefab09 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 10:51:18 -0700 Subject: [PATCH 264/345] Clean up `clone()` usage - Use new `%>% clone()` when we want a deep copy - Use aliasing instead of shallow copies, since with S3 lists we should not have the threat of mutation of the shallow list structure --- R/grouped_epi_archive.R | 6 +++--- R/methods-epi_archive.R | 2 +- tests/testthat/test-epix_fill_through_version.R | 3 --- tests/testthat/test-epix_merge.R | 3 +-- tests/testthat/test-epix_slide.R | 5 ++--- tests/testthat/test-methods-epi_archive.R | 9 ++++----- 6 files changed, 11 insertions(+), 17 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 8473ab35..7688e0f3 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -370,7 +370,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, # DT; convert and wrap: data.table::setattr(.data_group, "sorted", dt_key) data.table::setDT(.data_group, key = dt_key) - .data_group_archive <- as_of_archive %>% clone() + .data_group_archive <- as_of_archive .data_group_archive$DT <- .data_group comp_one_grp(.data_group_archive, .group_key, f = f, ..., @@ -437,8 +437,8 @@ is_grouped_epi_archive <- function(x) { #' @export clone.grouped_epi_archive <- function(x, ...) { - ungrouped <- x$private$ungrouped %>% clone() - new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) + x$private$ungrouped <- x$private$ungrouped %>% clone() + x } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index b6bf3837..5ae75b11 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -533,7 +533,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { out_dt <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% data.table::setDT(key = key(.data$DT)) - out_archive <- .data %>% clone() + out_archive <- .data out_archive$DT <- out_dt request_names <- names(col_modify_cols) return(list( diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 89bb4804..b87b26ed 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -59,16 +59,13 @@ test_that("epix_fill_through_version does not mutate x", { as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) )) { ea_orig_before <- clone(ea_orig) - ea_orig_dt_before_copy <- data.table::copy(ea_orig$DT) some_unobserved_version <- 8L ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") expect_identical(ea_orig_before, ea_orig) - expect_identical(ea_orig_dt_before_copy, ea_orig$DT) ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") expect_identical(ea_orig_before, ea_orig) - expect_identical(ea_orig_dt_before_copy, ea_orig$DT) } }) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 9bcc7d67..c29301b8 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,6 +1,5 @@ test_that("epix_merge requires forbids on invalid `y`", { - ea <- archive_cases_dv_subset %>% - clone() + ea <- archive_cases_dv_subset expect_error(epix_merge(ea, data.frame(x = 1))) }) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index b7a3e946..5c20abc2 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -371,7 +371,6 @@ test_that("epix_slide with all_versions option has access to all older versions" } ea_orig_mirror <- ea %>% clone() - ea_orig_mirror$DT <- data.table::copy(ea_orig_mirror$DT) result1 <- ea %>% group_by() %>% @@ -485,7 +484,7 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo <- ea %>% clone() + ea_multigeo <- ea ea_multigeo$DT <- rbind( ea_multigeo$DT, copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] @@ -565,7 +564,7 @@ test_that("epix_slide with all_versions option works as intended", { # back depending on the decisions there: # # test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { -# ea_updated_stale = ea %>% clone() +# ea_updated_stale = ea # ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) # # # expect_identical( diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 5be5330f..a5ba48fa 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -1,7 +1,6 @@ library(dplyr) -ea <- archive_cases_dv_subset %>% - clone() +ea <- archive_cases_dv_subset ea2_data <- tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, @@ -32,7 +31,7 @@ test_that("Warning against max_version being clobberable", { expect_warning(regexp = NA, ea %>% epix_as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea %>% epix_as_of(max_version = min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable <- ea %>% clone() + ea_with_clobberable <- ea ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) expect_warning(ea_with_clobberable %>% epix_as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(max_version = min(ea$DT$version))) @@ -92,7 +91,7 @@ test_that("epix_truncate_version_after doesn't filter if max_verion at latest ve ea2 <- ea2_data %>% as_epi_archive() - ea_expected <- ea2 %>% clone() + ea_expected <- ea2 ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -120,7 +119,7 @@ test_that("epix_truncate_version_after returns the same groups as input grouped_ as_epi_archive() ea2 <- ea2 %>% group_by(geo_value) - ea_expected <- ea2 %>% clone() + ea_expected <- ea2 ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) From 5ea168e88d966db096acd671054099763d5a7559 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 11:05:19 -0700 Subject: [PATCH 265/345] Remove remaining reference to R6 method --- R/grouped_epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 7688e0f3..97f5a4c9 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -216,7 +216,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, if ("group_by" %in% nse_dots_names(...)) { cli_abort(" The `group_by` argument to `slide` has been removed; please use - the `group_by` S3 generic function or `$group_by` R6 method + the `group_by()` S3 generic function before the slide instead. (If you were instead trying to pass a `group_by` argument to `f` or create a column named `group_by`, this check is a false positive, but you will still need to use a From e61e11a1dec9290ed9f4af5ecac386af31feb90b Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 29 Apr 2024 17:39:18 -0700 Subject: [PATCH 266/345] refactor: changes from review * remove is_epi_archive and delete in epix_slide * simplify group_by_drop_default * prune library calls in tests * remove here and waldo from Suggests * pull most validation work from new_epi_archive into validate_epi_archive * call validate_epi_archive in as_epi_archive --- DESCRIPTION | 2 - NAMESPACE | 3 +- R/archive.R | 161 +++++++++++----------- R/epiprocess.R | 2 +- R/grouped_epi_archive.R | 3 +- R/methods-epi_archive.R | 3 - man/epi_archive.Rd | 20 ++- man/is_epi_archive.Rd | 35 ----- tests/testthat/test-archive.R | 2 - tests/testthat/test-autoplot.R | 3 - tests/testthat/test-compactify.R | 4 - tests/testthat/test-correlation.R | 2 - tests/testthat/test-data.R | 2 +- tests/testthat/test-epix_slide.R | 2 +- tests/testthat/test-methods-epi_archive.R | 5 +- vignettes/archive.Rmd | 9 -- 16 files changed, 102 insertions(+), 156 deletions(-) delete mode 100644 man/is_epi_archive.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2b53474c..a7a7aa93 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,12 +51,10 @@ Suggests: covidcast, devtools, epidatr, - here, knitr, outbreaks, rmarkdown, testthat (>= 3.1.5), - waldo (>= 0.3.1), withr VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 907fc451..1362b15c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,7 +64,6 @@ export(filter) export(group_by) export(group_modify) export(growth_rate) -export(is_epi_archive) export(is_epi_df) export(is_grouped_epi_archive) export(key_colnames) @@ -78,6 +77,7 @@ export(rename) export(slice) export(ungroup) export(unnest) +export(validate_epi_archive) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) @@ -94,6 +94,7 @@ importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) importFrom(checkmate,check_names) +importFrom(checkmate,expect_class) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) diff --git a/R/archive.R b/R/archive.R index 61ab5182..14918678 100644 --- a/R/archive.R +++ b/R/archive.R @@ -307,59 +307,8 @@ new_epi_archive <- function( other_keys = NULL, additional_metadata = NULL, compactify = NULL, - clobberable_versions_start = NA, + clobberable_versions_start = NULL, versions_end = NULL) { - assert_data_frame(x) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } - - geo_type <- geo_type %||% guess_geo_type(x$geo_value) - time_type <- time_type %||% guess_time_type(x$time_value) - other_keys <- other_keys %||% character(0L) - additional_metadata <- additional_metadata %||% list() - - # Finish off with small checks on keys variables and metadata - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } - - # Conduct checks and apply defaults for `compactify` - assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - versions_end <- versions_end %||% max_version_with_row_in(x) - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - "`versions_end` was {versions_end}, but `x` contained - updates for a later version or versions, up through {max(x$version)}", - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - "`versions_end` was {versions_end}, but a `clobberable_versions_start` - of {clobberable_versions_start} indicated that there were later observed versions", - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - } - - # --- End of validation and replacing missing args with defaults --- - # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -441,6 +390,54 @@ new_epi_archive <- function( ) } +#' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`. +#' +#' @rdname epi_archive +#' +#' @export +validate_epi_archive <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NULL, + versions_end = NULL) { + # Finish off with small checks on keys variables and metadata + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + "`versions_end` was {versions_end}, but `x` contained + updates for a later version or versions, up through {max(x$version)}", + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + "`versions_end` was {versions_end}, but a `clobberable_versions_start` + of {clobberable_versions_start} indicated that there were later observed versions", + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } +} + #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. @@ -448,11 +445,36 @@ new_epi_archive <- function( #' @rdname epi_archive #' #' @export -as_epi_archive <- function(x, geo_type = NULL, time_type = NULL, other_keys = NULL, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x)) { +as_epi_archive <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NULL, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + geo_type <- geo_type %||% guess_geo_type(x$geo_value) + time_type <- time_type %||% guess_time_type(x$time_value) + other_keys <- other_keys %||% character(0L) + additional_metadata <- additional_metadata %||% list() + clobberable_versions_start <- clobberable_versions_start %||% NA + versions_end <- versions_end %||% max_version_with_row_in(x) + + validate_epi_archive( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) new_epi_archive( x, geo_type, time_type, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end @@ -652,31 +674,6 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ } -#' Test for `epi_archive` format -#' -#' @param x An object. -#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also -#' count? Default is `FALSE`. -#' @return `TRUE` if the object inherits from `epi_archive`. -#' -#' @export -#' @examples -#' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -#' is_epi_archive(archive_cases_dv_subset) # TRUE -#' -#' # By default, grouped_epi_archives don't count as epi_archives, as they may -#' # support a different set of operations from regular `epi_archives`. This -#' # behavior can be controlled by `grouped_okay`. -#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) -#' is_epi_archive(grouped_archive) # FALSE -#' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE -#' -#' @seealso [`is_grouped_epi_archive`] -is_epi_archive <- function(x, grouped_okay = FALSE) { - inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") -} - - #' Clone an `epi_archive` object. #' #' @param x An `epi_archive` object. diff --git a/R/epiprocess.R b/R/epiprocess.R index 5ef80739..dd7df87a 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -7,7 +7,7 @@ #' @importFrom checkmate assert assert_scalar assert_data_frame anyMissing #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic -#' anyInfinite test_subset test_set_equal checkInt +#' anyInfinite test_subset test_set_equal checkInt expect_class #' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% #' @name epiprocess diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 97f5a4c9..55a0176c 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -157,8 +157,7 @@ group_by.grouped_epi_archive <- function( #' #' @export group_by_drop_default.grouped_epi_archive <- function(.tbl) { - x <- .tbl - x$private$drop + .tbl$private$drop } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 5ae75b11..891cc064 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -797,9 +797,6 @@ epix_slide <- function( as_list_col = FALSE, names_sep = "_", all_versions = FALSE) { - if (!is_epi_archive(x, grouped_okay = TRUE)) { - cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") - } UseMethod("epix_slide") } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b7dd649e..97ff6af0 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -3,6 +3,7 @@ \name{epi_archive} \alias{epi_archive} \alias{new_epi_archive} +\alias{validate_epi_archive} \alias{as_epi_archive} \title{\code{epi_archive} object} \usage{ @@ -13,7 +14,18 @@ new_epi_archive( other_keys = NULL, additional_metadata = NULL, compactify = NULL, - clobberable_versions_start = NA, + clobberable_versions_start = NULL, + versions_end = NULL +) + +validate_epi_archive( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NULL, versions_end = NULL ) @@ -22,10 +34,10 @@ as_epi_archive( geo_type = NULL, time_type = NULL, other_keys = NULL, - additional_metadata = list(), + additional_metadata = NULL, compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x) + clobberable_versions_start = NULL, + versions_end = NULL ) } \arguments{ diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd deleted file mode 100644 index 06669709..00000000 --- a/man/is_epi_archive.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{is_epi_archive} -\alias{is_epi_archive} -\title{Test for \code{epi_archive} format} -\usage{ -is_epi_archive(x, grouped_okay = FALSE) -} -\arguments{ -\item{x}{An object.} - -\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also -count? Default is \code{FALSE}.} -} -\value{ -\code{TRUE} if the object inherits from \code{epi_archive}. -} -\description{ -Test for \code{epi_archive} format -} -\examples{ -is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -is_epi_archive(archive_cases_dv_subset) # TRUE - -# By default, grouped_epi_archives don't count as epi_archives, as they may -# support a different set of operations from regular `epi_archives`. This -# behavior can be controlled by `grouped_okay`. -grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) -is_epi_archive(grouped_archive) # FALSE -is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE - -} -\seealso{ -\code{\link{is_grouped_epi_archive}} -} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 2eba383d..1291e3c7 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -1,5 +1,3 @@ -library(dplyr) - test_that("first input must be a data.frame", { expect_error(as_epi_archive(c(1, 2, 3), compactify = FALSE), regexp = "Must be of type 'data.frame'." diff --git a/tests/testthat/test-autoplot.R b/tests/testthat/test-autoplot.R index ba3f8d53..0e4654eb 100644 --- a/tests/testthat/test-autoplot.R +++ b/tests/testthat/test-autoplot.R @@ -1,7 +1,4 @@ -library(dplyr) - d <- as.Date("2020-01-01") - raw_df_chr <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = "a"), dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = "d") diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 263d67b7..042a69ea 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -1,7 +1,3 @@ -library(epiprocess) -library(data.table) -library(dplyr) - dt <- archive_cases_dv_subset$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index fe129616..98507434 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -1,5 +1,3 @@ -library(tibble) - test_that("epi_cor throws an error for a non-epi_df for its first argument", { expect_error(epi_cor(1:10, 1, 1)) expect_error(epi_cor(data.frame(x = 1:10), 1, 1)) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index 885f0013..88ecc8c7 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -1,5 +1,5 @@ test_that("`archive_cases_dv_subset` is formed successfully", { - expect_true(is_epi_archive(archive_cases_dv_subset)) + expect_class(archive_cases_dv_subset, "epi_archive") }) test_that("`delayed_assign_with_unregister_awareness` works as expected on good promises", { diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 5c20abc2..a5b72cbf 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -506,7 +506,7 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { slide_fn <- function(x, gk, rtv) { - expect_true(is_epi_archive(x)) + expect_class(x, "epi_archive") return(NA) } diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index a5ba48fa..6686400b 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -1,7 +1,4 @@ -library(dplyr) - ea <- archive_cases_dv_subset - ea2_data <- tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, "ca", "2020-06-01", "2020-06-01", 1, @@ -104,7 +101,7 @@ test_that("epix_truncate_version_after returns the same grouping type as input e ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_epi_archive(ea_as_of, grouped_okay = FALSE)) + expect_class(ea_as_of, "epi_archive") ea2_grouped <- ea2 %>% group_by(geo_value) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 6193981a..a34429d9 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -113,15 +113,6 @@ key(x$DT) In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions. -```{r} -original_value <- x$DT$percent_cli[1] -y <- x # This DOES NOT make a copy of x -y$DT$percent_cli[1] <- 0 -head(y$DT) -head(x$DT) -x$DT$percent_cli[1] <- original_value -``` - ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` From d4d7fad363dbd3979e5946a06a8f4a21452b3d04 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 28 May 2024 15:40:34 -0700 Subject: [PATCH 267/345] Avoid `filter(cases==1)`, better describe & `complete` Ebola example Avoid `filter(cases==1)`: this should either * involve `filter(status == "confirmed")` instead, or * involve no filter, and let `sum` take care of things. We should generally expect the latter approach to be RAM-friendlier (one new column vs. new filtered vectors for every column), so implement the latter approach. Reformat the code a bit to try to make it read better. Clarify the description of the example; make clear we're deriving from line list data. `complete` the data to make the set of `time_value`s evenly spaced and the same for every `geo_value`; even though it doesn't impact the plots, we're demonstrating how to get things into `epi_df` format, and many functions we'd use on `epi_df`s will expect things in this format. --- DESCRIPTION | 2 +- NEWS.md | 6 +++++- vignettes/epiprocess.Rmd | 45 ++++++++++++++++++++++------------------ 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1e9566b1..cc7cba12 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.8 +Version: 0.7.9 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 8178328c..a1591d8d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -29,8 +29,12 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat argument checking (#413). - Fix logic to auto-assign `epi_df` `time_type` to `week` (#416) and `year` (#441). +- Clarified "Get started" example of getting Ebola line list data into `epi_df` + format. -## Breaking changes +# epiprocess 0.7.0 + +## Breaking changes: - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 - Refactored `epi_archive` to use S3 instead of R6 for its object model. The diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 12020d89..a8a6abd0 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -60,6 +60,7 @@ API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). library(epidatr) library(epiprocess) library(dplyr) +library(tidyr) library(withr) cases <- pub_covidcast( @@ -279,30 +280,34 @@ ggplot(x, aes(x = time_value, y = value)) + labs(x = "Date", y = "SARS cases in Canada", fill = "Type") ``` -Data on new cases of Ebola in Sierra Leone in 2014, from the same package: +Get confirmed cases of Ebola in Sierra Leone from 2014 to 2015 by province and +date of onset, prepared from line list data from the same package: -```{r, message = FALSE, fig.width = 9, fig.height = 6} +```{r, fig.width = 9, fig.height = 6} x <- outbreaks::ebola_sierraleone_2014 %>% - mutate( - cases = ifelse(status == "confirmed", 1, 0), - province = case_when( - district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern", - district %in% c( - "Bombali", "Kambia", "Koinadugu", - "Port Loko", "Tonkolili" - ) ~ "Northern", - district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern", - district %in% c("Western Rural", "Western Urban") ~ "Western" - ) + select(district, date_of_onset, status) %>% + mutate(province = case_when( + district %in% c("Kailahun", "Kenema", "Kono") ~ + "Eastern", + district %in% c( + "Bombali", "Kambia", "Koinadugu", "Port Loko", + "Tonkolili" + ) ~ + "Northern", + district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ + "Sourthern", + district %in% c("Western Rural", "Western Urban") ~ + "Western" + )) %>% + group_by(geo_value = province, time_value = date_of_onset) %>% + summarise( + cases = sum(if_else(status == "confirmed", 1, 0)), + .groups = "drop" ) %>% - select( - geo_value = province, - time_value = date_of_onset, - cases + complete(geo_value, + time_value = full_seq(time_value, period = 1), + fill = list(cases = 0) ) %>% - filter(cases == 1) %>% - group_by(geo_value, time_value) %>% - summarise(cases = sum(cases)) %>% as_epi_df(geo_type = "province", as_of = as.Date("2024-03-20")) ggplot(x, aes(x = time_value, y = cases)) + From 9417d33243b60e9f29776ab606afd3d70fb4d9b3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 21 Jun 2023 09:57:40 -0700 Subject: [PATCH 268/345] Use sum-of-bools for confirmed case count from line list example --- vignettes/epiprocess.Rmd | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index a8a6abd0..91daa455 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -300,10 +300,7 @@ x <- outbreaks::ebola_sierraleone_2014 %>% "Western" )) %>% group_by(geo_value = province, time_value = date_of_onset) %>% - summarise( - cases = sum(if_else(status == "confirmed", 1, 0)), - .groups = "drop" - ) %>% + summarise(cases = sum(status == "confirmed"), .groups = "drop") %>% complete(geo_value, time_value = full_seq(time_value, period = 1), fill = list(cases = 0) From d3827a341b0bfde0265ea1dfb377caf5e76449e7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 29 May 2024 10:38:59 -0700 Subject: [PATCH 269/345] Address Guidehouse feedback on landing page --- DESCRIPTION | 2 +- NEWS.md | 1 + vignettes/epiprocess.Rmd | 50 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index cc7cba12..3c409c31 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.9 +Version: 0.7.10 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index a1591d8d..1bfdd9c5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -31,6 +31,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat (#441). - Clarified "Get started" example of getting Ebola line list data into `epi_df` format. +- Improved documentation web site landing page's introduction. # epiprocess 0.7.0 diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 91daa455..2d67e28a 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -9,14 +9,56 @@ editor_options: chunk_output_type: console --- -This package introduces a common data structure for epidemiological data sets -measured over space and time, and offers associated utilities to perform basic -signal processing tasks. +The [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) package provides + +- `epi_df`, a class for working with epidemiological time series data; +- `epi_archive`, a class for working with the version history of such time series data; +- sample data in these formats; +- functions for common data transformations (e.g., 7-day averages); +- functions for exploratory data analysis and situational awareness (e.g., + outlier detection and growth rate estimation); and +- functions for version-faithful "pseudoprospective" backtesting of models, and + other version history analysis. + +It is part of a broader suite of packages including +[`{epidatr}`](https://cmu-delphi.github.io/epidatr/), +[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/), +[`{rtestim}`](https://dajmcdon.github.io/rtestim/), and +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/), for accessing, +analyzing, and forecasting epidemiological time series data. We have expanded +documentation and demonstrations for some of these packages available in an +online "book" format [here](https://cmu-delphi.github.io/delphi-tooling-book/). + +## Motivation + +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) and +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) are designed to lower +the barrier to entry and implementation cost for epidemiological time series +analysis and forecasting. Epidemiologists and forecasting groups repeatedly and +separately have had to rush to implement this type of functionality in a much +more ad hoc manner; we are trying to save such effort in the future by providing +well-documented, tested, and general packages that can be called for many common +tasks instead. + +[`{epiprocess}`](https://github.com/cmu-delphi/epiprocess/) also provides tools +to help avoid a particularly common pitfall in analysis and forecasting: +ignoring reporting latency and revisions to a data set. This can, for example, +lead to one retrospectively analyzing a surveillance signal or forecasting model +and concluding that it is much more accurate than it actually was in real time, +or producing always-decreasing forecasts on data sets where initial surveillance +estimates are systematically revised upward. Storing and working with version +history can help avoid these issues. + +## Intended audience + +We expect users to be proficient in R, and familiar with the +[`{dplyr}`](https://dplyr.tidyverse.org/) and +[`{tidyr}`](https://tidyr.tidyverse.org/) packages. ## Installing This package is not on CRAN yet, so it can be installed using the -[`devtools`](https://devtools.r-lib.org) package: +[`{devtools}`](https://devtools.r-lib.org) package: ```{r, eval = FALSE} devtools::install_github("cmu-delphi/epiprocess", ref = "main") From c1cfe09201c3859b7a17e957cabdb6d141daddeb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 29 May 2024 17:10:22 -0400 Subject: [PATCH 270/345] remove renaming options from epi_opt_slide. remove col_names checks --- R/slide.R | 65 +++++++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 47 deletions(-) diff --git a/R/slide.R b/R/slide.R index 7cdc8f38..a7316f72 100644 --- a/R/slide.R +++ b/R/slide.R @@ -432,8 +432,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' ungroup() epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + new_col_name = NULL, as_list_col = NULL, + names_sep = NULL, all_rows = FALSE) { assert_class(x, "epi_df") if (nrow(x) == 0L) { @@ -454,6 +454,18 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, class = "epiproces__epi_slide_mean__list_not_supported" ) } + if (!is.null(new_col_name)) { + cli_abort( + "`new_col_name` is not supported for `epi_slide_mean`", + class = "epiproces__epi_slide_mean__new_name_not_supported" + ) + } + if (!is.null(names_sep)) { + cli_abort( + "`names_sep` is not supported for `epi_slide_mean`", + class = "epiproces__epi_slide_mean__name_sep_not_supported" + ) + } # Check that slide function `f` is one of those short-listed from # `data.table` and `slider` (or a function that has the exact same @@ -543,48 +555,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `before` and `after` params. window_size <- before + after + 1L - col_names_quo <- enquo(col_names) - col_names_chr <- as.character(rlang::quo_get_expr(col_names_quo)) - if (startsWith(rlang::as_label(col_names_quo), "c(")) { - # List or vector of col names. We need to drop the first element since it - # will be either "c" (if built as a vector) or "list" (if built as a - # list). - col_names_chr <- col_names_chr[-1] - } else if (startsWith(rlang::as_label(col_names_quo), "list(")) { - cli_abort( - "`col_names` must be a single tidy column name or a vector - (`c()`) of tidy column names", - class = "epiprocess__epi_slide_mean__col_names_in_list", - epiprocess__col_names = col_names_chr - ) - } - # If single column name, do nothing. - - if (is.null(names_sep)) { - if (length(new_col_name) != length(col_names_chr)) { - cli_abort( - c( - "`new_col_name` must be the same length as `col_names` when - `names_sep` is NULL to avoid duplicate output column names." - ), - class = "epiprocess__epi_slide_mean__col_names_length_mismatch", - epiprocess__new_col_name = new_col_name, - epiprocess__col_names = col_names_chr - ) - } - result_col_names <- new_col_name - } else { - if (length(new_col_name) != 1L && length(new_col_name) != length(col_names_chr)) { - cli_abort( - "`new_col_name` must be either length 1 or the same length as `col_names`.", - class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", - epiprocess__new_col_name = new_col_name, - epiprocess__col_names = col_names_chr - ) - } - result_col_names <- paste(new_col_name, col_names_chr, sep = names_sep) - } - + result_col_names <- ... slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -632,7 +603,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (f_from_package == "data.table") { roll_output <- f( - x = .data_group[, col_names_chr], n = window_size, align = "right", ... + x = .data_group[, col_names], n = window_size, align = "right", ... ) if (after >= 1) { @@ -646,9 +617,9 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, .data_group[, result_col_names] <- roll_output } } else if (f_from_package == "slider") { - for (i in seq_along(col_names_chr)) { + for (i in seq_along(col_names)) { .data_group[, result_col_names[i]] <- f( - x = .data_group[[col_names_chr[i]]], before = before, after = after, ... + x = .data_group[[col_names[i]]], before = before, after = after, ... ) } } From a39a53dfc2f3d54697fbe36f68f547eea66d7606 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 29 May 2024 17:21:05 -0400 Subject: [PATCH 271/345] basic tidyselect --- R/slide.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/R/slide.R b/R/slide.R index a7316f72..052e3457 100644 --- a/R/slide.R +++ b/R/slide.R @@ -377,6 +377,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select #' @importFrom rlang enquo quo_get_expr as_label +#' @importFrom tidyselect eval_select #' @importFrom purrr map #' @importFrom data.table frollmean frollsum frollapply #' @importFrom lubridate as.period @@ -555,7 +556,8 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `before` and `after` params. window_size <- before + after + 1L - result_col_names <- ... + pos <- eval_select(rlang::enquo(col_names), data = x) + result_col_names <- names(pos) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -603,7 +605,8 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (f_from_package == "data.table") { roll_output <- f( - x = .data_group[, col_names], n = window_size, align = "right", ... + x = rlang::set_names(.data_group[, pos], result_col_names), + n = window_size, align = "right", ... ) if (after >= 1) { @@ -613,13 +616,11 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { c(.x[(after + 1L):length(.x)], rep(NA, after)) }) - } else { - .data_group[, result_col_names] <- roll_output } } else if (f_from_package == "slider") { - for (i in seq_along(col_names)) { + for (i in seq_along(pos)) { .data_group[, result_col_names[i]] <- f( - x = .data_group[[col_names[i]]], before = before, after = after, ... + x = .data_group[[pos[i]]], before = before, after = after, ... ) } } @@ -640,7 +641,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, } if (!is_epi_df(result)) { - # `all_rows`handling strip epi_df format and metadata. + # `all_rows`handling strips epi_df format and metadata. # Restore them. result <- reclass(result, attributes(x)$metadata) } From 0636d259f41cf637b523caa1591a26259bca9510 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 29 May 2024 17:31:52 -0400 Subject: [PATCH 272/345] tidyselect can't be used to provide column names --- R/slide.R | 8 +++++--- man-roxygen/opt-slide-params.R | 8 ++++++-- man/epi_slide_mean.Rd | 8 ++++++-- man/epi_slide_opt.Rd | 12 ++++++++---- man/epi_slide_sum.Rd | 8 ++++++-- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/R/slide.R b/R/slide.R index 052e3457..158fc690 100644 --- a/R/slide.R +++ b/R/slide.R @@ -557,7 +557,8 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, window_size <- before + after + 1L pos <- eval_select(rlang::enquo(col_names), data = x) - result_col_names <- names(pos) + # Always rename results to "slide_value_". + result_col_names <- paste0("slide_value_", names(x[, pos])) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -605,8 +606,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (f_from_package == "data.table") { roll_output <- f( - x = rlang::set_names(.data_group[, pos], result_col_names), - n = window_size, align = "right", ... + x = .data_group[, pos], n = window_size, align = "right", ... ) if (after >= 1) { @@ -616,6 +616,8 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { c(.x[(after + 1L):length(.x)], rep(NA, after)) }) + } else { + .data_group[, result_col_names] <- roll_output } } else if (f_from_package == "slider") { for (i in seq_along(pos)) { diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index a7d5b04a..6c1ad99d 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,5 +1,9 @@ -#' @param col_names A single tidyselection or a tidyselection vector of the -#' names of one or more columns for which to calculate the rolling mean. +#' @param col_names A <[`tidy-select`][dplyr_tidy_select]> of the names of one +#' or more columns for which to calculate a rolling computation. One or more +#' unquoted expressions separated by commas. Variable names can be used as +#' if they were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. The tidyselection cannot be used +#' to provide output column names. #' @param as_list_col Not supported. Included to match `epi_slide` interface. #' @param new_col_name Character vector indicating the name(s) of the new #' column(s) that will contain the derivative values. Default diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index ee3e7838..adb294bc 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -23,8 +23,12 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A single tidyselection or a tidyselection vector of the -names of one or more columns for which to calculate the rolling mean.} +\item{col_names}{A <\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one +or more columns for which to calculate a rolling computation. One or more +unquoted expressions separated by commas. Variable names can be used as +if they were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables. The tidyselection cannot be used +to provide output column names.} \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 0772b431..dda2adde 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,9 +13,9 @@ epi_slide_opt( after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_name = NULL, as_list_col = NULL, - names_sep = "_", + names_sep = NULL, all_rows = FALSE ) } @@ -24,8 +24,12 @@ epi_slide_opt( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A single tidyselection or a tidyselection vector of the -names of one or more columns for which to calculate the rolling mean.} +\item{col_names}{A <\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one +or more columns for which to calculate a rolling computation. One or more +unquoted expressions separated by commas. Variable names can be used as +if they were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables. The tidyselection cannot be used +to provide output column names.} \item{f}{Function; together with \code{...} specifies the computation to slide. \code{f} must be one of \code{data.table}'s rolling functions diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index d5961f27..a56c4c72 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -23,8 +23,12 @@ epi_slide_sum( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A single tidyselection or a tidyselection vector of the -names of one or more columns for which to calculate the rolling mean.} +\item{col_names}{A <\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one +or more columns for which to calculate a rolling computation. One or more +unquoted expressions separated by commas. Variable names can be used as +if they were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables. The tidyselection cannot be used +to provide output column names.} \item{...}{Additional arguments to pass to \code{data.table::frollsum}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically From e227ccf68d6a9b850bd6ba891148583af70bb66c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 30 May 2024 10:56:47 -0400 Subject: [PATCH 273/345] need to run eval_select after grouping to get correct column positions --- R/slide.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/R/slide.R b/R/slide.R index 41980969..4192f5fc 100644 --- a/R/slide.R +++ b/R/slide.R @@ -556,9 +556,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `before` and `after` params. window_size <- before + after + 1L - pos <- eval_select(rlang::enquo(col_names), data = x) # Always rename results to "slide_value_". - result_col_names <- paste0("slide_value_", names(x[, pos])) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -604,6 +602,19 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, ) } + # Although this value is the same for every `.data_group`, it needs to be + # evaluated inside `slide_one_grp`. This is because input `x` and + # `.data_group` can have a different number of columns (due to the + # grouping step), i.e. the position that `eval_select` returns for a + # given column can be different. + # + # It is possible that rerunning this is slow We could alternately + # initialize `pos` and `result_col_names` variables to `NULL` one level + # up, and superassign `<<-` the values here the first time we run + # `slide_one_grp` (relative resources use TBD). + pos <- eval_select(rlang::enquo(col_names), data = .data_group) + result_col_names <- paste0("slide_value_", names(x[, pos])) + if (f_from_package == "data.table") { roll_output <- f( x = .data_group[, pos], n = window_size, align = "right", ... From 0f0df7d247ee4bf39822f9e3d5b7f0e587a47896 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 30 May 2024 11:05:08 -0400 Subject: [PATCH 274/345] note use of colnames as character vector --- R/slide.R | 2 +- man-roxygen/opt-slide-params.R | 13 +++++++------ man/epi_slide_mean.Rd | 13 +++++++------ man/epi_slide_opt.Rd | 13 +++++++------ man/epi_slide_sum.Rd | 13 +++++++------ 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/R/slide.R b/R/slide.R index 4192f5fc..3b938adf 100644 --- a/R/slide.R +++ b/R/slide.R @@ -556,7 +556,6 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `before` and `after` params. window_size <- before + after + 1L - # Always rename results to "slide_value_". slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -613,6 +612,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # up, and superassign `<<-` the values here the first time we run # `slide_one_grp` (relative resources use TBD). pos <- eval_select(rlang::enquo(col_names), data = .data_group) + # Always rename results to "slide_value_". result_col_names <- paste0("slide_value_", names(x[, pos])) if (f_from_package == "data.table") { diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index 6c1ad99d..2fb51315 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,9 +1,10 @@ -#' @param col_names A <[`tidy-select`][dplyr_tidy_select]> of the names of one -#' or more columns for which to calculate a rolling computation. One or more -#' unquoted expressions separated by commas. Variable names can be used as -#' if they were positions in the data frame, so expressions like `x:y` can -#' be used to select a range of variables. The tidyselection cannot be used -#' to provide output column names. +#' @param col_names A character vector OR a +#' <[`tidy-select`][dplyr_tidy_select]> of the names of one or more columns +#' for which to calculate a rolling computation. If a tidy-selection, one +#' or more unquoted expressions separated by commas. Variable names can be +#' used as if they were positions in the data frame, so expressions like +#' `x:y` can be used to select a range of variables. The tidy-selection +#' cannot be used to provide output column names. #' @param as_list_col Not supported. Included to match `epi_slide` interface. #' @param new_col_name Character vector indicating the name(s) of the new #' column(s) that will contain the derivative values. Default diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index adb294bc..e4d35e46 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -23,12 +23,13 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A <\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one -or more columns for which to calculate a rolling computation. One or more -unquoted expressions separated by commas. Variable names can be used as -if they were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables. The tidyselection cannot be used -to provide output column names.} +\item{col_names}{A character vector OR a +<\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one or more columns +for which to calculate a rolling computation. If a tidy-selection, one +or more unquoted expressions separated by commas. Variable names can be +used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. The tidy-selection +cannot be used to provide output column names.} \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index dda2adde..4a8b6e68 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -24,12 +24,13 @@ epi_slide_opt( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A <\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one -or more columns for which to calculate a rolling computation. One or more -unquoted expressions separated by commas. Variable names can be used as -if they were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables. The tidyselection cannot be used -to provide output column names.} +\item{col_names}{A character vector OR a +<\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one or more columns +for which to calculate a rolling computation. If a tidy-selection, one +or more unquoted expressions separated by commas. Variable names can be +used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. The tidy-selection +cannot be used to provide output column names.} \item{f}{Function; together with \code{...} specifies the computation to slide. \code{f} must be one of \code{data.table}'s rolling functions diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index a56c4c72..91998efc 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -23,12 +23,13 @@ epi_slide_sum( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A <\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one -or more columns for which to calculate a rolling computation. One or more -unquoted expressions separated by commas. Variable names can be used as -if they were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables. The tidyselection cannot be used -to provide output column names.} +\item{col_names}{A character vector OR a +<\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one or more columns +for which to calculate a rolling computation. If a tidy-selection, one +or more unquoted expressions separated by commas. Variable names can be +used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. The tidy-selection +cannot be used to provide output column names.} \item{...}{Additional arguments to pass to \code{data.table::frollsum}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically From 7de0187c71f31fe63a753611d876302d4852486c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 30 May 2024 11:19:13 -0400 Subject: [PATCH 275/345] update epi_slide_mean/sum defaults --- R/slide.R | 8 ++++---- man/epi_slide_mean.Rd | 4 ++-- man/epi_slide_sum.Rd | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/slide.R b/R/slide.R index 3b938adf..537bdc8b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -727,8 +727,8 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' ungroup() epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + new_col_name = NULL, as_list_col = NULL, + names_sep = NULL, all_rows = FALSE) { epi_slide_opt( x = x, col_names = {{ col_names }}, @@ -774,8 +774,8 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' ungroup() epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_name = "slide_value", as_list_col = NULL, - names_sep = "_", all_rows = FALSE) { + new_col_name = NULL, as_list_col = NULL, + names_sep = NULL, all_rows = FALSE) { epi_slide_opt( x = x, col_names = {{ col_names }}, diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index e4d35e46..9937c986 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -12,9 +12,9 @@ epi_slide_mean( after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_name = NULL, as_list_col = NULL, - names_sep = "_", + names_sep = NULL, all_rows = FALSE ) } diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 91998efc..a65fb815 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -12,9 +12,9 @@ epi_slide_sum( after, ref_time_values, time_step, - new_col_name = "slide_value", + new_col_name = NULL, as_list_col = NULL, - names_sep = "_", + names_sep = NULL, all_rows = FALSE ) } From 1db0a013d73f120ca78cafb09f94fa83d0ac3499 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:26:44 -0400 Subject: [PATCH 276/345] error messages refer to _opt --- R/slide.R | 32 ++++++++++++++++---------------- man/epi_slide.Rd | 4 ++-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/R/slide.R b/R/slide.R index 537bdc8b..e2d70163 100644 --- a/R/slide.R +++ b/R/slide.R @@ -86,8 +86,8 @@ #' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`] #' @examples #' # slide a 7-day trailing average formula on cases -#' # This and other simple sliding means are much faster to do using -#' # the `epi_slide_mean` function instead. +#' # Simple sliding means and sums are much faster to do using +#' # the `epi_slide_mean` and `epi_slide_sum` functions instead. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide(cases_7dav = mean(cases), before = 6) %>% @@ -444,27 +444,27 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, "i" = "If this computation is occuring within an `epix_slide` call, check that `epix_slide` `ref_time_values` argument was set appropriately" ), - class = "epiprocess__epi_slide_mean__0_row_input", + class = "epiprocess__epi_slide_opt__0_row_input", epiprocess__x = x ) } if (!is.null(as_list_col)) { cli_abort( - "`as_list_col` is not supported for `epi_slide_mean`", - class = "epiproces__epi_slide_mean__list_not_supported" + "`as_list_col` is not supported for `epi_slide_[opt/mean/sum]`", + class = "epiproces__epi_slide_opt__list_not_supported" ) } if (!is.null(new_col_name)) { cli_abort( - "`new_col_name` is not supported for `epi_slide_mean`", - class = "epiproces__epi_slide_mean__new_name_not_supported" + "`new_col_name` is not supported for `epi_slide_[opt/mean/sum]`", + class = "epiproces__epi_slide_opt__new_name_not_supported" ) } if (!is.null(names_sep)) { cli_abort( - "`names_sep` is not supported for `epi_slide_mean`", - class = "epiproces__epi_slide_mean__name_sep_not_supported" + "`names_sep` is not supported for `epi_slide_[opt/mean/sum]`", + class = "epiproces__epi_slide_opt__name_sep_not_supported" ) } @@ -571,19 +571,19 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # If a group contains duplicate time values, `frollmean` will still only # use the last `k` obs. It isn't looking at dates, it just goes in row # order. So if the computation is aggregating across multiple obs for the - # same date, `epi_slide_mean` will produce incorrect results; `epi_slide` - # should be used instead. + # same date, `epi_slide_opt` and derivates will produce incorrect + # results; `epi_slide` should be used instead. if (anyDuplicated(.data_group$time_value) != 0L) { cli_abort( c( - "group contains duplicate time values. Using `epi_slide_mean` on this + "group contains duplicate time values. Using `epi_slide_[opt/mean/sum]` on this group will result in incorrect results", "i" = "Please change the grouping structure of the input data so that each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) - %>% epi_slide_mean`)", + %>% epi_slide_opt(f = frollmean)`)", "i" = "Use `epi_slide` to aggregate across groups" ), - class = "epiprocess__epi_slide_mean__duplicate_time_values", + class = "epiprocess__epi_slide_opt__duplicate_time_values", epiprocess__data_group = .data_group, epiprocess__group_key = .group_key ) @@ -595,7 +595,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, "i" = c("Input data may contain `time_values` closer together than the expected `time_step` size") ), - class = "epiprocess__epi_slide_mean__unexpected_row_number", + class = "epiprocess__epi_slide_opt__unexpected_row_number", epiprocess__data_group = .data_group, epiprocess__group_key = .group_key ) @@ -844,7 +844,7 @@ full_date_seq <- function(x, before, after, time_step) { "i" = c("The input data's `time_type` was probably `custom` or `day-time`. These require also passing a `time_step` function.") ), - class = "epiprocess__epi_slide_mean__unmappable_time_type", + class = "epiprocess__full_date_seq__unmappable_time_type", epiprocess__time_type = ttype ) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 0d0dfb55..a1319f99 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -160,8 +160,8 @@ through the \code{new_col_name} argument. } \examples{ # slide a 7-day trailing average formula on cases -# This and other simple sliding means are much faster to do using -# the `epi_slide_mean` function instead. +# Simple sliding means and sums are much faster to do using +# the `epi_slide_mean` and `epi_slide_sum` functions instead. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide(cases_7dav = mean(cases), before = 6) \%>\% From d6ff281de91b31039fdb9260f50ab0c70fb4bc61 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 3 Jun 2024 15:53:21 -0400 Subject: [PATCH 277/345] run `eval_select` once and immediately convert `pos` into explicit column names to avoid running the same compuation for each `.data_group` --- R/slide.R | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/R/slide.R b/R/slide.R index e2d70163..28e67dbe 100644 --- a/R/slide.R +++ b/R/slide.R @@ -556,6 +556,16 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # `before` and `after` params. window_size <- before + after + 1L + # The position of a given column can be differ between input `x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + pos <- eval_select(rlang::enquo(col_names), data = x, allow_rename = FALSE) + col_names_chr <- names(x)[pos] + # Always rename results to "slide_value_". + result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { missing_times <- all_dates[!(all_dates %in% .data_group$time_value)] @@ -601,23 +611,9 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, ) } - # Although this value is the same for every `.data_group`, it needs to be - # evaluated inside `slide_one_grp`. This is because input `x` and - # `.data_group` can have a different number of columns (due to the - # grouping step), i.e. the position that `eval_select` returns for a - # given column can be different. - # - # It is possible that rerunning this is slow We could alternately - # initialize `pos` and `result_col_names` variables to `NULL` one level - # up, and superassign `<<-` the values here the first time we run - # `slide_one_grp` (relative resources use TBD). - pos <- eval_select(rlang::enquo(col_names), data = .data_group) - # Always rename results to "slide_value_". - result_col_names <- paste0("slide_value_", names(x[, pos])) - if (f_from_package == "data.table") { roll_output <- f( - x = .data_group[, pos], n = window_size, align = "right", ... + x = .data_group[, col_names_chr], n = window_size, align = "right", ... ) if (after >= 1) { @@ -631,9 +627,9 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, .data_group[, result_col_names] <- roll_output } } else if (f_from_package == "slider") { - for (i in seq_along(pos)) { + for (i in seq_along(col_names_chr)) { .data_group[, result_col_names[i]] <- f( - x = .data_group[[pos[i]]], before = before, after = after, ... + x = .data_group[[col_names_chr[i]]], before = before, after = after, ... ) } } From e89d79aa30161e2b4b73ccd5b27f6bd9b22dd117 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 3 Jun 2024 16:23:10 -0400 Subject: [PATCH 278/345] update test error classes to match _mean -> _opt --- R/slide.R | 6 +++--- tests/testthat/test-epi_slide.R | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/slide.R b/R/slide.R index 28e67dbe..33ce0e7e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -452,19 +452,19 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, if (!is.null(as_list_col)) { cli_abort( "`as_list_col` is not supported for `epi_slide_[opt/mean/sum]`", - class = "epiproces__epi_slide_opt__list_not_supported" + class = "epiprocess__epi_slide_opt__list_not_supported" ) } if (!is.null(new_col_name)) { cli_abort( "`new_col_name` is not supported for `epi_slide_[opt/mean/sum]`", - class = "epiproces__epi_slide_opt__new_name_not_supported" + class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } if (!is.null(names_sep)) { cli_abort( "`names_sep` is not supported for `epi_slide_[opt/mean/sum]`", - class = "epiproces__epi_slide_opt__name_sep_not_supported" + class = "epiprocess__epi_slide_opt__name_sep_not_supported" ) } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 6d66e0c4..6d2f8d23 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -361,7 +361,7 @@ test_that("computation output formats x as_list_col", { value, before = 6L, as_list_col = TRUE, na.rm = TRUE ), - class = "epiproces__epi_slide_mean__list_not_supported" + class = "epiprocess__epi_slide_opt__list_not_supported" ) # `epi_slide_mean` doesn't return dataframe columns }) @@ -729,7 +729,7 @@ test_that("basic ungrouped epi_slide_mean computation produces expected output", # e.g. aggregating across geos expect_error( small_x %>% ungroup() %>% epi_slide_mean(value, before = 6L), - class = "epiprocess__epi_slide_mean__duplicate_time_values" + class = "epiprocess__epi_slide_opt__duplicate_time_values" ) }) @@ -1152,7 +1152,7 @@ test_that("special time_types without time_step fail in epi_slide_mean", { col_names = a, before = before, after = after ), - class = "epiprocess__epi_slide_mean__unmappable_time_type" + class = "epiprocess__epi_slide_opt__unmappable_time_type" ) } @@ -1376,14 +1376,14 @@ test_that("`epi_slide_mean` errors when passed `time_values` with closer than ex as_epi_df() expect_error( epi_slide_mean(time_df, value, before = 6L, time_step = lubridate::seconds), - class = "epiprocess__epi_slide_mean__unexpected_row_number" + class = "epiprocess__epi_slide_opt__unexpected_row_number" ) }) test_that("`epi_slide_mean` errors when passed `col_names` as list", { expect_error( epi_slide_mean(grouped, col_names = list(value), before = 1L, after = 0L, ref_time_values = d + 1), - class = "epiprocess__epi_slide_mean__col_names_in_list" + class = "epiprocess__epi_slide_opt__col_names_in_list" ) }) From c91e4e671817fd6bc6c18574935283a1a18fef27 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:15:43 -0400 Subject: [PATCH 279/345] update tests --- tests/testthat/test-epi_slide.R | 36 ++++++++++----------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 6d2f8d23..6561ab78 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -383,16 +383,6 @@ test_that("nested dataframe output names are controllable", { ), basic_result_from_size1_sum %>% rename(value_sum = slide_value) ) - expect_identical( - toy_edf %>% filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6L, names_sep = NULL, na.rm = TRUE - ), - basic_result_from_size1_mean - ) }) test_that("non-size-1 outputs are recycled", { @@ -482,7 +472,8 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { value, before = 6L, names_sep = NULL, na.rm = TRUE ), - basic_result_from_size1_mean + basic_result_from_size1_mean %>% + rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% filter( @@ -493,7 +484,8 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { before = 6L, ref_time_values = c(2L, 8L), names_sep = NULL, na.rm = TRUE ), - filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) + filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) %>% + rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% filter( @@ -505,9 +497,10 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { names_sep = NULL, na.rm = TRUE ), basic_result_from_size1_mean %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% c(2L, 8L), slide_value, NA_integer_ - )) + )) %>% + select(-slide_value) ) # slide computations returning data frames: @@ -662,7 +655,7 @@ test_that("basic grouped epi_slide_mean computation produces expected output", { as_epi_df(as_of = d + 6) result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result1, expected_output) + expect_identical(result1, expected_output %>% rename(slide_value_value = slide_value)) }) test_that("ungrouped epi_slide computation completes successfully", { @@ -722,7 +715,7 @@ test_that("basic ungrouped epi_slide_mean computation produces expected output", ungroup() %>% filter(geo_value == "ak") %>% epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result1, expected_output) + expect_identical(result1, expected_output %>% rename(slide_value_value = slide_value)) # Ungrouped with multiple geos # epi_slide_mean fails when input data groups contain duplicate time_values, @@ -928,7 +921,7 @@ test_that("basic slide behavior is correct when groups have non-overlapping date expect_identical(result1, expected_output) result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result2, expected_output) + expect_identical(result2, expected_output %>% rename(slide_value_value = slide_value)) }) @@ -1152,7 +1145,7 @@ test_that("special time_types without time_step fail in epi_slide_mean", { col_names = a, before = before, after = after ), - class = "epiprocess__epi_slide_opt__unmappable_time_type" + class = "epiprocess__full_date_seq__unmappable_time_type" ) } @@ -1380,13 +1373,6 @@ test_that("`epi_slide_mean` errors when passed `time_values` with closer than ex ) }) -test_that("`epi_slide_mean` errors when passed `col_names` as list", { - expect_error( - epi_slide_mean(grouped, col_names = list(value), before = 1L, after = 0L, ref_time_values = d + 1), - class = "epiprocess__epi_slide_opt__col_names_in_list" - ) -}) - test_that("epi_slide_mean produces same output as epi_slide_opt", { result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) result2 <- epi_slide_opt(small_x, value, From 7ed59b6194d5a5c0b665c971518b31385a78d8ea Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:28:19 -0400 Subject: [PATCH 280/345] update examples --- R/slide.R | 49 ++++++++++++++++++++----------------------- man/epi_slide_mean.Rd | 22 +++++++++---------- man/epi_slide_opt.Rd | 23 +++++++++----------- man/epi_slide_sum.Rd | 4 ++-- 4 files changed, 46 insertions(+), 52 deletions(-) diff --git a/R/slide.R b/R/slide.R index 33ce0e7e..7d24c7a7 100644 --- a/R/slide.R +++ b/R/slide.R @@ -391,45 +391,42 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 +#' f = data.table::frollmean, before = 6 #' ) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' # Remove a nonessential var. to ensure new col is printed, and rename new col +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt(cases, -#' f = data.table::frollmean, -#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, +#' epi_slide_opt( +#' cases, f = data.table::frollmean, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, -#' f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 +#' cases, f = slider::slide_mean, after = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, -#' f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 +#' cases, f = data.table::frollsum, before = 3, after = 3 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, @@ -681,45 +678,45 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_mean(cases, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed #' # and accuracy, and to allow partially-missing windows. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, -#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, +#' epi_slide_mean( +#' cases, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_mean(cases, after = 6) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean(cases, before = 3, after = 3) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean(cases, before = 6, after = 7) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% +#' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% #' ungroup() epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, time_step, @@ -764,9 +761,9 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' # slide a 7-day trailing sum formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) %>% +#' epi_slide_sum(cases, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% #' ungroup() epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, time_step, diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 9937c986..9f967cce 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -130,45 +130,45 @@ misspelled.) # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) \%>\% + epi_slide_mean(cases, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, - new_col_name = "cases_7dav", names_sep = NULL, before = 6, + epi_slide_mean( + cases, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) \%>\% + epi_slide_mean(cases, after = 6) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + epi_slide_mean(cases, before = 3, after = 3) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean(cases, before = 6, after = 7) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) \%>\% ungroup() } \seealso{ diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4a8b6e68..6d48deb9 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -153,45 +153,42 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 + f = data.table::frollmean, before = 6 ) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + # Remove a nonessential var. to ensure new col is printed, and rename new col + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed # and accuracy, and to allow partially-missing windows. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt(cases, - f = data.table::frollmean, - new_col_name = "cases_7dav", names_sep = NULL, before = 6, + epi_slide_opt( + cases, f = data.table::frollmean, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, - f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 + cases, f = slider::slide_mean, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, - f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 + cases, f = data.table::frollsum, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() } \seealso{ diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index a65fb815..076f4959 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -130,9 +130,9 @@ misspelled.) # slide a 7-day trailing sum formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) \%>\% + epi_slide_sum(cases, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dsum) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) \%>\% ungroup() } \seealso{ From a1effcdb26834376b4cdc2896b16cdec929102a7 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Mon, 3 Jun 2024 21:29:51 +0000 Subject: [PATCH 281/345] style: styler (GHA) --- R/slide.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R/slide.R b/R/slide.R index 7d24c7a7..27a3135c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -402,7 +402,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, f = data.table::frollmean, before = 6, +#' cases, +#' f = data.table::frollmean, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -413,7 +414,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, f = slider::slide_mean, after = 6 +#' cases, +#' f = slider::slide_mean, after = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% @@ -423,7 +425,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( -#' cases, f = data.table::frollsum, before = 3, after = 3 +#' cases, +#' f = data.table::frollsum, before = 3, after = 3 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% @@ -688,7 +691,8 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean( -#' cases, before = 6, +#' cases, +#' before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% From 63fd31d09b3946e6812fe8365760df0c187ffcfb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:37:58 -0400 Subject: [PATCH 282/345] news and version --- DESCRIPTION | 2 +- NEWS.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index cc7cba12..0c871dca 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.9 +Version: 0.7.11 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index a1591d8d..48cefc65 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Add new `epi_slide_opt` function to allow much faster rolling computations in some cases, using `data.table` and `slider` optimized rolling functions (#433). +- Add tidyselect interfact for `epi_slide_opt` and derivatives (#452). - regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API - changed approach to versioning, see DEVELOPMENT.md for details From 5c8322dcced748e5372f74dbc0f59b2498ef4d06 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 4 Jun 2024 11:47:54 -0700 Subject: [PATCH 283/345] docs (epiprocess.Rmd): attempt short intro intro + ref dplyr "verbs" --- vignettes/epiprocess.Rmd | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 2d67e28a..f278f827 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -9,16 +9,21 @@ editor_options: chunk_output_type: console --- -The [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) package provides +The [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) package works +with epidemiological time series and version data to provide situational +awareness, processing and transformations in preparation for modeling, and +version-faithful model backtesting. It contains: - `epi_df`, a class for working with epidemiological time series data; - `epi_archive`, a class for working with the version history of such time series data; - sample data in these formats; -- functions for common data transformations (e.g., 7-day averages); +- [`{dplyr}`](https://dplyr.tidyverse.org/)-esque "verbs" for common data + transformations (e.g., 7-day averages); - functions for exploratory data analysis and situational awareness (e.g., outlier detection and growth rate estimation); and -- functions for version-faithful "pseudoprospective" backtesting of models, and - other version history analysis. +- [`{dplyr}`](https://dplyr.tidyverse.org/)-esque "verbs" for version-faithful + "pseudoprospective" backtesting of models, and other version history analysis + and transformations. It is part of a broader suite of packages including [`{epidatr}`](https://cmu-delphi.github.io/epidatr/), From e64f3c313da6d3c32ae348794e6c94d8df533caf Mon Sep 17 00:00:00 2001 From: brookslogan Date: Tue, 4 Jun 2024 13:19:16 -0700 Subject: [PATCH 284/345] docs(epiprocess.Rmd): order package links by importance/relevance Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- vignettes/epiprocess.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index f278f827..c0cb0011 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -25,11 +25,11 @@ version-faithful model backtesting. It contains: "pseudoprospective" backtesting of models, and other version history analysis and transformations. -It is part of a broader suite of packages including +It is part of a broader suite of packages that includes +[`{epipredict}`](https://cmu-delphi.github.io/epipredict/), [`{epidatr}`](https://cmu-delphi.github.io/epidatr/), -[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/), [`{rtestim}`](https://dajmcdon.github.io/rtestim/), and -[`{epipredict}`](https://cmu-delphi.github.io/epipredict/), for accessing, +[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/), for accessing, analyzing, and forecasting epidemiological time series data. We have expanded documentation and demonstrations for some of these packages available in an online "book" format [here](https://cmu-delphi.github.io/delphi-tooling-book/). From 6c1524d82289eab71af1f1b9dfe90d4dee6cb6bb Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 6 Jun 2024 15:35:58 -0400 Subject: [PATCH 285/345] test renaming --- NEWS.md | 2 +- tests/testthat/test-epi_slide.R | 30 ++++++++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2c295232..57256cd7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,7 +16,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Add new `epi_slide_opt` function to allow much faster rolling computations in some cases, using `data.table` and `slider` optimized rolling functions (#433). -- Add tidyselect interfact for `epi_slide_opt` and derivatives (#452). +- Add tidyselect interface for `epi_slide_opt` and derivatives (#452). - regenerated the `jhu_csse_daily_subset` dataset with the latest versions of the data from the API - changed approach to versioning, see DEVELOPMENT.md for details diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 6561ab78..90851bb5 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -29,7 +29,7 @@ toy_edf <- tibble::tribble( as_epi_df(as_of = 100) # nolint start: line_length_linter. -basic_result_from_size1_sum <- tibble::tribble( +basic_sum_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), @@ -38,7 +38,7 @@ basic_result_from_size1_sum <- tibble::tribble( dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) -basic_result_from_size1_mean <- tibble::tribble( +basic_mean_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, "a", 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% @@ -315,27 +315,29 @@ test_that( ) test_that("computation output formats x as_list_col", { - # See `toy_edf` and `basic_result_from_size1_sum` definitions at top of file. + # See `toy_edf` and `basic_sum_result` definitions at top of file. # We'll try 7d sum with a few formats. expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), - basic_result_from_size1_sum + basic_sum_result ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), - basic_result_from_size1_sum %>% dplyr::mutate(slide_value = as.list(slide_value)) + basic_sum_result %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), - basic_result_from_size1_sum %>% rename(slide_value_value = slide_value) + basic_sum_result %>% rename(slide_value_value = slide_value) ) expect_identical( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), - basic_result_from_size1_sum %>% + basic_sum_result %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) +} - # See `toy_edf` and `basic_result_from_size1_mean` definitions at top of file. +test_that("epi_slide_mean errors when `as_list_col` non-NULL", { + # See `toy_edf` and `basic_mean_result` definitions at top of file. # We'll try 7d avg with a few formats. # Warning: not exactly the same naming behavior as `epi_slide`. expect_identical( @@ -347,7 +349,7 @@ test_that("computation output formats x as_list_col", { value, before = 6L, na.rm = TRUE ), - basic_result_from_size1_mean %>% dplyr::mutate( + basic_mean_result %>% dplyr::mutate( slide_value_value = slide_value ) %>% select(-slide_value) @@ -373,7 +375,7 @@ test_that("nested dataframe output names are controllable", { before = 6L, ~ data.frame(value = sum(.x$value)), new_col_name = "result" ), - basic_result_from_size1_sum %>% rename(result_value = slide_value) + basic_sum_result %>% rename(result_value = slide_value) ) expect_identical( toy_edf %>% @@ -381,7 +383,7 @@ test_that("nested dataframe output names are controllable", { before = 6L, ~ data.frame(value_sum = sum(.x$value)), names_sep = NULL ), - basic_result_from_size1_sum %>% rename(value_sum = slide_value) + basic_sum_result %>% rename(value_sum = slide_value) ) }) @@ -472,7 +474,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { value, before = 6L, names_sep = NULL, na.rm = TRUE ), - basic_result_from_size1_mean %>% + basic_mean_result %>% rename(slide_value_value = slide_value) ) expect_identical( @@ -484,7 +486,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { before = 6L, ref_time_values = c(2L, 8L), names_sep = NULL, na.rm = TRUE ), - filter(basic_result_from_size1_mean, time_value %in% c(2L, 8L)) %>% + filter(basic_mean_result, time_value %in% c(2L, 8L)) %>% rename(slide_value_value = slide_value) ) expect_identical( @@ -496,7 +498,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, names_sep = NULL, na.rm = TRUE ), - basic_result_from_size1_mean %>% + basic_mean_result %>% dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% c(2L, 8L), slide_value, NA_integer_ )) %>% From 1afd7653913ac62e0590d1abe16c7e58e7bb69a5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:09:45 -0400 Subject: [PATCH 286/345] missing ) --- tests/testthat/test-epi_slide.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 90851bb5..8765d50c 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -334,7 +334,7 @@ test_that("computation output formats x as_list_col", { basic_sum_result %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) -} +}) test_that("epi_slide_mean errors when `as_list_col` non-NULL", { # See `toy_edf` and `basic_mean_result` definitions at top of file. From de936f55ba705f350c92aaca83be6a4e03baa792 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 7 Jun 2024 14:42:37 -0400 Subject: [PATCH 287/345] tidyselect options description Co-authored-by: brookslogan --- man-roxygen/opt-slide-params.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index 2fb51315..34ff0b2a 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,10 +1,13 @@ -#' @param col_names A character vector OR a -#' <[`tidy-select`][dplyr_tidy_select]> of the names of one or more columns -#' for which to calculate a rolling computation. If a tidy-selection, one -#' or more unquoted expressions separated by commas. Variable names can be -#' used as if they were positions in the data frame, so expressions like -#' `x:y` can be used to select a range of variables. The tidy-selection -#' cannot be used to provide output column names. +#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column name +#' (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), or other +#' tidy-select expression. Variable names can be used as if they were +#' positions in the data frame, so expressions like `x:y` can be used to +#' select a range of variables. If you have the desired column names stored in +#' a vector `vars`, use `col_names = all_of(vars)`. +#' +#' The tidy-selection renaming interface is not supported, and cannot be used +#' to provide output column names; if you want to customize the output column +#' names, use [`dplyr::rename`] after the slide. #' @param as_list_col Not supported. Included to match `epi_slide` interface. #' @param new_col_name Character vector indicating the name(s) of the new #' column(s) that will contain the derivative values. Default From 64658c13b7514dc2f3c6c21d9b92b3a14c6dead4 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 7 Jun 2024 14:55:15 -0400 Subject: [PATCH 288/345] build docs and link tidyselect page --- man-roxygen/opt-slide-params.R | 12 ++++++------ man/epi_slide_mean.Rd | 20 ++++++++++++-------- man/epi_slide_opt.Rd | 26 ++++++++++++++++---------- man/epi_slide_sum.Rd | 17 ++++++++++------- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index 34ff0b2a..d13921b2 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,9 +1,9 @@ -#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column name -#' (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), or other -#' tidy-select expression. Variable names can be used as if they were -#' positions in the data frame, so expressions like `x:y` can be used to -#' select a range of variables. If you have the desired column names stored in -#' a vector `vars`, use `col_names = all_of(vars)`. +#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), or +#' [other tidy-select expression][tidyselect::language]. Variable names can +#' be used as if they were positions in the data frame, so expressions like +#' `x:y` can be used to select a range of variables. If you have the desired +#' column names stored in a vector `vars`, use `col_names = all_of(vars)`. #' #' The tidy-selection renaming interface is not supported, and cannot be used #' to provide output column names; if you want to customize the output column diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 9f967cce..850a45a1 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -23,13 +23,16 @@ epi_slide_mean( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A character vector OR a -<\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one or more columns -for which to calculate a rolling computation. If a tidy-selection, one -or more unquoted expressions separated by commas. Variable names can be -used as if they were positions in the data frame, so expressions like -\code{x:y} can be used to select a range of variables. The tidy-selection -cannot be used to provide output column names.} +\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or +\link[tidyselect:language]{other tidy-select expression}. Variable names can +be used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. If you have the desired +column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. + +The tidy-selection renaming interface is not supported, and cannot be used +to provide output column names; if you want to customize the output column +names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} \item{...}{Additional arguments to pass to \code{data.table::frollmean}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically @@ -140,7 +143,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_mean( - cases, before = 6, + cases, + before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 6d48deb9..4b011c16 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -24,13 +24,16 @@ epi_slide_opt( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A character vector OR a -<\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one or more columns -for which to calculate a rolling computation. If a tidy-selection, one -or more unquoted expressions separated by commas. Variable names can be -used as if they were positions in the data frame, so expressions like -\code{x:y} can be used to select a range of variables. The tidy-selection -cannot be used to provide output column names.} +\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or +\link[tidyselect:language]{other tidy-select expression}. Variable names can +be used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. If you have the desired +column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. + +The tidy-selection renaming interface is not supported, and cannot be used +to provide output column names; if you want to customize the output column +names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} \item{f}{Function; together with \code{...} specifies the computation to slide. \code{f} must be one of \code{data.table}'s rolling functions @@ -164,7 +167,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = data.table::frollmean, before = 6, + cases, + f = data.table::frollmean, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -175,7 +179,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = slider::slide_mean, after = 6 + cases, + f = slider::slide_mean, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% @@ -185,7 +190,8 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( - cases, f = data.table::frollsum, before = 3, after = 3 + cases, + f = data.table::frollsum, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 076f4959..8c835bdb 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -23,13 +23,16 @@ epi_slide_sum( or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a single data group.} -\item{col_names}{A character vector OR a -<\code{\link[=dplyr_tidy_select]{tidy-select}}> of the names of one or more columns -for which to calculate a rolling computation. If a tidy-selection, one -or more unquoted expressions separated by commas. Variable names can be -used as if they were positions in the data frame, so expressions like -\code{x:y} can be used to select a range of variables. The tidy-selection -cannot be used to provide output column names.} +\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), or +\link[tidyselect:language]{other tidy-select expression}. Variable names can +be used as if they were positions in the data frame, so expressions like +\code{x:y} can be used to select a range of variables. If you have the desired +column names stored in a vector \code{vars}, use \code{col_names = all_of(vars)}. + +The tidy-selection renaming interface is not supported, and cannot be used +to provide output column names; if you want to customize the output column +names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} \item{...}{Additional arguments to pass to \code{data.table::frollsum}, for example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically From bf5c6e81d117209408364016fc02ffd9eaf0de90 Mon Sep 17 00:00:00 2001 From: XuedaShen <110584221+XuedaShen@users.noreply.github.com> Date: Tue, 18 Jun 2024 15:06:40 -0700 Subject: [PATCH 289/345] `slide.Rmd` seems to contain legacy info on `epi_slide` --- vignettes/slide.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index b0295865..92590fb1 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -21,8 +21,7 @@ as `Date` objects, then one time step is one day, since `as.Date("2022-01-01") + 1` equals `as.Date("2022-01-02")`. Alternatively, the time step can be specified manually in the call to `epi_slide()`; you can read the documentation for more details. Furthermore, the alignment of the running window used in `epi_slide()` -can be "right", "center", or "left"; the default is "right", and is what we use -in this vignette. +is specified by `before` and `after`. As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the From 270958007ec34a9bc8781def3748d756bc536ab0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 20 Jun 2024 14:52:03 -0400 Subject: [PATCH 290/345] assign to dots using "assign" --- R/grouped_epi_archive.R | 5 ++++- R/slide.R | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 55a0176c..c6326751 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -317,7 +317,10 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, f <- quos[[1]] new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # nolint: object_usage_linter. magic value that passes zero args as dots in calls below + # Magic value that passes zero args as dots in calls below. Equivalent to + # `... <- missing_arg()`, but use `assign` to avoid warning about + # improper use of dots. + assign("...", missing_arg()) } f <- as_slide_computation(f, ...) diff --git a/R/slide.R b/R/slide.R index 27a3135c..9d26174a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -304,7 +304,10 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, f <- quos[[1]] new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # magic value that passes zero args as dots in calls below # nolint: object_usage_linter + # Magic value that passes zero args as dots in calls below. Equivalent to + # `... <- missing_arg()`, but use `assign` to avoid warning about + # improper use of dots. + assign("...", missing_arg()) } f <- as_slide_computation(f, ...) From b7f36e079b6a39228ba634db9eceeb0f90ac81ac Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 20 Jun 2024 13:39:37 -0700 Subject: [PATCH 291/345] feat: add a epi_df attribute that avoids the decay. See #467 --- R/methods-epi_df.R | 1 + tests/testthat/test-as_tibble-decay.R | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 tests/testthat/test-as_tibble-decay.R diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 526a1171..3f0626fe 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -12,6 +12,7 @@ as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. + if (attr(x, "no_decay_to_tibble") %||% FALSE) return(x) decay_epi_df(NextMethod()) } diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R new file mode 100644 index 00000000..804ee358 --- /dev/null +++ b/tests/testthat/test-as_tibble-decay.R @@ -0,0 +1,6 @@ +test_that("as_tibble checks an attr to avoid decay to tibble", { + edf <- jhu_csse_daily_subset + expect_s3_class(as_tibble(edf), c("tbl_df", "tbl", "data.frame")) + attr(edf, "no_decay_to_tibble") <- TRUE + expect_s3_class(as_tibble(edf), c("epi_df", "tbl_df", "tbl", "data.frame")) +}) From 740d0ea4325753da5968e8d14989e32465436fb2 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 20 Jun 2024 13:55:24 -0700 Subject: [PATCH 292/345] bump version number --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0c871dca..f35681f6 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.11 +Version: 0.7.12 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), From 00b8e04ae8d092b234c1e1c8585a103fedfc5c83 Mon Sep 17 00:00:00 2001 From: dajmcdon Date: Thu, 20 Jun 2024 20:59:32 +0000 Subject: [PATCH 293/345] style: styler (GHA) --- R/methods-epi_df.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 3f0626fe..fa881b82 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -12,7 +12,9 @@ as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. - if (attr(x, "no_decay_to_tibble") %||% FALSE) return(x) + if (attr(x, "no_decay_to_tibble") %||% FALSE) { + return(x) + } decay_epi_df(NextMethod()) } From 15c00bfd28643ed5f1dfc1749fe45571bc813ec5 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 20 Jun 2024 16:06:52 -0700 Subject: [PATCH 294/345] working on the grouped versions --- R/methods-epi_df.R | 2 +- tests/testthat/test-as_tibble-decay.R | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 3f0626fe..5c5311c4 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -12,7 +12,7 @@ as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. - if (attr(x, "no_decay_to_tibble") %||% FALSE) return(x) + if (attr(x, "no_decay_to_tibble") %||% FALSE) return(ungroup(x)) decay_epi_df(NextMethod()) } diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R index 804ee358..867319c5 100644 --- a/tests/testthat/test-as_tibble-decay.R +++ b/tests/testthat/test-as_tibble-decay.R @@ -1,6 +1,13 @@ test_that("as_tibble checks an attr to avoid decay to tibble", { edf <- jhu_csse_daily_subset - expect_s3_class(as_tibble(edf), c("tbl_df", "tbl", "data.frame")) + expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) attr(edf, "no_decay_to_tibble") <- TRUE - expect_s3_class(as_tibble(edf), c("epi_df", "tbl_df", "tbl", "data.frame")) + expect_identical(class(as_tibble(edf)), c("epi_df", "tbl_df", "tbl", "data.frame")) +}) + +test_that("as_tibble ungroups if needed", { + edf <- jhu_csse_daily_subset %>% group_by(geo_value) + expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) + attr(edf, "no_decay_to_tibble") <- TRUE + expect_identical(class(as_tibble(edf)), c("epi_df", "tbl_df", "tbl", "data.frame")) }) From cee786ec23acd5569c296468e832ec80b6690258 Mon Sep 17 00:00:00 2001 From: dajmcdon Date: Thu, 20 Jun 2024 23:09:56 +0000 Subject: [PATCH 295/345] style: styler (GHA) --- R/methods-epi_df.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 5c5311c4..a59cc9f5 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -12,7 +12,9 @@ as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. - if (attr(x, "no_decay_to_tibble") %||% FALSE) return(ungroup(x)) + if (attr(x, "no_decay_to_tibble") %||% FALSE) { + return(ungroup(x)) + } decay_epi_df(NextMethod()) } From 9a3d9503e73fa7359206e609668eee8131b422fd Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 20 Jun 2024 17:10:35 -0700 Subject: [PATCH 296/345] pass local tests --- R/methods-epi_df.R | 4 +++- tests/testthat/test-as_tibble-decay.R | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index a59cc9f5..9d8725da 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -13,7 +13,9 @@ as_tibble.epi_df <- function(x, ...) { # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. if (attr(x, "no_decay_to_tibble") %||% FALSE) { - return(ungroup(x)) + metadata <- attr(x, "metadata") + x <- NextMethod() + return(reclass(x, metadata)) } decay_epi_df(NextMethod()) } diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R index 867319c5..b5e04c24 100644 --- a/tests/testthat/test-as_tibble-decay.R +++ b/tests/testthat/test-as_tibble-decay.R @@ -7,7 +7,9 @@ test_that("as_tibble checks an attr to avoid decay to tibble", { test_that("as_tibble ungroups if needed", { edf <- jhu_csse_daily_subset %>% group_by(geo_value) - expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) + # removes the grouped_df class + expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) attr(edf, "no_decay_to_tibble") <- TRUE + # removes grouped_df but not `epi_df` expect_identical(class(as_tibble(edf)), c("epi_df", "tbl_df", "tbl", "data.frame")) }) From a63a465bec8746837e24908adc0d7c32382967bf Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 20 Jun 2024 17:21:59 -0700 Subject: [PATCH 297/345] done --- R/methods-epi_df.R | 9 ++++----- tests/testthat/test-as_tibble-decay.R | 10 +++++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 9d8725da..aa7d61d6 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -12,12 +12,11 @@ as_tibble.epi_df <- function(x, ...) { # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the # grouping and should be called by `NextMethod()` in the current design. # See #223 for discussion of alternatives. - if (attr(x, "no_decay_to_tibble") %||% FALSE) { - metadata <- attr(x, "metadata") - x <- NextMethod() - return(reclass(x, metadata)) + if (attr(x, "decay_to_tibble") %||% TRUE) { + return(decay_epi_df(NextMethod())) } - decay_epi_df(NextMethod()) + metadata <- attr(x, "metadata") + reclass(NextMethod(), metadata) } #' Convert to tsibble format diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R index b5e04c24..d2248a6d 100644 --- a/tests/testthat/test-as_tibble-decay.R +++ b/tests/testthat/test-as_tibble-decay.R @@ -1,15 +1,19 @@ test_that("as_tibble checks an attr to avoid decay to tibble", { edf <- jhu_csse_daily_subset expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) - attr(edf, "no_decay_to_tibble") <- TRUE + attr(edf, "decay_to_tibble") <- TRUE + expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) + attr(edf, "decay_to_tibble") <- FALSE expect_identical(class(as_tibble(edf)), c("epi_df", "tbl_df", "tbl", "data.frame")) }) test_that("as_tibble ungroups if needed", { edf <- jhu_csse_daily_subset %>% group_by(geo_value) # removes the grouped_df class - expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) - attr(edf, "no_decay_to_tibble") <- TRUE + expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) + attr(edf, "decay_to_tibble") <- TRUE + expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) + attr(edf, "decay_to_tibble") <- FALSE # removes grouped_df but not `epi_df` expect_identical(class(as_tibble(edf)), c("epi_df", "tbl_df", "tbl", "data.frame")) }) From dee77ed821ad8f99c417272f29caabca6dd07580 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 21 Jun 2024 04:23:11 -0700 Subject: [PATCH 298/345] Update NEWS.md with #468 (merged) and #466 (pending) --- NEWS.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/NEWS.md b/NEWS.md index 57256cd7..46c774b7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Clarified "Get started" example of getting Ebola line list data into `epi_df` format. - Improved documentation web site landing page's introduction. +- Fixed documentation referring to old `epi_slide()` interface (#466, thanks + @XuedaShen!). + +## Cleanup +- Resolved some linting messages in package checks (#468). # epiprocess 0.7.0 From d1d47c5fedc9e3a87eb586502f8df1df4d657389 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 21 Jun 2024 04:51:54 -0700 Subject: [PATCH 299/345] Add roxygen, NEWS.md for #471 --- NEWS.md | 4 ++++ R/methods-epi_df.R | 4 ++++ man/as_tibble.epi_df.Rd | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/NEWS.md b/NEWS.md index 57256cd7..98ec0fe8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,10 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat format. - Improved documentation web site landing page's introduction. +## Cleanup +- Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior + of `epi_df`s to let `epipredict` work more easily with other libraries (#471). + # epiprocess 0.7.0 ## Breaking changes: diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index aa7d61d6..a8cfe509 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -3,6 +3,10 @@ #' Converts an `epi_df` object into a tibble, dropping metadata and any #' grouping. #' +#' Advanced: if you are working with a third-party package that uses +#' `as_tibble()` on `epi_df`s but you actually want them to remain `epi_df`s, +#' use `attr(your_epi_df, "decay_to_tibble") <- FALSE` beforehand. +#' #' @template x #' @param ... additional arguments to forward to `NextMethod()` #' diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd index 5913a5e7..174768e5 100644 --- a/man/as_tibble.epi_df.Rd +++ b/man/as_tibble.epi_df.Rd @@ -15,3 +15,8 @@ Converts an \code{epi_df} object into a tibble, dropping metadata and any grouping. } +\details{ +Advanced: if you are working with a third-party package that uses +\code{as_tibble()} on \code{epi_df}s but you actually want them to remain \code{epi_df}s, +use \code{attr(your_epi_df, "decay_to_tibble") <- FALSE} beforehand. +} From b1c40d453a506881a5b6d7f0de0610e1b90d1ce4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 21 Jun 2024 04:57:59 -0700 Subject: [PATCH 300/345] Braced package name convention in NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 98ec0fe8..b40e5432 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,7 +36,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Cleanup - Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior - of `epi_df`s to let `epipredict` work more easily with other libraries (#471). + of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471). # epiprocess 0.7.0 From 1cf296e3f1443343abe87c1cf7d49441b28bfba3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 21 Jun 2024 05:11:02 -0700 Subject: [PATCH 301/345] feat(print.epi_df): display decay_to_tibble attr if non-NULL --- R/methods-epi_df.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index a8cfe509..cc532021 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -60,6 +60,8 @@ print.epi_df <- function(x, ...) { cat(sprintf("* %-9s = %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", attributes(x)$metadata$time_type)) cat(sprintf("* %-9s = %s\n", "as_of", attributes(x)$metadata$as_of)) + # Conditional output (silent if attribute is NULL): + cat(sprintf("* %-9s = %s\n", "decay_to_tibble", attr(x, "decay_to_tibble"))) cat("\n") NextMethod() } From 67a10b567b4848a52842a888cfe3e4314a694a48 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Jun 2024 12:13:43 -0700 Subject: [PATCH 302/345] fix(epi_slide): use expect_equal for slide tests --- tests/testthat/test-epi_slide.R | 130 ++++++++++++++++---------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 8765d50c..9aa67603 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -128,9 +128,9 @@ test_that("Test errors/warnings for discouraged features", { ) # Results from epi_slide and epi_slide_mean should match - expect_identical(select(ref1, -slide_value_count), opt1) - expect_identical(select(ref2, -slide_value_count), opt2) - expect_identical(select(ref3, -slide_value_count), opt3) + expect_equal(select(ref1, -slide_value_count), opt1) + expect_equal(select(ref2, -slide_value_count), opt2) + expect_equal(select(ref3, -slide_value_count), opt3) }) test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { @@ -203,7 +203,7 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa )) # Results from epi_slide and epi_slide_mean should match - expect_identical(select(ref, -slide_value_count), opt) + expect_equal(select(ref, -slide_value_count), opt) }) test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { @@ -275,8 +275,8 @@ test_that("Warn user against having a blank `before`", { )) # Results from epi_slide and epi_slide_mean should match - expect_identical(select(ref1, -slide_value_count), opt1) - expect_identical(select(ref2, -slide_value_count), opt2) + expect_equal(select(ref1, -slide_value_count), opt1) + expect_equal(select(ref2, -slide_value_count), opt2) }) ## --- These cases doesn't generate the error: --- @@ -286,26 +286,26 @@ test_that( values are out of the range for every group" ), { - expect_identical( + expect_equal( epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group - expect_identical( + expect_equal( epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) ) # not out of range for either group - expect_identical( + expect_equal( epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group - expect_identical( + expect_equal( epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), @@ -317,19 +317,19 @@ test_that( test_that("computation output formats x as_list_col", { # See `toy_edf` and `basic_sum_result` definitions at top of file. # We'll try 7d sum with a few formats. - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), basic_sum_result ) - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), basic_sum_result %>% dplyr::mutate(slide_value = as.list(slide_value)) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), basic_sum_result %>% rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), basic_sum_result %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) @@ -340,7 +340,7 @@ test_that("epi_slide_mean errors when `as_list_col` non-NULL", { # See `toy_edf` and `basic_mean_result` definitions at top of file. # We'll try 7d avg with a few formats. # Warning: not exactly the same naming behavior as `epi_slide`. - expect_identical( + expect_equal( toy_edf %>% filter( geo_value == "a" @@ -369,7 +369,7 @@ test_that("epi_slide_mean errors when `as_list_col` non-NULL", { }) test_that("nested dataframe output names are controllable", { - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), @@ -377,7 +377,7 @@ test_that("nested dataframe output names are controllable", { ), basic_sum_result %>% rename(result_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value_sum = sum(.x$value)), @@ -399,19 +399,19 @@ test_that("non-size-1 outputs are recycled", { dplyr::arrange(time_value) %>% as_epi_df(as_of = 100) # nolint end - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), basic_result_from_size2 ) - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE), basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)), basic_result_from_size2 %>% rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE), basic_result_from_size2 %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) @@ -444,18 +444,18 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { as_epi_df(as_of = 100) # nolint end # slide computations returning atomic vecs: - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), basic_full_result ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ sum(.x$value), ref_time_values = c(2L, 8L) ), basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ sum(.x$value), ref_time_values = c(2L, 8L), all_rows = TRUE @@ -466,7 +466,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { )) ) - expect_identical( + expect_equal( toy_edf %>% filter( geo_value == "a" ) %>% @@ -477,7 +477,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { basic_mean_result %>% rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% filter( geo_value == "a" ) %>% @@ -489,7 +489,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { filter(basic_mean_result, time_value %in% c(2L, 8L)) %>% rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% filter( geo_value == "a" ) %>% @@ -506,11 +506,11 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) # slide computations returning data frames: - expect_identical( + expect_equal( toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L) @@ -519,7 +519,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L), all_rows = TRUE @@ -531,7 +531,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::rename(slide_value_value = slide_value) ) # slide computations returning data frames with `as_list_col=TRUE`: - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE @@ -539,7 +539,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L), @@ -549,7 +549,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% dplyr::filter(time_value %in% c(2L, 8L)) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L), all_rows = TRUE, @@ -562,7 +562,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { )) ) # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE @@ -570,7 +570,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { unnest(slide_value, names_sep = "_"), basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L), @@ -581,7 +581,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::filter(time_value %in% c(2L, 8L)) %>% dplyr::rename(slide_value_value = slide_value) ) - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L), all_rows = TRUE, @@ -602,7 +602,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { list(vctrs::vec_cast(NA, vctrs::vec_ptype_common(!!!slide_values_list))) ) } - expect_identical( + expect_equal( toy_edf %>% epi_slide( before = 6L, ~ data.frame(value = sum(.x$value)), ref_time_values = c(2L, 8L), all_rows = TRUE, @@ -637,15 +637,15 @@ test_that("basic grouped epi_slide computation produces expected output", { # formula result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) # function result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50) - expect_identical(result2, expected_output) + expect_equal(result2, expected_output) # dots result3 <- epi_slide(small_x, slide_value = sum(value), before = 50) - expect_identical(result3, expected_output) + expect_equal(result3, expected_output) }) test_that("basic grouped epi_slide_mean computation produces expected output", { @@ -657,7 +657,7 @@ test_that("basic grouped epi_slide_mean computation produces expected output", { as_epi_df(as_of = d + 6) result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result1, expected_output %>% rename(slide_value_value = slide_value)) + expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) }) test_that("ungrouped epi_slide computation completes successfully", { @@ -684,7 +684,7 @@ test_that("basic ungrouped epi_slide computation produces expected output", { before = 50, slide_value = sum(.x$value) ) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( @@ -704,7 +704,7 @@ test_that("basic ungrouped epi_slide computation produces expected output", { before = 50, slide_value = sum(.x$value) ) - expect_identical(result2, expected_output) + expect_equal(result2, expected_output) }) test_that("basic ungrouped epi_slide_mean computation produces expected output", { @@ -717,7 +717,7 @@ test_that("basic ungrouped epi_slide_mean computation produces expected output", ungroup() %>% filter(geo_value == "ak") %>% epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result1, expected_output %>% rename(slide_value_value = slide_value)) + expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) # Ungrouped with multiple geos # epi_slide_mean fails when input data groups contain duplicate time_values, @@ -742,7 +742,7 @@ test_that("epi_slide computation via formula can use ref_time_value", { before = 50 ) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) result2 <- small_x %>% epi_slide( @@ -750,7 +750,7 @@ test_that("epi_slide computation via formula can use ref_time_value", { before = 50 ) - expect_identical(result2, expected_output) + expect_equal(result2, expected_output) result3 <- small_x %>% epi_slide( @@ -758,7 +758,7 @@ test_that("epi_slide computation via formula can use ref_time_value", { before = 50 ) - expect_identical(result3, expected_output) + expect_equal(result3, expected_output) # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( @@ -774,7 +774,7 @@ test_that("epi_slide computation via formula can use ref_time_value", { f = ~.ref_time_value, before = 50 ) - expect_identical(result4, expected_output) + expect_equal(result4, expected_output) }) test_that("epi_slide computation via function can use ref_time_value", { @@ -791,7 +791,7 @@ test_that("epi_slide computation via function can use ref_time_value", { before = 2 ) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) }) test_that("epi_slide computation via dots can use ref_time_value and group", { @@ -809,7 +809,7 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { slide_value = .ref_time_value ) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) # `.{x,group_key,ref_time_value}` should be inaccessible from `.data` and # `.env`. @@ -834,7 +834,7 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { slide_value = .group_key$geo_value ) - expect_identical(result3, expected_output) + expect_equal(result3, expected_output) # Use entire group_key object expected_output <- dplyr::bind_rows( @@ -850,7 +850,7 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { slide_value = nrow(.group_key) ) - expect_identical(result4, expected_output) + expect_equal(result4, expected_output) # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( @@ -866,7 +866,7 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { before = 50, slide_value = .ref_time_value ) - expect_identical(result5, expected_output) + expect_equal(result5, expected_output) }) test_that("epi_slide computation via dots outputs the same result using col names and the data var", { @@ -883,7 +883,7 @@ test_that("epi_slide computation via dots outputs the same result using col name slide_value = max(.x$time_value) ) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) result2 <- small_x %>% epi_slide( @@ -891,7 +891,7 @@ test_that("epi_slide computation via dots outputs the same result using col name slide_value = max(.data$time_value) ) - expect_identical(result2, expected_output) + expect_equal(result2, expected_output) }) test_that("`epi_slide` can access objects inside of helper functions", { @@ -920,10 +920,10 @@ test_that("basic slide behavior is correct when groups have non-overlapping date as_epi_df(as_of = d + 6) result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) - expect_identical(result2, expected_output %>% rename(slide_value_value = slide_value)) + expect_equal(result2, expected_output %>% rename(slide_value_value = slide_value)) }) @@ -948,7 +948,7 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin slide_value = .ref_time_value ) - expect_identical(result1, expected_output) + expect_equal(result1, expected_output) }) test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { @@ -979,7 +979,7 @@ test_that("results for different `before`s and `after`s match between epi_slide col_names = c(a, b), na.rm = TRUE, before = before, after = after, ... ) - expect_identical(result1, result2) + expect_equal(result1, result2) } set.seed(0) @@ -1094,11 +1094,11 @@ test_that("results for different time_types match between epi_slide and epi_slid col_names = c(a, b), na.rm = TRUE, before = before, after = after, ... ) - expect_identical(result1, result2) + expect_equal(result1, result2) # All fields except dates - expect_identical(select(ref_result, -time_value), select(result1, -time_value)) - expect_identical(select(ref_result, -time_value), select(result2, -time_value)) + expect_equal(select(ref_result, -time_value), select(result1, -time_value)) + expect_equal(select(ref_result, -time_value), select(result2, -time_value)) } test_time_type_mean(days) @@ -1118,7 +1118,7 @@ test_that("results for different time_types match between epi_slide and epi_slid col_names = c(a, b), na.rm = TRUE, before = 6L, after = 0L ) - expect_identical(select(ref_result, -time_value), select(result2, -time_value)) + expect_equal(select(ref_result, -time_value), select(result2, -time_value)) }) test_that("special time_types without time_step fail in epi_slide_mean", { @@ -1381,7 +1381,7 @@ test_that("epi_slide_mean produces same output as epi_slide_opt", { f = data.table::frollmean, before = 50, names_sep = NULL, na.rm = TRUE ) - expect_identical(result1, result2) + expect_equal(result1, result2) result3 <- epi_slide_opt(small_x, value, f = slider::slide_mean, @@ -1396,7 +1396,7 @@ test_that("epi_slide_sum produces same output as epi_slide_opt", { f = data.table::frollsum, before = 50, names_sep = NULL, na.rm = TRUE ) - expect_identical(result1, result2) + expect_equal(result1, result2) result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, From cc606affcf128df2a57ede8fc481c2d3ab6aa371 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 13:43:42 -0500 Subject: [PATCH 303/345] test grouping only before tsibble 1.1.5 --- tests/testthat/test-as_tibble-decay.R | 1 + tests/testthat/test-methods-epi_df.R | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R index d2248a6d..5a38f9b9 100644 --- a/tests/testthat/test-as_tibble-decay.R +++ b/tests/testthat/test-as_tibble-decay.R @@ -8,6 +8,7 @@ test_that("as_tibble checks an attr to avoid decay to tibble", { }) test_that("as_tibble ungroups if needed", { + skip_if(packageVersion("tsibble") > "1.1.4") edf <- jhu_csse_daily_subset %>% group_by(geo_value) # removes the grouped_df class expect_identical(class(as_tibble(edf)), c("tbl_df", "tbl", "data.frame")) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index b071d3ec..14924c22 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -121,7 +121,17 @@ test_that("Correct metadata when subset includes some of other_keys", { # Including both original other_keys was already tested above }) -test_that("Metadata and grouping are dropped by `as_tibble`", { +test_that("Metadata is dropped by `as_tibble`", { + grouped_converted <- toy_epi_df %>% + group_by(geo_value) %>% + as_tibble() + expect_true( + !any(c("metadata") %in% names(attributes(grouped_converted))) + ) +}) + +test_that("Grouping are dropped by `as_tibble`", { + skip_if(packageVersion("tsibble") > "1.1.4") grouped_converted <- toy_epi_df %>% group_by(geo_value) %>% as_tibble() From 25c9bbcdbc500d5c95ce605bb747ee0c7f439822 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 14:06:19 -0500 Subject: [PATCH 304/345] documenting this change --- tests/testthat/test-as_tibble-decay.R | 1 + tests/testthat/test-methods-epi_df.R | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/testthat/test-as_tibble-decay.R b/tests/testthat/test-as_tibble-decay.R index 5a38f9b9..488ace63 100644 --- a/tests/testthat/test-as_tibble-decay.R +++ b/tests/testthat/test-as_tibble-decay.R @@ -8,6 +8,7 @@ test_that("as_tibble checks an attr to avoid decay to tibble", { }) test_that("as_tibble ungroups if needed", { + # tsibble is doing some method piracy, and overwriting as_tibble.grouped_df as of 1.1.5 skip_if(packageVersion("tsibble") > "1.1.4") edf <- jhu_csse_daily_subset %>% group_by(geo_value) # removes the grouped_df class diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 14924c22..5ba66ed2 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -131,6 +131,7 @@ test_that("Metadata is dropped by `as_tibble`", { }) test_that("Grouping are dropped by `as_tibble`", { + # tsibble is doing some method piracy, and overwriting as_tibble.grouped_df as of 1.1.5 skip_if(packageVersion("tsibble") > "1.1.4") grouped_converted <- toy_epi_df %>% group_by(geo_value) %>% From 5a06e0d1f5d2302980d97c90f886d59194146d47 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:18:33 -0400 Subject: [PATCH 305/345] ignore man-roxygen dir --- .Rbuildignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index a28a0185..0582014a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,4 +14,5 @@ ^Meta$ ^.git-blame-ignore-revs$ ^.lintr$ -^DEVELOPMENT.md$ \ No newline at end of file +^DEVELOPMENT.md$ +man-roxygen From 16f56508a7a1aa99198f19dabea9aca506a7a619 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:42:24 -0400 Subject: [PATCH 306/345] specify import packages --- NAMESPACE | 1 + R/archive.R | 1 + R/utils.R | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 1362b15c..f3ef030a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -188,4 +188,5 @@ importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) +importFrom(utils,capture.output) importFrom(utils,tail) diff --git a/R/archive.R b/R/archive.R index 14918678..464d68ef 100644 --- a/R/archive.R +++ b/R/archive.R @@ -251,6 +251,7 @@ NULL #' #' @importFrom data.table as.data.table key setkeyv #' @importFrom dplyr if_any if_all everything +#' @importFrom utils capture.output #' #' @name epi_archive #' @export diff --git a/R/utils.R b/R/utils.R index a7f7649f..fc228e10 100644 --- a/R/utils.R +++ b/R/utils.R @@ -536,7 +536,7 @@ deprecated_quo_is_present <- function(quo) { FALSE } else { quo_expr <- rlang::get_expr(quo) - if (identical(quo_expr, rlang::expr(deprecated())) || identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { # nolint: object_usage_linter + if (identical(quo_expr, rlang::expr(lifecycle::deprecated())) || identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { # nolint: object_usage_linter FALSE } else { TRUE From fece100e799e8ec14e4f254b47b025eb195b250a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:49:18 -0400 Subject: [PATCH 307/345] add more global vars to track dplyr col names --- R/epiprocess.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/epiprocess.R b/R/epiprocess.R index dd7df87a..c949528e 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -12,4 +12,7 @@ #' @importFrom rlang %||% #' @name epiprocess "_PACKAGE" -utils::globalVariables(c(".x", ".group_key", ".ref_time_value")) +utils::globalVariables(c( + ".x", ".group_key", ".ref_time_value", "resid", + "fitted", ".response", "geo_value", "time_value" +)) From 2e875c64a8b5c83513486c5711880b639917af5b Mon Sep 17 00:00:00 2001 From: nmdefries Date: Thu, 20 Jun 2024 20:51:24 +0000 Subject: [PATCH 308/345] style: styler (GHA) --- R/epiprocess.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epiprocess.R b/R/epiprocess.R index c949528e..40c3ce8a 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -13,6 +13,6 @@ #' @name epiprocess "_PACKAGE" utils::globalVariables(c( - ".x", ".group_key", ".ref_time_value", "resid", - "fitted", ".response", "geo_value", "time_value" + ".x", ".group_key", ".ref_time_value", "resid", + "fitted", ".response", "geo_value", "time_value" )) From 6edc3829d27346ef1099d6e57f11864d47e8ffa0 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 8 Jul 2024 15:29:34 -0400 Subject: [PATCH 309/345] import deprecated --- NAMESPACE | 1 + R/utils.R | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index f3ef030a..b4cafc83 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,6 +139,7 @@ importFrom(dplyr,slice) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(ggplot2,autoplot) +importFrom(lifecycle,deprecated) importFrom(lubridate,as.period) importFrom(lubridate,days) importFrom(lubridate,weeks) diff --git a/R/utils.R b/R/utils.R index fc228e10..b2098e9b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -526,6 +526,8 @@ list2var <- function(x) { #' bad_wrapper1 <- function(x) fn(x) #' bad_wrapper1() # TRUE, bad #' +#' @importFrom lifecycle deprecated +#' #' @noRd deprecated_quo_is_present <- function(quo) { if (!rlang::is_quosure(quo)) { @@ -536,7 +538,7 @@ deprecated_quo_is_present <- function(quo) { FALSE } else { quo_expr <- rlang::get_expr(quo) - if (identical(quo_expr, rlang::expr(lifecycle::deprecated())) || identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { # nolint: object_usage_linter + if (identical(quo_expr, rlang::expr(deprecated())) || identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { # nolint: object_usage_linter FALSE } else { TRUE From ebfba0d67b3397a9284d6219938e85fd930a3b4e Mon Sep 17 00:00:00 2001 From: nmdefries Date: Mon, 8 Jul 2024 19:31:25 +0000 Subject: [PATCH 310/345] docs: document (GHA) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index f35681f6..c9a3f589 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,7 +66,7 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ From 1dea4a91939247601da6d26fc84729886269c48a Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 13 Jun 2024 14:34:56 -0700 Subject: [PATCH 311/345] remove fabletools/feasts imports --- DESCRIPTION | 3 --- R/outliers.R | 48 ++++++++++++++++++++--------------------- man/detect_outlr_stl.Rd | 3 ++- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c9a3f589..ac188dc9 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,9 +30,6 @@ Imports: cli, data.table, dplyr (>= 1.0.0), - fabletools, - feasts, - generics, genlasso, ggplot2, lifecycle (>= 1.0.1), diff --git a/R/outliers.R b/R/outliers.R index ab4f0e8e..a8ca2e97 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -222,6 +222,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' @param seasonal_period Integer specifying period of seasonality. For example, #' for daily data, a period 7 means weekly seasonality. The default is `NULL`, #' meaning that no seasonal term will be included in the STL decomposition. +#' If specified, it must be strictly larger than 1. #' @template outlier-detection-options #' @template detect-outlr-return #' @@ -258,6 +259,9 @@ detect_outlr_stl <- function(x = seq_along(y), y, detection_multiplier = 2, min_radius = 0, replacement_multiplier = 0) { + if (dplyr::n_distinct(x) != length(y)) { + cli_abort("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") + } # Transform if requested if (log_transform) { # Replace all negative values with 0 @@ -266,32 +270,26 @@ detect_outlr_stl <- function(x = seq_along(y), y, y <- log(y + offset) } - # Make a tsibble for fabletools, setup and run STL - z_tsibble <- tsibble::tsibble(x = x, y = y, index = x) - - stl_formula <- y ~ trend(window = n_trend) + - season(period = seasonal_period, window = n_seasonal) + if (is.null(seasonal_period)) { + freq <- 7L + } else { + if (seasonal_period == 1L) cli_abort("`seasonal_period` must be `NULL` or > 1.") + freq <- seasonal_period + } - stl_components <- z_tsibble %>% - fabletools::model(feasts::STL(stl_formula, robust = TRUE)) %>% - generics::components() %>% + yts <- stats::ts(y, frequency = freq) + stl_comp <- stats::stl(yts, + t.window = n_trend, s.window = n_seasonal, + robust = TRUE + )$time.series %>% tibble::as_tibble() %>% - dplyr::select(.data$trend:.data$remainder) %>% # - dplyr::rename_with(~"seasonal", tidyselect::starts_with("season")) %>% dplyr::rename(resid = .data$remainder) # Allocate the seasonal term from STL to either fitted or resid if (!is.null(seasonal_period)) { - stl_components <- stl_components %>% - dplyr::mutate( - fitted = .data$trend + .data$seasonal - ) + stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend + .data$seasonal) } else { - stl_components <- stl_components %>% - dplyr::mutate( - fitted = .data$trend, - resid = .data$seasonal + resid - ) + stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend, resid = .data$seasonal + .data$resid) } # Detect negatives if requested @@ -306,10 +304,7 @@ detect_outlr_stl <- function(x = seq_along(y), y, # Calculate lower and upper thresholds and replacement value z <- z %>% - dplyr::mutate( - fitted = stl_components$fitted, - resid = stl_components$resid - ) %>% + dplyr::mutate(fitted = stl_comp$fitted, resid = stl_comp$resid) %>% roll_iqr( n = n_threshold, detection_multiplier = detection_multiplier, @@ -337,7 +332,12 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius, as_type <- as.numeric } - epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% + z %>% + epi_slide( + roll_iqr = stats::IQR(resid), + before = floor((n - 1) / 2), + after = ceiling((n - 1) / 2) + ) %>% dplyr::mutate( lower = pmax( min_lower, diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 2b518451..9bb6b971 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -36,7 +36,8 @@ outlier thresholds.} \item{seasonal_period}{Integer specifying period of seasonality. For example, for daily data, a period 7 means weekly seasonality. The default is \code{NULL}, -meaning that no seasonal term will be included in the STL decomposition.} +meaning that no seasonal term will be included in the STL decomposition. +If specified, it must be strictly larger than 1.} \item{log_transform}{Should a log transform be applied before running outlier detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the From 195c787de92f113823576d3c716cb0b39cd68eee Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 17 Jun 2024 15:42:09 -0700 Subject: [PATCH 312/345] Separate `seasonal_period` into 2 args, correct docs, more validation --- In `detect_outlr_stl()`: --- It appears there's simple way to turn off the seasonal component of STL; simply considering the seasonal component as part of the residual will do different things based on frequency/seasonal/low-pass args to `stats::stl()`. So, instead of `seasonal_period = NULL` doing the seasonal + residual thing for some fixed magic values of the other args, instead split it into `seasonal_period` and `seasonal_as_residual` to allow the user more control. Update docs and vignettes accordingly. (Plus remove reference to no-longer-used `feasts`.) More validation: - More validation on `seasonal_*` args. - Try to detect gaps in `x` values and complain (we can't simply fix with `complete()`; NAs aren't allowed). - Order `x` and `y` according to `x`. --- R/outliers.R | 53 ++++++++++++++++++++++++++++------------- man/detect_outlr.Rd | 5 ++-- man/detect_outlr_stl.Rd | 24 +++++++++++++------ vignettes/outliers.Rmd | 10 ++++---- 4 files changed, 62 insertions(+), 30 deletions(-) diff --git a/R/outliers.R b/R/outliers.R index a8ca2e97..aa34f31f 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -64,9 +64,10 @@ #' args = list(list( #' detect_negatives = TRUE, #' detection_multiplier = 2.5, -#' seasonal_period = NULL +#' seasonal_period = 7, +#' seasonal_as_residual = TRUE #' )), -#' abbr = "stl_nonseasonal" +#' abbr = "stl_reseasonal" #' ) #' ) #' @@ -216,19 +217,28 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' @param n_trend Number of time steps to use in the rolling window for trend. #' Default is 21. #' @param n_seasonal Number of time steps to use in the rolling window for -#' seasonality. Default is 21. +#' seasonality. Default is 21. Can also be the string "periodic". See +#' `s.window` in [`stats::stl`]. #' @param n_threshold Number of time steps to use in rolling window for the IQR #' outlier thresholds. -#' @param seasonal_period Integer specifying period of seasonality. For example, -#' for daily data, a period 7 means weekly seasonality. The default is `NULL`, -#' meaning that no seasonal term will be included in the STL decomposition. -#' If specified, it must be strictly larger than 1. +#' @param seasonal_period Integer specifying period of "seasonality". For +#' example, for daily data, a period 7 means weekly seasonality. It must be +#' strictly larger than 1. Also impacts the size of the low-pass filter +#' window; see `l.window` in [`stats::stl`]. +#' @param seasonal_as_residual Boolean specifying whether the seasonal(/weekly) +#' component should be treated as part of the residual component instead of as +#' part of the predictions. The default, FALSE, treats them as part of the +#' predictions, so large seasonal(/weekly) components will not lead to +#' flagging points as outliers. `TRUE` may instead consider the extrema of +#' large seasonal variations to be outliers; `n_seasonal` and +#' `seasonal_period` will still have an impact on the result, though, by +#' impacting the estimation of the trend component. #' @template outlier-detection-options #' @template detect-outlr-return #' -#' @details The STL decomposition is computed using the `feasts` package. Once +#' @details The STL decomposition is computed using the [`stats::stl()`]. Once #' computed, the outlier detection method is analogous to the rolling median -#' method in `detect_outlr_rm()`, except with the fitted values and residuals +#' method in [`detect_outlr_rm()`], except with the fitted values and residuals #' from the STL decomposition taking the place of the rolling median and #' residuals to the rolling median, respectively. #' @@ -262,6 +272,19 @@ detect_outlr_stl <- function(x = seq_along(y), y, if (dplyr::n_distinct(x) != length(y)) { cli_abort("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") } + if (length(y) <= 1L) { + cli_abort("`y` has length {length(y)}; that's definitely too little for STL. (If being run in a `mutate()` or `epi_slide()`, check whether you grouped by too many variables; you should not be grouping by `time_value` in particular.)") + } + distinct_x_skips <- unique(diff(x)) + if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) { + cli_abort("`x` does not appear to have regular spacing; consider filling in gaps with imputed values (STL does not allow NAs).") + } + if (is.unsorted(x)) { # <- for performance in common (sorted) case + o <- order(x) + x <- x[o] + y <- y[o] + } + # Transform if requested if (log_transform) { # Replace all negative values with 0 @@ -270,14 +293,10 @@ detect_outlr_stl <- function(x = seq_along(y), y, y <- log(y + offset) } - if (is.null(seasonal_period)) { - freq <- 7L - } else { - if (seasonal_period == 1L) cli_abort("`seasonal_period` must be `NULL` or > 1.") - freq <- seasonal_period - } + assert_int(seasonal_period, len = 1L, lower = 2L) + assert_logical(seasonal_as_residual, len = 1L, any.missing = FALSE) - yts <- stats::ts(y, frequency = freq) + yts <- stats::ts(y, frequency = seasonal_period) stl_comp <- stats::stl(yts, t.window = n_trend, s.window = n_seasonal, robust = TRUE @@ -286,7 +305,7 @@ detect_outlr_stl <- function(x = seq_along(y), y, dplyr::rename(resid = .data$remainder) # Allocate the seasonal term from STL to either fitted or resid - if (!is.null(seasonal_period)) { + if (!seasonal_as_residual) { stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend + .data$seasonal) } else { stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend, resid = .data$seasonal + .data$resid) diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 4263a64b..3ac08585 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -87,9 +87,10 @@ detection_methods <- dplyr::bind_rows( args = list(list( detect_negatives = TRUE, detection_multiplier = 2.5, - seasonal_period = NULL + seasonal_period = 7, + seasonal_as_residual = TRUE )), - abbr = "stl_nonseasonal" + abbr = "stl_reseasonal" ) ) diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 9bb6b971..bd178d28 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -29,15 +29,16 @@ detect_outlr_stl( Default is 21.} \item{n_seasonal}{Number of time steps to use in the rolling window for -seasonality. Default is 21.} +seasonality. Default is 21. Can also be the string "periodic". See +\code{s.window} in \code{\link[stats:stl]{stats::stl}}.} \item{n_threshold}{Number of time steps to use in rolling window for the IQR outlier thresholds.} -\item{seasonal_period}{Integer specifying period of seasonality. For example, -for daily data, a period 7 means weekly seasonality. The default is \code{NULL}, -meaning that no seasonal term will be included in the STL decomposition. -If specified, it must be strictly larger than 1.} +\item{seasonal_period}{Integer specifying period of "seasonality". For +example, for daily data, a period 7 means weekly seasonality. It must be +strictly larger than 1. Also impacts the size of the low-pass filter +window; see \code{l.window} in \code{\link[stats:stl]{stats::stl}}.} \item{log_transform}{Should a log transform be applied before running outlier detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the @@ -58,6 +59,15 @@ values are from the rolling median. The replacement is the original value if it is within the detection thresholds, or otherwise it is rounded to the nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). Default is 0.} + +\item{seasonal_as_residual}{Boolean specifying whether the seasonal(/weekly) +component should be treated as part of the residual component instead of as +part of the predictions. The default, FALSE, treats them as part of the +predictions, so large seasonal(/weekly) components will not lead to +flagging points as outliers. \code{TRUE} may instead consider the extrema of +large seasonal variations to be outliers; \code{n_seasonal} and +\code{seasonal_period} will still have an impact on the result, though, by +impacting the estimation of the trend component.} } \value{ An tibble with number of rows equal to \code{length(y)} and columns @@ -68,9 +78,9 @@ replacement values from each detection method (\code{replacement}). Detects outliers based on a seasonal-trend decomposition using LOESS (STL). } \details{ -The STL decomposition is computed using the \code{feasts} package. Once +The STL decomposition is computed using the \code{\link[stats:stl]{stats::stl()}}. Once computed, the outlier detection method is analogous to the rolling median -method in \code{detect_outlr_rm()}, except with the fitted values and residuals +method in \code{\link[=detect_outlr_rm]{detect_outlr_rm()}}, except with the fitted values and residuals from the STL decomposition taking the place of the rolling median and residuals to the rolling median, respectively. diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 4d9d4da8..ea3c30ac 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -74,8 +74,9 @@ methods. 2. Detection based on a seasonal-trend decomposition using LOESS (STL), using `detect_outlr_stl()`, which is similar to the rolling median method but replaces the rolling median with fitted values from STL. -3. Detection based on an STL decomposition, but without seasonality term, which - amounts to smoothing using LOESS. +3. Detection based on an STL decomposition, but subtracting out the seasonality + term from its predictions, which may result in the extrema of large seasonal + variations being considered as outliers. The outlier detection methods are specified using a `tibble` that is passed to `detect_outlr()`, with one row per method, and whose columms specify the @@ -108,9 +109,10 @@ detection_methods <- bind_rows( args = list(list( detect_negatives = TRUE, detection_multiplier = 2.5, - seasonal_period = NULL + seasonal_period = 7, + seasonal_as_residual = TRUE )), - abbr = "stl_nonseasonal" + abbr = "stl_reseasonal" ) ) From c51f116336db5cc8ee22feeaf18f7c0c3fe03940 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 17 Jun 2024 15:58:35 -0700 Subject: [PATCH 313/345] Add NEWS.md entry, bump package version --- NEWS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/NEWS.md b/NEWS.md index 382cbf19..ed9dea40 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat # epiprocess 0.8 +## Breaking changes +- `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use + `detect_outlr_stl(seasonal_period = , seasonal_as_residual = TRUE)` + instead. See `?detect_outlr_stl` for more details. + ## Improvements - `epi_slide` computations are now 2-4 times faster after changing how @@ -43,6 +48,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471). +## Cleanup +- Removed some external package dependencies. + # epiprocess 0.7.0 ## Breaking changes: From 017f18139644ea31398e6aedb6bbfe54bde0875b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 18 Jun 2024 09:41:22 -0700 Subject: [PATCH 314/345] fix (outliers.R): len = 1L is already baked into assert_int --- R/outliers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/outliers.R b/R/outliers.R index aa34f31f..337d9359 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -293,7 +293,7 @@ detect_outlr_stl <- function(x = seq_along(y), y, y <- log(y + offset) } - assert_int(seasonal_period, len = 1L, lower = 2L) + assert_int(seasonal_period, lower = 2L) assert_logical(seasonal_as_residual, len = 1L, any.missing = FALSE) yts <- stats::ts(y, frequency = seasonal_period) From 5fd98ce02f28a1e1a354c326a0ee393555a273b0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 18 Jun 2024 13:30:58 -0700 Subject: [PATCH 315/345] fix (detect_outlr_stl): make intended fn signature changes --- R/outliers.R | 3 ++- man/detect_outlr_stl.Rd | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/R/outliers.R b/R/outliers.R index 337d9359..bb4ab56e 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -263,7 +263,8 @@ detect_outlr_stl <- function(x = seq_along(y), y, n_trend = 21, n_seasonal = 21, n_threshold = 21, - seasonal_period = NULL, + seasonal_period, + seasonal_as_residual = FALSE, log_transform = FALSE, detect_negatives = FALSE, detection_multiplier = 2, diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index bd178d28..ba732dab 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -10,7 +10,8 @@ detect_outlr_stl( n_trend = 21, n_seasonal = 21, n_threshold = 21, - seasonal_period = NULL, + seasonal_period, + seasonal_as_residual = FALSE, log_transform = FALSE, detect_negatives = FALSE, detection_multiplier = 2, @@ -40,6 +41,15 @@ example, for daily data, a period 7 means weekly seasonality. It must be strictly larger than 1. Also impacts the size of the low-pass filter window; see \code{l.window} in \code{\link[stats:stl]{stats::stl}}.} +\item{seasonal_as_residual}{Boolean specifying whether the seasonal(/weekly) +component should be treated as part of the residual component instead of as +part of the predictions. The default, FALSE, treats them as part of the +predictions, so large seasonal(/weekly) components will not lead to +flagging points as outliers. \code{TRUE} may instead consider the extrema of +large seasonal variations to be outliers; \code{n_seasonal} and +\code{seasonal_period} will still have an impact on the result, though, by +impacting the estimation of the trend component.} + \item{log_transform}{Should a log transform be applied before running outlier detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the log transform will be padded by 1.} @@ -59,15 +69,6 @@ values are from the rolling median. The replacement is the original value if it is within the detection thresholds, or otherwise it is rounded to the nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). Default is 0.} - -\item{seasonal_as_residual}{Boolean specifying whether the seasonal(/weekly) -component should be treated as part of the residual component instead of as -part of the predictions. The default, FALSE, treats them as part of the -predictions, so large seasonal(/weekly) components will not lead to -flagging points as outliers. \code{TRUE} may instead consider the extrema of -large seasonal variations to be outliers; \code{n_seasonal} and -\code{seasonal_period} will still have an impact on the result, though, by -impacting the estimation of the trend component.} } \value{ An tibble with number of rows equal to \code{length(y)} and columns From 537056c4c492f67939f3862f5284417bacc86bae Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 20 Jun 2024 16:14:42 -0700 Subject: [PATCH 316/345] Address line length lints --- R/outliers.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/outliers.R b/R/outliers.R index bb4ab56e..3cc120a0 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -271,10 +271,14 @@ detect_outlr_stl <- function(x = seq_along(y), y, min_radius = 0, replacement_multiplier = 0) { if (dplyr::n_distinct(x) != length(y)) { - cli_abort("`x` contains duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") + cli_abort("`x` contains duplicate values. (If being run on a column in an + `epi_df`, did you group by relevant key variables?)") } if (length(y) <= 1L) { - cli_abort("`y` has length {length(y)}; that's definitely too little for STL. (If being run in a `mutate()` or `epi_slide()`, check whether you grouped by too many variables; you should not be grouping by `time_value` in particular.)") + cli_abort("`y` has length {length(y)}; that's definitely too little for + STL. (If being run in a `mutate()` or `epi_slide()`, check + whether you grouped by too many variables; you should not be + grouping by `time_value` in particular.)") } distinct_x_skips <- unique(diff(x)) if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) { From d55515ff50532e83649c03e4fd03ecf798386857 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 21 Jun 2024 12:36:51 -0700 Subject: [PATCH 317/345] Update R/outliers.R Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- R/outliers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/outliers.R b/R/outliers.R index 3cc120a0..1d074a3a 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -236,7 +236,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' @template outlier-detection-options #' @template detect-outlr-return #' -#' @details The STL decomposition is computed using the [`stats::stl()`]. Once +#' @details The STL decomposition is computed using [`stats::stl()`]. Once #' computed, the outlier detection method is analogous to the rolling median #' method in [`detect_outlr_rm()`], except with the fitted values and residuals #' from the STL decomposition taking the place of the rolling median and From b0e207c283a1845fe1f9687f9b2df501a84acf4f Mon Sep 17 00:00:00 2001 From: brookslogan Date: Fri, 21 Jun 2024 19:38:30 +0000 Subject: [PATCH 318/345] docs: document (GHA) --- man/detect_outlr_stl.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index ba732dab..695c2de7 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -79,7 +79,7 @@ replacement values from each detection method (\code{replacement}). Detects outliers based on a seasonal-trend decomposition using LOESS (STL). } \details{ -The STL decomposition is computed using the \code{\link[stats:stl]{stats::stl()}}. Once +The STL decomposition is computed using \code{\link[stats:stl]{stats::stl()}}. Once computed, the outlier detection method is analogous to the rolling median method in \code{\link[=detect_outlr_rm]{detect_outlr_rm()}}, except with the fitted values and residuals from the STL decomposition taking the place of the rolling median and From 9524643059c9422e7ceed7b20c258a46170a585c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 17 Jul 2024 14:21:35 -0700 Subject: [PATCH 319/345] Address line length lint --- R/outliers.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/outliers.R b/R/outliers.R index 1d074a3a..3d0ff5e5 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -282,7 +282,8 @@ detect_outlr_stl <- function(x = seq_along(y), y, } distinct_x_skips <- unique(diff(x)) if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) { - cli_abort("`x` does not appear to have regular spacing; consider filling in gaps with imputed values (STL does not allow NAs).") + cli_abort("`x` does not appear to have regular spacing; consider filling in + gaps with imputed values (STL does not allow NAs).") } if (is.unsorted(x)) { # <- for performance in common (sorted) case o <- order(x) From 00804257f3344c318a6556f3af2c5de6827ed3a3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 17 Jul 2024 23:07:21 -0700 Subject: [PATCH 320/345] Fix guess_period on datetimes, make it more precise + generic - Don't discard units and effectively replace them with seconds - Don't allow any tolerance in judging a remainder to be zero, since when we use it to generate the default `ref_time_values` that means we could miss reproducing some of the actual input time values. - Make it into an S3 generic so it can be extended for more time classes. --- DESCRIPTION | 2 +- NAMESPACE | 4 +++ NEWS.md | 2 ++ R/utils.R | 70 +++++++++++++++++++++++++------------ man/guess_period.Rd | 28 ++++++++------- tests/testthat/test-utils.R | 50 ++++++++++++++++++++++++++ 6 files changed, 119 insertions(+), 37 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c9a3f589..543e9860 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.12 +Version: 0.7.13 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index b4cafc83..b42c024e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,9 @@ S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) +S3method(guess_period,Date) +S3method(guess_period,POSIXt) +S3method(guess_period,default) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) @@ -64,6 +67,7 @@ export(filter) export(group_by) export(group_modify) export(growth_rate) +export(guess_period) export(is_epi_df) export(is_grouped_epi_archive) export(key_colnames) diff --git a/NEWS.md b/NEWS.md index 382cbf19..434dba1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -35,6 +35,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Improved documentation web site landing page's introduction. - Fixed documentation referring to old `epi_slide()` interface (#466, thanks @XuedaShen!). +- Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would + output a huge number of `ref_time_values` spaced apart by mere seconds. ## Cleanup - Resolved some linting messages in package checks (#468). diff --git a/R/utils.R b/R/utils.R index b2098e9b..f899957e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -670,28 +670,52 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { vctrs::vec_cast(numeric_gcd, dividends) } -#' Use max valid period as guess for `period` of `ref_time_values` -#' -#' @param ref_time_values Vector containing time-interval-like or time-like -#' data, with at least two distinct values, [`diff`]-able (e.g., a -#' `time_value` or `version` column), and should have a sensible result from -#' adding `is.numeric` versions of its `diff` result (via `as.integer` if its -#' `typeof` is `"integer"`, otherwise via `as.numeric`). -#' @param ref_time_values_arg Optional, string; name to give `ref_time_values` -#' in error messages. Defaults to quoting the expression the caller fed into -#' the `ref_time_values` argument. -#' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)` -guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { - sorted_distinct_ref_time_values <- sort(unique(ref_time_values)) - if (length(sorted_distinct_ref_time_values) < 2L) { - cli_abort("Not enough distinct values in {.code {ref_time_values_arg}} to guess the period.", ref_time_values_arg) +#' Use max valid period as guess for `period` of `time_values` +#' +#' `r lifecycle::badge("experimental")` +#' +#' @param time_values Vector containing time-interval-like or time-point-like +#' data, with at least two distinct values. +#' @param time_values_arg Optional, string; name to give `time_values` in error +#' messages. Defaults to quoting the expression the caller fed into the +#' `time_values` argument. +#' @return length-1 vector; `r lifecycle::badge("experimental")` class will +#' either be the same class as [`base::diff()`] on such time values, an +#' integer, or a double, such that all `time_values` can be exactly obtained +#' by adding `k * result` for an integer k, and such that there is no smaller +#' `result` that can achieve this. +#' @export +guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + UseMethod("guess_period") +} + +#' @export +guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + rlang::check_dots_empty() + sorted_distinct_time_values <- sort(unique(time_values)) + if (length(sorted_distinct_time_values) < 2L) { + cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.", + class = "epiprocess__guess_period__not_enough_times", + time_values = time_values + ) } - skips <- diff(sorted_distinct_ref_time_values) - decayed_skips <- - if (typeof(skips) == "integer") { - as.integer(skips) - } else { - as.numeric(skips) - } - gcd_num(decayed_skips) + skips <- diff(sorted_distinct_time_values) + # Certain diff results have special classes or attributes; use vctrs to try to + # appropriately destructure for gcd_num, then restore to their original class + # & attributes. + skips_data <- vctrs::vec_data(skips) + period_data <- gcd_num(skips_data, rrtol = 0) + vctrs::vec_restore(period_data, skips) +} + +# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types: + +#' @export +guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + as.numeric(NextMethod(), units = "days") +} + +#' @export +guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + as.numeric(NextMethod(), units = "secs") } diff --git a/man/guess_period.Rd b/man/guess_period.Rd index e03a1373..7d53eba2 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -2,27 +2,29 @@ % Please edit documentation in R/utils.R \name{guess_period} \alias{guess_period} -\title{Use max valid period as guess for \code{period} of \code{ref_time_values}} +\title{Use max valid period as guess for \code{period} of \code{time_values}} \usage{ guess_period( - ref_time_values, - ref_time_values_arg = rlang::caller_arg(ref_time_values) + time_values, + time_values_arg = rlang::caller_arg(time_values), + ... ) } \arguments{ -\item{ref_time_values}{Vector containing time-interval-like or time-like -data, with at least two distinct values, \code{\link{diff}}-able (e.g., a -\code{time_value} or \code{version} column), and should have a sensible result from -adding \code{is.numeric} versions of its \code{diff} result (via \code{as.integer} if its -\code{typeof} is \code{"integer"}, otherwise via \code{as.numeric}).} +\item{time_values}{Vector containing time-interval-like or time-point-like +data, with at least two distinct values.} -\item{ref_time_values_arg}{Optional, string; name to give \code{ref_time_values} -in error messages. Defaults to quoting the expression the caller fed into -the \code{ref_time_values} argument.} +\item{time_values_arg}{Optional, string; name to give \code{time_values} in error +messages. Defaults to quoting the expression the caller fed into the +\code{time_values} argument.} } \value{ -\code{is.numeric}, length 1; attempts to match \code{typeof(ref_time_values)} +length-1 vector; \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} class will +either be the same class as \code{\link[base:diff]{base::diff()}} on such time values, an +integer, or a double, such that all \code{time_values} can be exactly obtained +by adding \code{k * result} for an integer k, and such that there is no smaller +\code{result} that can achieve this. } \description{ -Use max valid period as guess for \code{period} of \code{ref_time_values} +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} } diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 85959d94..b69277f2 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -231,3 +231,53 @@ test_that("as_slide_computation raises errors as expected", { class = "epiprocess__as_slide_computation__cant_convert_catchall" ) }) + +test_that("guess_period works", { + # Error cases: + expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") + expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times") + # Different numeric classes and cases: + expect_identical(guess_period(c(1, 8)), 7) + expect_identical(guess_period(c(1, 8, 15)), 7) + expect_identical(guess_period(c(1L, 8L, 15L)), 7L) + expect_identical(guess_period(c(0, 7, 14, 15)), 1) + # We currently allow the guessed frequency to no appear in the diffs, but this + # might not be a good idea as it likely indicates an issue with the data. If + # we drop this behavior we could also drop the gcd algorithm by just checking + # the validity of the smallest diff: + expect_identical(guess_period(c(0, 2, 5)), 1) + expect_identical(guess_period(c(0, 4, 10)), 2) + # On Dates: + daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") + weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") + expect_identical( + daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), + daily_dates + ) + expect_identical( + weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), + weekly_dates + ) + # On POSIXcts: + daily_posixcts <- as.POSIXct(daily_dates, tz = "ET") + 3600 + weekly_posixcts <- as.POSIXct(weekly_dates, tz = "ET") + 3600 + expect_identical( + daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L), + daily_posixcts + ) + expect_identical( + weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L), + weekly_posixcts + ) + # On POSIXlts: + daily_posixlts <- as.POSIXlt(daily_dates, tz = "ET") + 3600 + weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "ET") + 3600 + expect_identical( + daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L), + daily_posixlts + ) + expect_identical( + weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L), + weekly_posixlts + ) +}) From 4a6f7bfb116148221d46b9f28d979b8bf7138816 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 5 Jun 2024 23:47:14 -0500 Subject: [PATCH 321/345] basic auto-naming --- NAMESPACE | 1 + R/epi_df.R | 27 ++++++++++++++++++++++++++- man/as_epi_df.Rd | 10 +++++++++- tests/testthat/test-epi_df.R | 16 ++++++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b4cafc83..edbaf137 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,6 +100,7 @@ importFrom(checkmate,test_subset) importFrom(checkmate,vname) importFrom(cli,cat_line) importFrom(cli,cli_abort) +importFrom(cli,cli_inform) importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(cli,format_message) diff --git a/R/epi_df.R b/R/epi_df.R index f4df1604..6ac1b7db 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -256,9 +256,34 @@ as_epi_df.epi_df <- function(x, ...) { #' (stored in its attributes); if this fails, then the current day-time will #' be used. #' @importFrom rlang .data +#' @importFrom cli cli_inform #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { + additional_metadata = list(), + substitutions = NULL, ...) { + # possible standard substitutions + if (!("time_value" %in% names(x))) { + if (("forecast_date" %in% names(x)) && ("target_date" %in% names(x))) { + cli_abort("both `forecast_date` and `target_date` are present without a `time_value` +column, so it is ambiguous which to choose as `time_value`.") + } + name_substitutions <- substitutions %||% c( + time_value = "date", + time_value = "forecast_date", + time_value = "target_date", + time_value = "dates", + time_value = "time_values", + time_value = "forecast_dates", + time_value = "target_dates" + ) + x <- tryCatch(x %>% rename(any_of(name_substitutions)), + error = function(cond) { + cli_abort("There are multiple `time_value` candidate columns. +Either `rename` on yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( "Columns `geo_value` and `time_value` must be present in `x`." diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 40c0a1c5..82bd8882 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -12,7 +12,15 @@ as_epi_df(x, ...) \method{as_epi_df}{epi_df}(x, ...) -\method{as_epi_df}{tbl_df}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) +\method{as_epi_df}{tbl_df}( + x, + geo_type, + time_type, + as_of, + additional_metadata = list(), + substitutions = NULL, + ... +) \method{as_epi_df}{data.frame}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 8cfb4408..8ffc9820 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -46,6 +46,22 @@ test_that("as_epi_df errors when additional_metadata is not a list", { ) }) +test_that("as_epi_df works for nonstandard input", { + tib <- tibble::tibble( + x = 1:10, y = 1:10, + date = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) + ) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + + tib <- tib %>% rename(forecast_date = date) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + tib %>% rename(any_of(name_substitutions)) + + tib <- tib %>% mutate(target_date = 20 + forecast_date) + expect_error(tib_epi_df <- tib %>% as_epi_df()) +}) + # select fixes tib <- tibble::tibble( From 284daaf93465ea92f58f4631da339a6c866150c6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 7 Jun 2024 16:04:39 -0500 Subject: [PATCH 322/345] geo_value and version, separate functions, more ex --- NAMESPACE | 1 + R/archive.R | 3 + R/epi_df.R | 29 ++-------- R/utils.R | 106 ++++++++++++++++++++++++++++++++++ man/as_epi_df.Rd | 2 +- man/guess_time_column_name.Rd | 12 ++++ man/upcase_snake_case.Rd | 16 +++++ 7 files changed, 144 insertions(+), 25 deletions(-) create mode 100644 man/guess_time_column_name.Rd create mode 100644 man/upcase_snake_case.Rd diff --git a/NAMESPACE b/NAMESPACE index edbaf137..f0b01e82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -187,6 +187,7 @@ importFrom(tibble,as_tibble) importFrom(tibble,new_tibble) importFrom(tibble,validate_tibble) importFrom(tidyr,unnest) +importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) diff --git a/R/archive.R b/R/archive.R index 464d68ef..8488cbfa 100644 --- a/R/archive.R +++ b/R/archive.R @@ -456,6 +456,9 @@ as_epi_archive <- function( clobberable_versions_start = NULL, versions_end = NULL) { assert_data_frame(x) + x <- guess_time_column_name(x) + x <- guess_geo_column_name(x) + x <- guess_version_column_name(x) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( "Columns `geo_value`, `time_value`, and `version` must be present in `x`." diff --git a/R/epi_df.R b/R/epi_df.R index 6ac1b7db..712c4b0e 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -256,34 +256,15 @@ as_epi_df.epi_df <- function(x, ...) { #' (stored in its attributes); if this fails, then the current day-time will #' be used. #' @importFrom rlang .data +#' @importFrom tidyselect any_of #' @importFrom cli cli_inform #' @export as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, additional_metadata = list(), - substitutions = NULL, ...) { - # possible standard substitutions - if (!("time_value" %in% names(x))) { - if (("forecast_date" %in% names(x)) && ("target_date" %in% names(x))) { - cli_abort("both `forecast_date` and `target_date` are present without a `time_value` -column, so it is ambiguous which to choose as `time_value`.") - } - name_substitutions <- substitutions %||% c( - time_value = "date", - time_value = "forecast_date", - time_value = "target_date", - time_value = "dates", - time_value = "time_values", - time_value = "forecast_dates", - time_value = "target_dates" - ) - x <- tryCatch(x %>% rename(any_of(name_substitutions)), - error = function(cond) { - cli_abort("There are multiple `time_value` candidate columns. -Either `rename` on yourself or drop some.") - } - ) - cli_inform("inferring `time_value` column.") - } + ...) { + # possible standard substitutions for time_value + x <- guess_time_column_name(x) + x <- guess_geo_column_name(x) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( "Columns `geo_value` and `time_value` must be present in `x`." diff --git a/R/utils.R b/R/utils.R index b2098e9b..39f97a36 100644 --- a/R/utils.R +++ b/R/utils.R @@ -448,6 +448,112 @@ guess_time_type <- function(time_value) { return("custom") } +#' given a vector of characters, add the same values, but upcased, e.g. +#' "date" -> c("date", "Date") +#' "target_date" -> c("target_date", "Target_Date") +#' @keywords internal +upcase_snake_case <- function(x) { + X <- strsplit(x, "_") %>% + map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% + unlist() + c(x, X) +} + +#' given an arbitrary +#' @keywords internal +guess_time_column_name <- function(x, substitutions = NULL) { + if (!("time_value" %in% names(x))) { + if (is.null(substitutions)) { + substitutions <- c( + time_value = "date", + time_value = "time", + time_value = "datetime", + time_value = "dateTime", + tmie_value = "date_time", + time_value = "forecast_date", + time_value = "target_date", + time_value = "week", + time_value = "day", + time_value = "epiweek", + time_value = "month", + time_value = "year", + time_value = "yearmon", + time_value = "yearMon", + time_value = "dates", + time_value = "time_values", + time_value = "forecast_dates", + time_value = "target_dates" + ) + substitutions <- upcase_snake_case(substitutions) + } + strsplit(name_substitutions, "_") %>% + map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% + unlist() + x <- tryCatch(x %>% rename(any_of(name_substitutions)), + error = function(cond) { + cli_abort("There are multiple `time_value` candidate columns. +Either `rename` some yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } + return(x) +} + + +#' @keywords internal +guess_geo_column_name <- function(x, substitutions = NULL) { + if (!("time_value" %in% names(x))) { + substitutions <- substitutions %||% c( + geo_value = "geo_values", + geo_value = "geo_id", + geo_value = "geos", + geo_value = "location", + geo_value = "jurisdiction", + geo_value = "fips", + geo_value = "zip", + geo_value = "county", + geo_value = "hrr", + geo_value = "msa", + geo_value = "state", + geo_value = "province", + geo_value = "nation", + geo_value = "states", + geo_value = "provinces", + geo_value = "counties" + ) + substitutions <- upcase_snake_case(substitutions) + x <- tryCatch(x %>% rename(any_of(substitutions)), + error = function(cond) { + cli_abort("There are multiple `geo_value` candidate columns. +Either `rename` some yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } + return(x) +} + +guess_version_column_name <- function(x, substitutions = NULL) { + if (!("version" %in% names(x))) { + if (is.null(substitutions)) { + substitutions <- c( + version = "issue", + version = "release" + ) + substitutions <- upcase_snake_case(substitutions) + } + x <- tryCatch(x %>% rename(any_of(substitutions)), + error = function(cond) { + cli_abort("There are multiple `geo_value` candidate columns. +Either `rename` some yourself or drop some.") + } + ) + cli_inform("inferring `time_value` column.") + } + return(x) +} + ########## diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 82bd8882..22faa265 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -17,8 +17,8 @@ as_epi_df(x, ...) geo_type, time_type, as_of, + other_keys = character(0), additional_metadata = list(), - substitutions = NULL, ... ) diff --git a/man/guess_time_column_name.Rd b/man/guess_time_column_name.Rd new file mode 100644 index 00000000..45a173b6 --- /dev/null +++ b/man/guess_time_column_name.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{guess_time_column_name} +\alias{guess_time_column_name} +\title{given an arbitrary} +\usage{ +guess_time_column_name(x, substitutions = NULL) +} +\description{ +given an arbitrary +} +\keyword{internal} diff --git a/man/upcase_snake_case.Rd b/man/upcase_snake_case.Rd new file mode 100644 index 00000000..398f6a0b --- /dev/null +++ b/man/upcase_snake_case.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{upcase_snake_case} +\alias{upcase_snake_case} +\title{given a vector of characters, add the same values, but upcased, e.g. +"date" -> c("date", "Date") +"target_date" -> c("target_date", "Target_Date")} +\usage{ +upcase_snake_case(x) +} +\description{ +given a vector of characters, add the same values, but upcased, e.g. +"date" -> c("date", "Date") +"target_date" -> c("target_date", "Target_Date") +} +\keyword{internal} From 6b361da54d06a7c317581b6d56c8d061fbef4bae Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 7 Jun 2024 21:06:52 +0000 Subject: [PATCH 323/345] docs: document (GHA) --- man/as_epi_df.Rd | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 22faa265..40c0a1c5 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -12,15 +12,7 @@ as_epi_df(x, ...) \method{as_epi_df}{epi_df}(x, ...) -\method{as_epi_df}{tbl_df}( - x, - geo_type, - time_type, - as_of, - other_keys = character(0), - additional_metadata = list(), - ... -) +\method{as_epi_df}{tbl_df}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) \method{as_epi_df}{data.frame}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) From 3276e9e5d6eacc45773c41ad0db4aaa308ee312d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 7 Jun 2024 17:31:58 -0500 Subject: [PATCH 324/345] errant renamed variables --- R/utils.R | 4 ++-- tests/testthat/test-epi_df.R | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/R/utils.R b/R/utils.R index 39f97a36..e52c0da9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -486,10 +486,10 @@ guess_time_column_name <- function(x, substitutions = NULL) { ) substitutions <- upcase_snake_case(substitutions) } - strsplit(name_substitutions, "_") %>% + strsplit(substitutions, "_") %>% map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% unlist() - x <- tryCatch(x %>% rename(any_of(name_substitutions)), + x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { cli_abort("There are multiple `time_value` candidate columns. Either `rename` some yourself or drop some.") diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 8ffc9820..c6a304bd 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -56,7 +56,6 @@ test_that("as_epi_df works for nonstandard input", { tib <- tib %>% rename(forecast_date = date) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) - tib %>% rename(any_of(name_substitutions)) tib <- tib %>% mutate(target_date = 20 + forecast_date) expect_error(tib_epi_df <- tib %>% as_epi_df()) From 00ce1da71e00db14efd78345b436c1b12d9aa69d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 24 Jun 2024 19:05:44 -0500 Subject: [PATCH 325/345] More tests, `...` tidyselect, doc as_epi_df, more values --- R/archive.R | 3 ++- R/epi_df.R | 19 ++++++++++++------- R/utils.R | 20 ++++++++++++++------ man/as_epi_df.Rd | 16 ++++++++++------ man/guess_time_column_name.Rd | 4 ++-- tests/testthat/test-epi_df.R | 9 +++++++++ 6 files changed, 49 insertions(+), 22 deletions(-) diff --git a/R/archive.R b/R/archive.R index 8488cbfa..83806eef 100644 --- a/R/archive.R +++ b/R/archive.R @@ -454,8 +454,9 @@ as_epi_archive <- function( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL) { + versions_end = NULL, ...) { assert_data_frame(x) + x <- rename(x, ...) x <- guess_time_column_name(x) x <- guess_geo_column_name(x) x <- guess_version_column_name(x) diff --git a/R/epi_df.R b/R/epi_df.R index 712c4b0e..c7554b33 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -249,12 +249,16 @@ as_epi_df.epi_df <- function(x, ...) { #' @method as_epi_df tbl_df #' @describeIn as_epi_df The input tibble `x` must contain the columns -#' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `as_of` is missing, then the function -#' will try to guess it from an `as_of`, `issue`, or `version` column of `x` -#' (if any of these are present), or from as an `as_of` field in its metadata -#' (stored in its attributes); if this fails, then the current day-time will -#' be used. +#' `geo_value` and `time_value`, or column names that uniquely map onto these +#' (e.g. `date` or `province`). Alternatively, you can specify the conversion +#' explicitly (`time_value = someWeirdColumnName`). All other columns not +#' specified as `other_keys` will be preserved as is, and treated as measured +#' variables. +#' +#' If `as_of` is missing, then the function will try to guess it from an +#' `as_of`, `issue`, or `version` column of `x` (if any of these are present), +#' or from as an `as_of` field in its metadata (stored in its attributes); if +#' this fails, then the current day-time will be used. #' @importFrom rlang .data #' @importFrom tidyselect any_of #' @importFrom cli cli_inform @@ -263,11 +267,12 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, additional_metadata = list(), ...) { # possible standard substitutions for time_value + x <- rename(x, ...) x <- guess_time_column_name(x) x <- guess_geo_column_name(x) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Columns `geo_value` and `time_value` must be present in `x`." + "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." ) } diff --git a/R/utils.R b/R/utils.R index e52c0da9..bce158dd 100644 --- a/R/utils.R +++ b/R/utils.R @@ -459,7 +459,7 @@ upcase_snake_case <- function(x) { c(x, X) } -#' given an arbitrary +#' rename potential time_value columns #' @keywords internal guess_time_column_name <- function(x, substitutions = NULL) { if (!("time_value" %in% names(x))) { @@ -473,12 +473,14 @@ guess_time_column_name <- function(x, substitutions = NULL) { time_value = "forecast_date", time_value = "target_date", time_value = "week", - time_value = "day", time_value = "epiweek", time_value = "month", + time_value = "mon", time_value = "year", time_value = "yearmon", + time_value = "yearmonth", time_value = "yearMon", + time_value = "yearMonth", time_value = "dates", time_value = "time_values", time_value = "forecast_dates", @@ -495,7 +497,9 @@ guess_time_column_name <- function(x, substitutions = NULL) { Either `rename` some yourself or drop some.") } ) - cli_inform("inferring `time_value` column.") + if (any(substitutions != "")) { + cli_inform("inferring `time_value` column.") + } } return(x) } @@ -529,7 +533,9 @@ guess_geo_column_name <- function(x, substitutions = NULL) { Either `rename` some yourself or drop some.") } ) - cli_inform("inferring `time_value` column.") + if (any(substitutions != "")) { + cli_inform("inferring `geo_value` column.") + } } return(x) } @@ -545,11 +551,13 @@ guess_version_column_name <- function(x, substitutions = NULL) { } x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `geo_value` candidate columns. + cli_abort("There are multiple `version` candidate columns. Either `rename` some yourself or drop some.") } ) - cli_inform("inferring `time_value` column.") + if (any(substitutions != "")) { + cli_inform("inferring `version` column.") + } } return(x) } diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 40c0a1c5..98cdbb83 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -55,12 +55,16 @@ examples. \item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. \item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns -\code{geo_value} and \code{time_value}. All other columns will be preserved as is, -and treated as measured variables. If \code{as_of} is missing, then the function -will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x} -(if any of these are present), or from as an \code{as_of} field in its metadata -(stored in its attributes); if this fails, then the current day-time will -be used. +\code{geo_value} and \code{time_value}, or column names that uniquely map onto these +(e.g. \code{date} or \code{province}). Alternatively, you can specify the conversion +explicitly (\code{time_value = someWeirdColumnName}). All other columns not +specified as \code{other_keys} will be preserved as is, and treated as measured +variables. + +If \code{as_of} is missing, then the function will try to guess it from an +\code{as_of}, \code{issue}, or \code{version} column of \code{x} (if any of these are present), +or from as an \code{as_of} field in its metadata (stored in its attributes); if +this fails, then the current day-time will be used. \item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. diff --git a/man/guess_time_column_name.Rd b/man/guess_time_column_name.Rd index 45a173b6..f09a0e6e 100644 --- a/man/guess_time_column_name.Rd +++ b/man/guess_time_column_name.Rd @@ -2,11 +2,11 @@ % Please edit documentation in R/utils.R \name{guess_time_column_name} \alias{guess_time_column_name} -\title{given an arbitrary} +\title{rename potential time_value columns} \usage{ guess_time_column_name(x, substitutions = NULL) } \description{ -given an arbitrary +rename potential time_value columns } \keyword{internal} diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index c6a304bd..e7f092fd 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -53,6 +53,15 @@ test_that("as_epi_df works for nonstandard input", { geo_value = rep(c("ca", "hi"), each = 5) ) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) + expect_error(expect_message( + tib %>% rename(awefa = geo_value) %>% as_epi_df(), + regexp = "inferring `time_value` column." + )) + expect_no_error(expect_message( + tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), + regexp = "inferring `time_value` column." + )) tib <- tib %>% rename(forecast_date = date) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) From 287edd7608632bbbf0d3a8cb3fcedd6e6f9d46bc Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 00:08:36 +0000 Subject: [PATCH 326/345] docs: document (GHA) --- man/epi_archive.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 97ff6af0..d95e6eb5 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -37,7 +37,8 @@ as_epi_archive( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL + versions_end = NULL, + ... ) } \arguments{ From 75c62c2bacc9bea2b35b67498de85acbe04421d8 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 10:47:03 -0500 Subject: [PATCH 327/345] remove `forecast_date` as a default --- R/utils.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/utils.R b/R/utils.R index bce158dd..d008f1b3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -470,7 +470,6 @@ guess_time_column_name <- function(x, substitutions = NULL) { time_value = "datetime", time_value = "dateTime", tmie_value = "date_time", - time_value = "forecast_date", time_value = "target_date", time_value = "week", time_value = "epiweek", @@ -483,7 +482,6 @@ guess_time_column_name <- function(x, substitutions = NULL) { time_value = "yearMonth", time_value = "dates", time_value = "time_values", - time_value = "forecast_dates", time_value = "target_dates" ) substitutions <- upcase_snake_case(substitutions) From 952e0e24b6b769edaf042bb7116fe464d2b9645d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 10:55:02 -0500 Subject: [PATCH 328/345] happier linter --- R/epi_df.R | 3 ++- R/utils.R | 6 +++--- man/upcase_snake_case.Rd | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index c7554b33..5c84e90f 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -272,7 +272,8 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, x <- guess_geo_column_name(x) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." + "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal + functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." ) } diff --git a/R/utils.R b/R/utils.R index d008f1b3..c00205a3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -452,11 +452,11 @@ guess_time_type <- function(time_value) { #' "date" -> c("date", "Date") #' "target_date" -> c("target_date", "Target_Date") #' @keywords internal -upcase_snake_case <- function(x) { - X <- strsplit(x, "_") %>% +upcase_snake_case <- function(vec) { + VEC <- strsplit(vec, "_") %>% map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% unlist() - c(x, X) + c(vec, VEC) } #' rename potential time_value columns diff --git a/man/upcase_snake_case.Rd b/man/upcase_snake_case.Rd index 398f6a0b..31ecb768 100644 --- a/man/upcase_snake_case.Rd +++ b/man/upcase_snake_case.Rd @@ -6,7 +6,7 @@ "date" -> c("date", "Date") "target_date" -> c("target_date", "Target_Date")} \usage{ -upcase_snake_case(x) +upcase_snake_case(vec) } \description{ given a vector of characters, add the same values, but upcased, e.g. From e3feab6255bdf9eee54256823e524409c25537a6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Jun 2024 15:34:40 -0500 Subject: [PATCH 329/345] wrong test, better too many columns error --- R/archive.R | 1 + R/utils.R | 10 +++++----- man/epi_archive.Rd | 2 ++ tests/testthat/test-epi_df.R | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 83806eef..b9c29b63 100644 --- a/R/archive.R +++ b/R/archive.R @@ -442,6 +442,7 @@ validate_epi_archive <- function( #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. +#' @param ... used for specifying column names, such as `version = release_date` #' #' @rdname epi_archive #' diff --git a/R/utils.R b/R/utils.R index c00205a3..d2fa3ab4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -453,10 +453,10 @@ guess_time_type <- function(time_value) { #' "target_date" -> c("target_date", "Target_Date") #' @keywords internal upcase_snake_case <- function(vec) { - VEC <- strsplit(vec, "_") %>% + upper_vec <- strsplit(vec, "_") %>% map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% unlist() - c(vec, VEC) + c(vec, upper_vec) } #' rename potential time_value columns @@ -491,7 +491,7 @@ guess_time_column_name <- function(x, substitutions = NULL) { unlist() x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `time_value` candidate columns. + cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) @@ -527,7 +527,7 @@ guess_geo_column_name <- function(x, substitutions = NULL) { substitutions <- upcase_snake_case(substitutions) x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `geo_value` candidate columns. + cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) @@ -549,7 +549,7 @@ guess_version_column_name <- function(x, substitutions = NULL) { } x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { - cli_abort("There are multiple `version` candidate columns. + cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index d95e6eb5..943d8e11 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -90,6 +90,8 @@ beyond \code{max(x$version)}, but they all contained empty updates. (The default value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} + +\item{...}{used for specifying column names, such as \code{version = release_date}} } \value{ An \code{epi_archive} object. diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index e7f092fd..1ccb0bec 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -63,10 +63,10 @@ test_that("as_epi_df works for nonstandard input", { regexp = "inferring `time_value` column." )) - tib <- tib %>% rename(forecast_date = date) + tib <- tib %>% rename(target_date = date) expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) - tib <- tib %>% mutate(target_date = 20 + forecast_date) + tib <- tib %>% mutate(Time = 20 + target_date) expect_error(tib_epi_df <- tib %>% as_epi_df()) }) From ad70c69ad97d3924bac72f12f666c0492b287786 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 8 Jul 2024 12:36:14 -0500 Subject: [PATCH 330/345] minor refactor to make col name subs accessible --- NAMESPACE | 3 + R/archive.R | 6 +- R/epi_df.R | 4 +- R/utils.R | 139 ++++++++++++++-------------------- man/geo_column_names.Rd | 12 +++ man/guess_column_name.Rd | 17 +++++ man/guess_time_column_name.Rd | 12 --- man/time_column_names.Rd | 12 +++ man/version_column_names.Rd | 12 +++ tests/testthat/test-archive.R | 8 +- tests/testthat/test-epi_df.R | 4 +- 11 files changed, 122 insertions(+), 107 deletions(-) create mode 100644 man/geo_column_names.Rd create mode 100644 man/guess_column_name.Rd delete mode 100644 man/guess_time_column_name.Rd create mode 100644 man/time_column_names.Rd create mode 100644 man/version_column_names.Rd diff --git a/NAMESPACE b/NAMESPACE index f0b01e82..dcbc151e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(epix_merge) export(epix_slide) export(epix_truncate_versions_after) export(filter) +export(geo_column_names) export(group_by) export(group_modify) export(growth_rate) @@ -75,9 +76,11 @@ export(next_after) export(relocate) export(rename) export(slice) +export(time_column_names) export(ungroup) export(unnest) export(validate_epi_archive) +export(version_column_names) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) diff --git a/R/archive.R b/R/archive.R index b9c29b63..2adccb5f 100644 --- a/R/archive.R +++ b/R/archive.R @@ -458,9 +458,9 @@ as_epi_archive <- function( versions_end = NULL, ...) { assert_data_frame(x) x <- rename(x, ...) - x <- guess_time_column_name(x) - x <- guess_geo_column_name(x) - x <- guess_version_column_name(x) + x <- guess_column_name(x, "time_value", time_column_names()) + x <- guess_column_name(x, "geo_value", geo_column_names()) + x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( "Columns `geo_value`, `time_value`, and `version` must be present in `x`." diff --git a/R/epi_df.R b/R/epi_df.R index 5c84e90f..2cb68138 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -268,8 +268,8 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, ...) { # possible standard substitutions for time_value x <- rename(x, ...) - x <- guess_time_column_name(x) - x <- guess_geo_column_name(x) + x <- guess_column_name(x, "time_value", time_column_names()) + x <- guess_column_name(x, "geo_value", geo_column_names()) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal diff --git a/R/utils.R b/R/utils.R index d2fa3ab4..e65c5fea 100644 --- a/R/utils.R +++ b/R/utils.R @@ -459,102 +459,73 @@ upcase_snake_case <- function(vec) { c(vec, upper_vec) } -#' rename potential time_value columns -#' @keywords internal -guess_time_column_name <- function(x, substitutions = NULL) { - if (!("time_value" %in% names(x))) { - if (is.null(substitutions)) { - substitutions <- c( - time_value = "date", - time_value = "time", - time_value = "datetime", - time_value = "dateTime", - tmie_value = "date_time", - time_value = "target_date", - time_value = "week", - time_value = "epiweek", - time_value = "month", - time_value = "mon", - time_value = "year", - time_value = "yearmon", - time_value = "yearmonth", - time_value = "yearMon", - time_value = "yearMonth", - time_value = "dates", - time_value = "time_values", - time_value = "target_dates" - ) - substitutions <- upcase_snake_case(substitutions) - } - strsplit(substitutions, "_") %>% - map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>% - unlist() - x <- tryCatch(x %>% rename(any_of(substitutions)), - error = function(cond) { - cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. -Either `rename` some yourself or drop some.") - } - ) - if (any(substitutions != "")) { - cli_inform("inferring `time_value` column.") - } - } - return(x) +#' potential time_value columns +#' @description +#' the full list of potential substitutions for the `time_value` column name: +#' `r time_column_names()` +#' @export +time_column_names <- function() { + substitutions <- c( + "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date", + "week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth", + "yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("time_value", length(substitutions)) + return(substitutions) +} +# +#' potential geo_value columns +#' @description +#' the full list of potential substitutions for the `geo_value` column name: +#' `r geo_column_names()` +#' @export +geo_column_names <- function() { + substitutions <- c( + "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip", + "county", "hrr", "msa", "state", "province", "nation", "states", + "provinces", "counties", "geo_Value" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("geo_value", length(substitutions)) + return(substitutions) } +#' potential version columns +#' @description +#' the full list of potential substitutions for the `version` column name: +#' `r version_column_names()` +#' @export +version_column_names <- function() { + substitutions <- c( + "version", "issue", "release" + ) + substitutions <- upcase_snake_case(substitutions) + names(substitutions) <- rep("version", length(substitutions)) + return(substitutions) +} +#' rename potential time_value columns +#' +#' @description +#' potentially renames +#' @param x the tibble to potentially rename +#' @param substitions a named vector. the potential substitions, with every name `time_value` #' @keywords internal -guess_geo_column_name <- function(x, substitutions = NULL) { - if (!("time_value" %in% names(x))) { - substitutions <- substitutions %||% c( - geo_value = "geo_values", - geo_value = "geo_id", - geo_value = "geos", - geo_value = "location", - geo_value = "jurisdiction", - geo_value = "fips", - geo_value = "zip", - geo_value = "county", - geo_value = "hrr", - geo_value = "msa", - geo_value = "state", - geo_value = "province", - geo_value = "nation", - geo_value = "states", - geo_value = "provinces", - geo_value = "counties" - ) - substitutions <- upcase_snake_case(substitutions) +guess_column_name <- function(x, column_name, substitutions) { + if (!(column_name %in% names(x))) { x <- tryCatch(x %>% rename(any_of(substitutions)), error = function(cond) { cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. Either `rename` some yourself or drop some.") } ) - if (any(substitutions != "")) { - cli_inform("inferring `geo_value` column.") + # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column + if (!any(names(x) %in% substitutions)) { + cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list") } - } - return(x) -} - -guess_version_column_name <- function(x, substitutions = NULL) { - if (!("version" %in% names(x))) { - if (is.null(substitutions)) { - substitutions <- c( - version = "issue", - version = "release" - ) - substitutions <- upcase_snake_case(substitutions) - } - x <- tryCatch(x %>% rename(any_of(substitutions)), - error = function(cond) { - cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. -Either `rename` some yourself or drop some.") - } - ) if (any(substitutions != "")) { - cli_inform("inferring `version` column.") + cli_inform("inferring {column_name} column.") } } return(x) diff --git a/man/geo_column_names.Rd b/man/geo_column_names.Rd new file mode 100644 index 00000000..4b3810dc --- /dev/null +++ b/man/geo_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{geo_column_names} +\alias{geo_column_names} +\title{potential geo_value columns} +\usage{ +geo_column_names() +} +\description{ +the full list of potential substitutions for the \code{geo_value} column name: +geo_value, geo_values, geo_id, geos, location, jurisdiction, fips, zip, county, hrr, msa, state, province, nation, states, provinces, counties, geo_Value, Geo_Value, Geo_Values, Geo_Id, Geos, Location, Jurisdiction, Fips, Zip, County, Hrr, Msa, State, Province, Nation, States, Provinces, Counties, Geo_Value +} diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd new file mode 100644 index 00000000..f03a7b80 --- /dev/null +++ b/man/guess_column_name.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{guess_column_name} +\alias{guess_column_name} +\title{rename potential time_value columns} +\usage{ +guess_column_name(x, column_name, substitutions) +} +\arguments{ +\item{x}{the tibble to potentially rename} + +\item{substitions}{a named vector. the potential substitions, with every name \code{time_value}} +} +\description{ +potentially renames +} +\keyword{internal} diff --git a/man/guess_time_column_name.Rd b/man/guess_time_column_name.Rd deleted file mode 100644 index f09a0e6e..00000000 --- a/man/guess_time_column_name.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{guess_time_column_name} -\alias{guess_time_column_name} -\title{rename potential time_value columns} -\usage{ -guess_time_column_name(x, substitutions = NULL) -} -\description{ -rename potential time_value columns -} -\keyword{internal} diff --git a/man/time_column_names.Rd b/man/time_column_names.Rd new file mode 100644 index 00000000..2e2db6b5 --- /dev/null +++ b/man/time_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{time_column_names} +\alias{time_column_names} +\title{potential time_value columns} +\usage{ +time_column_names() +} +\description{ +the full list of potential substitutions for the \code{time_value} column name: +time_value, date, time, datetime, dateTime, date_time, target_date, week, epiweek, month, mon, year, yearmon, yearmonth, yearMon, yearMonth, dates, time_values, target_dates, time_Value, Time_Value, Date, Time, Datetime, DateTime, Date_Time, Target_Date, Week, Epiweek, Month, Mon, Year, Yearmon, Yearmonth, YearMon, YearMonth, Dates, Time_Values, Target_Dates, Time_Value +} diff --git a/man/version_column_names.Rd b/man/version_column_names.Rd new file mode 100644 index 00000000..75ee2315 --- /dev/null +++ b/man/version_column_names.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{version_column_names} +\alias{version_column_names} +\title{potential version columns} +\usage{ +version_column_names() +} +\description{ +the full list of potential substitutions for the \code{version} column name: +version, issue, release, Version, Issue, Release +} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1291e3c7..218baf46 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -8,13 +8,13 @@ dt <- archive_cases_dv_subset$DT test_that("data.frame must contain geo_value, time_value and version columns", { expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no geo_value column or similar name" ) - expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + expect_error(expect_message(as_epi_archive(select(dt, -time_value), compactify = FALSE)), + regexp = "There is no time_value column or similar name" ) expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + regexp = "There is no version column or similar name" ) }) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 1ccb0bec..25a2dc59 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -56,11 +56,11 @@ test_that("as_epi_df works for nonstandard input", { expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) expect_error(expect_message( tib %>% rename(awefa = geo_value) %>% as_epi_df(), - regexp = "inferring `time_value` column." + regexp = "inferring " )) expect_no_error(expect_message( tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), - regexp = "inferring `time_value` column." + regexp = "inferring" )) tib <- tib %>% rename(target_date = date) From edbf1acdfb2c862efd2e0e4a043c3b062b9280a9 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 8 Jul 2024 12:37:24 -0500 Subject: [PATCH 331/345] Nat's suggestions --- R/archive.R | 6 ++++-- R/epi_df.R | 4 ++-- man/epi_archive.Rd | 3 ++- man/new_epi_df.Rd | 3 +-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 2adccb5f..1fd40d73 100644 --- a/R/archive.R +++ b/R/archive.R @@ -442,7 +442,8 @@ validate_epi_archive <- function( #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. -#' @param ... used for specifying column names, such as `version = release_date` +#' @param ... used for specifying column names, as in [`dplyr::rename`]. For +#' example `version = release_date` #' #' @rdname epi_archive #' @@ -463,7 +464,8 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, or related columns (see the internal + functions `guess_time_column_name()`, `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." ) } if (anyMissing(x$version)) { diff --git a/R/epi_df.R b/R/epi_df.R index 2cb68138..e3c77e96 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -95,7 +95,7 @@ NULL #' #' @export new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list(), ...) { + additional_metadata = list()) { assert_data_frame(x) assert_list(additional_metadata) @@ -279,7 +279,7 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, new_epi_df( x, geo_type, time_type, as_of, - additional_metadata, ... + additional_metadata ) } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 943d8e11..19ab1d1e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -91,7 +91,8 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{...}{used for specifying column names, such as \code{version = release_date}} +\item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For +example \code{version = release_date}} } \value{ An \code{epi_archive} object. diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd index 7182c222..934a716e 100644 --- a/man/new_epi_df.Rd +++ b/man/new_epi_df.Rd @@ -9,8 +9,7 @@ new_epi_df( geo_type, time_type, as_of, - additional_metadata = list(), - ... + additional_metadata = list() ) } \arguments{ From d19d1a5e2538ffa77af429062399cf4e3b2c61ef Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 11:39:55 -0500 Subject: [PATCH 332/345] avoid arg prefix-completion for versions_end --- R/archive.R | 3 ++- tests/testthat/test-archive.R | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/R/archive.R b/R/archive.R index 1fd40d73..6137cc72 100644 --- a/R/archive.R +++ b/R/archive.R @@ -456,7 +456,8 @@ as_epi_archive <- function( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL, ...) { + .versions_end = NULL, ..., + versions_end = .versions_end) { assert_data_frame(x) x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 218baf46..c606b664 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -18,6 +18,24 @@ test_that("data.frame must contain geo_value, time_value and version columns", { ) }) +test_that("as_epi_archive custom name mapping works correctly", { + # custom name works correctly + suppressWarnings(expect_equal( + as_epi_archive(rename(dt, weirdName = version), version = weirdName), + as_epi_archive(dt) + )) + suppressWarnings(expect_equal( + as_epi_archive(rename(dt, weirdName = geo_value), geo_value = weirdName), + as_epi_archive(dt) + )) + suppressWarnings(expect_equal( + as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), + as_epi_archive(dt) + )) + + expect_error(as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value), "Names must be unique") +}) + test_that("other_keys can only contain names of the data.frame columns", { expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE), regexp = "`other_keys` must be contained in the column names of `x`." From f6904f27beb35ff933423f280d2784fc5208491b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 16:49:18 +0000 Subject: [PATCH 333/345] style: styler (GHA) --- tests/testthat/test-archive.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index c606b664..1858da86 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -21,7 +21,7 @@ test_that("data.frame must contain geo_value, time_value and version columns", { test_that("as_epi_archive custom name mapping works correctly", { # custom name works correctly suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = version), version = weirdName), + as_epi_archive(rename(dt, weirdName = version), version = weirdName), as_epi_archive(dt) )) suppressWarnings(expect_equal( @@ -29,7 +29,7 @@ test_that("as_epi_archive custom name mapping works correctly", { as_epi_archive(dt) )) suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), + as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), as_epi_archive(dt) )) From 76388e2052def411b22b6d7ff89791dbc6de83fd Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 9 Jul 2024 16:49:21 +0000 Subject: [PATCH 334/345] docs: document (GHA) --- man/epi_archive.Rd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 19ab1d1e..a813c6de 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -37,8 +37,9 @@ as_epi_archive( additional_metadata = NULL, compactify = NULL, clobberable_versions_start = NULL, - versions_end = NULL, - ... + .versions_end = NULL, + ..., + versions_end = .versions_end ) } \arguments{ From 500a952f4101c52bba297317698a58a60e805946 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 12 Jul 2024 11:39:05 -0500 Subject: [PATCH 335/345] formatting --- R/archive.R | 8 ++++++-- man/epi_archive.Rd | 4 ++++ tests/testthat/test-archive.R | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 6137cc72..91f5d4a7 100644 --- a/R/archive.R +++ b/R/archive.R @@ -444,6 +444,9 @@ validate_epi_archive <- function( #' `epi_archive` object. #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` +#' @param .versions_end location based versions_end, used to avoid prefix +#' `version = issue` from being assigned to `versions_end` instead of being +#' used to rename columns. #' #' @rdname epi_archive #' @@ -465,8 +468,9 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, or related columns (see the internal - functions `guess_time_column_name()`, `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." + "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, +or related columns (see the internal functions `guess_time_column_name()`, +`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." ) } if (anyMissing(x$version)) { diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index a813c6de..99203052 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -92,6 +92,10 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +\item{.versions_end}{location based versions_end, used to avoid prefix +\code{version = issue} from being assigned to \code{versions_end} instead of being +used to rename columns.} + \item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For example \code{version = release_date}} } diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1858da86..54501a31 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -33,7 +33,9 @@ test_that("as_epi_archive custom name mapping works correctly", { as_epi_archive(dt) )) - expect_error(as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value), "Names must be unique") + expect_error(as_epi_archive(rename(dt, weirdName = version), + version = weirdName, version = time_value + ), "Names must be unique") }) test_that("other_keys can only contain names of the data.frame columns", { From 1b84d010aad36f046d5828cf2044a8f368fbfd62 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 12 Jul 2024 15:33:57 -0500 Subject: [PATCH 336/345] template needed changing --- R/epi_df.R | 1 + R/utils.R | 2 +- man-roxygen/epi_df-params.R | 1 - man/guess_column_name.Rd | 2 +- man/new_epi_df.Rd | 2 -- tests/testthat/test-archive.R | 10 +++++++--- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R/epi_df.R b/R/epi_df.R index e3c77e96..d8b399fb 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -162,6 +162,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for #' examples. #' +#' @param ... Additional arguments passed to methods. #' @template epi_df-params #' #' @export diff --git a/R/utils.R b/R/utils.R index e65c5fea..24da3995 100644 --- a/R/utils.R +++ b/R/utils.R @@ -510,7 +510,7 @@ version_column_names <- function() { #' @description #' potentially renames #' @param x the tibble to potentially rename -#' @param substitions a named vector. the potential substitions, with every name `time_value` +#' @param substitutions a named vector. the potential substitions, with every name `time_value` #' @keywords internal guess_column_name <- function(x, column_name, substitutions) { if (!(column_name %in% names(x))) { diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R index 54d8c2d2..59c51603 100644 --- a/man-roxygen/epi_df-params.R +++ b/man-roxygen/epi_df-params.R @@ -15,5 +15,4 @@ #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a #' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd index f03a7b80..d4aa09b7 100644 --- a/man/guess_column_name.Rd +++ b/man/guess_column_name.Rd @@ -9,7 +9,7 @@ guess_column_name(x, column_name, substitutions) \arguments{ \item{x}{the tibble to potentially rename} -\item{substitions}{a named vector. the potential substitions, with every name \code{time_value}} +\item{substitutions}{a named vector. the potential substitions, with every name \code{time_value}} } \description{ potentially renames diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd index 934a716e..8010b700 100644 --- a/man/new_epi_df.Rd +++ b/man/new_epi_df.Rd @@ -34,8 +34,6 @@ then the current day-time will be used.} \code{as_of} fields; named entries from the passed list will be included as well. If your tibble has additional keys, be sure to specify them as a character vector in the \code{other_keys} component of \code{additional_metadata}.} - -\item{...}{Additional arguments passed to methods.} } \value{ An \code{epi_df} object. diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 54501a31..679e4dbd 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -33,9 +33,13 @@ test_that("as_epi_archive custom name mapping works correctly", { as_epi_archive(dt) )) - expect_error(as_epi_archive(rename(dt, weirdName = version), - version = weirdName, version = time_value - ), "Names must be unique") + expect_error( + as_epi_archive( + rename(dt, weirdName = version), + version = weirdName, + version = time_value + ), "Names must be unique" + ) }) test_that("other_keys can only contain names of the data.frame columns", { From c59af46b494d88c587a7c386d779ed714eb0824c Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 12 Jul 2024 16:48:34 -0500 Subject: [PATCH 337/345] pkgdown fix for new functions, apparently --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 4930c9f5..b95a6386 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -63,6 +63,7 @@ reference: desc: Details on `epi_df` format, and basic functionality. - contents: - matches("epi_df") + - matches("column_names") - title: "`epi_*()` functions" desc: Functions that act on `epi_df` objects. - contents: From 20884154b12522cf0f9b860bc557c5e14bca626a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 19 Jul 2024 14:29:12 -0400 Subject: [PATCH 338/345] document ... --- R/utils.R | 2 ++ man/guess_period.Rd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R/utils.R b/R/utils.R index f899957e..4154395d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -679,11 +679,13 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { #' @param time_values_arg Optional, string; name to give `time_values` in error #' messages. Defaults to quoting the expression the caller fed into the #' `time_values` argument. +#' @param ... Should be empty, there to satisfy the S3 generic. #' @return length-1 vector; `r lifecycle::badge("experimental")` class will #' either be the same class as [`base::diff()`] on such time values, an #' integer, or a double, such that all `time_values` can be exactly obtained #' by adding `k * result` for an integer k, and such that there is no smaller #' `result` that can achieve this. +#' #' @export guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { UseMethod("guess_period") diff --git a/man/guess_period.Rd b/man/guess_period.Rd index 7d53eba2..0be9fdf2 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -17,6 +17,8 @@ data, with at least two distinct values.} \item{time_values_arg}{Optional, string; name to give \code{time_values} in error messages. Defaults to quoting the expression the caller fed into the \code{time_values} argument.} + +\item{...}{Should be empty, there to satisfy the S3 generic.} } \value{ length-1 vector; \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} class will From 7d99e6df8f10cd592f4c72cd37cf404f18984eec Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jul 2024 15:21:27 -0500 Subject: [PATCH 339/345] recs from Nat, local checks passing --- R/archive.R | 7 ++++--- R/epi_df.R | 6 ++++-- R/utils.R | 36 +++++++++++++++++++++++++---------- tests/testthat/test-archive.R | 32 ++++++++++++++++++------------- tests/testthat/test-epi_df.R | 27 ++++++++++++++++++-------- 5 files changed, 72 insertions(+), 36 deletions(-) diff --git a/R/archive.R b/R/archive.R index 91f5d4a7..780279b0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -468,9 +468,10 @@ as_epi_archive <- function( x <- guess_column_name(x, "version", version_column_names()) if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { cli_abort( - "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, -or related columns (see the internal functions `guess_time_column_name()`, -`guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)." + "Either columns `geo_value`, `time_value`, and `version`, or related columns + (see the internal functions `guess_time_column_name()`, + `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete + list) must be present in `x`." ) } if (anyMissing(x$version)) { diff --git a/R/epi_df.R b/R/epi_df.R index d8b399fb..707944f6 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -273,8 +273,10 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, x <- guess_column_name(x, "geo_value", geo_column_names()) if (!test_subset(c("geo_value", "time_value"), names(x))) { cli_abort( - "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal - functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)." + "Either columns `geo_value` and `time_value` or related columns + (see the internal functions `guess_time_column_name()` and/or + `guess_geo_column_name()` for a complete list) + must be present in `x`." ) } diff --git a/R/utils.R b/R/utils.R index 24da3995..9396070d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -512,21 +512,37 @@ version_column_names <- function() { #' @param x the tibble to potentially rename #' @param substitutions a named vector. the potential substitions, with every name `time_value` #' @keywords internal +#' @importFrom cli cli_inform cli_abort +#' @importFrom dplyr rename guess_column_name <- function(x, column_name, substitutions) { if (!(column_name %in% names(x))) { - x <- tryCatch(x %>% rename(any_of(substitutions)), - error = function(cond) { - cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions. -Either `rename` some yourself or drop some.") - } - ) # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column if (!any(names(x) %in% substitutions)) { - cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list") - } - if (any(substitutions != "")) { - cli_inform("inferring {column_name} column.") + cli_abort( + "There is no {column_name} column or similar name. + See e.g. [`time_column_name()`] for a complete list", + class = "epiprocess__guess_column__multiple_substitution_error" + ) } + + tryCatch( + { + x <- x %>% rename(any_of(substitutions)) + cli_inform( + "inferring {column_name} column.", + class = "epiprocess__guess_column_inferring_inform" + ) + return(x) + }, + error = function(cond) { + cli_abort( + "{intersect(names(x), substitutions)} + are both/all valid substitutions for {column_name}. + Either `rename` some yourself or drop some.", + class = "epiprocess__guess_column__multiple_substitution_error" + ) + } + ) } return(x) } diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 679e4dbd..d437c983 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -10,7 +10,7 @@ test_that("data.frame must contain geo_value, time_value and version columns", { expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE), regexp = "There is no geo_value column or similar name" ) - expect_error(expect_message(as_epi_archive(select(dt, -time_value), compactify = FALSE)), + expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE), regexp = "There is no time_value column or similar name" ) expect_error(as_epi_archive(select(dt, -version), compactify = FALSE), @@ -20,18 +20,24 @@ test_that("data.frame must contain geo_value, time_value and version columns", { test_that("as_epi_archive custom name mapping works correctly", { # custom name works correctly - suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = version), version = weirdName), - as_epi_archive(dt) - )) - suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = geo_value), geo_value = weirdName), - as_epi_archive(dt) - )) - suppressWarnings(expect_equal( - as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName), - as_epi_archive(dt) - )) + expect_equal( + as_epi_archive(rename(dt, weirdName = version), + version = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + expect_equal( + as_epi_archive(rename(dt, weirdName = geo_value), + geo_value = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) + expect_equal( + as_epi_archive(rename(dt, weirdName = time_value), + time_value = weirdName, compactify = TRUE + ), + as_epi_archive(dt, compactify = TRUE) + ) expect_error( as_epi_archive( diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 25a2dc59..1c5e527f 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -52,22 +52,33 @@ test_that("as_epi_df works for nonstandard input", { date = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) - expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df()), + class = "epiprocess__guess_column_inferring_inform" + ) expect_no_error(tib_epi_df <- tib %>% as_epi_df(time_value = date, geo_value = geo_value)) - expect_error(expect_message( - tib %>% rename(awefa = geo_value) %>% as_epi_df(), - regexp = "inferring " - )) + expect_error( + expect_message( + tib %>% + rename(awefa = geo_value) %>% + as_epi_df(), + class = "epiprocess__guess_column_inferring_inform" + ), + class = "epiprocess__guess_column__multiple_substitution_error" + ) expect_no_error(expect_message( tib %>% rename(awefa = geo_value) %>% as_epi_df(geo_value = awefa), - regexp = "inferring" + class = "epiprocess__guess_column_inferring_inform" )) tib <- tib %>% rename(target_date = date) - expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df())) + expect_message(expect_no_error(tib_epi_df <- tib %>% as_epi_df()), + class = "epiprocess__guess_column_inferring_inform" + ) tib <- tib %>% mutate(Time = 20 + target_date) - expect_error(tib_epi_df <- tib %>% as_epi_df()) + expect_error(tib_epi_df <- tib %>% as_epi_df(), + class = "epiprocess__guess_column__multiple_substitution_error" + ) }) # select fixes From 243c45e9c949653dc6b89608dd68c1722a5fb1a7 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 19 Jul 2024 15:43:37 -0500 Subject: [PATCH 340/345] desc, news --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ac188dc9..21dda4f5 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.12 +Version: 0.7.13 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index ed9dea40..7be31506 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Improved documentation web site landing page's introduction. - Fixed documentation referring to old `epi_slide()` interface (#466, thanks @XuedaShen!). +- `as_epi_df` and `as_epi_archive` now support arguments to specify column names + e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list + of default conversions, see `time_column_names` for a list of columns that + will automatically be recognized and converted to `time_value` column (there + are similar functions for `geo` and `version`). ## Cleanup - Resolved some linting messages in package checks (#468). From a5f397f228feac85002c2eeeecc9f21845a280d2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 19 Jul 2024 16:01:14 -0700 Subject: [PATCH 341/345] Refactor musing in tests into Issue + link (#485) --- tests/testthat/test-utils.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index b69277f2..3067ba8a 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -241,10 +241,9 @@ test_that("guess_period works", { expect_identical(guess_period(c(1, 8, 15)), 7) expect_identical(guess_period(c(1L, 8L, 15L)), 7L) expect_identical(guess_period(c(0, 7, 14, 15)), 1) - # We currently allow the guessed frequency to no appear in the diffs, but this - # might not be a good idea as it likely indicates an issue with the data. If - # we drop this behavior we could also drop the gcd algorithm by just checking - # the validity of the smallest diff: + # We currently allow the guessed frequency to not appear in the diffs, but + # this might not be a good idea as it likely indicates an issue with the data + # (#485). expect_identical(guess_period(c(0, 2, 5)), 1) expect_identical(guess_period(c(0, 4, 10)), 2) # On Dates: From f79821e554a9952e5ea8562e18646966ba577197 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 16 Jul 2024 12:41:59 -0700 Subject: [PATCH 342/345] ci: update checkout action --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c863c4f3..5f76fee6 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -19,7 +19,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: From 7a50d9d43c4cf8feaf9f8fdf445de94de7317a7a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Jun 2024 18:47:12 -0700 Subject: [PATCH 343/345] refactor(time_types): refactor time types * guess time_type of "day", "week", "yearmonth", "integer" from time column and warn if not compatible * restrict `before` and `after` to types compatible with time column * deprecate geo_type and time_type constructor arguments, infer only * improve documentation on geo and time types * enforce time_value and version being same type in epi_archive * update vignettes * move arg validation from new_epi_df to as_epi_df.tbl_df to match epi_archive a --- NAMESPACE | 1 + NEWS.md | 19 +- R/archive.R | 114 ++- R/epi_df.R | 250 +++--- R/epiprocess.R | 1 + R/grouped_epi_archive.R | 31 +- R/methods-epi_archive.R | 40 +- R/methods-epi_df.R | 18 +- R/slide.R | 240 ++---- R/utils.R | 102 ++- man-roxygen/basic-slide-details.R | 34 + man-roxygen/basic-slide-params.R | 41 +- man-roxygen/epi_df-params.R | 12 +- man-roxygen/opt-slide-details.R | 31 +- man/as_epi_df.Rd | 148 ---- man/epi_archive.Rd | 88 +- man/epi_df.Rd | 178 +++- man/epi_slide.Rd | 79 +- man/epi_slide_mean.Rd | 75 +- man/epi_slide_opt.Rd | 75 +- man/epi_slide_sum.Rd | 75 +- man/epix_as_of.Rd | 2 +- man/epix_slide.Rd | 21 +- man/new_epi_df.Rd | 45 - tests/testthat/test-archive-version-bounds.R | 2 +- tests/testthat/test-archive.R | 63 +- tests/testthat/test-autoplot.R | 16 +- tests/testthat/test-compactify.R | 7 +- tests/testthat/test-correlation.R | 2 +- tests/testthat/test-epi_df.R | 22 +- tests/testthat/test-epi_slide.R | 788 +++++++----------- .../testthat/test-epix_fill_through_version.R | 45 +- tests/testthat/test-epix_merge.R | 130 ++- tests/testthat/test-epix_slide.R | 200 ++--- tests/testthat/test-methods-epi_df.R | 4 +- tests/testthat/test-utils.R | 56 +- vignettes/aggregation.Rmd | 33 - vignettes/archive.Rmd | 5 +- vignettes/epiprocess.Rmd | 30 +- 39 files changed, 1326 insertions(+), 1797 deletions(-) create mode 100644 man-roxygen/basic-slide-details.R delete mode 100644 man/as_epi_df.Rd delete mode 100644 man/new_epi_df.Rd diff --git a/NAMESPACE b/NAMESPACE index e571858f..f8610226 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -102,6 +102,7 @@ importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) importFrom(checkmate,check_names) importFrom(checkmate,expect_class) +importFrom(checkmate,test_int) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) diff --git a/NEWS.md b/NEWS.md index 3d7ea718..e186b8fe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,9 +5,10 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat # epiprocess 0.8 ## Breaking changes + - `detect_outlr_stl(seasonal_period = NULL)` is no longer accepted. Use `detect_outlr_stl(seasonal_period = , seasonal_as_residual = TRUE)` - instead. See `?detect_outlr_stl` for more details. + instead. See `?detect_outlr_stl` for more details. ## Improvements @@ -49,15 +50,23 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat output a huge number of `ref_time_values` spaced apart by mere seconds. ## Cleanup -- Resolved some linting messages in package checks (#468). -## Cleanup +- Resolved some linting messages in package checks (#468). - Added optional `decay_to_tibble` attribute controlling `as_tibble()` behavior of `epi_df`s to let `{epipredict}` work more easily with other libraries (#471). - -## Cleanup - Removed some external package dependencies. +## Breaking Changes + +- `epi_df`'s are now more strict about what types they allow in the time column. + Namely, we are explicit about only supporting `Date` at the daily and weekly + cadence and generic integer types (for yearly cadence). +- `epi_slide` `before` and `after` arguments are now require the user to + specific time units in certain cases. The `time_step` argument has been + removed. +- `epix_slide` `before` argument now defaults to `Inf`, and requires the user to + specify units in some cases. The `time_step` argument has been removed. + # epiprocess 0.7.0 ## Breaking changes: diff --git a/R/archive.R b/R/archive.R index 780279b0..052f2776 100644 --- a/R/archive.R +++ b/R/archive.R @@ -170,11 +170,8 @@ NULL #' The data table `DT` has key variables `geo_value`, `time_value`, `version`, #' as well as any others (these can be specified when instantiating the #' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for `as_epi_archive()` for -#' information and examples of relevant parameter names for an `epi_archive` -#' object. Note that there can only be a single row per unique combination of -#' key variables, and thus the key variables are critical for figuring out how -#' to generate a snapshot of data from the archive, as of a given version. +#' on `DT` directly). Note that there can only be a single row per unique +#' combination of key variables. #' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` @@ -184,18 +181,15 @@ NULL #' * `time_type`: the type for the time values. #' * `additional_metadata`: list of additional metadata for the data archive. #' -#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be -#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, -#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the -#' metadata of an `epi_archive` object are not currently used by any -#' downstream functions in the `epiprocess` package, and serve only as useful -#' bits of information to convey about the data set at hand. +#' While this metadata is not protected, it is generally recommended to treat it +#' as read-only, and to use the `epi_archive` methods to interact with the data +#' archive. Unexpected behavior may result from modifying the metadata +#' directly. #' #' @section Generating Snapshots: #' An `epi_archive` object can be used to generate a snapshot of the data in -#' `epi_df` format, which represents the most up-to-date values of the signal -#' variables, as of the specified version. This is accomplished by calling -#' `epix_as_of()`. +#' `epi_df` format, which represents the most up-to-date time series values up +#' to a point in time. This is accomplished by calling `epix_as_of()`. #' #' @section Sliding Computations: #' We can run a sliding computation over an `epi_archive` object, much like @@ -208,19 +202,18 @@ NULL #' #' @param x A data.frame, data.table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". +#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the +#' location column and set to "custom" if not recognized. +#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time +#' column and set to "custom" if not recognized. Unpredictable behavior may result +#' if the time type is not recognized. #' @param other_keys Character vector specifying the names of variables in `x` #' that should be considered key variables (in the language of `data.table`) #' apart from "geo_value", "time_value", and "version". #' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`. `TRUE` will remove some +#' `epi_archive` object. The metadata will have the `geo_type` field; named +#' entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean. `TRUE` will remove some #' redundant rows, `FALSE` will not, and missing or `NULL` will remove #' redundant rows, but issue a warning. See more information at `compactify`. #' @param clobberable_versions_start Optional; `length`-1; either a value of the @@ -269,10 +262,7 @@ NULL #' value = rnorm(10, mean = 2, sd = 1) #' ) #' -#' toy_epi_archive <- tib %>% as_epi_archive( -#' geo_type = "state", -#' time_type = "day" -#' ) +#' toy_epi_archive <- tib %>% as_epi_archive() #' toy_epi_archive #' #' # Ex. with an additional key for county @@ -295,21 +285,17 @@ NULL #' cases_rate = c(0.01, 0.02, 0.01, 0.05) #' ) #' -#' x <- df %>% as_epi_archive( -#' geo_type = "state", -#' time_type = "day", -#' other_keys = "county" -#' ) +#' x <- df %>% as_epi_archive(other_keys = "county") #' new_epi_archive <- function( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL) { + geo_type, + time_type, + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end) { # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -398,13 +384,11 @@ new_epi_archive <- function( #' @export validate_epi_archive <- function( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL) { + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end) { # Finish off with small checks on keys variables and metadata if (!test_subset(other_keys, names(x))) { cli_abort("`other_keys` must be contained in the column names of `x`.") @@ -413,12 +397,20 @@ validate_epi_archive <- function( cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\" or \"time_type\".") } # Conduct checks and apply defaults for `compactify` assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) + # Make sure `time_value` and `version` have the same time type + if (!identical(class(x[["time_value"]]), class(x[["version"]]))) { + cli_abort( + "`time_value` and `version` must have the same class.", + class = "epiprocess__time_value_version_mismatch" + ) + } + # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) @@ -453,13 +445,13 @@ validate_epi_archive <- function( #' @export as_epi_archive <- function( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, + geo_type = deprecated(), + time_type = deprecated(), + other_keys = character(0L), + additional_metadata = list(), compactify = NULL, - clobberable_versions_start = NULL, - .versions_end = NULL, ..., + clobberable_versions_start = NA, + .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end) { assert_data_frame(x) x <- rename(x, ...) @@ -477,16 +469,18 @@ as_epi_archive <- function( if (anyMissing(x$version)) { cli_abort("Column `version` must not contain missing values.") } + if (lifecycle::is_present(geo_type)) { + cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") + } + if (lifecycle::is_present(time_type)) { + cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.") + } - geo_type <- geo_type %||% guess_geo_type(x$geo_value) - time_type <- time_type %||% guess_time_type(x$time_value) - other_keys <- other_keys %||% character(0L) - additional_metadata <- additional_metadata %||% list() - clobberable_versions_start <- clobberable_versions_start %||% NA - versions_end <- versions_end %||% max_version_with_row_in(x) + geo_type <- guess_geo_type(x$geo_value) + time_type <- guess_time_type(x$time_value) validate_epi_archive( - x, geo_type, time_type, other_keys, additional_metadata, + x, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end ) new_epi_archive( diff --git a/R/epi_df.R b/R/epi_df.R index 707944f6..37f26b87 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -1,30 +1,39 @@ -#' @title `epi_df` object +#' `epi_df` object #' -#' @description An `epi_df` is a tibble with certain minimal column structure -#' and metadata. It can be seen as a snapshot of a data set that contains the -#' most up-to-date values of some signal variables of interest, as of a given -#' time. +#' An `epi_df` is a tibble with certain minimal column structure and metadata. +#' It can be seen as a snapshot of a data set that contains the most +#' up-to-date values of some signal variables of interest, as of a given time. #' #' @details An `epi_df` is a tibble with (at least) the following columns: #' -#' * `geo_value`: the geographic value associated with each row of measurements. -#' * `time_value`: the time value associated with each row of measurements. +#' - `geo_value`: A character vector representing the geographical unit of +#' observation. This could be a country code, a state name, a county code, +#' etc. +#' - `time_value`: A date or integer vector representing the time of observation. #' #' Other columns can be considered as measured variables, which we also refer to #' as signal variables. An `epi_df` object also has metadata with (at least) #' the following fields: #' #' * `geo_type`: the type for the geo values. -#' * `time_type`: the type for the time values. #' * `as_of`: the time value at which the given data were available. #' +#' Most users should use `as_epi_df`. The input tibble `x` to the constructor +#' must contain the columns `geo_value` and `time_value`. All other columns +#' will be preserved as is, and treated as measured variables. If `as_of` is +#' missing, then the function will try to guess it from an `as_of`, `issue`, +#' or `version` column of `x` (if any of these are present), or from as an +#' `as_of` field in its metadata (stored in its attributes); if this fails, +#' then the current day-time will be used. The `new_epi_df` constructor +#' assumes its arguments have already been validated, so it should mainly be +#' used by advanced users. +#' #' Metadata for an `epi_df` object `x` can be accessed (and altered) via -#' `attributes(x)$metadata`. The first two fields in the above list, -#' `geo_type` and `time_type`, can usually be inferred from the `geo_value` -#' and `time_value` columns, respectively. They are not currently used by any -#' downstream functions in the `epiprocess` package, and serve only as useful -#' bits of information to convey about the data set at hand. More information -#' on their coding is given below. +#' `attributes(x)$metadata`. The first field in the above list, `geo_type`, +#' can usually be inferred from the `geo_value` columns. They are not +#' currently used by any downstream functions in the `epiprocess` package, +#' and serve only as useful bits of information to convey about the data set +#' at hand. More information on their coding is given below. #' #' The last field in the above list, `as_of`, is one of the most unique aspects #' of an `epi_df` object. In brief, we can think of an `epi_df` object as a @@ -61,109 +70,19 @@ #' @section Time Types: #' The following time types are recognized in an `epi_df`. #' -#' * `"day-time"`: each observation corresponds to a time on a given day -#' (measured to the second); coded as a `POSIXct` object, as in -#' `as.POSIXct("2022-01-31 18:45:40")`. #' * `"day"`: each observation corresponds to a day; coded as a `Date` object, #' as in `as.Date("2022-01-31")`. #' * `"week"`: each observation corresponds to a week; the alignment can be #' arbitrary (as to whether a week starts on a Monday, Tuesday); coded as a #' `Date` object, representing the start date of week. -#' * `"yearweek"`: each observation corresponds to a week; the alignment can be -#' arbitrary; coded as a `tsibble::yearweek` object, where the alignment is -#' stored in the `week_start` field of its attributes. #' * `"yearmonth"`: each observation corresponds to a month; coded as a #' `tsibble::yearmonth` object. -#' * `"yearquarter"`: each observation corresponds to a quarter; coded as a -#' `tsibble::yearquarter` object. -#' * `"year"`: each observation corresponds to a year; coded as an integer -#' greater than or equal to 1582. +#' * `"integer"`: a generic integer index (e.g. years or something else). #' #' An unrecognizable time type is labeled "custom". #' -#' @name epi_df -NULL - - -#' Creates an `epi_df` object -#' -#' Creates a new `epi_df` object. By default, builds an empty tibble with the -#' correct metadata for an `epi_df` object (ie. `geo_type`, `time_type`, and `as_of`). -#' Refer to the below info. about the arguments for more details. -#' -#' @template epi_df-params -#' -#' @export -new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, - additional_metadata = list()) { - assert_data_frame(x) - assert_list(additional_metadata) - - additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) - - # If geo type is missing, then try to guess it - if (missing(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (missing(time_type)) { - time_type <- guess_time_type(x$time_value) - } - - # If as_of is missing, then try to guess it - if (missing(as_of)) { - # First check the metadata for an as_of field - if ( - "metadata" %in% names(attributes(x)) && - "as_of" %in% names(attributes(x)$metadata) - ) { - as_of <- attributes(x)$metadata$as_of - } else if ("as_of" %in% names(x)) { - # Next check for as_of, issue, or version columns - as_of <- max(x$as_of) - } else if ("issue" %in% names(x)) { - as_of <- max(x$issue) - } else if ("version" %in% names(x)) { - as_of <- max(x$version) - } else { - # If we got here then we failed - as_of <- Sys.time() - } # Use the current day-time - } - - # Define metadata fields - metadata <- list() - metadata$geo_type <- geo_type - metadata$time_type <- time_type - metadata$as_of <- as_of - metadata <- c(metadata, additional_metadata) - - # Reorder columns (geo_value, time_value, ...) - if (sum(dim(x)) != 0) { - cols_to_put_first <- c("geo_value", "time_value") - x <- x[, c( - cols_to_put_first, - # All other columns - names(x)[!(names(x) %in% cols_to_put_first)] - )] - } - - # Apply epi_df class, attach metadata, and return - class(x) <- c("epi_df", class(x)) - attributes(x)$metadata <- metadata - return(x) -} - -#' Convert to `epi_df` format -#' -#' Converts a data frame or tibble into an `epi_df` object. See the [getting -#' started -#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for -#' examples. -#' -#' @param ... Additional arguments passed to methods. #' @template epi_df-params +#' @rdname epi_df #' #' @export #' @examples @@ -186,11 +105,10 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' #' # The `other_keys` metadata (`"county_code"` in this case) is automatically #' # inferred from the `tsibble`'s `key`: -#' ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") +#' ex1 <- as_epi_df(x = ex1_input, as_of = "2020-06-03") #' attr(ex1, "metadata")[["other_keys"]] #' #' -#' #' # Dealing with misspecified column names: #' # Geographical and temporal information must be provided in columns named #' # `geo_value` and `time_value`; if we start from a data frame with a @@ -211,14 +129,13 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' ex2 <- ex2_input %>% #' dplyr::rename(geo_value = state, time_value = reported_date) %>% #' as_epi_df( -#' geo_type = "state", as_of = "2020-06-03", +#' as_of = "2020-06-03", #' additional_metadata = list(other_keys = "pol") #' ) #' #' attr(ex2, "metadata") #' #' -#' #' # Adding additional keys to an `epi_df` object #' #' ex3_input <- jhu_csse_county_level_subset %>% @@ -237,36 +154,57 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) #' #' attr(ex3, "metadata") +new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, + additional_metadata = list()) { + # Define metadata fields + metadata <- list() + metadata$geo_type <- geo_type + metadata$time_type <- time_type + metadata$as_of <- as_of + metadata <- c(metadata, additional_metadata) + + # Reorder columns (geo_value, time_value, ...) + if (sum(dim(x)) != 0) { + cols_to_put_first <- c("geo_value", "time_value") + x <- x[, c( + cols_to_put_first, + # All other columns + names(x)[!(names(x) %in% cols_to_put_first)] + )] + } + + # Apply epi_df class, attach metadata, and return + class(x) <- c("epi_df", class(x)) + attributes(x)$metadata <- metadata + return(x) +} + +#' @rdname epi_df +#' @export as_epi_df <- function(x, ...) { UseMethod("as_epi_df") } #' @method as_epi_df epi_df -#' @describeIn as_epi_df Simply returns the `epi_df` object unchanged. +#' @rdname epi_df #' @export as_epi_df.epi_df <- function(x, ...) { return(x) } #' @method as_epi_df tbl_df -#' @describeIn as_epi_df The input tibble `x` must contain the columns -#' `geo_value` and `time_value`, or column names that uniquely map onto these -#' (e.g. `date` or `province`). Alternatively, you can specify the conversion -#' explicitly (`time_value = someWeirdColumnName`). All other columns not -#' specified as `other_keys` will be preserved as is, and treated as measured -#' variables. -#' -#' If `as_of` is missing, then the function will try to guess it from an -#' `as_of`, `issue`, or `version` column of `x` (if any of these are present), -#' or from as an `as_of` field in its metadata (stored in its attributes); if -#' this fails, then the current day-time will be used. +#' @rdname epi_df #' @importFrom rlang .data #' @importFrom tidyselect any_of #' @importFrom cli cli_inform #' @export -as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), - ...) { +as_epi_df.tbl_df <- function( + x, + geo_type = deprecated(), + time_type = deprecated(), + as_of, + additional_metadata = list(), + ...) { # possible standard substitutions for time_value x <- rename(x, ...) x <- guess_column_name(x, "time_value", time_column_names()) @@ -280,41 +218,61 @@ as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of, ) } - new_epi_df( - x, geo_type, time_type, as_of, - additional_metadata - ) + if (lifecycle::is_present(geo_type)) { + cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") + } + if (lifecycle::is_present(time_type)) { + cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.") + } + + # If geo type is missing, then try to guess it + geo_type <- guess_geo_type(x$geo_value) + time_type <- guess_time_type(x$time_value) + + # If as_of is missing, then try to guess it + if (missing(as_of)) { + # First check the metadata for an as_of field + if ( + "metadata" %in% names(attributes(x)) && + "as_of" %in% names(attributes(x)$metadata) + ) { + as_of <- attributes(x)$metadata$as_of + } else if ("as_of" %in% names(x)) { + # Next check for as_of, issue, or version columns + as_of <- max(x$as_of) + } else if ("issue" %in% names(x)) { + as_of <- max(x$issue) + } else if ("version" %in% names(x)) { + as_of <- max(x$version) + } else { + # If we got here then we failed + as_of <- Sys.time() + } # Use the current day-time + } + + assert_list(additional_metadata) + additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) + new_epi_df(x, geo_type, time_type, as_of, additional_metadata) } #' @method as_epi_df data.frame -#' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`. +#' @rdname epi_df #' @export -as_epi_df.data.frame <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { - as_epi_df.tbl_df( - tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ... - ) +as_epi_df.data.frame <- function(x, as_of, additional_metadata = list(), ...) { + as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...) } #' @method as_epi_df tbl_ts -#' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`, except that -#' the `tbl_ts` class is dropped, and any key variables (other than -#' "geo_value") are added to the metadata of the returned object, under the -#' `other_keys` field. +#' @rdname epi_df #' @export -as_epi_df.tbl_ts <- function(x, geo_type, time_type, as_of, - additional_metadata = list(), ...) { +as_epi_df.tbl_ts <- function(x, as_of, additional_metadata = list(), ...) { tsibble_other_keys <- setdiff(tsibble::key_vars(x), "geo_value") if (length(tsibble_other_keys) != 0) { additional_metadata$other_keys <- unique( c(additional_metadata$other_keys, tsibble_other_keys) ) } - as_epi_df.tbl_df( - tibble::as_tibble(x), geo_type, time_type, as_of, - additional_metadata, ... - ) + as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, additional_metadata = additional_metadata, ...) } #' Test for `epi_df` format diff --git a/R/epiprocess.R b/R/epiprocess.R index 40c3ce8a..ba072a2d 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -8,6 +8,7 @@ #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic #' anyInfinite test_subset test_set_equal checkInt expect_class +#' test_int #' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% #' @name epiprocess diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index c6326751..d0418eea 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -204,10 +204,16 @@ ungroup.grouped_epi_archive <- function(x, ...) { #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg #' @export -epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { +epix_slide.grouped_epi_archive <- function( + x, + f, + ..., + before = Inf, + ref_time_values = NULL, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE) { # Perform some deprecated argument checks without using ` = # deprecated()` in the function signature, because they are from # early development versions and much more likely to be clutter than @@ -231,7 +237,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } - if (missing(ref_time_values)) { + if (is.null(ref_time_values)) { ref_time_values <- epix_slide_ref_time_values_default(x$private$ungrouped) } else { assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) @@ -246,20 +252,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, ref_time_values <- sort(ref_time_values) } - # Validate and pre-process `before`: - if (missing(before)) { - cli_abort("`before` is required (and must be passed by name); - if you did not want to apply a sliding window but rather - to map `epix_as_of` and `f` across various `ref_time_values`, - pass a large `before` value (e.g., if time steps are days, - `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) - - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) + validate_slide_window_arg(before, x$private$ungrouped$time_type) # Symbolize column name new_col <- sym(new_col_name) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 891cc064..8363fa2e 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -50,7 +50,7 @@ #' # (a.k.a. "hotfixed", "clobbered", etc.): #' clobberable_versions_start = max(archive_cases_dv_subset$DT$version), #' # Suppose today is the following day, and there are no updates out yet: -#' versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, +#' versions_end = max(archive_cases_dv_subset$DT$version) + 1L, #' compactify = TRUE #' ) #' @@ -111,8 +111,6 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL tibble::as_tibble() %>% dplyr::select(-"version") %>% as_epi_df( - geo_type = x$geo_type, - time_type = x$time_type, as_of = max_version, additional_metadata = c( x$additional_metadata, @@ -271,7 +269,7 @@ epix_merge <- function(x, y, } if (!identical(x$time_type, y$time_type)) { - cli_abort("`x` and `y` must have the same `$time_type`") + cli_abort("`x` and `y` must share data type on their `time_value` column.") } if (length(x$additional_metadata) != 0L) { @@ -307,11 +305,11 @@ epix_merge <- function(x, y, y_dt <- y$DT } } else if (sync %in% c("na", "locf")) { - new_versions_end <- max(x$versions_end, y$versions_end) + new_versions_end <- max(c(x$versions_end, y$versions_end)) x_dt <- epix_fill_through_version(x, new_versions_end, sync)$DT y_dt <- epix_fill_through_version(y, new_versions_end, sync)$DT } else if (sync == "truncate") { - new_versions_end <- min(x$versions_end, y$versions_end) + new_versions_end <- min(c(x$versions_end, y$versions_end)) x_dt <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] y_dt <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] } else { @@ -450,8 +448,6 @@ epix_merge <- function(x, y, return(as_epi_archive( result_dt[], # clear data.table internal invisibility flag if set - geo_type = x$geo_type, - time_type = x$time_type, other_keys = setdiff(key(result_dt), c("geo_value", "time_value", "version")), additional_metadata = result_additional_metadata, # It'd probably be better to pre-compactify before the merge, and might be @@ -610,12 +606,6 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' set to a regularly-spaced sequence of values set to cover the range of #' `version`s in the `DT` plus the `versions_end`; the spacing of values will #' be guessed (using the GCD of the skips between values). -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. @@ -790,9 +780,8 @@ epix_slide <- function( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -803,10 +792,16 @@ epix_slide <- function( #' @rdname epix_slide #' @export -epix_slide.epi_archive <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { +epix_slide.epi_archive <- function( + x, + f, + ..., + before = Inf, + ref_time_values = NULL, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big # group (group by 0 vars), like `dplyr::summarize`, and let the # resulting `grouped_epi_archive` handle the slide: @@ -814,8 +809,7 @@ epix_slide.epi_archive <- function(x, f, ..., before, ref_time_values, group_by(x), f, ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, + before = before, ref_time_values = ref_time_values, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, all_versions = all_versions ) %>% diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index cc532021..daccabd8 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -83,7 +83,6 @@ print.epi_df <- function(x, ...) { summary.epi_df <- function(object, ...) { cat("An `epi_df` x, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", attributes(object)$metadata$geo_type)) - cat(sprintf("* %-9s = %s\n", "time_type", attributes(object)$metadata$time_type)) cat(sprintf("* %-9s = %s\n", "as_of", attributes(object)$metadata$as_of)) cat("----------\n") cat(sprintf("* %-27s = %s\n", "min time value", min(object$time_value))) @@ -118,15 +117,14 @@ decay_epi_df <- function(x) { } # Implementing `dplyr_extending`: we have a few metadata attributes to consider: -# `as_of` is an attribute doesn't depend on the rows or columns, `geo_type` and -# `time_type` are scalar attributes dependent on columns, and `other_keys` acts -# like an attribute vectorized over columns; `dplyr_extending` advice at time of -# writing says to implement `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and -# `names<-`, but not `dplyr_row_slice`; however, we'll also implement -# `dplyr_row_slice` anyway to prevent a `arrange` on grouped `epi_df`s from -# dropping the `epi_df` class. We'll implement `[` to allow either 1d or 2d. -# We'll also implement some other methods where we want to (try to) maintain an -# `epi_df`. +# `as_of` is an attribute doesn't depend on the rows or columns, `geo_type` is a +# scalar attribute dependent on columns, and `other_keys` acts like an attribute +# vectorized over columns; `dplyr_extending` advice at time of writing says to +# implement `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but +# not `dplyr_row_slice`; however, we'll also implement `dplyr_row_slice` anyway +# to prevent a `arrange` on grouped `epi_df`s from dropping the `epi_df` class. +# We'll implement `[` to allow either 1d or 2d. We'll also implement some other +# methods where we want to (try to) maintain an `epi_df`. #' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may #' directly feed in latter from our other methods) diff --git a/R/slide.R b/R/slide.R index 9d26174a..be8d895b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -35,49 +35,7 @@ #' the names of the resulting columns are given by prepending `new_col_name` #' to the names of the list elements. #' -#' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is centered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. The unit (the meaning of one time step) is implicitly -#' defined by the way the `time_value` column treats addition and subtraction; -#' for example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals -#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly -#' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If there are not enough time -#' steps available to complete the window at any given reference time, then -#' `epi_slide()` still attempts to perform the computation anyway (it does not -#' require a complete window). The issue of what to do with partial -#' computations (those run on incomplete windows) is therefore left up to the -#' user, either through the specified function or formula `f`, or through -#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and -#' `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) -#' -#' If `f` is missing, then an expression for tidy evaluation can be specified, -#' for example, as in: -#' ``` -#' epi_slide(x, cases_7dav = mean(cases), before = 6) -#' ``` -#' which would be equivalent to: -#' ``` -#' epi_slide(x, function(x, g) mean(x$cases), before = 6, -#' new_col_name = "cases_7dav") -#' ``` -#' Thus, to be clear, when the computation is specified via an expression for -#' tidy evaluation (first example, above), then the name for the new column is -#' inferred from the given expression and overrides any name passed explicitly -#' through the `new_col_name` argument. +#' @template basic-slide-details #' #' @importFrom lubridate days weeks #' @importFrom dplyr bind_rows group_vars filter select @@ -130,13 +88,16 @@ #' before = 1, as_list_col = TRUE #' ) %>% #' ungroup() -epi_slide <- function(x, f, ..., before, after, ref_time_values, - time_step, +epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") - if (missing(ref_time_values)) { + if (nrow(x) == 0L) { + return(x) + } + + if (is.null(ref_time_values)) { ref_time_values <- unique(x$time_value) } else { assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) @@ -151,40 +112,22 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, } ref_time_values <- sort(ref_time_values) - # Validate and pre-process `before`, `after`: - if (!missing(before)) { - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (!missing(after)) { - after <- vctrs::vec_cast(after, integer()) - assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (missing(before)) { - if (missing(after)) { - cli_abort("Either or both of `before`, `after` must be provided.") - } else if (after == 0L) { - cli_warn("`before` missing, `after==0`; maybe this was intended to be some - non-zero-width trailing window, but since `before` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") - } - before <- 0L - } else if (missing(after)) { - if (before == 0L) { - cli_warn("`before==0`, `after` missing; maybe this was intended to be some - non-zero-width leading window, but since `after` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") + if (is.null(before) && !is.null(after)) { + if (inherits(after, "difftime")) { + before <- as.difftime(0, units = units(after)) + } else { + before <- 0 } - after <- 0L } - - # If a custom time step is specified, then redefine units - if (!missing(time_step)) { - before <- time_step(before) - after <- time_step(after) + if (is.null(after) && !is.null(before)) { + if (inherits(before, "difftime")) { + after <- as.difftime(0, units = units(before)) + } else { + after <- 0 + } } + validate_slide_window_arg(before, attr(x, "metadata")$time_type) + validate_slide_window_arg(after, attr(x, "metadata")$time_type) # Arrange by increasing time_value x <- arrange(x, .data$time_value) @@ -434,8 +377,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() -epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, - time_step, +epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, all_rows = FALSE) { assert_class(x, "epi_df") @@ -504,7 +446,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, ) } - user_provided_rtvs <- !missing(ref_time_values) + user_provided_rtvs <- !is.null(ref_time_values) if (!user_provided_rtvs) { ref_time_values <- unique(x$time_value) } else { @@ -520,37 +462,25 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, } ref_time_values <- sort(ref_time_values) - # Validate and pre-process `before`, `after`: - if (!missing(before)) { - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (!missing(after)) { - after <- vctrs::vec_cast(after, integer()) - assert_int(after, lower = 0, null.ok = FALSE, na.ok = FALSE) - } - if (missing(before)) { - if (missing(after)) { - cli_abort("Either or both of `before`, `after` must be provided.") - } else if (after == 0L) { - cli_warn("`before` missing, `after==0`; maybe this was intended to be some - non-zero-width trailing window, but since `before` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") + if (is.null(before) && !is.null(after)) { + if (inherits(after, "difftime")) { + before <- as.difftime(0, units = units(after)) + } else { + before <- 0 } - before <- 0L - } else if (missing(after)) { - if (before == 0L) { - cli_warn("`before==0`, `after` missing; maybe this was intended to be some - non-zero-width leading window, but since `after` appears to be - missing, it's interpreted as a zero-width window (`before=0, - after=0`).") + } + if (is.null(after) && !is.null(before)) { + if (inherits(before, "difftime")) { + after <- as.difftime(0, units = units(before)) + } else { + after <- 0 } - after <- 0L } + validate_slide_window_arg(before, attr(x, "metadata")$time_type) + validate_slide_window_arg(after, attr(x, "metadata")$time_type) # Make a complete date sequence between min(x$time_value) and max(x$time_value). - date_seq_list <- full_date_seq(x, before, after, time_step) + date_seq_list <- full_date_seq(x, before, after, attr(x, "metadata")$time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates @@ -632,7 +562,10 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, } else if (f_from_package == "slider") { for (i in seq_along(col_names_chr)) { .data_group[, result_col_names[i]] <- f( - x = .data_group[[col_names_chr[i]]], before = before, after = after, ... + x = .data_group[[col_names_chr[i]]], + before = as.numeric(before), + after = as.numeric(after), + ... ) } } @@ -725,8 +658,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% #' ungroup() -epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, - time_step, +epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, all_rows = FALSE) { epi_slide_opt( @@ -737,7 +669,6 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, before = before, after = after, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, @@ -772,8 +703,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% #' ungroup() -epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, - time_step, +epi_slide_sum <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, all_rows = FALSE) { epi_slide_opt( @@ -784,7 +714,6 @@ epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, before = before, after = after, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, @@ -796,24 +725,25 @@ epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, #' (x$time_value). Produce lists of dates before min(x$time_value) and after #' max(x$time_value) for padding initial and final windows to size `n`. #' -#' `before` and `after` inputs here should be raw (numeric) values; -#' `time_step` function should NOT have been applied. `full_date_seq` applies -#' `time_step` as needed. +#' `before` and `after` args are assumed to have been validated by the calling +#' function (using `validate_slide_window_arg`). #' #' @importFrom checkmate assert_function #' @noRd -full_date_seq <- function(x, before, after, time_step) { +full_date_seq <- function(x, before, after, time_type) { + if (!time_type %in% c("day", "week", "yearmonth", "integer")) { + cli_abort( + "time_type must be one of 'day', 'week', or 'integer'." + ) + } + pad_early_dates <- c() pad_late_dates <- c() - # If dates are one of tsibble-provided classes, can step by numeric. `tsibble` - # defines a step of 1 to automatically be the quantum (smallest resolvable - # unit) of the date class. For example, one step = 1 quarter for `yearquarter`. - # - # `tsibble` classes apparently can't be added to in different units, so even - # if `time_step` is provided by the user, use a value-1 unitless step. - if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { # nolint: indentation_linter + # `tsibble` time types have their own behavior, where adding 1 corresponds to + # incrementing by a quantum (smallest resolvable unit) of the date class. For + # example, one step = 1 quarter for `yearquarter`. + if (time_type %in% c("yearmonth", "integer")) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { @@ -822,70 +752,24 @@ full_date_seq <- function(x, before, after, time_step) { if (after != 0) { pad_late_dates <- all_dates[length(all_dates)] + 1:after } - } else if (missing(time_step)) { - # Guess what `by` should be based on the epi_df's `time_type`. - ttype <- attributes(x)$metadata$time_type - by <- switch(ttype, + } else { + by <- switch(time_type, day = "days", week = "weeks", - yearweek = "weeks", - yearmonth = "months", - yearquarter = "quarters", - year = "years", - NA # default value for "custom", "day-time" ) - if (is.na(by)) { - cli_abort( - c( - "`frollmean` requires a full window to compute a result, but the - `time_type` associated with the epi_df was not mappable to a period - type valid for creating a date sequence.", - "i" = c("The input data's `time_type` was probably `custom` or `day-time`. - These require also passing a `time_step` function.") - ), - class = "epiprocess__full_date_seq__unmappable_time_type", - epiprocess__time_type = ttype - ) - } - - # `seq.Date` `by` arg can be any of `c("days", "weeks", "months", "quarters", "years")`. all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - if (before != 0) { - # Use `seq.Date` here to avoid having to map `epi_df` `time_type` to - # `time_step` functions. - # - # The first element `seq.Date` returns is always equal to the provided - # `from` date (`from + 0`). The full return value is equivalent to - # `from + 0:n`. In our case, we `from + 1:n`, so drop the first - # element. - # - # Adding "-1" to the `by` arg makes `seq.Date` go backwards in time. + # The behavior is analogous to the branch with tsibble types above. For + # more detail, note that the function `seq.Date(from, ..., length.out = + # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first + # element. Adding "-1" to the `by` arg makes `seq.Date` go backwards in + # time. pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) } if (after != 0) { pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] } - } else { - # A custom time step is specified. - assert_function(time_step) - - # Calculate the number of `time_step`s required to go between min and max time - # values. This is roundabout because difftime objects, lubridate::period objects, - # and Dates are hard to convert to the same time scale and add. - t_elapsed_s <- difftime(max(x$time_value), min(x$time_value), units = "secs") - step_size_s <- lubridate::as.period(time_step(1), unit = "secs") - n_steps <- ceiling(as.numeric(t_elapsed_s) / as.numeric(step_size_s)) - - all_dates <- min(x$time_value) + time_step(0:n_steps) - - if (before != 0) { - pad_early_dates <- all_dates[1L] - time_step(before:1) - } - if (after != 0) { - pad_late_dates <- all_dates[length(all_dates)] + time_step(1:after) - } } return(list( diff --git a/R/utils.R b/R/utils.R index 46a5cdc5..8c1c622f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -395,56 +395,38 @@ guess_geo_type <- function(geo_value) { } } - # If we got here then we failed return("custom") } -guess_time_type <- function(time_value) { - # Convert character time values to Date or POSIXct - if (is.character(time_value)) { - if (nchar(time_value[1]) <= 10L) { - new_time_value <- tryCatch( - { - as.Date(time_value) - }, - error = function(e) NULL - ) - } else { - new_time_value <- tryCatch( - { - as.POSIXct(time_value) - }, - error = function(e) NULL - ) - } - if (!is.null(new_time_value)) time_value <- new_time_value - } - # Now, if a POSIXct class, then use "day-time" - if (inherits(time_value, "POSIXct")) { - return("day-time") - } else if (inherits(time_value, "Date")) { - # Else, if a Date class, then use "week" or "day" depending on gaps - # Convert to numeric so we can use the modulo operator. +guess_time_type <- function(time_value, time_value_arg = rlang::caller_arg(time_value)) { + if (inherits(time_value, "Date")) { unique_time_gaps <- as.numeric(diff(sort(unique(time_value)))) - # We need to check the modulus of `unique_time_gaps` in case there are - # missing dates. Gaps in a weekly date sequence will cause some diffs to - # be larger than 7 days. If we just check if `diffs == 7`, it will fail - # unless the weekly date sequence is already complete. - return(ifelse(all(unique_time_gaps %% 7 == 0), "week", "day")) - } else if (inherits(time_value, "yearweek")) { - # Else, check whether it's one of the tsibble classes - return("yearweek") + # Gaps in a weekly date sequence will cause some diffs to be larger than 7 + # days, so check modulo 7 equality, rather than equality with 7. + if (all(unique_time_gaps %% 7 == 0)) { + return("week") + } + if (all(unique_time_gaps >= 28)) { + cli_abort( + "Found a monthly or longer cadence in the time column `{time_value_arg}`. + Consider using tsibble::yearmonth for monthly data and 'YYYY' integers for year data." + ) + } + return("day") } else if (inherits(time_value, "yearmonth")) { return("yearmonth") - } else if (inherits(time_value, "yearquarter")) { - return("yearquarter") - } else if (rlang::is_integerish(time_value) && - all(nchar(as.character(time_value)) == 4L)) { # nolint: indentation_linter - return("year") + } else if (rlang::is_integerish(time_value)) { + return("integer") } - # If we got here then we failed + cli_warn( + "Unsupported time type in column `{time_value_arg}`, with class {.code {class(time_value)}}. + Time-related functionality may have unexpected behavior. + ", + class = "epiprocess__guess_time_type__unknown_time_type", + epiprocess__time_value = time_value + ) return("custom") } @@ -820,3 +802,41 @@ guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(t guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { as.numeric(NextMethod(), units = "secs") } + + +validate_slide_window_arg <- function(arg, time_type, arg_name = rlang::caller_arg(arg)) { + if (is.null(arg)) { + cli_abort("`{arg_name}` is a required argument.") + } + + if (!checkmate::test_scalar(arg)) { + cli_abort("Expected `{arg_name}` to be a scalar value.") + } + + if (time_type == "custom") { + cli_abort("Unsure how to interpret slide units with a custom time type. Consider converting your time + column to a Date, yearmonth, or integer type.") + } + + if (!identical(arg, Inf)) { + if (time_type == "day") { + if (!test_int(arg, lower = 0L) && !(inherits(arg, "difftime") && units(arg) == "days")) { + cli_abort("Expected `{arg_name}` to be a difftime with units in days or a non-negative integer.") + } + } else if (time_type == "week") { + if (!(inherits(arg, "difftime") && units(arg) == "weeks")) { + cli_abort("Expected `{arg_name}` to be a difftime with units in weeks.") + } + } else if (time_type == "yearmonth") { + if (!test_int(arg, lower = 0L) || inherits(arg, "difftime")) { + cli_abort("Expected `{arg_name}` to be a non-negative integer.") + } + } else if (time_type == "integer") { + if (!test_int(arg, lower = 0L) || inherits(arg, "difftime")) { + cli_abort("Expected `{arg_name}` to be a non-negative integer.") + } + } else { + cli_abort("Expected `{arg_name}` to be Inf, an appropriate a difftime, or a non-negative integer.") + } + } +} diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R new file mode 100644 index 00000000..f8f6792d --- /dev/null +++ b/man-roxygen/basic-slide-details.R @@ -0,0 +1,34 @@ +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is centered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. +#' +#' If there are not enough time steps available to complete the window at any +#' given reference time, then `epi_slide()` still attempts to perform the +#' computation anyway (it does not require a complete window). The issue of +#' what to do with partial computations (those run on incomplete windows) is +#' therefore left up to the user, either through the specified function or +#' formula `f`, or through post-processing. For a centrally-aligned slide of +#' `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after = +#' (n-1)/2` when the number of `time_value`s in a sliding window is odd and +#' `before = n/2-1` and `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments. +#' +#' If `f` is missing, then an expression for tidy evaluation can be specified, +#' for example, as in: +#' ``` +#' epi_slide(x, cases_7dav = mean(cases), before = 6) +#' ``` +#' which would be equivalent to: +#' ``` +#' epi_slide(x, function(x, g) mean(x$cases), before = 6, +#' new_col_name = "cases_7dav") +#' ``` +#' Thus, to be clear, when the computation is specified via an expression for +#' tidy evaluation (first example, above), then the name for the new column is +#' inferred from the given expression and overrides any name passed explicitly +#' through the `new_col_name` argument. diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index 383c102d..7e169af6 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -3,31 +3,32 @@ #' single data group. #' @param before,after How far `before` and `after` each `ref_time_value` should #' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' provided; the other's default will be 0. The accepted values for these +#' depend on the type of the `time_value` column: +#' +#' - if it is a Date and the cadence is daily, then they can be integers +#' (which will be interpreted in units of days) or difftimes with units +#' "days" +#' - if it is a Date and the cadence is weekly, then they must be difftimes +#' with units "weeks" +#' - if it is an integer, then they must be integers +#' +#' Endpoints of the window are inclusive. Common settings: +#' +#' - For trailing/right-aligned windows from `ref_time_value - k` to +#' `ref_time_value`: either pass `before=k` by itself, or pass `before=k, +#' after=0`. +#' - For center-aligned windows from `ref_time_value - k` to +#' `ref_time_value + k`: pass `before=k, after=k`. +#' - For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + k`: either pass pass `after=k` by itself, #' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. +#' +#' See "Details:" on how missing rows are handled within the window. #' @param ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the #' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class [lubridate::period]. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R index 59c51603..bedcb7d4 100644 --- a/man-roxygen/epi_df-params.R +++ b/man-roxygen/epi_df-params.R @@ -1,10 +1,9 @@ #' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". +#' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the +#' location column and set to "custom" if not recognized. +#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time +#' column and set to "custom" if not recognized. Unpredictable behavior may result +#' if the time type is not recognized. #' @param as_of Time value representing the time at which the given data were #' available. For example, if `as_of` is January 31, 2022, then the `epi_df` #' object that is created would represent the most up-to-date version of the @@ -15,4 +14,5 @@ #' `as_of` fields; named entries from the passed list will be included as #' well. If your tibble has additional keys, be sure to specify them as a #' character vector in the `other_keys` component of `additional_metadata`. +#' @param ... Additional arguments passed to methods. #' @return An `epi_df` object. diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R index 33fb437c..5e8876d2 100644 --- a/man-roxygen/opt-slide-details.R +++ b/man-roxygen/opt-slide-details.R @@ -1,25 +1,16 @@ #' @details To "slide" means to apply a function over a rolling window of time -#' steps for each data group, where the window is centered at a reference -#' time and left and right endpoints are given by the `before` and `after` -#' arguments. The unit (the meaning of one time step) is implicitly defined -#' by the way the `time_value` column treats addition and subtraction; for -#' example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals `as.Date -#' ("2022-01-02")`. Alternatively, the time step can be set explicitly using -#' the `time_step` argument (which if specified would override the default -#' choice based on `time_value` column). If there are not enough time steps -#' available to complete the window at any given reference time, then -#' `epi_slide_*()` will fail; it requires a complete window to perform the -#' computation. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` -#' and `after = n/2` when `n` is even. +#' steps for each data group, where the window is centered at a reference time +#' and left and right endpoints are given by the `before` and `after` +#' arguments. + +#' If there are not enough time steps available to complete the window at any +#' given reference time, then `epi_slide_*()` will fail; it requires a +#' complete window to perform the computation. For a centrally-aligned slide +#' of `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after +#' = (n-1)/2` when the number of `time_value`s in a sliding window is odd and +#' `before = n/2-1` and `after = n/2` when `n` is even. #' #' Sometimes, we want to experiment with various trailing or leading window #' widths and compare the slide outputs. In the (uncommon) case where #' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) +#' `after` arguments. diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd deleted file mode 100644 index 98cdbb83..00000000 --- a/man/as_epi_df.Rd +++ /dev/null @@ -1,148 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_df.R -\name{as_epi_df} -\alias{as_epi_df} -\alias{as_epi_df.epi_df} -\alias{as_epi_df.tbl_df} -\alias{as_epi_df.data.frame} -\alias{as_epi_df.tbl_ts} -\title{Convert to \code{epi_df} format} -\usage{ -as_epi_df(x, ...) - -\method{as_epi_df}{epi_df}(x, ...) - -\method{as_epi_df}{tbl_df}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) - -\method{as_epi_df}{data.frame}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) - -\method{as_epi_df}{tbl_ts}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) -} -\arguments{ -\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} - -\item{...}{Additional arguments passed to methods.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{as_of}{Time value representing the time at which the given data were -available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} -object that is created would represent the most up-to-date version of the -data available as of January 31, 2022. If the \code{as_of} argument is missing, -then the current day-time will be used.} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and -\code{as_of} fields; named entries from the passed list will be included as -well. If your tibble has additional keys, be sure to specify them as a -character vector in the \code{other_keys} component of \code{additional_metadata}.} -} -\value{ -An \code{epi_df} object. -} -\description{ -Converts a data frame or tibble into an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html}{getting started guide} for -examples. -} -\section{Methods (by class)}{ -\itemize{ -\item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. - -\item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns -\code{geo_value} and \code{time_value}, or column names that uniquely map onto these -(e.g. \code{date} or \code{province}). Alternatively, you can specify the conversion -explicitly (\code{time_value = someWeirdColumnName}). All other columns not -specified as \code{other_keys} will be preserved as is, and treated as measured -variables. - -If \code{as_of} is missing, then the function will try to guess it from an -\code{as_of}, \code{issue}, or \code{version} column of \code{x} (if any of these are present), -or from as an \code{as_of} field in its metadata (stored in its attributes); if -this fails, then the current day-time will be used. - -\item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. - -\item \code{as_epi_df(tbl_ts)}: Works analogously to \code{as_epi_df.tbl_df()}, except that -the \code{tbl_ts} class is dropped, and any key variables (other than -"geo_value") are added to the metadata of the returned object, under the -\code{other_keys} field. - -}} -\examples{ -# Convert a `tsibble` that has county code as an extra key -# Notice that county code should be a character string to preserve any leading zeroes - -ex1_input <- tibble::tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c( - "06059", "06061", "06067", - "12111", "12113", "12117", - "42101", "42103", "42105" - ), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(geo_value)), - value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) -) \%>\% - tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) - -# The `other_keys` metadata (`"county_code"` in this case) is automatically -# inferred from the `tsibble`'s `key`: -ex1 <- as_epi_df(x = ex1_input, geo_type = "state", time_type = "day", as_of = "2020-06-03") -attr(ex1, "metadata")[["other_keys"]] - - - -# Dealing with misspecified column names: -# Geographical and temporal information must be provided in columns named -# `geo_value` and `time_value`; if we start from a data frame with a -# different format, it must be converted to use `geo_value` and `time_value` -# before calling `as_epi_df`. - -ex2_input <- tibble::tibble( - state = rep(c("ca", "fl", "pa"), each = 3), # misnamed - pol = rep(c("blue", "swing", "swing"), each = 3), # extra key - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), - by = "day" - ), length.out = length(state)), # misnamed - value = 1:length(state) + 0.01 * rnorm(length(state)) -) - -print(ex2_input) - -ex2 <- ex2_input \%>\% - dplyr::rename(geo_value = state, time_value = reported_date) \%>\% - as_epi_df( - geo_type = "state", as_of = "2020-06-03", - additional_metadata = list(other_keys = "pol") - ) - -attr(ex2, "metadata") - - - -# Adding additional keys to an `epi_df` object - -ex3_input <- jhu_csse_county_level_subset \%>\% - dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% - dplyr::slice_tail(n = 6) - -ex3 <- ex3_input \%>\% - tsibble::as_tsibble() \%>\% # needed to add the additional metadata - # add 2 extra keys - dplyr::mutate( - state = rep("MA", 6), - pol = rep(c("blue", "swing", "swing"), each = 2) - ) \%>\% - # the 2 extra keys we added have to be specified in the other_keys - # component of additional_metadata. - as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) - -attr(ex3, "metadata") -} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 99203052..74591693 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -9,35 +9,33 @@ \usage{ new_epi_archive( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL + geo_type, + time_type, + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end ) validate_epi_archive( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NULL, - versions_end = NULL + other_keys, + additional_metadata, + compactify, + clobberable_versions_start, + versions_end ) as_epi_archive( x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, + geo_type = deprecated(), + time_type = deprecated(), + other_keys = character(0L), + additional_metadata = list(), compactify = NULL, - clobberable_versions_start = NULL, - .versions_end = NULL, + clobberable_versions_start = NA, + .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end ) @@ -46,23 +44,22 @@ as_epi_archive( \item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, \code{time_value}, \code{version}, and then any additional number of columns.} -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} +\item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the +location column and set to "custom" if not recognized.} -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} +\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time +column and set to "custom" if not recognized. Unpredictable behavior may result +if the time type is not recognized.} \item{other_keys}{Character vector specifying the names of variables in \code{x} that should be considered key variables (in the language of \code{data.table}) apart from "geo_value", "time_value", and "version".} \item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} +\code{epi_archive} object. The metadata will have the \code{geo_type} field; named +entries from the passed list or will be included as well.} -\item{compactify}{Optional; Boolean or \code{NULL}. \code{TRUE} will remove some +\item{compactify}{Optional; Boolean. \code{TRUE} will remove some redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove redundant rows, but issue a warning. See more information at \code{compactify}.} @@ -126,11 +123,8 @@ later. The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, as well as any others (these can be specified when instantiating the \code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Refer to the documentation for \code{as_epi_archive()} for -information and examples of relevant parameter names for an \code{epi_archive} -object. Note that there can only be a single row per unique combination of -key variables, and thus the key variables are critical for figuring out how -to generate a snapshot of data from the archive, as of a given version. +on \code{DT} directly). Note that there can only be a single row per unique +combination of key variables. } \section{Metadata}{ @@ -142,20 +136,17 @@ object: \item \code{additional_metadata}: list of additional metadata for the data archive. } -Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be -accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, -etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the -metadata of an \code{epi_archive} object are not currently used by any -downstream functions in the \code{epiprocess} package, and serve only as useful -bits of information to convey about the data set at hand. +While this metadata is not protected, it is generally recommended to treat it +as read-only, and to use the \code{epi_archive} methods to interact with the data +archive. Unexpected behavior may result from modifying the metadata +directly. } \section{Generating Snapshots}{ An \code{epi_archive} object can be used to generate a snapshot of the data in -\code{epi_df} format, which represents the most up-to-date values of the signal -variables, as of the specified version. This is accomplished by calling -\code{epix_as_of()}. +\code{epi_df} format, which represents the most up-to-date time series values up +to a point in time. This is accomplished by calling \code{epix_as_of()}. } \section{Sliding Computations}{ @@ -182,10 +173,7 @@ tib <- tibble::tibble( value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% as_epi_archive( - geo_type = "state", - time_type = "day" -) +toy_epi_archive <- tib \%>\% as_epi_archive() toy_epi_archive # Ex. with an additional key for county @@ -208,10 +196,6 @@ df <- data.frame( cases_rate = c(0.01, 0.02, 0.01, 0.05) ) -x <- df \%>\% as_epi_archive( - geo_type = "state", - time_type = "day", - other_keys = "county" -) +x <- df \%>\% as_epi_archive(other_keys = "county") } diff --git a/man/epi_df.Rd b/man/epi_df.Rd index 4e5af146..dbb4a917 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -1,19 +1,78 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/epi_df.R -\name{epi_df} -\alias{epi_df} +\name{new_epi_df} +\alias{new_epi_df} +\alias{as_epi_df} +\alias{as_epi_df.epi_df} +\alias{as_epi_df.tbl_df} +\alias{as_epi_df.data.frame} +\alias{as_epi_df.tbl_ts} \title{\code{epi_df} object} +\usage{ +new_epi_df( + x = tibble::tibble(), + geo_type, + time_type, + as_of, + additional_metadata = list() +) + +as_epi_df(x, ...) + +\method{as_epi_df}{epi_df}(x, ...) + +\method{as_epi_df}{tbl_df}( + x, + geo_type = deprecated(), + time_type = deprecated(), + as_of, + additional_metadata = list(), + ... +) + +\method{as_epi_df}{data.frame}(x, as_of, additional_metadata = list(), ...) + +\method{as_epi_df}{tbl_ts}(x, as_of, additional_metadata = list(), ...) +} +\arguments{ +\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} + +\item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the +location column and set to "custom" if not recognized.} + +\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time +column and set to "custom" if not recognized. Unpredictable behavior may result +if the time type is not recognized.} + +\item{as_of}{Time value representing the time at which the given data were +available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} +object that is created would represent the most up-to-date version of the +data available as of January 31, 2022. If the \code{as_of} argument is missing, +then the current day-time will be used.} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and +\code{as_of} fields; named entries from the passed list will be included as +well. If your tibble has additional keys, be sure to specify them as a +character vector in the \code{other_keys} component of \code{additional_metadata}.} + +\item{...}{Additional arguments passed to methods.} +} +\value{ +An \code{epi_df} object. +} \description{ -An \code{epi_df} is a tibble with certain minimal column structure -and metadata. It can be seen as a snapshot of a data set that contains the -most up-to-date values of some signal variables of interest, as of a given -time. +An \code{epi_df} is a tibble with certain minimal column structure and metadata. +It can be seen as a snapshot of a data set that contains the most +up-to-date values of some signal variables of interest, as of a given time. } \details{ An \code{epi_df} is a tibble with (at least) the following columns: \itemize{ -\item \code{geo_value}: the geographic value associated with each row of measurements. -\item \code{time_value}: the time value associated with each row of measurements. +\item \code{geo_value}: A character vector representing the geographical unit of +observation. This could be a country code, a state name, a county code, +etc. +\item \code{time_value}: A date or integer vector representing the time of observation. } Other columns can be considered as measured variables, which we also refer to @@ -21,17 +80,25 @@ as signal variables. An \code{epi_df} object also has metadata with (at least) the following fields: \itemize{ \item \code{geo_type}: the type for the geo values. -\item \code{time_type}: the type for the time values. \item \code{as_of}: the time value at which the given data were available. } +Most users should use \code{as_epi_df}. The input tibble \code{x} to the constructor +must contain the columns \code{geo_value} and \code{time_value}. All other columns +will be preserved as is, and treated as measured variables. If \code{as_of} is +missing, then the function will try to guess it from an \code{as_of}, \code{issue}, +or \code{version} column of \code{x} (if any of these are present), or from as an +\code{as_of} field in its metadata (stored in its attributes); if this fails, +then the current day-time will be used. The \code{new_epi_df} constructor +assumes its arguments have already been validated, so it should mainly be +used by advanced users. + Metadata for an \code{epi_df} object \code{x} can be accessed (and altered) via -\code{attributes(x)$metadata}. The first two fields in the above list, -\code{geo_type} and \code{time_type}, can usually be inferred from the \code{geo_value} -and \code{time_value} columns, respectively. They are not currently used by any -downstream functions in the \code{epiprocess} package, and serve only as useful -bits of information to convey about the data set at hand. More information -on their coding is given below. +\code{attributes(x)$metadata}. The first field in the above list, \code{geo_type}, +can usually be inferred from the \code{geo_value} columns. They are not +currently used by any downstream functions in the \code{epiprocess} package, +and serve only as useful bits of information to convey about the data set +at hand. More information on their coding is given below. The last field in the above list, \code{as_of}, is one of the most unique aspects of an \code{epi_df} object. In brief, we can think of an \code{epi_df} object as a @@ -72,25 +139,86 @@ An unrecognizable geo type is labeled "custom". The following time types are recognized in an \code{epi_df}. \itemize{ -\item \code{"day-time"}: each observation corresponds to a time on a given day -(measured to the second); coded as a \code{POSIXct} object, as in -\code{as.POSIXct("2022-01-31 18:45:40")}. \item \code{"day"}: each observation corresponds to a day; coded as a \code{Date} object, as in \code{as.Date("2022-01-31")}. \item \code{"week"}: each observation corresponds to a week; the alignment can be arbitrary (as to whether a week starts on a Monday, Tuesday); coded as a \code{Date} object, representing the start date of week. -\item \code{"yearweek"}: each observation corresponds to a week; the alignment can be -arbitrary; coded as a \code{tsibble::yearweek} object, where the alignment is -stored in the \code{week_start} field of its attributes. \item \code{"yearmonth"}: each observation corresponds to a month; coded as a \code{tsibble::yearmonth} object. -\item \code{"yearquarter"}: each observation corresponds to a quarter; coded as a -\code{tsibble::yearquarter} object. -\item \code{"year"}: each observation corresponds to a year; coded as an integer -greater than or equal to 1582. +\item \code{"integer"}: a generic integer index (e.g. years or something else). } An unrecognizable time type is labeled "custom". } +\examples{ +# Convert a `tsibble` that has county code as an extra key +# Notice that county code should be a character string to preserve any leading zeroes + +ex1_input <- tibble::tibble( + geo_value = rep(c("ca", "fl", "pa"), each = 3), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), + time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), + by = "day" + ), length.out = length(geo_value)), + value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) +) \%>\% + tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) + +# The `other_keys` metadata (`"county_code"` in this case) is automatically +# inferred from the `tsibble`'s `key`: +ex1 <- as_epi_df(x = ex1_input, as_of = "2020-06-03") +attr(ex1, "metadata")[["other_keys"]] + + +# Dealing with misspecified column names: +# Geographical and temporal information must be provided in columns named +# `geo_value` and `time_value`; if we start from a data frame with a +# different format, it must be converted to use `geo_value` and `time_value` +# before calling `as_epi_df`. + +ex2_input <- tibble::tibble( + state = rep(c("ca", "fl", "pa"), each = 3), # misnamed + pol = rep(c("blue", "swing", "swing"), each = 3), # extra key + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), + by = "day" + ), length.out = length(state)), # misnamed + value = 1:length(state) + 0.01 * rnorm(length(state)) +) + +print(ex2_input) + +ex2 <- ex2_input \%>\% + dplyr::rename(geo_value = state, time_value = reported_date) \%>\% + as_epi_df( + as_of = "2020-06-03", + additional_metadata = list(other_keys = "pol") + ) + +attr(ex2, "metadata") + + +# Adding additional keys to an `epi_df` object + +ex3_input <- jhu_csse_county_level_subset \%>\% + dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% + dplyr::slice_tail(n = 6) + +ex3 <- ex3_input \%>\% + tsibble::as_tsibble() \%>\% # needed to add the additional metadata + # add 2 extra keys + dplyr::mutate( + state = rep("MA", 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) \%>\% + # the 2 extra keys we added have to be specified in the other_keys + # component of additional_metadata. + as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) + +attr(ex3, "metadata") +} diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index a1319f99..5f4db7b4 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -8,10 +8,9 @@ epi_slide( x, f, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -47,35 +46,35 @@ as in \code{dplyr} verbs, and can also refer to \code{.x}, \code{.group_key}, an \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} @@ -115,31 +114,21 @@ for examples. To "slide" means to apply a function or formula over a rolling window of time steps for each data group, where the window is centered at a reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. The unit (the meaning of one time step) is implicitly -defined by the way the \code{time_value} column treats addition and subtraction; -for example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals -\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly -using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If there are not enough time -steps available to complete the window at any given reference time, then -\code{epi_slide()} still attempts to perform the computation anyway (it does not -require a complete window). The issue of what to do with partial -computations (those run on incomplete windows) is therefore left up to the -user, either through the specified function or formula \code{f}, or through -post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and -\code{after = n/2} when \code{n} is even. +\code{after} arguments. + +If there are not enough time steps available to complete the window at any +given reference time, then \code{epi_slide()} still attempts to perform the +computation anyway (it does not require a complete window). The issue of +what to do with partial computations (those run on incomplete windows) is +therefore left up to the user, either through the specified function or +formula \code{f}, or through post-processing. For a centrally-aligned slide of +\code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. If \code{f} is missing, then an expression for tidy evaluation can be specified, for example, as in: diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 850a45a1..aeb56729 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -8,10 +8,9 @@ epi_slide_mean( x, col_names, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, @@ -41,35 +40,35 @@ passed the data \code{x} to operate on, the window size \code{n}, and the alignm \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{Character vector indicating the name(s) of the new column(s) that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to any existing @@ -105,29 +104,19 @@ examples. Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference -time and left and right endpoints are given by the \code{before} and \code{after} -arguments. The unit (the meaning of one time step) is implicitly defined -by the way the \code{time_value} column treats addition and subtraction; for -example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using -the \code{time_step} argument (which if specified would override the default -choice based on \code{time_value} column). If there are not enough time steps -available to complete the window at any given reference time, then -\verb{epi_slide_*()} will fail; it requires a complete window to perform the -computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} -and \code{after = n/2} when \code{n} is even. +steps for each data group, where the window is centered at a reference time +and left and right endpoints are given by the \code{before} and \code{after} +arguments. +If there are not enough time steps available to complete the window at any +given reference time, then \verb{epi_slide_*()} will fail; it requires a +complete window to perform the computation. For a centrally-aligned slide +of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. } \examples{ # slide a 7-day trailing average formula on cases diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4b011c16..629134d5 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -9,10 +9,9 @@ epi_slide_opt( col_names, f, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, @@ -62,35 +61,35 @@ points \code{before} and \code{after} to use in the computation.} \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{Character vector indicating the name(s) of the new column(s) that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to any existing @@ -126,29 +125,19 @@ for examples. } \details{ To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference -time and left and right endpoints are given by the \code{before} and \code{after} -arguments. The unit (the meaning of one time step) is implicitly defined -by the way the \code{time_value} column treats addition and subtraction; for -example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using -the \code{time_step} argument (which if specified would override the default -choice based on \code{time_value} column). If there are not enough time steps -available to complete the window at any given reference time, then -\verb{epi_slide_*()} will fail; it requires a complete window to perform the -computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} -and \code{after = n/2} when \code{n} is even. +steps for each data group, where the window is centered at a reference time +and left and right endpoints are given by the \code{before} and \code{after} +arguments. +If there are not enough time steps available to complete the window at any +given reference time, then \verb{epi_slide_*()} will fail; it requires a +complete window to perform the computation. For a centrally-aligned slide +of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. } \examples{ # slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 8c835bdb..7bf92e23 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -8,10 +8,9 @@ epi_slide_sum( x, col_names, ..., - before, - after, - ref_time_values, - time_step, + before = NULL, + after = NULL, + ref_time_values = NULL, new_col_name = NULL, as_list_col = NULL, names_sep = NULL, @@ -41,35 +40,35 @@ passed the data \code{x} to operate on, the window size \code{n}, and the alignm \item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. Any value provided for either -argument must be a single, non-\code{NA}, non-negative, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of -the window are inclusive. Common settings: +provided; the other's default will be 0. The accepted values for these +depend on the type of the \code{time_value} column: \itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - time_step (k)} to \code{ref_time_value}: either pass \code{before=k} by itself, or pass -\verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - time_step(k)} to -\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. +\item if it is a Date and the cadence is daily, then they can be integers +(which will be interpreted in units of days) or difftimes with units +"days" +\item if it is a Date and the cadence is weekly, then they must be difftimes +with units "weeks" +\item if it is an integer, then they must be integers +} + +Endpoints of the window are inclusive. Common settings: +\itemize{ +\item For trailing/right-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. +\item For center-aligned windows from \code{ref_time_value - k} to +\code{ref_time_value + k}: pass \verb{before=k, after=k}. \item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, +\code{ref_time_value + k}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. -See "Details:" about the definition of a time step,(non)treatment of -missing rows within the window, and avoiding warnings about -\code{before}&\code{after} settings for a certain uncommon use case. -}} +} + +See "Details:" on how missing rows are handled within the window.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a non-negative integer and -return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use -\code{time_step = lubridate::hours} in order to set the time step to be one hour -(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{Character vector indicating the name(s) of the new column(s) that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to any existing @@ -105,29 +104,19 @@ examples. Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference -time and left and right endpoints are given by the \code{before} and \code{after} -arguments. The unit (the meaning of one time step) is implicitly defined -by the way the \code{time_value} column treats addition and subtraction; for -example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using -the \code{time_step} argument (which if specified would override the default -choice based on \code{time_value} column). If there are not enough time steps -available to complete the window at any given reference time, then -\verb{epi_slide_*()} will fail; it requires a complete window to perform the -computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a -sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} -and \code{after = n/2} when \code{n} is even. +steps for each data group, where the window is centered at a reference time +and left and right endpoints are given by the \code{before} and \code{after} +arguments. +If there are not enough time steps available to complete the window at any +given reference time, then \verb{epi_slide_*()} will fail; it requires a +complete window to perform the computation. For a centrally-aligned slide +of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and +\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} -with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, -use \verb{before=k, after=0} instead; otherwise, it looks too much like a -leading window was intended, but the \code{after} argument was forgotten or -misspelled.) +\code{after} arguments. } \examples{ # slide a 7-day trailing sum formula on cases diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 42b121fa..4ab23882 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -62,7 +62,7 @@ archive_cases_dv_subset2 <- as_epi_archive( # (a.k.a. "hotfixed", "clobbered", etc.): clobberable_versions_start = max(archive_cases_dv_subset$DT$version), # Suppose today is the following day, and there are no updates out yet: - versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, + versions_end = max(archive_cases_dv_subset$DT$version) + 1L, compactify = TRUE ) diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index c8f09594..2789cb01 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -10,9 +10,8 @@ epix_slide( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -23,9 +22,8 @@ epix_slide( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -36,9 +34,8 @@ epix_slide( x, f, ..., - before, - ref_time_values, - time_step, + before = Inf, + ref_time_values = NULL, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -97,12 +94,6 @@ set to a regularly-spaced sequence of values set to cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will be guessed (using the GCD of the skips between values).} -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - \item{new_col_name}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd deleted file mode 100644 index 8010b700..00000000 --- a/man/new_epi_df.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_df.R -\name{new_epi_df} -\alias{new_epi_df} -\title{Creates an \code{epi_df} object} -\usage{ -new_epi_df( - x = tibble::tibble(), - geo_type, - time_type, - as_of, - additional_metadata = list() -) -} -\arguments{ -\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{as_of}{Time value representing the time at which the given data were -available. For example, if \code{as_of} is January 31, 2022, then the \code{epi_df} -object that is created would represent the most up-to-date version of the -data available as of January 31, 2022. If the \code{as_of} argument is missing, -then the current day-time will be used.} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_df} object. The metadata will have \code{geo_type}, \code{time_type}, and -\code{as_of} fields; named entries from the passed list will be included as -well. If your tibble has additional keys, be sure to specify them as a -character vector in the \code{other_keys} component of \code{additional_metadata}.} -} -\value{ -An \code{epi_df} object. -} -\description{ -Creates a new \code{epi_df} object. By default, builds an empty tibble with the -correct metadata for an \code{epi_df} object (ie. \code{geo_type}, \code{time_type}, and \code{as_of}). -Refer to the below info. about the arguments for more details. -} diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index d36fcab1..d12c4060 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -89,7 +89,7 @@ test_that("`validate_version_bound` validate and class checks together allow and test_that("archive version bounds args work as intended", { measurement_date <- as.Date("2000-01-01") update_tbl <- tibble::tibble( - geo_value = "g1", + geo_value = "ak", time_value = measurement_date, version = measurement_date + 1:5, value = 1:5 diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index d437c983..ac5aee8d 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -67,21 +67,22 @@ test_that("other_keys cannot contain names geo_value, time_value or version", { ) }) -test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { +test_that("Warning thrown when other_metadata contains overlapping names with geo_type field", { expect_warning(as_epi_archive(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + regexp = "`additional_metadata` names overlap with existing metadata fields" ) expect_warning(as_epi_archive(dt, additional_metadata = list(time_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + regexp = "`additional_metadata` names overlap with existing metadata fields" ) }) test_that("epi_archives are correctly instantiated with a variety of data types", { + d <- as.Date("2020-01-01") # Data frame df <- data.frame( geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, + time_value = d, + version = d + 0:19, value = 1:20 ) @@ -107,8 +108,8 @@ test_that("epi_archives are correctly instantiated with a variety of data types" # Keyed data.table kdt <- data.table::data.table( geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, + time_value = d, + version = d + 0:19, value = 1:20, code = "CA", key = "code" @@ -127,8 +128,8 @@ test_that("epi_archives are correctly instantiated with a variety of data types" # Unkeyed data.table udt <- data.table::data.table( geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, + time_value = d, + version = d + 0:19, value = 1:20, code = "CA" ) @@ -157,7 +158,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" # Keyed epi_df edf2 <- data.frame( geo_value = "al", - time_value = rep(as.Date("2020-01-01") + 0:9, 2), + time_value = rep(d + 0:9, 2), version = c( rep(as.Date("2020-01-25"), 10), rep(as.Date("2020-01-26"), 10) @@ -177,14 +178,13 @@ test_that("epi_archives are correctly instantiated with a variety of data types" }) test_that("`epi_archive` rejects nonunique keys", { - toy_update_tbl <- - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130, - "us", "pediatric", "2000-01-01", "2000-01-02", 5 - ) %>% + toy_update_tbl <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% mutate( age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value), @@ -199,3 +199,30 @@ test_that("`epi_archive` rejects nonunique keys", { as_epi_archive(toy_update_tbl, other_keys = "age_group"), ) }) + +test_that("`epi_archive` rejects dataframes where time_value and version columns don't share type", { + tbl1 <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", as.Date("2000-01-01"), as.Date("2000-01-02"), 121, + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + ) + expect_no_error(as_epi_archive(tbl1)) + tbl2 <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", as.Date("2000-01-01"), 2022, 121, + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + ) + expect_error(as_epi_archive(tbl2), class = "epiprocess__time_value_version_mismatch") + tbl3 <- tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", as.Date("2000-01-01"), as.POSIXct("2000-01-01"), 121, + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + ) + expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch") +}) diff --git a/tests/testthat/test-autoplot.R b/tests/testthat/test-autoplot.R index 0e4654eb..3b7d9c1f 100644 --- a/tests/testthat/test-autoplot.R +++ b/tests/testthat/test-autoplot.R @@ -1,17 +1,17 @@ -d <- as.Date("2020-01-01") +test_date <- as.Date("2020-01-01") raw_df_chr <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = "a"), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = "d") + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = "a"), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = "d") ) -ungrouped_chr <- as_epi_df(raw_df_chr, as_of = d + 6) +ungrouped_chr <- as_epi_df(raw_df_chr, as_of = test_date + 6) grouped_chr <- ungrouped_chr %>% group_by(geo_value) raw_df_num <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = 1:5) ) -ungrouped_num <- as_epi_df(raw_df_num, as_of = d + 6) +ungrouped_num <- as_epi_df(raw_df_num, as_of = test_date + 6) grouped_num <- ungrouped_num %>% group_by(geo_value) @@ -33,7 +33,7 @@ test_that("autoplot fails if no non-key columns are numeric", { # A numeric column is available, but is a key not a value. testdf <- mutate(raw_df_chr, key1 = c(1:5, 5:9)) %>% as_tsibble(index = time_value, key = c(geo_value, key1)) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) expect_error(autoplot(testdf), class = "epiprocess__no_numeric_vars_available" ) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 042a69ea..d05fe0b3 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -87,10 +87,11 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti }) test_that("compactify does not alter the default clobberable and observed version bounds", { + d <- as.Date("2000-01-01") x <- tibble::tibble( - geo_value = "geo1", - time_value = as.Date("2000-01-01"), - version = as.Date("2000-01-01") + 1:5, + geo_value = "ak", + time_value = d, + version = d + 1:5, value = 42L ) ea_true <- as_epi_archive(x, compactify = TRUE) diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index 98507434..886d94c4 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -21,7 +21,7 @@ test_that("epi_cor functions as intended", { ) edf <- as_epi_df(data.frame( - geo_value = rep("asdf", 20), + geo_value = rep("ak", 20), time_value = as.Date("2020-01-01") + 1:20, pos = 1:20, neg = -(1:20) diff --git a/tests/testthat/test-epi_df.R b/tests/testthat/test-epi_df.R index 1c5e527f..a49855aa 100644 --- a/tests/testthat/test-epi_df.R +++ b/tests/testthat/test-epi_df.R @@ -1,14 +1,8 @@ test_that("new_epi_df works as intended", { - # Empty tibble - wmsg <- capture_warnings(a <- new_epi_df()) - expect_match( - wmsg[1], - "Unknown or uninitialised column: `geo_value`." - ) - expect_match( - wmsg[2], - "Unknown or uninitialised column: `time_value`." - ) + # Empty call fails + expect_error(new_epi_df(), "argument \"geo_type\" is missing") + # Empty tibble works, but requires metadata + a <- new_epi_df(tibble(), geo_type = "custom", time_type = "custom", as_of = as.POSIXct("2020-01-01")) expect_true(is_epi_df(a)) expect_identical(attributes(a)$metadata$geo_type, "custom") expect_identical(attributes(a)$metadata$time_type, "custom") @@ -21,7 +15,7 @@ test_that("new_epi_df works as intended", { geo_value = rep(c("ca", "hi"), each = 5) ) - epi_tib <- new_epi_df(tib) + epi_tib <- new_epi_df(tib, geo_type = "state", time_type = "day", as_of = as.POSIXct("2020-01-01")) expect_true(is_epi_df(epi_tib)) expect_length(epi_tib, 4L) expect_identical(attributes(epi_tib)$metadata$geo_type, "state") @@ -82,13 +76,12 @@ test_that("as_epi_df works for nonstandard input", { }) # select fixes - tib <- tibble::tibble( x = 1:10, y = 1:10, time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2), geo_value = rep(c("ca", "hi"), each = 5) ) -epi_tib <- epiprocess::new_epi_df(tib) +epi_tib <- epiprocess::as_epi_df(tib) test_that("grouped epi_df maintains type for select", { grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-y) @@ -115,10 +108,9 @@ test_that("grouped epi_df handles extra keys correctly", { geo_value = rep(c("ca", "hi"), each = 5), extra_key = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), times = 2) ) - epi_tib <- epiprocess::new_epi_df(tib, + epi_tib <- epiprocess::as_epi_df(tib, additional_metadata = list(other_keys = "extra_key") ) - attributes(epi_tib) grouped_epi <- epi_tib %>% group_by(geo_value) selected_df <- grouped_epi %>% select(-extra_key) expect_true(inherits(selected_df, "epi_df")) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 9aa67603..f369fe15 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1,205 +1,180 @@ ## Create an epi. df and a function to test epi_slide with -d <- as.Date("2020-01-01") +test_date <- as.Date("2020-01-01") +days_dt <- as.difftime(1, units = "days") +weeks_dt <- as.difftime(1, units = "weeks") ungrouped <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value = 1:200), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5)) ) %>% as_epi_df() grouped <- ungrouped %>% group_by(geo_value) small_x <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5)) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% group_by(geo_value) - f <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) toy_edf <- tibble::tribble( ~geo_value, ~time_value, ~value, - "a", 1:10, 2L^(1:10), - "b", 1:10, 2L^(11:20), + "a", test_date + 1:10, 2L^(1:10), + "b", test_date + 1:10, 2L^(11:20), ) %>% tidyr::unchop(c(time_value, value)) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint start: line_length_linter. basic_sum_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) basic_mean_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "a", test_date + 1:10, 2L^(1:10), data.table::frollmean(2L^(1:10), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint end: line_length_linter. ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { expect_error( - epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = d + 3), - "Assertion on 'before' failed: Must have length 1" + epi_slide(grouped, f, before = c(0, 1), after = 0, ref_time_values = test_date + 3), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = d + 3), - "Assertion on 'after' failed: Must have length 1" + epi_slide(grouped, f, before = 1, after = c(0, 1), ref_time_values = test_date + 3), + "Expected `after` to be a scalar value." ) - expect_error( - epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = d + 3), - "Assertion on 'before' failed: Must have length 1" + epi_slide_mean(grouped, col_names = value, before = c(0, 1), after = 0, ref_time_values = test_date + 3), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = d + 3), - "Assertion on 'after' failed: Must have length 1" + epi_slide_mean(grouped, col_names = value, before = 1, after = c(0, 1), ref_time_values = test_date + 3), + "Expected `after` to be a scalar value." ) }) test_that("Test errors/warnings for discouraged features", { expect_error( - epi_slide(grouped, f, ref_time_values = d + 1), - "Either or both of `before`, `after` must be provided." - ) - expect_warning( - epi_slide(grouped, f, before = 0L, ref_time_values = d + 1), - "`before==0`, `after` missing" - ) - expect_warning( - epi_slide(grouped, f, after = 0L, ref_time_values = d + 1), - "`before` missing, `after==0`" + epi_slide(grouped, f, ref_time_values = test_date + 1), + "`before` is a required argument." ) expect_error( - epi_slide_mean(grouped, col_names = value, ref_time_values = d + 1), - "Either or both of `before`, `after` must be provided." - ) - expect_warning( - epi_slide_mean(grouped, col_names = value, before = 0L, ref_time_values = d + 1), - "`before==0`, `after` missing" - ) - expect_warning( - epi_slide_mean(grouped, col_names = value, after = 0L, ref_time_values = d + 1), - "`before` missing, `after==0`" + epi_slide_mean(grouped, col_names = value, ref_time_values = test_date + 1), + "`before` is a required argument." ) - # Below cases should raise no errors/warnings: expect_no_warning( - ref1 <- epi_slide(grouped, f, before = 1L, ref_time_values = d + 2) + ref1 <- epi_slide(grouped, f, before = days_dt, ref_time_values = test_date + 2) ) expect_no_warning( - ref2 <- epi_slide(grouped, f, after = 1L, ref_time_values = d + 2) - ) - expect_no_warning( - ref3 <- epi_slide(grouped, f, - before = 0L, after = 0L, ref_time_values = d + 2 - ) + ref2 <- epi_slide(grouped, f, after = days_dt, ref_time_values = test_date + 2) ) expect_no_warning( opt1 <- epi_slide_mean(grouped, col_names = value, - before = 1L, ref_time_values = d + 2, na.rm = TRUE + before = days_dt, ref_time_values = test_date + 2, na.rm = TRUE ) ) expect_no_warning( opt2 <- epi_slide_mean(grouped, col_names = value, - after = 1L, ref_time_values = d + 2, na.rm = TRUE - ) - ) - expect_no_warning( - opt3 <- epi_slide_mean(grouped, - col_names = value, - before = 0L, after = 0L, ref_time_values = d + 2, na.rm = TRUE + after = days_dt, ref_time_values = test_date + 2, na.rm = TRUE ) ) # Results from epi_slide and epi_slide_mean should match expect_equal(select(ref1, -slide_value_count), opt1) expect_equal(select(ref2, -slide_value_count), opt2) - expect_equal(select(ref3, -slide_value_count), opt3) }) test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible", { expect_error( - epi_slide(grouped, f, before = -1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: Element 1 is not >= 0" + epi_slide(grouped, f, before = -1L, ref_time_values = test_date + 2L), + "Expected `before` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d + 2L), - "Assertion on 'after' failed: Element 1 is not >= 0" + epi_slide(grouped, f, after = -1L, ref_time_values = test_date + 2L), + "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = "a", after = days_dt, ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = days_dt, after = "a", ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = 0.5, after = days_dt, ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error(epi_slide(grouped, f, before = days_dt, after = 0.5, ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: May not be NA" + epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = test_date + 2L), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d + 2L), - "Assertion on 'after' failed: May not be NA" + epi_slide(grouped, f, before = days_dt, after = NA, ref_time_values = test_date + 2L), + "Expected `after` to be a scalar value." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: Element 1 is not >= 0" + epi_slide_mean(grouped, col_names = value, before = -1L, ref_time_values = test_date + 2L), + "Expected `before` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = 2L, after = -1L, ref_time_values = d + 2L), - "Assertion on 'after' failed: Element 1 is not >= 0" + epi_slide_mean(grouped, col_names = value, after = -1L, ref_time_values = test_date + 2L), + "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, before = "a", ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = "a", ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, after = "a", ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = d + 2L), - regexp = "before", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, before = 0.5, ref_time_values = test_date + 2L), + regexp = "Expected `before` to be a difftime with units in days or a non-negative integer." ) - expect_error(epi_slide_mean(grouped, col_names = value, before = 1L, after = 0.5, ref_time_values = d + 2L), - regexp = "after", class = "vctrs_error_incompatible_type" + expect_error( + epi_slide_mean(grouped, col_names = value, after = 0.5, ref_time_values = test_date + 2L), + regexp = "Expected `after` to be a difftime with units in days or a non-negative integer." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = NA, after = 1L, ref_time_values = d + 2L), - "Assertion on 'before' failed: May not be NA" + epi_slide_mean(grouped, col_names = value, before = NA, after = days_dt, ref_time_values = test_date + 2L), + "Expected `before` to be a scalar value." ) expect_error( - epi_slide_mean(grouped, col_names = value, before = 1L, after = NA, ref_time_values = d + 2L), - "Assertion on 'after' failed: May not be NA" + epi_slide_mean(grouped, col_names = value, before = days_dt, after = NA, ref_time_values = test_date + 2L), + "Expected `after` to be a scalar value." ) # Non-integer-class but integer-compatible values are allowed: expect_no_error( - ref <- epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d + 2L) + ref <- epi_slide(grouped, f, before = days_dt, after = days_dt, ref_time_values = test_date + 2L) ) expect_no_error(opt <- epi_slide_mean( grouped, - col_names = value, before = 1, after = 1, - ref_time_values = d + 2L, na.rm = TRUE + col_names = value, before = days_dt, after = days_dt, + ref_time_values = test_date + 2L, na.rm = TRUE )) # Results from epi_slide and epi_slide_mean should match @@ -208,20 +183,25 @@ test_that("Both `before` and `after` must be non-NA, non-negative, integer-compa test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d), + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 207L), + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 207L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window expect_error( - epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d), + epi_slide_mean(grouped, col_names = value, before = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, no data in the slide windows expect_error( - epi_slide_mean(grouped, col_names = value, before = 2L, ref_time_values = d + 207L), + epi_slide_mean( + grouped, + col_names = value, + before = 2 * days_dt, + ref_time_values = test_date + 207L + ), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, no data in window }) @@ -234,51 +214,25 @@ test_that( ), { expect_error( - epi_slide(grouped, f, before = 0L, after = 2L, ref_time_values = d), + epi_slide(grouped, f, after = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 201L), + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 201L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window expect_error( - epi_slide_mean(grouped, value, before = 0L, after = 2L, ref_time_values = d), + epi_slide_mean(grouped, value, after = 2 * days_dt, ref_time_values = test_date), "`ref_time_values` must be a unique subset of the time values in `x`." ) # before the first, but we'd expect there to be data in the window expect_error( - epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 201L), + epi_slide_mean(grouped, value, before = 2 * days_dt, ref_time_values = test_date + 201L), "`ref_time_values` must be a unique subset of the time values in `x`." ) # beyond the last, but still with data in window } ) -## --- These cases generate warnings (or not): --- -test_that("Warn user against having a blank `before`", { - expect_no_warning(ref1 <- epi_slide( - grouped, f, - after = 1L, ref_time_values = d + 1L - )) - expect_no_warning(ref2 <- epi_slide( - grouped, f, - before = 0L, after = 1L, ref_time_values = d + 1L - )) - - expect_no_warning(opt1 <- epi_slide_mean( - grouped, value, - after = 1L, ref_time_values = d + 1L, na.rm = TRUE - )) - expect_no_warning(opt2 <- epi_slide_mean( - grouped, value, - before = 0L, after = 1L, - ref_time_values = d + 1L, na.rm = TRUE - )) - - # Results from epi_slide and epi_slide_mean should match - expect_equal(select(ref1, -slide_value_count), opt1) - expect_equal(select(ref2, -slide_value_count), opt2) -}) - ## --- These cases doesn't generate the error: --- test_that( c( @@ -287,26 +241,32 @@ test_that( ), { expect_equal( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 200L) %>% + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 200L) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group expect_equal( - epi_slide(grouped, f, before = 2L, ref_time_values = d + 3) %>% + epi_slide(grouped, f, before = 2 * days_dt, ref_time_values = test_date + 3) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) ) # not out of range for either group expect_equal( - epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 200L, na.rm = TRUE) %>% + epi_slide_mean( + grouped, value, + before = 2 * days_dt, ref_time_values = test_date + 200L, na.rm = TRUE + ) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199) ) # out of range for one group expect_equal( - epi_slide_mean(grouped, value, before = 2L, ref_time_values = d + 3, na.rm = TRUE) %>% + epi_slide_mean( + grouped, value, + before = 2 * days_dt, ref_time_values = test_date + 3, na.rm = TRUE + ) %>% ungroup() %>% dplyr::select("geo_value", "slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) @@ -318,19 +278,23 @@ test_that("computation output formats x as_list_col", { # See `toy_edf` and `basic_sum_result` definitions at top of file. # We'll try 7d sum with a few formats. expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ sum(.x$value)), basic_sum_result ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ sum(.x$value), as_list_col = TRUE), basic_sum_result %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value))), basic_sum_result %>% rename(slide_value_value = slide_value) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), + toy_edf %>% + epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), basic_sum_result %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) @@ -347,7 +311,7 @@ test_that("epi_slide_mean errors when `as_list_col` non-NULL", { ) %>% epi_slide_mean( value, - before = 6L, na.rm = TRUE + before = 6 * days_dt, na.rm = TRUE ), basic_mean_result %>% dplyr::mutate( slide_value_value = slide_value @@ -361,7 +325,7 @@ test_that("epi_slide_mean errors when `as_list_col` non-NULL", { ) %>% epi_slide_mean( value, - before = 6L, as_list_col = TRUE, na.rm = TRUE + before = 6 * days_dt, as_list_col = TRUE, na.rm = TRUE ), class = "epiprocess__epi_slide_opt__list_not_supported" ) @@ -372,7 +336,7 @@ test_that("nested dataframe output names are controllable", { expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), new_col_name = "result" ), basic_sum_result %>% rename(result_value = slide_value) @@ -380,7 +344,7 @@ test_that("nested dataframe output names are controllable", { expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value_sum = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value_sum = sum(.x$value)), names_sep = NULL ), basic_sum_result %>% rename(value_sum = slide_value) @@ -392,27 +356,29 @@ test_that("non-size-1 outputs are recycled", { # nolint start: line_length_linter. basic_result_from_size2 <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, + "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE) + 1L, ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint end expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + 0:1), basic_result_from_size2 ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + 0:1, as_list_col = TRUE), basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value) + 0:1)), basic_result_from_size2 %>% rename(slide_value_value = slide_value) ) expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE), + toy_edf %>% epi_slide( + before = 6 * days_dt, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE + ), basic_result_from_size2 %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) ) @@ -421,11 +387,17 @@ test_that("non-size-1 outputs are recycled", { test_that("epi_slide alerts if the provided f doesn't take enough args", { f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$value), count = length(x$value)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) - expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d + 1), regexp = NA) + expect_error( + epi_slide(grouped, f_xgt, before = days_dt, ref_time_values = test_date + 1), + regexp = NA + ) + expect_warning( + epi_slide(grouped, f_xgt, before = days_dt, ref_time_values = test_date + 1), + regexp = NA + ) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) - expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d + 1), + expect_warning(epi_slide(grouped, f_x_dots, before = days_dt, ref_time_values = test_date + 1), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) @@ -436,32 +408,32 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # nolint start: line_length_linter. basic_full_result <- tibble::tribble( ~geo_value, ~time_value, ~value, ~slide_value, - "a", 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), - "b", 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "a", test_date + 1:10, 2L^(1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), + "b", test_date + 1:10, 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7, rep(7L, 3L)), adaptive = TRUE, na.rm = TRUE), ) %>% tidyr::unchop(c(time_value, value, slide_value)) %>% dplyr::arrange(time_value) %>% - as_epi_df(as_of = 100) + as_epi_df(as_of = test_date + 100) # nolint end # slide computations returning atomic vecs: expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value)), basic_full_result ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L) + before = 6 * days_dt, ~ sum(.x$value), + ref_time_values = test_date + c(2L, 8L) ), - basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) + basic_full_result %>% dplyr::filter(time_value %in% (test_date + c(2L, 8L))) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ sum(.x$value), - ref_time_values = c(2L, 8L), all_rows = TRUE + before = 6 * days_dt, ~ sum(.x$value), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE ), basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) ) @@ -472,7 +444,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) %>% epi_slide_mean( value, - before = 6L, names_sep = NULL, na.rm = TRUE + before = 6 * days_dt, names_sep = NULL, na.rm = TRUE ), basic_mean_result %>% rename(slide_value_value = slide_value) @@ -483,10 +455,10 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) %>% epi_slide_mean( value, - before = 6L, ref_time_values = c(2L, 8L), + before = 6 * days_dt, ref_time_values = test_date + c(2L, 8L), names_sep = NULL, na.rm = TRUE ), - filter(basic_mean_result, time_value %in% c(2L, 8L)) %>% + filter(basic_mean_result, time_value %in% (test_date + c(2L, 8L))) %>% rename(slide_value_value = slide_value) ) expect_equal( @@ -495,11 +467,11 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) %>% epi_slide_mean( value, - before = 6L, ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, names_sep = NULL, na.rm = TRUE ), basic_mean_result %>% - dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) %>% select(-slide_value) @@ -507,25 +479,25 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # slide computations returning data frames: expect_equal( - toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + toy_edf %>% epi_slide(before = 6 * days_dt, ~ data.frame(value = sum(.x$value))), basic_full_result %>% dplyr::rename(slide_value_value = slide_value) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L) + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L) ), basic_full_result %>% - dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% dplyr::rename(slide_value_value = slide_value) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE ), basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) %>% dplyr::rename(slide_value_value = slide_value) @@ -533,7 +505,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # slide computations returning data frames with `as_list_col=TRUE`: expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE ), basic_full_result %>% @@ -541,30 +513,30 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), as_list_col = TRUE ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% - dplyr::filter(time_value %in% c(2L, 8L)) + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, as_list_col = TRUE ), basic_full_result %>% dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, list(NULL) )) ) # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE ) %>% unnest(slide_value, names_sep = "_"), @@ -572,19 +544,19 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), as_list_col = TRUE ) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% - dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% dplyr::rename(slide_value_value = slide_value) ) expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, as_list_col = TRUE ) %>% unnest(slide_value, names_sep = "_"), @@ -592,7 +564,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { # XXX unclear exactly what we want in this case. Current approach is # compatible with `vctrs::vec_detect_missing` but breaks `tidyr::unnest` # compatibility - dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::filter(time_value %in% (test_date + c(2L, 8L))) %>% dplyr::rename(slide_value_value = slide_value) ) rework_nulls <- function(slide_values_list) { @@ -604,14 +576,14 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { } expect_equal( toy_edf %>% epi_slide( - before = 6L, ~ data.frame(value = sum(.x$value)), - ref_time_values = c(2L, 8L), all_rows = TRUE, + before = 6 * days_dt, ~ data.frame(value = sum(.x$value)), + ref_time_values = test_date + c(2L, 8L), all_rows = TRUE, as_list_col = TRUE ) %>% mutate(slide_value = rework_nulls(slide_value)) %>% unnest(slide_value, names_sep = "_"), basic_full_result %>% - dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% (test_date + c(2L, 8L)), slide_value, NA_integer_ )) %>% dplyr::rename(slide_value_value = slide_value) @@ -621,7 +593,7 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { test_that("`epi_slide` doesn't decay date output", { expect_true( ungrouped %>% - epi_slide(before = 5L, ~ as.Date("2020-01-01")) %>% + epi_slide(before = 5 * days_dt, ~ as.Date("2020-01-01")) %>% `[[`("slide_value") %>% inherits("Date") ) @@ -629,34 +601,34 @@ test_that("`epi_slide` doesn't decay date output", { test_that("basic grouped epi_slide computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(-(1:5))) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) # formula - result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50) + result1 <- epi_slide(small_x, f = ~ sum(.x$value), before = 50 * days_dt) expect_equal(result1, expected_output) # function - result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50) + result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before = 50 * days_dt) expect_equal(result2, expected_output) # dots - result3 <- epi_slide(small_x, slide_value = sum(value), before = 50) + result3 <- epi_slide(small_x, slide_value = sum(value), before = 50 * days_dt) expect_equal(result3, expected_output) }) test_that("basic grouped epi_slide_mean computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) - result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + result1 <- epi_slide_mean(small_x, value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) }) @@ -665,7 +637,7 @@ test_that("ungrouped epi_slide computation completes successfully", { small_x %>% ungroup() %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = sum(.x$value) ) ) @@ -673,15 +645,15 @@ test_that("ungrouped epi_slide computation completes successfully", { test_that("basic ungrouped epi_slide computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15)) ) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = sum(.x$value) ) expect_equal(result1, expected_output) @@ -689,19 +661,19 @@ test_that("basic ungrouped epi_slide computation produces expected output", { # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( dplyr::tibble( - geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) + geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) + cumsum(-(1:5)) ), dplyr::tibble( - geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) + geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = cumsum(11:15) + cumsum(-(1:5)) ) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% arrange(time_value) result2 <- small_x %>% ungroup() %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = sum(.x$value) ) expect_equal(result2, expected_output) @@ -709,37 +681,37 @@ test_that("basic ungrouped epi_slide computation produces expected output", { test_that("basic ungrouped epi_slide_mean computation produces expected output", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), ) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% ungroup() %>% filter(geo_value == "ak") %>% - epi_slide_mean(value, before = 50, names_sep = NULL, na.rm = TRUE) + epi_slide_mean(value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) expect_equal(result1, expected_output %>% rename(slide_value_value = slide_value)) # Ungrouped with multiple geos # epi_slide_mean fails when input data groups contain duplicate time_values, # e.g. aggregating across geos expect_error( - small_x %>% ungroup() %>% epi_slide_mean(value, before = 6L), + small_x %>% ungroup() %>% epi_slide_mean(value, before = 6 * days_dt), class = "epiprocess__epi_slide_opt__duplicate_time_values" ) }) test_that("epi_slide computation via formula can use ref_time_value", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( f = ~.ref_time_value, - before = 50 + before = 50 * days_dt ) expect_equal(result1, expected_output) @@ -747,7 +719,7 @@ test_that("epi_slide computation via formula can use ref_time_value", { result2 <- small_x %>% epi_slide( f = ~.z, - before = 50 + before = 50 * days_dt ) expect_equal(result2, expected_output) @@ -755,40 +727,40 @@ test_that("epi_slide computation via formula can use ref_time_value", { result3 <- small_x %>% epi_slide( f = ~..3, - before = 50 + before = 50 * days_dt ) expect_equal(result3, expected_output) # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% arrange(time_value) result4 <- small_x %>% ungroup() %>% epi_slide( f = ~.ref_time_value, - before = 50 + before = 50 * days_dt ) expect_equal(result4, expected_output) }) test_that("epi_slide computation via function can use ref_time_value", { expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( f = function(x, g, t) t, - before = 2 + before = 2 * days_dt ) expect_equal(result1, expected_output) @@ -797,15 +769,15 @@ test_that("epi_slide computation via function can use ref_time_value", { test_that("epi_slide computation via dots can use ref_time_value and group", { # ref_time_value expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .ref_time_value ) @@ -815,22 +787,22 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # `.env`. expect_error(small_x %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .env$.ref_time_value )) # group_key # Use group_key column expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = "ak"), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = "al") + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = "ak"), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = "al") ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result3 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = .group_key$geo_value ) @@ -838,15 +810,15 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # Use entire group_key object expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = 1L), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = 1L) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = 1L), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = 1L) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result4 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = nrow(.group_key) ) @@ -854,16 +826,16 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # Ungrouped with multiple geos expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value = d + 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 1:5, value = -(1:5), slide_value = test_date + 1:5) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% arrange(time_value) result5 <- small_x %>% ungroup() %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .ref_time_value ) expect_equal(result5, expected_output) @@ -872,14 +844,14 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { test_that("epi_slide computation via dots outputs the same result using col names and the data var", { expected_output <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = max(time_value) ) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = max(.x$time_value) ) @@ -887,7 +859,7 @@ test_that("epi_slide computation via dots outputs the same result using col name result2 <- small_x %>% epi_slide( - before = 2, + before = 2 * days_dt, slide_value = max(.data$time_value) ) @@ -896,7 +868,9 @@ test_that("epi_slide computation via dots outputs the same result using col name test_that("`epi_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epi_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + archive_haystack %>% epi_slide( + has_needle = time_value_needle %in% time_value, before = 365000L * days_dt + ) } expect_error( helper(small_x, as.Date("2021-01-01")), @@ -906,45 +880,50 @@ test_that("`epi_slide` can access objects inside of helper functions", { test_that("basic slide behavior is correct when groups have non-overlapping date ranges", { small_x_misaligned_dates <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5)) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% group_by(geo_value) expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = cumsum(11:15) / 1:5), + dplyr::tibble( + geo_value = "al", time_value = test_date + 151:155, value = -(1:5), slide_value = cumsum(-(1:5)) / 1:5 + ) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) - result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50) + result1 <- epi_slide(small_x_misaligned_dates, f = ~ mean(.x$value), before = 50 * days_dt) expect_equal(result1, expected_output) - result2 <- epi_slide_mean(small_x_misaligned_dates, value, before = 50, names_sep = NULL, na.rm = TRUE) + result2 <- epi_slide_mean( + small_x_misaligned_dates, value, + before = 50 * days_dt, names_sep = NULL, na.rm = TRUE + ) expect_equal(result2, expected_output %>% rename(slide_value_value = slide_value)) }) test_that("epi_slide gets correct ref_time_value when groups have non-overlapping date ranges", { small_x_misaligned_dates <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5)) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15), + dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5)) ) %>% - as_epi_df(as_of = d + 6) %>% + as_epi_df(as_of = test_date + 6) %>% group_by(geo_value) expected_output <- dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value = d + 1:5), - dplyr::tibble(geo_value = "al", time_value = d + 151:155, value = -(1:5), slide_value = d + 151:155) + dplyr::tibble(geo_value = "ak", time_value = test_date + 1:5, value = 11:15, slide_value = test_date + 1:5), + dplyr::tibble(geo_value = "al", time_value = test_date + 151:155, value = -(1:5), slide_value = test_date + 151:155) ) %>% group_by(geo_value) %>% - as_epi_df(as_of = d + 6) + as_epi_df(as_of = test_date + 6) result1 <- small_x_misaligned_dates %>% epi_slide( - before = 50, + before = 50 * days_dt, slide_value = .ref_time_value ) @@ -952,7 +931,7 @@ test_that("epi_slide gets correct ref_time_value when groups have non-overlappin }) test_that("results for different `before`s and `after`s match between epi_slide and epi_slide_mean", { - test_time_type_mean <- function(dates, vals, before = 6L, after = 0L, n, m, n_obs, k, ...) { + test_time_type_mean <- function(dates, vals, before = 6 * days_dt, after = 0 * days_dt, n, m, n_obs, k, ...) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. last (m-1):(n-1) dates are missing epi_data <- epiprocess::as_epi_df(rbind(tibble( @@ -975,10 +954,7 @@ test_that("results for different `before`s and `after`s match between epi_slide ), before = before, after = after, names_sep = NULL, ... ) - result2 <- epi_slide_mean(epi_data, - col_names = c(a, b), na.rm = TRUE, - before = before, after = after, ... - ) + result2 <- epi_slide_mean(epi_data, col_names = c(a, b), na.rm = TRUE, before = before, after = after, ...) expect_equal(result1, result2) } @@ -994,12 +970,12 @@ test_that("results for different `before`s and `after`s match between epi_slide # Basic time type days <- as.Date("2022-01-01") + k - test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) # Without any missing dates n <- 15 # Max date index @@ -1011,12 +987,12 @@ test_that("results for different `before`s and `after`s match between epi_slide # Basic time type days <- as.Date("2022-01-01") + k - test_time_type_mean(days, rand_vals, before = 6, after = 0, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 1, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 6, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 1, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 6, n = n, m = m, n_obs = n_obs, k = k) - test_time_type_mean(days, rand_vals, before = 0, after = 1, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 6 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, before = 1 * days_dt, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 6 * days_dt, n = n, m = m, n_obs = n_obs, k = k) + test_time_type_mean(days, rand_vals, after = 1 * days_dt, n = n, m = m, n_obs = n_obs, k = k) }) test_that("results for different time_types match between epi_slide and epi_slide_mean", { @@ -1047,26 +1023,11 @@ test_that("results for different time_types match between epi_slide and epi_slid )), ...) } - # Basic time type + # Basic time type, require before and after in difftimes days <- as.Date("2022-01-01") + k - - # Require lubridate::period function to be passed as `time_step` - day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes - day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks - - # Don't require a `time_step` fn - yearweeks <- tsibble::yearweek(10L + k) + weeks <- as.Date("2022-01-01") + 7L * k yearmonths <- tsibble::yearmonth(10L + k) - yearquarters <- tsibble::yearquarter(10L + k) - years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format - - # Not supported - custom_dates <- c( - "January 1, 2022", "January 2, 2022", "January 3, 2022", - "January 4, 2022", "January 5, 2022", "January 6, 2022" - ) - not_dates <- c("a", "b", "c", "d", "e", "f") + integers <- 2000L + k ref_epi_data <- generate_special_date_data(days) %>% group_by(geo_value) @@ -1075,10 +1036,10 @@ test_that("results for different time_types match between epi_slide and epi_slid slide_value_a = mean(.x$a, rm.na = TRUE), slide_value_b = mean(.x$b, rm.na = TRUE) ), - before = 6L, after = 0L, names_sep = NULL + before = 6 * days_dt, names_sep = NULL ) - test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { + test_time_type_mean <- function(dates, before) { # Three states, with 2 variables. a is linear, going up in one state and down in the other # b is just random. date 10 is missing epi_data <- generate_special_date_data(dates) %>% @@ -1088,11 +1049,10 @@ test_that("results for different time_types match between epi_slide and epi_slid slide_value_a = mean(.x$a, rm.na = TRUE), slide_value_b = mean(.x$b, rm.na = TRUE) ), - before = before, after = after, names_sep = NULL, ... + before = before, names_sep = NULL ) result2 <- epi_slide_mean(epi_data, - col_names = c(a, b), na.rm = TRUE, - before = before, after = after, ... + col_names = c(a, b), na.rm = TRUE, before = before ) expect_equal(result1, result2) @@ -1101,14 +1061,10 @@ test_that("results for different time_types match between epi_slide and epi_slid expect_equal(select(ref_result, -time_value), select(result2, -time_value)) } - test_time_type_mean(days) - test_time_type_mean(yearweeks) - test_time_type_mean(yearmonths) - test_time_type_mean(yearquarters) - test_time_type_mean(years) - test_time_type_mean(day_times_minute, time_step = lubridate::minutes) - test_time_type_mean(day_times_hour, time_step = lubridate::hours) - test_time_type_mean(weeks, time_step = lubridate::weeks) + test_time_type_mean(days, before = 6 * days_dt) + test_time_type_mean(weeks, before = 6 * weeks_dt) + test_time_type_mean(yearmonths, before = 6) + test_time_type_mean(integers, before = 6) # `epi_slide_mean` can also handle `weeks` without `time_step` being # provided, but `epi_slide` can't @@ -1116,47 +1072,11 @@ test_that("results for different time_types match between epi_slide and epi_slid group_by(geo_value) result2 <- epi_slide_mean(epi_data, col_names = c(a, b), na.rm = TRUE, - before = 6L, after = 0L + before = 6 * weeks_dt ) expect_equal(select(ref_result, -time_value), select(result2, -time_value)) }) -test_that("special time_types without time_step fail in epi_slide_mean", { - n_obs <- 6 - k <- 1:n_obs - - day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes - day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - - # Not supported - custom_dates <- c( - "January 1, 2022", "January 2, 2022", "January 3, 2022", - "January 4, 2022", "January 5, 2022", "January 6, 2022" - ) - not_dates <- c("a", "b", "c", "d", "e", "f") - - test_time_type_mean <- function(dates, before = 6L, after = 0L, ...) { - epi_data <- epiprocess::as_epi_df(tibble( - geo_value = "al", - time_value = dates, - a = 1:n_obs - )) - - expect_error( - epi_slide_mean(epi_data, - col_names = a, - before = before, after = after - ), - class = "epiprocess__full_date_seq__unmappable_time_type" - ) - } - - test_time_type_mean(custom_dates) - test_time_type_mean(not_dates) - test_time_type_mean(day_times_minute) - test_time_type_mean(day_times_hour) -}) - test_that("helper `full_date_seq` returns expected date values", { n <- 6L # Max date index m <- 1L # Number of missing dates @@ -1185,19 +1105,11 @@ test_that("helper `full_date_seq` returns expected date values", { )), ...) } - # Basic time type + # Basic time type, require before and after in difftimes days <- as.Date("2022-01-01") + k - - # Require lubridate::period function to be passed as `time_step` - day_times_minute <- lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(k) # needs time_step = lubridate::minutes - day_times_hour <- lubridate::ydm_h("2022-01-01-15") + lubridate::hours(k) # needs time_step = lubridate::hours - weeks <- as.Date("2022-01-01") + 7L * k # needs time_step = lubridate::weeks - - # Don't require a `time_step` fn - yearweeks <- tsibble::yearweek(10L + k) + weeks <- as.Date("2022-01-01") + 7L * k yearmonths <- tsibble::yearmonth(10L + k) - yearquarters <- tsibble::yearquarter(10L + k) - years <- 2000L + k # does NOT need time_step = lubridate::years because dates are numeric, not a special date format + integers <- 2000L + k before <- 2L after <- 1L @@ -1205,7 +1117,7 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( generate_special_date_data(days), - before = before, after = after + before = before * days_dt, after = after * days_dt, time_type = "day" ), list( all_dates = as.Date(c( @@ -1217,21 +1129,18 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq( - generate_special_date_data(yearweeks), - before = before, after = after - ), + full_date_seq(generate_special_date_data(weeks), before = before, after = after, time_type = "week"), list( - all_dates = tsibble::yearweek(10:16), - pad_early_dates = tsibble::yearweek(8:9), - pad_late_dates = tsibble::yearweek(17) + all_dates = as.Date(c( + "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", + "2022-01-29", "2022-02-05", "2022-02-12" + )), + pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), + pad_late_dates = as.Date(c("2022-02-19")) ) ) expect_identical( - full_date_seq( - generate_special_date_data(yearmonths), - before = before, after = after - ), + full_date_seq(generate_special_date_data(yearmonths), before = before, after = after, time_type = "yearmonth"), list( all_dates = tsibble::yearmonth(10:16), pad_early_dates = tsibble::yearmonth(8:9), @@ -1239,81 +1148,13 @@ test_that("helper `full_date_seq` returns expected date values", { ) ) expect_identical( - full_date_seq( - generate_special_date_data(yearquarters), - before = before, after = after - ), - list( - all_dates = tsibble::yearquarter(10:16), - pad_early_dates = tsibble::yearquarter(8:9), - pad_late_dates = tsibble::yearquarter(17) - ) - ) - expect_identical( - full_date_seq( - generate_special_date_data(years), - before = before, after = after - ), + full_date_seq(generate_special_date_data(integers), before = before, after = after, time_type = "integer"), list( all_dates = 2000L:2006L, pad_early_dates = 1998L:1999L, pad_late_dates = 2007L ) ) - expect_identical( - full_date_seq( - generate_special_date_data(day_times_minute), - before = before, after = after, - time_step = lubridate::minutes - ), - list( - all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(0:6), - pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::minutes(2:1), - pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::minutes(7) - ) - ) - expect_identical( - full_date_seq( - generate_special_date_data(day_times_hour), - before = before, after = after, - time_step = lubridate::hours - ), - list( - all_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(0:6), - pad_early_dates = lubridate::ydm_h("2022-01-01-15") - lubridate::hours(2:1), - pad_late_dates = lubridate::ydm_h("2022-01-01-15") + lubridate::hours(7) - ) - ) - expect_identical( - full_date_seq( - generate_special_date_data(weeks), - before = before, after = after, - time_step = lubridate::weeks - ), - list( - all_dates = as.Date(c( - "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", - "2022-01-29", "2022-02-05", "2022-02-12" - )), - pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), - pad_late_dates = as.Date(c("2022-02-19")) - ) - ) - # Check the middle branch (`if (missing(time_step))`) of `full_date_seq`. - expect_identical( - full_date_seq( - generate_special_date_data(weeks, time_type = "week"), - before = before, after = after - ), - list( - all_dates = as.Date(c( - "2022-01-01", "2022-01-08", "2022-01-15", "2022-01-22", - "2022-01-29", "2022-02-05", "2022-02-12" - )), - pad_early_dates = as.Date(c("2021-12-18", "2021-12-25")), - pad_late_dates = as.Date(c("2022-02-19")) - ) - ) # Other before/after values before <- 5L @@ -1322,7 +1163,7 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( generate_special_date_data(days), - before = before, after = after + before = before * days_dt, after = after * days_dt, time_type = "day" ), list( all_dates = as.Date(c( @@ -1343,7 +1184,7 @@ test_that("helper `full_date_seq` returns expected date values", { expect_identical( full_date_seq( generate_special_date_data(days), - before = before, after = after + before = before * days_dt, after = after * days_dt, time_type = "day" ), list( all_dates = as.Date(c( @@ -1358,49 +1199,44 @@ test_that("helper `full_date_seq` returns expected date values", { ) }) -test_that("`epi_slide_mean` errors when passed `time_values` with closer than expected spacing", { - time_df <- tibble( - geo_value = 1, - value = c(0:7, 3.5, 10, 20), - # Adding the value 3.5 creates a time that has fractional seconds, which - # doesn't follow the expected 1-second spacing of the `time_values`. - # This leads to `frollmean` using obs spanning less than the expected - # time frame for some computation windows. - time_value = Sys.time() + value - ) %>% - as_epi_df() - expect_error( - epi_slide_mean(time_df, value, before = 6L, time_step = lubridate::seconds), - class = "epiprocess__epi_slide_opt__unexpected_row_number" - ) -}) - test_that("epi_slide_mean produces same output as epi_slide_opt", { - result1 <- epi_slide_mean(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) - result2 <- epi_slide_opt(small_x, value, + result1 <- epi_slide_mean( + small_x, + value, + before = 50 * days_dt, + names_sep = NULL, + na.rm = TRUE + ) + result2 <- epi_slide_opt( + small_x, + value, f = data.table::frollmean, - before = 50, names_sep = NULL, na.rm = TRUE + before = 50 * days_dt, + names_sep = NULL, + na.rm = TRUE ) expect_equal(result1, result2) - - result3 <- epi_slide_opt(small_x, value, + result3 <- epi_slide_opt( + small_x, + value, f = slider::slide_mean, - before = 50, names_sep = NULL, na_rm = TRUE + before = 50 * days_dt, + names_sep = NULL, + na_rm = TRUE ) expect_equal(result1, result3) }) test_that("epi_slide_sum produces same output as epi_slide_opt", { - result1 <- epi_slide_sum(small_x, value, before = 50, names_sep = NULL, na.rm = TRUE) + result1 <- epi_slide_sum(small_x, value, before = 50 * days_dt, names_sep = NULL, na.rm = TRUE) result2 <- epi_slide_opt(small_x, value, f = data.table::frollsum, - before = 50, names_sep = NULL, na.rm = TRUE + before = 50 * days_dt, names_sep = NULL, na.rm = TRUE ) expect_equal(result1, result2) - result3 <- epi_slide_opt(small_x, value, f = slider::slide_sum, - before = 50, names_sep = NULL, na_rm = TRUE + before = 50 * days_dt, names_sep = NULL, na_rm = TRUE ) expect_equal(result1, result3) }) @@ -1410,31 +1246,29 @@ test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` fun epi_slide_opt( grouped, col_names = value, f = data.table::frollmean, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ) ) expect_no_error( epi_slide_opt( grouped, col_names = value, f = slider::slide_min, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ) ) - reexport_frollmean <- data.table::frollmean expect_no_error( epi_slide_opt( grouped, col_names = value, f = reexport_frollmean, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ) ) - expect_error( epi_slide_opt( grouped, col_names = value, f = mean, - before = 1L, after = 0L, ref_time_values = d + 1 + before = days_dt, ref_time_values = test_date + 1 ), class = "epiprocess__epi_slide_opt__unsupported_slide_function" ) diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index b87b26ed..7da6c6be 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -1,9 +1,11 @@ +test_date <- as.Date("2020-01-01") + test_that("epix_fill_through_version mirrors input when it is sufficiently up to date", { ea_orig <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 + geo_value = "ak", time_value = test_date, + version = test_date + 1:5, value = 1:5 )) - some_earlier_observed_version <- 2L + some_earlier_observed_version <- test_date + 2L ea_trivial_fill_na1 <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") ea_trivial_fill_na2 <- epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") ea_trivial_fill_locf <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") @@ -15,13 +17,13 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to test_that("epix_fill_through_version can extend observed versions, gives expected `as_of`s", { ea_orig <- as_epi_archive(data.table::data.table( - geo_value = "g1", - time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), - version = c(1:5, 2L), + geo_value = "ak", + time_value = test_date + c(rep(0L, 5L), 1L), + version = test_date + c(1:5, 2L), value = 1:6 )) - first_unobserved_version <- 6L - later_unobserved_version <- 10L + first_unobserved_version <- test_date + 6L + later_unobserved_version <- test_date + 10L ea_fill_na <- epix_fill_through_version(ea_orig, later_unobserved_version, "na") ea_fill_locf <- epix_fill_through_version(ea_orig, later_unobserved_version, "locf") @@ -29,7 +31,7 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte { expect_identical(ea_fill_na$versions_end, later_unobserved_version) expect_identical(tibble::as_tibble(epix_as_of(ea_fill_na, first_unobserved_version)), - tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + tibble::tibble(geo_value = "ak", time_value = test_date + 0:1, value = rep(NA_integer_, 2L)), ignore_attr = TRUE ) expect_identical(ea_fill_locf$versions_end, later_unobserved_version) @@ -50,38 +52,39 @@ test_that("epix_fill_through_version does not mutate x", { for (ea_orig in list( # vanilla case as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 + geo_value = "ak", time_value = test_date, + version = test_date + 1:5, value = 1:5 )), # data.table unique yielding original DT by reference special case (maybe # having only 1 row is the trigger? having no revisions of initial values # doesn't seem sufficient to trigger) - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date + 1, version = test_date + 1, value = 10L)) )) { ea_orig_before <- clone(ea_orig) - some_unobserved_version <- 8L - ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") + ea_fill_na <- epix_fill_through_version(ea_orig, test_date + 8, "na") expect_identical(ea_orig_before, ea_orig) - ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") + ea_fill_locf <- epix_fill_through_version(ea_orig, test_date + 8, "locf") expect_identical(ea_orig_before, ea_orig) } }) test_that("epix_fill_through_version return with expected visibility", { ea <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 + geo_value = "ak", time_value = test_date, + version = test_date + 1:5, value = 1:5 )) - expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) + expect_true(withVisible(epix_fill_through_version(ea, test_date + 10L, "na"))[["visible"]]) }) test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { - ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + ea <- as_epi_archive( + tibble::tibble(geo_value = "ak", time_value = test_date + 1, version = test_date + 1, value = 10L) + ) old_dt_copy <- data.table::copy(ea$DT) old_key <- data.table::key(ea$DT) - expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) - expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version(ea, test_date + 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version(ea, test_date + 5L, "locf")$DT), old_key) expect_identical(data.table::key(ea$DT), old_key) }) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index c29301b8..d336622f 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,3 +1,5 @@ +test_date <- as.Date("2020-01-01") + test_that("epix_merge requires forbids on invalid `y`", { ea <- archive_cases_dv_subset expect_error(epix_merge(ea, data.frame(x = 1))) @@ -9,20 +11,20 @@ test_that("epix_merge merges and carries forward updates properly", { tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, # same version set for x and y - "g1", 1L, 1:3, paste0("XA", 1:3), + "ak", test_date + 1, test_date + 1:3, paste0("XA", 1:3), # versions of x surround those of y + this measurement has # max update version beyond some others - "g1", 2L, 1:5, paste0("XB", 1:5), + "ak", test_date + 2, test_date + 1:5, paste0("XB", 1:5), # mirror case - "g1", 3L, 2L, paste0("XC", 2L), + "ak", test_date + 3, test_date + 2L, paste0("XC", 2L), # x has 1 version, y has 0 - "g1", 4L, 1L, paste0("XD", 1L), + "ak", test_date + 4, test_date + 1L, paste0("XD", 1L), # non-NA values that should be carried forward # (version-wise LOCF) in other versions, plus NAs that # should (similarly) be carried forward as NA (latter # wouldn't work with an ordinary merge + post-processing # with `data.table::nafill`) - "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + "ak", test_date + 6, test_date + c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) ) %>% tidyr::unchop(c(version, x_value)) %>% dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) @@ -32,11 +34,11 @@ test_that("epix_merge merges and carries forward updates properly", { data.table::as.data.table( tibble::tribble( ~geo_value, ~time_value, ~version, ~y_value, - "g1", 1L, 1:3, paste0("YA", 1:3), - "g1", 2L, 2L, paste0("YB", 2L), - "g1", 3L, 1:5, paste0("YC", 1:5), - "g1", 5L, 1L, paste0("YD", 1L), - "g1", 6L, 1:5, paste0("YE", 1:5), + "ak", test_date + 1, test_date + 1:3, paste0("YA", 1:3), + "ak", test_date + 2, test_date + 2L, paste0("YB", 2L), + "ak", test_date + 3, test_date + 1:5, paste0("YC", 1:5), + "ak", test_date + 5, test_date + 1L, paste0("YD", 1L), + "ak", test_date + 6, test_date + 1:5, paste0("YE", 1:5), ) %>% tidyr::unchop(c(version, y_value)) %>% dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) @@ -47,12 +49,12 @@ test_that("epix_merge merges and carries forward updates properly", { data.table::as.data.table( tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), - "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), - "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), - "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), - "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), - "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + "ak", test_date + 1, test_date + 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "ak", test_date + 2, test_date + 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "ak", test_date + 3, test_date + 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "ak", test_date + 4, test_date + 1L, paste0("XD", 1L), paste0("YD", NA), + "ak", test_date + 5, test_date + 1L, paste0("XD", NA), paste0("YD", 1L), + "ak", test_date + 6, test_date + 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), ) %>% tidyr::unchop(c(version, x_value, y_value)) %>% dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) @@ -65,39 +67,41 @@ test_that("epix_merge merges and carries forward updates properly", { test_that("epix_merge forbids and warns on metadata and naming issues", { expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "tx", time_value = test_date, version = test_date + 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "us", time_value = test_date, version = test_date + 5L, y_value = 2L)) ), regexp = "must have the same.*geo_type" ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "pa", time_value = test_date, version = test_date + 1L, x_value = 1L)), + as_epi_archive( + tibble::tibble(geo_value = "pa", time_value = 1L, version = 2L, y_value = 2L) + ) ), - regexp = "must have the same.*time_type" + regexp = "must share data type on their `time_value` column." ) expect_error( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, value = 2L)) ), regexp = "overlapping.*names" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L), additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) ), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L)) ), regexp = "x\\$additional_metadata", class = "epiprocess__epix_merge_ignores_additional_metadata" ) expect_warning( epix_merge( - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L)), + as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L), additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) ) ), @@ -109,30 +113,30 @@ test_that("epix_merge forbids and warns on metadata and naming issues", { # use `local` to prevent accidentally using the x, y, xy bindings here # elsewhere, while allowing reuse across a couple tests local({ - x <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), - clobberable_versions_start = 1L, versions_end = 10L + x <- as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 1L), + clobberable_versions_start = test_date + 1L, versions_end = test_date + 10L ) - y <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), - clobberable_versions_start = 3L, versions_end = 10L + y <- as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 2L), + clobberable_versions_start = test_date + 3L, versions_end = test_date + 10L ) xy <- epix_merge(x, y) test_that("epix_merge considers partially-clobberable row to be clobberable", { - expect_identical(xy$clobberable_versions_start, 1L) + expect_identical(xy$clobberable_versions_start, test_date + 1L) }) test_that("epix_merge result uses versions_end metadata not max version val", { - expect_identical(xy$versions_end, 10L) + expect_identical(xy$versions_end, test_date + 10L) }) }) local({ x <- as_epi_archive( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), - clobberable_versions_start = 1L, - versions_end = 3L + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 10L), + clobberable_versions_start = test_date + 1L, + versions_end = test_date + 3L ) y <- as_epi_archive( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), - clobberable_versions_start = 1L + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 5L, y_value = 20L), + clobberable_versions_start = test_date + 1L ) test_that('epix_merge forbids on sync default or "forbid"', { expect_error(epix_merge(x, y), @@ -147,12 +151,12 @@ local({ epix_merge(x, y, sync = "na"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet - 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + "ak", test_date, test_date + 1L, 10L, NA_integer_, # x updated, y not observed yet + "ak", test_date, test_date + 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + "ak", test_date, test_date + 5L, NA_integer_, 20L, # x still NA, y updated # (we should not have a y vals -> NA update here; version 5 should be # the `versions_end` of the result) - ), clobberable_versions_start = 1L) + ), clobberable_versions_start = test_date + 1L) ) }) test_that('epix_merge sync="locf" works', { @@ -160,9 +164,9 @@ local({ epix_merge(x, y, sync = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated - ), clobberable_versions_start = 1L) + "ak", test_date, test_date + 1L, 10L, NA_integer_, # x updated, y not observed yet + "ak", test_date, test_date + 5L, 10L, 20L, # x LOCF'd, y updated + ), clobberable_versions_start = test_date + 1L) ) }) test_that('epix_merge sync="truncate" works', { @@ -170,16 +174,20 @@ local({ epix_merge(x, y, sync = "truncate"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + "ak", test_date, test_date + 1L, 10L, NA_integer_, # x updated, y not observed yet # y's update beyond x's last update has been truncated - ), clobberable_versions_start = 1L, versions_end = 3L) + ), clobberable_versions_start = test_date + 1L, versions_end = test_date + 3L) ) }) - x_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) - y_no_conflict <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + x_no_conflict <- as_epi_archive( + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, x_value = 10L) + ) + y_no_conflict <- as_epi_archive( + tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, y_value = 20L) + ) xy_no_conflict_expected <- as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + "ak", test_date, test_date + 1L, 10L, 20L, # x updated, y not observed yet )) test_that('epix_merge sync="forbid" on no-conflict works', { expect_equal( @@ -209,25 +217,3 @@ local({ ) }) }) - - -test_that('epix_merge sync="na" balks if do not know next_after', { - expect_error( - epix_merge( - as_epi_archive(tibble::tibble( - geo_value = 1L, - time_value = 1L, - version = as.POSIXct(as.Date("2020-01-01")), - x_value = 10L - )), - as_epi_archive(tibble::tibble( - geo_value = 1L, - time_value = 1L, - version = as.POSIXct(as.Date("2020-01-02")), - y_value = 20L - )), - sync = "na" - ), - regexp = "no applicable method.*next_after" - ) -}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index a5b72cbf..cb7b3bdc 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -1,19 +1,21 @@ library(dplyr) +test_date <- as.Date("2020-01-01") + test_that("epix_slide only works on an epi_archive", { expect_error(epix_slide(data.frame(x = 1))) }) x <- tibble::tribble( ~version, ~time_value, ~binary, - 4, c(1:3), 2^(1:3), - 5, c(1:2, 4), 2^(4:6), - 6, c(1:2, 4:5), 2^(7:10), - 7, 2:6, 2^(11:15) + test_date + 4, test_date + c(1:3), 2^(1:3), + test_date + 5, test_date + c(1:2, 4), 2^(4:6), + test_date + 6, test_date + c(1:2, 4:5), 2^(7:10), + test_date + 7, test_date + 2:6, 2^(11:15) ) %>% tidyr::unnest(c(time_value, binary)) -xx <- bind_cols(geo_value = rep("x", 15), x) %>% +xx <- bind_cols(geo_value = rep("ak", 15), x) %>% as_epi_archive() test_that("epix_slide works as intended", { @@ -26,8 +28,8 @@ test_that("epix_slide works as intended", { ) xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -77,10 +79,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_dfrow2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = c( 2^3 + 2^2, @@ -90,7 +91,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) %>% purrr::map(~ data.frame(bin_sum = .x)) ) %>% group_by(geo_value) - expect_identical(xx_dfrow1, xx_dfrow2) # * xx_dfrow3 <- xx %>% @@ -100,7 +100,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical xx_df1 <- xx %>% @@ -110,10 +109,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_df2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -123,7 +121,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) %>% purrr::map(~ data.frame(bin = rev(.x))) ) %>% group_by(geo_value) - expect_identical(xx_df1, xx_df2) xx_scalar1 <- xx %>% @@ -133,10 +130,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_scalar2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = list( 2^3 + 2^2, @@ -146,7 +142,6 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) ) %>% group_by(geo_value) - expect_identical(xx_scalar1, xx_scalar2) xx_vec1 <- xx %>% @@ -156,10 +151,9 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { before = 2, as_list_col = TRUE ) - xx_vec2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -169,48 +163,26 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { ) %>% purrr::map(rev) ) %>% group_by(geo_value) - expect_identical(xx_vec1, xx_vec2) }) test_that("epix_slide `before` validation works", { - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary)), - "`before` is required" - ) expect_error( xx %>% epix_slide(f = ~ sum(.x$binary), before = NA), - "Assertion on 'before' failed: May not be NA" + "Expected `before` to be a scalar value." ) expect_error( xx %>% epix_slide(f = ~ sum(.x$binary), before = -1), - "Assertion on 'before' failed: Element 1 is not >= 0" + "Expected `before` to be a difftime with units in days or a non-negative integer." ) expect_error( xx %>% epix_slide(f = ~ sum(.x$binary), before = 1.5), - regexp = "before", - class = "vctrs_error_incompatible_type" + "Expected `before` to be a difftime with units in days or a non-negative integer." ) - # We might want to allow this at some point (issue #219): - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = Inf), - regexp = "before", - class = "vctrs_error_incompatible_type" - ) - expect_error(xx %>% epix_slide(f = ~ sum(.x$binary)), "`before` is required") # These `before` values should be accepted: - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 0), - NA - ) - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 2L), - NA - ) - expect_error( - xx %>% epix_slide(f = ~ sum(.x$binary), before = 365000), - NA - ) + expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = 0)) + expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = 2)) + expect_no_error(xx %>% epix_slide(f = ~ sum(.x$binary), before = as.difftime(365000, units = "days"))) }) test_that("quosure passing issue in epix_slide is resolved + other potential issues", { @@ -349,15 +321,15 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea <- tibble::tribble( ~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1) + test_date + 2, test_date + 1:1, 2^(1:1), + test_date + 3, test_date + 1:2, 2^(2:1), + test_date + 4, test_date + 1:3, 2^(3:1), + test_date + 5, test_date + 1:4, 2^(4:1), + test_date + 6, test_date + 1:5, 2^(5:1), + test_date + 7, test_date + 1:6, 2^(6:1) ) %>% tidyr::unnest(c(time_value, binary)) %>% - mutate(geo_value = "x") %>% + mutate(geo_value = "ak") %>% as_epi_archive() test_that("epix_slide with all_versions option has access to all older versions", { @@ -385,12 +357,12 @@ test_that("epix_slide with all_versions option has access to all older versions" result2 <- tibble::tribble( ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, - 2, 1L, sum(1:1), "data.table", key(ea$DT), - 3, 2L, sum(1:2), "data.table", key(ea$DT), - 4, 3L, sum(1:3), "data.table", key(ea$DT), - 5, 4L, sum(1:4), "data.table", key(ea$DT), - 6, 5L, sum(1:5), "data.table", key(ea$DT), - 7, 6L, sum(1:6), "data.table", key(ea$DT), + test_date + 2, 1L, sum(1:1), "data.table", key(ea$DT), + test_date + 3, 2L, sum(1:2), "data.table", key(ea$DT), + test_date + 4, 3L, sum(1:3), "data.table", key(ea$DT), + test_date + 5, 4L, sum(1:4), "data.table", key(ea$DT), + test_date + 6, 5L, sum(1:5), "data.table", key(ea$DT), + test_date + 7, 6L, sum(1:6), "data.table", key(ea$DT), ) expect_identical(result1, result2) # * @@ -437,21 +409,24 @@ test_that("epix_slide with all_versions option has access to all older versions" test_that("epix_as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: - f1 <- function(x, gk, rtv) { tibble( diff_mean = mean(diff(x$binary)) ) } - ref_time_value1 <- 5 + ref_time_value1 <- test_date expect_identical( ea %>% epix_as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), - ea %>% epix_slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ea %>% epix_slide( + f1, + before = 1000, + ref_time_values = ref_time_value1, + names_sep = NULL + ) ) # For all_versions = TRUE: - f2 <- function(x, gk, rtv) { x %>% # extract time&version-lag-1 data: @@ -473,33 +448,45 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { ) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } - ref_time_value2 <- 5 + ref_time_value2 <- test_date + 5 expect_identical( ea %>% epix_as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), - ea %>% epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ea %>% epix_slide( + f2, + before = 1000, + ref_time_values = ref_time_value2, + all_versions = TRUE, + names_sep = NULL + ) ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. ea_multigeo <- ea ea_multigeo$DT <- rbind( ea_multigeo$DT, - copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + copy(ea_multigeo$DT)[, geo_value := "ak"][, binary := -binary][] ) setkeyv(ea_multigeo$DT, key(ea$DT)) expect_identical( ea_multigeo %>% group_by(geo_value) %>% - epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% - filter(geo_value == "x"), + epix_slide( + f2, + before = 1000, + ref_time_values = ref_time_value2, + all_versions = TRUE, + names_sep = NULL + ) %>% + filter(geo_value == "ak"), ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` epix_as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% - transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + transmute(geo_value = "ak", time_value = ref_time_value2, mean_abs_delta) %>% group_by(geo_value) ) }) @@ -515,7 +502,7 @@ test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_version epix_slide( f = slide_fn, before = 1, - ref_time_values = 5, + ref_time_values = test_date + 5, new_col_name = "out", all_versions = TRUE ) @@ -532,8 +519,8 @@ test_that("epix_slide with all_versions option works as intended", { ) xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -582,7 +569,7 @@ test_that("epix_slide with all_versions option works as intended", { test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty <- function(ea, ...) { ea %>% - epix_slide(before = 5L, ..., function(x, gk, rtv) { + epix_slide(before = 5, ..., function(x, gk, rtv) { tibble::tibble() }) } @@ -601,8 +588,6 @@ test_that("epix_slide works with 0-row computation outputs", { geo_value = ea$DT$geo_value[integer(0)], time_value = ea$DT$version[integer(0)] ) %>% - # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, - # as_of = ea$versions_end) %>% group_by(geo_value) ) # with `all_versions=TRUE`, we have something similar but never get an @@ -637,11 +622,11 @@ test_that("epix_slide works with 0-row computation outputs", { # tibble::tibble(value = 42) # }, names_sep = NULL), # tibble::tibble( -# geo_value = "x", +# geo_value = "ak", # time_value = epix_slide_ref_time_values_default(ea), # value = 42 # ) %>% -# new_epi_df(as_of = ea$versions_end) +# as_epi_df(as_of = ea$versions_end) # ) # }) # nolint end @@ -649,20 +634,20 @@ test_that("epix_slide works with 0-row computation outputs", { test_that("epix_slide alerts if the provided f doesn't take enough args", { f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) - expect_warning(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) + expect_error(epix_slide(xx, f = f_xgt, before = 2), regexp = NA) + expect_warning(epix_slide(xx, f = f_xgt, before = 2), regexp = NA) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(epix_slide(xx, f_x_dots, before = 2L), + expect_warning(epix_slide(xx, f_x_dots, before = 2), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) test_that("epix_slide computation via formula can use ref_time_value", { xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -696,9 +681,9 @@ test_that("epix_slide computation via formula can use ref_time_value", { test_that("epix_slide computation via function can use ref_time_value", { xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -715,9 +700,9 @@ test_that("epix_slide computation via function can use ref_time_value", { test_that("epix_slide computation via dots can use ref_time_value and group", { # ref_time_value xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -732,9 +717,9 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { # group_key xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = "x" + geo_value = rep("ak", 4), + time_value = test_date + c(4, 5, 6, 7), + slide_value = "ak" ) %>% group_by(geo_value) @@ -765,14 +750,14 @@ test_that("epix_slide computation via dots outputs the same result using col nam group_by(.data$geo_value) %>% epix_slide( before = 2, - sum_binary = sum(time_value) + sum_binary = sum(binary) ) xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( before = 2, - sum_binary = sum(.x$time_value) + sum_binary = sum(.x$binary) ) expect_identical(xx1, xx_ref) @@ -781,7 +766,7 @@ test_that("epix_slide computation via dots outputs the same result using col nam group_by(.data$geo_value) %>% epix_slide( before = 2, - sum_binary = sum(.data$time_value) + sum_binary = sum(.data$binary) ) expect_identical(xx2, xx_ref) @@ -791,9 +776,8 @@ test_that("`epix_slide` doesn't decay date output", { expect_true( xx$DT %>% as_tibble() %>% - mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% as_epi_archive() %>% - epix_slide(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + epix_slide(before = 5, ~ attr(.x, "metadata")$as_of) %>% `[[`("slide_value") %>% inherits("Date") ) @@ -801,14 +785,8 @@ test_that("`epix_slide` doesn't decay date output", { test_that("`epix_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = Inf) } - expect_error( - helper(archive_cases_dv_subset, as.Date("2021-01-01")), - NA - ) - expect_error( - helper(xx, 3L), - NA - ) + expect_no_error(helper(archive_cases_dv_subset, as.Date("2021-01-01"))) + expect_no_error(helper(xx, 3L)) }) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 5ba66ed2..27e9097c 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -142,7 +142,7 @@ test_that("Grouping are dropped by `as_tibble`", { }) test_that("Renaming columns gives appropriate colnames and metadata", { - edf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% + edf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% as_epi_df(additional_metadata = list(other_keys = "age")) # renaming using base R renamed_edf1 <- edf %>% @@ -158,7 +158,7 @@ test_that("Renaming columns gives appropriate colnames and metadata", { }) test_that("Renaming columns while grouped gives appropriate colnames and metadata", { - gedf <- tibble::tibble(geo_value = 1, time_value = 1, age = 1, value = 1) %>% + gedf <- tibble::tibble(geo_value = "ak", time_value = as.Date("2020-01-01"), age = 1, value = 1) %>% as_epi_df(additional_metadata = list(other_keys = "age")) %>% group_by(geo_value) # renaming using base R diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 3067ba8a..e220af16 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -40,48 +40,30 @@ test_that("guess_geo_type tests for different types of geo_value's", { test_that("guess_time_type works for different types", { days <- as.Date("2022-01-01") + 0:6 weeks <- as.Date("2022-01-01") + 7 * 0:6 - - yearweeks <- tsibble::yearweek(10) yearmonths <- tsibble::yearmonth(10) - yearquarters <- tsibble::yearquarter(10) - - years <- c(1999, 2000) - ambiguous_yearweeks <- c(199901, 199902) # -> "custom" - - daytimes <- as.POSIXct(c("2022-01-01 05:00:00", "2022-01-01 15:0:00"), tz = "UTC") - daytimes_chr <- as.character(daytimes) + integers <- c(1999, 2000) # YYYY-MM-DD is the accepted format not_ymd1 <- "January 1, 2022" not_ymd2 <- "1 January 2022" not_ymd3 <- "1 Jan 2022" - not_a_date <- "asdf" expect_equal(guess_time_type(days), "day") expect_equal(guess_time_type(weeks), "week") - - expect_equal(guess_time_type(yearweeks), "yearweek") expect_equal(guess_time_type(yearmonths), "yearmonth") - expect_equal(guess_time_type(yearquarters), "yearquarter") - - expect_equal(guess_time_type(years), "year") - expect_equal(guess_time_type(ambiguous_yearweeks), "custom") - - expect_equal(guess_time_type(daytimes), "day-time") - expect_equal(guess_time_type(daytimes_chr), "day-time") + expect_equal(guess_time_type(integers), "integer") - expect_equal(guess_time_type(not_ymd1), "custom") - expect_equal(guess_time_type(not_ymd2), "custom") - expect_equal(guess_time_type(not_ymd3), "custom") - expect_equal(guess_time_type(not_a_date), "custom") + expect_warning(guess_time_type(not_ymd1), "Unsupported time type in column") + expect_warning(guess_time_type(not_ymd2), "Unsupported time type in column") + expect_warning(guess_time_type(not_ymd3), "Unsupported time type in column") + expect_warning(guess_time_type(not_a_date), "Unsupported time type in column") }) test_that("guess_time_type works with gaps", { - days_gaps <- as.Date("2022-01-01") + c(0, 1, 3, 4, 8, 8 + 7) - weeks_gaps <- as.Date("2022-01-01") + 7 * c(0, 1, 3, 4, 8, 8 + 7) - expect_equal(guess_time_type(days_gaps), "day") - expect_equal(guess_time_type(weeks_gaps), "week") + gaps <- c(1:6, 8, 9, 11, 8 + 7) + expect_equal(guess_time_type(as.Date("2022-01-01") + gaps), "day") + expect_equal(guess_time_type(as.Date("2022-01-01") + 7 * gaps), "week") }) test_that("enlist works", { @@ -280,3 +262,23 @@ test_that("guess_period works", { weekly_posixlts ) }) + + +test_that("validate_slide_window_arg works", { + for (time_type in c("day", "week", "integer", "yearmonth")) { + expect_no_error(validate_slide_window_arg(Inf, time_type)) + } + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "days"), "day")) + expect_no_error(validate_slide_window_arg(1, "day")) + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "day")) + + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "week")) + expect_error(validate_slide_window_arg(1, "week")) + + expect_no_error(validate_slide_window_arg(1, "integer")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "days"), "integer")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "integer")) + + expect_no_error(validate_slide_window_arg(1, "yearmonth")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "yearmonth")) +}) diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index dca595ff..ec5f36af 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -214,39 +214,6 @@ xt_filled %>% print(n = 7) ``` -## Aggregate to different time scales - -Continuing on with useful `tsibble` functionality, we can aggregate to different -time scales using `index_by()` from `tsibble`, which modifies the index variable -in the given object by applying a suitable time-coarsening transformation (say, -moving from days to weeks, or weeks to months, and so on). The most common use -case would be to follow up with a call to a `dplyr` verb like `summarize()` in -order to perform some kind of aggregation of our measured variables over the new -index variable. - -Below, we use the functions `yearweek()` and `yearmonth()` that are provided in -the `tsibble` package in order to aggregate to weekly and monthly resolutions. -In the former call, we set `week_start = 7` to coincide with the CDC definition -of an epiweek (epidemiological week). - -```{r} -# Aggregate to weekly -xt_filled_week <- xt_filled %>% - index_by(epiweek = ~ yearweek(., week_start = 7)) %>% - group_by(geo_value) %>% - summarize(cases = sum(cases, na.rm = TRUE)) - -head(xt_filled_week) - -# Aggregate to monthly -xt_filled_month <- xt_filled_week %>% - index_by(month = ~ yearmonth(.)) %>% - group_by(geo_value) %>% - summarize(cases = sum(cases, na.rm = TRUE)) - -head(xt_filled_month) -``` - ## Geographic aggregation TODO diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index a34429d9..686f558f 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -93,7 +93,7 @@ print(x) An `epi_archive` is consists of a primary field `DT`, which is a data table (from the `data.table` package) that has the columns `geo_value`, `time_value`, `version` (and possibly additional ones), and other metadata fields, such as -`geo_type` and `time_type`. +`geo_type`. ```{r} class(x$DT) @@ -119,11 +119,10 @@ The following pieces of metadata are included as fields in an `epi_archive` object: * `geo_type`: the type for the geo values. -* `time_type`: the type for the time values. * `additional_metadata`: list of additional metadata for the data archive. Metadata for an `epi_archive` object `x` can be accessed (and altered) directly, -as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function +as in `x$geo_type`, etc. Just like `as_epi_df()`, the function `as_epi_archive()` attempts to guess metadata fields when an `epi_archive` object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index c0cb0011..24a98505 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -129,8 +129,6 @@ frame into `epi_df` format. ```{r, message = FALSE} x <- as_epi_df(cases, - geo_type = "state", - time_type = "day", as_of = max(cases$issue) ) %>% select(geo_value, time_value, total_cases = value) @@ -146,15 +144,13 @@ attributes(x)$metadata In general, an `epi_df` object has the following fields in its metadata: * `geo_type`: the type for the geo values. -* `time_type`: the type for the time values. * `as_of`: the time value at which the given data were available. Metadata for an `epi_df` object `x` can be accessed (and altered) via -`attributes(x)$metadata`. The first two fields here, `geo_type` and `time_type`, -are not currently used by any downstream functions in the `epiprocess` package, -and serve only as useful bits of information to convey about the data set at -hand. The last field here, `as_of`, is one of the most unique aspects of an -`epi_df` object. +`attributes(x)$metadata`. The field, `geo_type`,is not currently used by any +downstream functions in the `epiprocess` package, and serve only as useful bits +of information to convey about the data set at hand. The last field here, +`as_of`, is one of the most unique aspects of an `epi_df` object. In brief, we can think of an `epi_df` object as a single snapshot of a data set that contains the most up-to-date values of some signals of interest, as of the @@ -166,11 +162,11 @@ data set. See the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for more. -If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a -call to `as_epi_df()`, then this function will try to infer them from the passed -object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value` -and `time_value` columns, respectively, but inferring the `as_of` field is not -as easy. See the documentation for `as_epi_df()` more details. +If `geo_type` or `as_of` arguments are missing in a call to `as_epi_df()`, then +this function will try to infer them from the passed object. Usually, `geo_type` +can be inferred from the `geo_value` columns, respectively, but inferring the +`as_of` field is not as easy. See the documentation for `as_epi_df()` more +details. ```{r} x <- as_epi_df(cases, as_of = as.Date("2024-03-20")) %>% @@ -196,7 +192,7 @@ ex1 <- tibble( ) %>% as_tsibble(index = time_value, key = c(geo_value, county_code)) -ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03") +ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03") ``` The metadata now includes `county_code` as an extra key. @@ -237,7 +233,7 @@ head(ex2) ex2 <- ex2 %>% rename(geo_value = state, time_value = reported_date) %>% as_epi_df( - geo_type = "state", as_of = "2020-06-03", + as_of = "2020-06-03", additional_metadata = list(other_keys = "pol") ) @@ -304,7 +300,7 @@ cases in Canada in 2003, from the x <- outbreaks::sars_canada_2003 %>% mutate(geo_value = "ca") %>% select(geo_value, time_value = date, starts_with("cases")) %>% - as_epi_df(geo_type = "nation", as_of = as.Date("2024-03-20")) + as_epi_df(as_of = as.Date("2024-03-20")) head(x) @@ -352,7 +348,7 @@ x <- outbreaks::ebola_sierraleone_2014 %>% time_value = full_seq(time_value, period = 1), fill = list(cases = 0) ) %>% - as_epi_df(geo_type = "province", as_of = as.Date("2024-03-20")) + as_epi_df(as_of = as.Date("2024-03-20")) ggplot(x, aes(x = time_value, y = cases)) + geom_col(aes(fill = geo_value), show.legend = FALSE) + From fe98b7bcd84aa01436b0f32a1586c4a99fbf6689 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 19 Jul 2024 17:45:59 -0700 Subject: [PATCH 344/345] repo: bump version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b4806bd5..569b2fd0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.14 +Version: 0.7.15 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), From d2354f05fa306a5165d04c7c303873a4b076eca4 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 23 Jul 2024 13:39:31 -0700 Subject: [PATCH 345/345] doc: bump to 0.8 --- DESCRIPTION | 2 +- NEWS.md | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 569b2fd0..f03a92ee 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.15 +Version: 0.8.0 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index e186b8fe..e1c6a3ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's. +# epiprocess 0.9 + # epiprocess 0.8 ## Breaking changes @@ -42,9 +44,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Fixed documentation referring to old `epi_slide()` interface (#466, thanks @XuedaShen!). - `as_epi_df` and `as_epi_archive` now support arguments to specify column names - e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list - of default conversions, see `time_column_names` for a list of columns that - will automatically be recognized and converted to `time_value` column (there + e.g. `as_epi_df(some_tibble, geo_value=state)`. In addition, there is a list + of default conversions, see `time_column_names` for a list of columns that + will automatically be recognized and converted to `time_value` column (there are similar functions for `geo` and `version`). - Fixed bug where `epix_slide_ref_time_values_default()` on datetimes would output a huge number of `ref_time_values` spaced apart by mere seconds.