diff --git a/NEWS.md b/NEWS.md index 7ca3e2b3..2d9e6015 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,17 +1,36 @@ -# epiprocess 0.7.1.9000 +# epiprocess 0.7.2.9999 ## Improvements -* Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. * `epi_slide` computations are now 2-4 times faster after changing how reference time values, made accessible within sliding functions, are calculated (#397). -# epiprocess 0.7.0 +# epiprocess 0.7.1.9999 Note that `epiprocess` uses the [Semantic Versioning ("semver")](https://semver.org/) scheme for all release versions, but any inter-release development versions will include an additional ".9999" suffix. +Pre-1.0.0 numbering scheme: when making changes to a development version +0.x.y.9999, we will increment y when merging PRs, and will have increment x (and +reset y) on release. + +## Breaking changes + +* Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 + +## Improvements + +* `select` on grouped `epi_df`s now only drops `epi_df`ness if it makes sense; PR #390 +* Minor documentation updates; PR #393 + +# epiprocess 0.7.0.9999 + +## Improvements + +* Updated vignettes for compatibility with epidatr 1.0.0 in PR #377. + +# epiprocess 0.7.0 ## Breaking changes diff --git a/R/archive.R b/R/archive.R index 1908b77c..faaf048b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -216,11 +216,24 @@ epi_archive <- classname = "epi_archive", ##### public = list( + #' @field DT (`data.table`)\cr + #' the (optionally compactified) datatable DT = NULL, + #' @field geo_type (string)\cr + #' the resolution of the geographic label (e.g. state) geo_type = NULL, + #' @field time_type (string)\cr + #' the resolution of the time column (e.g. day) time_type = NULL, + #' @field additional_metadata (named list)\cr + #' any extra fields, such as `other_keys` additional_metadata = NULL, + #' @field clobberable_versions_start (length-1 of same type&class as `version` column, or `NA`)\cr + #' the earliest version number that might be rewritten in the future without assigning a new version + #' date/number, or `NA` if this won't happen clobberable_versions_start = NULL, + #' @field versions_end (length-1 of same type&class as `version` column)\cr + #' the latest version observed versions_end = NULL, #' @description Creates a new `epi_archive` object. #' @param x A data frame, data table, or tibble, with columns `geo_value`, @@ -426,6 +439,10 @@ epi_archive <- self$clobberable_versions_start <- clobberable_versions_start self$versions_end <- versions_end }, + #' Print information about an archive + #' @param class Boolean; whether to print the class label header + #' @param methods Boolean; whether to print all available methods of + #' the archive print = function(class = TRUE, methods = TRUE) { if (class) cat("An `epi_archive` object, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) @@ -487,7 +504,23 @@ epi_archive <- }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. - #' See the documentation for the wrapper function [`epix_as_of()`] for details. + #' See the documentation for the wrapper function [`epix_as_of()`] for + #' details. The parameter descriptions below are copied from there + #' @param x An `epi_archive` object + #' @param max_version Version specifying the max version to permit in the + #' snapshot. That is, the snapshot will comprise the unique rows of the + #' current archive data that represent the most up-to-date signal values, as + #' of the specified `max_version` (and whose `time_value`s are at least + #' `min_time_value`). + #' @param min_time_value Time value specifying the min `time_value` to permit in + #' the snapshot. Default is `-Inf`, which effectively means that there is no + #' minimum considered. + #' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in + #' `epi_archive` format, and contain rows in the specified `time_value` range + #' having `version <= max_version`. The resulting object will cover a + #' potentially narrower `version` and `time_value` range than `x`, depending + #' on user-provided arguments. Otherwise, there will be one row in the output + #' for the `max_version` of each `time_value`. Default is `FALSE`. #' @importFrom data.table between key as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { # Self max version and other keys @@ -679,15 +712,94 @@ epi_archive <- return(invisible(self)) }, - ##### + #' group an epi_archive + #' @description + #' group an epi_archive + #' @param ... variables or computations to group by. Computations are always + #' done on the ungrouped data frame. To perform computations on the grouped + #' data, you need to use a separate [`mutate()`] step before the + #' [`group_by()`] + #' @param .add When `FALSE`, the default, [`group_by()`] will override existing + #' groups. To add to the existing groups, use `.add = TRUE`. + #' @param .drop Drop groups formed by factor levels that don't appear in the + #' data. The default is `TRUE` except when `.data` has been previously grouped + #' with `.drop = FALSE`. See [`group_by_drop_default()`] for details. group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { group_by.epi_archive(self, ..., .add = .add, .drop = .drop) }, #' @description Slides a given function over variables in an `epi_archive` #' object. See the documentation for the wrapper function [`epix_slide()`] for - #' details. + #' details. The parameter descriptions below are copied from there #' @importFrom data.table key #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms + #' @param f Function, formula, or missing; together with `...` specifies the + #' computation to slide. To "slide" means to apply a computation over a + #' sliding (a.k.a. "rolling") time window for each data group. The window is + #' determined by the `before` parameter described below. One time step is + #' typically one day or one week; see [`epi_slide`] details for more + #' explanation. If a function, `f` must take an `epi_df` with the same + #' column names as the archive's `DT`, minus the `version` column; followed + #' by a one-row tibble containing the values of the grouping variables for + #' the associated group; followed by a reference time value, usually as a + #' `Date` object; followed by any number of named arguments. If a formula, + #' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as + #' in `~ mean (.x$var)` to compute a mean of a column `var` for each + #' group-`ref_time_value` combination. The group key can be accessed via + #' `.y` or `.group_key`, and the reference time value can be accessed via + #' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the + #' computation. + #' @param ... Additional arguments to pass to the function or formula specified + #' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an + #' expression for tidy evaluation; in addition to referring to columns + #' directly by name, the expression has access to `.data` and `.env` pronouns + #' as in `dplyr` verbs, and can also refer to the `.group_key` and + #' `.ref_time_value`. See details of [`epi_slide`]. + #' @param before How far `before` each `ref_time_value` should the sliding + #' window extend? If provided, should be a single, non-NA, + #' [integer-compatible][vctrs::vec_cast] number of time steps. This window + #' endpoint is inclusive. For example, if `before = 7`, and one time step is + #' one day, then to produce a value for a `ref_time_value` of January 8, we + #' apply the given function or formula to data (for each group present) with + #' `time_value`s from January 1 onward, as they were reported on January 8. + #' For typical disease surveillance sources, this will not include any data + #' with a `time_value` of January 8, and, depending on the amount of reporting + #' latency, may not include January 7 or even earlier `time_value`s. (If + #' instead the archive were to hold nowcasts instead of regular surveillance + #' data, then we would indeed expect data for `time_value` January 8. If it + #' were to hold forecasts, then we would expect data for `time_value`s after + #' January 8, and the sliding window would extend as far after each + #' `ref_time_value` as needed to include all such `time_value`s.) + #' @param ref_time_values Reference time values / versions for sliding + #' computations; each element of this vector serves both as the anchor point + #' for the `time_value` window for the computation and the `max_version` + #' `as_of` which we fetch data in this window. If missing, then this will set + #' to a regularly-spaced sequence of values set to cover the range of + #' `version`s in the `DT` plus the `versions_end`; the spacing of values will + #' be guessed (using the GCD of the skips between values). + #' @param time_step Optional function used to define the meaning of one time + #' step, which if specified, overrides the default choice based on the + #' `time_value` column. This function must take a positive integer and return + #' an object of class `lubridate::period`. For example, we can use `time_step + #' = lubridate::hours` in order to set the time step to be one hour (this + #' would only be meaningful if `time_value` is of class `POSIXct`). + #' @param new_col_name String indicating the name of the new column that will + #' contain the derivative values. Default is "slide_value"; note that setting + #' `new_col_name` equal to an existing column name will overwrite this column. + #' @param as_list_col Should the slide results be held in a list column, or be + #' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, + #' in which case a list object returned by `f` would be unnested (using + #' [`tidyr::unnest()`]), and, if the slide computations output data frames, + #' the names of the resulting columns are given by prepending `new_col_name` + #' to the names of the list elements. + #' @param names_sep String specifying the separator to use in `tidyr::unnest()` + #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix + #' from `new_col_name` entirely. + #' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If + #' `all_versions = TRUE`, then `f` will be passed the version history (all + #' `version <= ref_time_value`) for rows having `time_value` between + #' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be + #' passed only the most recent `version` for every unique `time_value`. + #' Default is `FALSE`. slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -717,7 +829,7 @@ epi_archive <- #' Converts a data frame, data table, or tibble into an `epi_archive` #' object. See the [archive #' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. +#' examples. The parameter descriptions below are copied from there #' #' @param x A data frame, data table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index a1c60687..93b10736 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -81,7 +81,7 @@ An \code{epi_archive} object. \description{ Converts a data frame, data table, or tibble into an \code{epi_archive} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. +examples. The parameter descriptions below are copied from there } \details{ This simply a wrapper around the \code{new()} method of the \code{epi_archive} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 366eafe0..6a25b2af 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -115,6 +115,30 @@ toy_epi_archive <- tib \%>\% epi_archive$new( ) toy_epi_archive } +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{DT}}{(\code{data.table})\cr +the (optionally compactified) datatable} + +\item{\code{geo_type}}{(string)\cr +the resolution of the geographic label (e.g. state)} + +\item{\code{time_type}}{(string)\cr +the resolution of the time column (e.g. day)} + +\item{\code{additional_metadata}}{(named list)\cr +any extra fields, such as \code{other_keys}} + +\item{\code{clobberable_versions_start}}{(length-1 of same type&class as \code{version} column, or \code{NA})\cr +the earliest version number that might be rewritten in the future without assigning a new version +date/number, or \code{NA} if this won't happen} + +\item{\code{versions_end}}{(length-1 of same type&class as \code{version} column)\cr +the latest version observed} +} +\if{html}{\out{
}} +} \section{Methods}{ \subsection{Public methods}{ \itemize{ @@ -196,6 +220,7 @@ rows of \code{x}.} \subsection{Details}{ Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information and examples of parameter names. +Print information about an archive } \subsection{Returns}{ @@ -210,17 +235,52 @@ An \code{epi_archive} object. \if{html}{\out{
}}\preformatted{epi_archive$print(class = TRUE, methods = TRUE)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{class}}{Boolean; whether to print the class label header} + +\item{\code{methods}}{Boolean; whether to print all available methods of +the archive} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} \subsection{Method \code{as_of()}}{ Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for details. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for +details. The parameter descriptions below are copied from there \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf, all_versions = FALSE)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_version}}{Version specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose \code{time_value}s are at least +\code{min_time_value}).} + +\item{\code{min_time_value}}{Time value specifying the min \code{time_value} to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{\code{all_versions}}{Boolean; If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{\code{x}}{An \code{epi_archive} object} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} @@ -291,7 +351,8 @@ does not alias either archive's \code{DT}. \item{\code{sync}}{as in \code{\link{epix_merge}}} -\item{\code{compactify}}{as in \code{\link{epix_merge}}} +\item{\code{compactify}}{as in \code{\link{epix_merge}} +group an epi_archive} } \if{html}{\out{}} } @@ -300,6 +361,7 @@ does not alias either archive's \code{DT}. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-group_by}{}}} \subsection{Method \code{group_by()}}{ +group an epi_archive \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$group_by( ..., @@ -308,6 +370,23 @@ does not alias either archive's \code{DT}. )}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{variables or computations to group by. Computations are always +done on the ungrouped data frame. To perform computations on the grouped +data, you need to use a separate \code{\link[=mutate]{mutate()}} step before the +\code{\link[=group_by]{group_by()}}} + +\item{\code{.add}}{When \code{FALSE}, the default, \code{\link[=group_by]{group_by()}} will override existing +groups. To add to the existing groups, use \code{.add = TRUE}.} + +\item{\code{.drop}}{Drop groups formed by factor levels that don't appear in the +data. The default is \code{TRUE} except when \code{.data} has been previously grouped +with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop_default()}} for details.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} @@ -315,7 +394,7 @@ does not alias either archive's \code{DT}. \subsection{Method \code{slide()}}{ Slides a given function over variables in an \code{epi_archive} object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. +details. The parameter descriptions below are copied from there \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$slide( f, @@ -330,6 +409,87 @@ details. )}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{f}}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{\code{...}}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{\code{before}}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{\code{ref_time_values}}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{\code{time_step}}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{\code{new_col_name}}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{\code{as_list_col}}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{\code{names_sep}}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{\code{all_versions}}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}}