From 49b1eb81ca47c7e90128b3ad37ce324e1c635a79 Mon Sep 17 00:00:00 2001 From: evalparse Date: Sat, 30 Nov 2019 17:32:18 +1100 Subject: [PATCH 1/3] adding support for tidyfast --- DESCRIPTION | 3 +- NAMESPACE | 16 +- R/chunk_mapper.r | 89 ++++++++++ R/dplyr_verbs.r | 163 +++++------------- R/sample_frac.R | 2 +- R/tidyfast-verbs.r | 49 ++++++ ...dplyr_mapper.Rd => create_chunk_mapper.Rd} | 18 +- man/tidyfast_verbs.Rd | 51 ++++++ 8 files changed, 256 insertions(+), 135 deletions(-) create mode 100644 R/chunk_mapper.r create mode 100644 R/tidyfast-verbs.r rename man/{create_dplyr_mapper.Rd => create_chunk_mapper.Rd} (66%) create mode 100644 man/tidyfast_verbs.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 8aa60a23..d011853f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,7 +51,8 @@ Suggests: speedglm, broom, learnr, - ggplot2 + ggplot2, + tidyfast (>= 0.1.8) LinkingTo: Rcpp RoxygenNote: 7.0.1 diff --git a/NAMESPACE b/NAMESPACE index 9f88d417..c11973c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ S3method(compute,disk.frame) S3method(delayed,disk.frame) S3method(distinct,disk.frame) S3method(do,disk.frame) +S3method(dt_separate,disk.frame) S3method(filter,disk.frame) S3method(full_join,disk.frame) S3method(get_chunk,disk.frame) @@ -62,6 +63,12 @@ export(as.disk.frame) export(ceremony_text) export(chunk_arrange) export(chunk_distinct) +export(chunk_dt_count.disk.frame) +export(chunk_dt_fill) +export(chunk_dt_hoist) +export(chunk_dt_nest) +export(chunk_dt_uncount.disk.frame) +export(chunk_dt_unnest) export(chunk_group_by) export(chunk_lapply) export(chunk_summarise) @@ -75,7 +82,7 @@ export(collect_list) export(colnames) export(copy_df_to) export(count.disk.frame) -export(create_dplyr_mapper) +export(create_chunk_mapper) export(csv_to_disk.frame) export(delayed) export(delete) @@ -232,6 +239,13 @@ importFrom(rlang,eval_tidy) importFrom(rlang,quo) importFrom(stats,runif) importFrom(stringr,fixed) +importFrom(tidyfast,dt_count) +importFrom(tidyfast,dt_fill) +importFrom(tidyfast,dt_hoist) +importFrom(tidyfast,dt_nest) +importFrom(tidyfast,dt_separate) +importFrom(tidyfast,dt_uncount) +importFrom(tidyfast,dt_unnest) importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,memory.limit) diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r new file mode 100644 index 00000000..b51fd8e7 --- /dev/null +++ b/R/chunk_mapper.r @@ -0,0 +1,89 @@ +#' Create function that applies to each chunk if disk.frame +#' +#' A function to make it easier to create functions like \code{filter} +#' +#' @examples +#' +#' filter = create_chunk_mapper(dplyr::filter) +#' +#' #' example: creating a function that keeps only the first and last n row +#' first_and_last <- function(chunk, n, ...) { +#' nr = nrow(chunk) +#' print(nr-n+1:nr) +#' chunk[c(1:n, (nr-n+1):nr), ] +#' } +#' +#' #' create the function for use with disk.frame +#' first_and_last_df = create_chunk_mapper(first_and_last) +#' +#' mtcars.df = as.disk.frame(mtcars) +#' +#' #' the operation is lazy +#' lazy_mtcars.df = mtcars.df %>% +#' first_and_last_df(2) +#' +#' #' bring into R +#' collect(lazy_mtcars.df) +#' +#' #' clean up +#' delete(mtcars.df) +#' +#' @param fn The dplyr function to create a mapper for +#' @param warning_msg The warning message to display when invoking the mapper +#' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr +#' @importFrom rlang enquos quo +#' @export +create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) { + return_func <- function(.data, ...) { + if (!is.null(warning_msg)) { + warning(warning_msg) + } + + + quo_dotdotdot = rlang::enquos(...) + + # this is designed to capture any global stuff + vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot) + data_for_eval_tidy = force(vars_and_pkgs$globals) + + res = map(.data, ~{ + + this_env = environment() + + if(length(data_for_eval_tidy) > 0) { + for(i in 1:length(data_for_eval_tidy)) { + assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env) + } + } + + lapply(quo_dotdotdot, function(x) { + attr(x, ".Environment") = this_env + }) + + if(as.data.frame) { + if("grouped_df" %in% class(.x)) { + code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) + } else { + code = rlang::quo(chunk_fn(as.data.frame(.x), !!!quo_dotdotdot)) + } + } else { + code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) + } + + # ZJ: we need both approaches. TRUST ME + # TODO better NSE at some point need dist + tryCatch({ + return(rlang::eval_tidy(code)) + }, error = function(e) { + as_label_code = rlang::as_label(code) + if(as_label_code == "chunk_fn(...)") { + stop(glue::glue("disk.frame has detected a syntax error in \n\n`{code}`\n\n. If you believe your syntax is correct, raise an issue at https://github.com/xiaodaigh/disk.frame with a MWE")) + } else { + # likely to be dealing with data.tables + return(eval(parse(text=as_label_code), envir = this_env)) + } + }) + }, lazy = TRUE) + } + return_func +} \ No newline at end of file diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 577bcfe5..ca3d2a52 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -1,86 +1,3 @@ -#' Create dplyr function for disk.frame -#' -#' A function to make it easier to create functions like \code{filter} -#' -#' @examples -#' -#' filter = create_dplyr_mapper(dplyr::filter) -#' -#' #' example: creating a function that keeps only the first and last n row -#' first_and_last <- function(chunk, n, ...) { -#' nr = nrow(chunk) -#' print(nr-n+1:nr) -#' chunk[c(1:n, (nr-n+1):nr), ] -#' } -#' -#' #' create the function for use with disk.frame -#' first_and_last_df = create_dplyr_mapper(first_and_last) -#' -#' mtcars.df = as.disk.frame(mtcars) -#' -#' #' the operation is lazy -#' lazy_mtcars.df = mtcars.df %>% -#' first_and_last_df(2) -#' -#' #' bring into R -#' collect(lazy_mtcars.df) -#' -#' #' clean up -#' delete(mtcars.df) -#' -#' @param dplyr_fn The dplyr function to create a mapper for -#' @param warning_msg The warning message to display when invoking the mapper -#' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr -#' @importFrom rlang enquos quo -#' @export -create_dplyr_mapper <- function(dplyr_fn, warning_msg = NULL, as.data.frame = TRUE) { - return_func <- function(.data, ...) { - if (!is.null(warning_msg)) { - warning(warning_msg) - } - - quo_dotdotdot = rlang::enquos(...) - - # this is designed to capture any global stuff - vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot) - data_for_eval_tidy = force(vars_and_pkgs$globals) - - res = map(.data, ~{ - this_env = environment() - - if(length(data_for_eval_tidy) > 0) { - for(i in 1:length(data_for_eval_tidy)) { - assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env) - } - } - - lapply(quo_dotdotdot, function(x) { - attr(x, ".Environment") = this_env - }) - - if(as.data.frame) { - if("grouped_df" %in% class(.x)) { - code = rlang::quo(dplyr_fn(.x, !!!quo_dotdotdot)) - } else { - code = rlang::quo(dplyr_fn(as.data.frame(.x), !!!quo_dotdotdot)) - } - } else { - code = rlang::quo(dplyr_fn(.x, !!!quo_dotdotdot)) - } - - # ZJ: we need both approaches. TRUST ME - # TODO better NSE at some point - #tryCatch({ - rlang::eval_tidy(code) - #}, error = function(e) { - # if the previous failed - # eval(parse(text=rlang::as_label(code)), envir = this_env) - #}) - }, lazy = TRUE) - } - return_func -} - #' The dplyr verbs implemented for disk.frame #' @description #' Please see the dplyr document for their usage. Please note that `group_by` @@ -119,30 +36,30 @@ select.disk.frame <- function(.data, ...) { #' @export #' @rdname dplyr_verbs -rename.disk.frame <- create_dplyr_mapper(dplyr::rename) +rename.disk.frame <- create_chunk_mapper(dplyr::rename) #' @export #' @rdname dplyr_verbs -filter.disk.frame <- create_dplyr_mapper(dplyr::filter) +filter.disk.frame <- create_chunk_mapper(dplyr::filter) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr filter_all -filter_all.disk.frame <- create_dplyr_mapper(dplyr::filter_all) +filter_all.disk.frame <- create_chunk_mapper(dplyr::filter_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr filter_if -filter_if.disk.frame <- create_dplyr_mapper(dplyr::filter_if) +filter_if.disk.frame <- create_chunk_mapper(dplyr::filter_if) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr filter_at -filter_at.disk.frame <- create_dplyr_mapper(dplyr::filter_at) +filter_at.disk.frame <- create_chunk_mapper(dplyr::filter_at) #' @export @@ -150,37 +67,37 @@ filter_at.disk.frame <- create_dplyr_mapper(dplyr::filter_at) #' @importFrom future getGlobalsAndPackages #' @importFrom rlang eval_tidy quo enquos #' @importFrom dplyr mutate -mutate.disk.frame <- create_dplyr_mapper(dplyr::mutate) +mutate.disk.frame <- create_chunk_mapper(dplyr::mutate) #' @export #' @importFrom dplyr transmute #' @rdname dplyr_verbs -transmute.disk.frame <- create_dplyr_mapper(dplyr::transmute) +transmute.disk.frame <- create_chunk_mapper(dplyr::transmute) #' @export #' @importFrom dplyr arrange #' @rdname dplyr_verbs -arrange.disk.frame =create_dplyr_mapper(dplyr::arrange, warning_msg="`arrange.disk.frame` is now deprecated. Please use `chunk_arrange` instead. This is in preparation for a more powerful `arrange` that sorts the whole disk.frame") +arrange.disk.frame =create_chunk_mapper(dplyr::arrange, warning_msg="`arrange.disk.frame` is now deprecated. Please use `chunk_arrange` instead. This is in preparation for a more powerful `arrange` that sorts the whole disk.frame") #' @export #' @importFrom dplyr arrange #' @rdname dplyr_verbs -chunk_arrange <- create_dplyr_mapper(dplyr::arrange) +chunk_arrange <- create_chunk_mapper(dplyr::arrange) #' @export #' @importFrom dplyr tally #' @rdname dplyr_verbs -tally.disk.frame <- create_dplyr_mapper(dplyr::tally) +tally.disk.frame <- create_chunk_mapper(dplyr::tally) #' @export #' @importFrom dplyr count #' @rdname dplyr_verbs -count.disk.frame <- create_dplyr_mapper(dplyr::count) +count.disk.frame <- create_chunk_mapper(dplyr::count) # TODO family is not required is group-by # TODO alot of these .disk.frame functions are not generic @@ -189,31 +106,31 @@ count.disk.frame <- create_dplyr_mapper(dplyr::count) #' @export #' @importFrom dplyr add_count #' @rdname dplyr_verbs -add_count.disk.frame <- create_dplyr_mapper(dplyr::add_count) +add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) #' @export #' @importFrom dplyr add_tally #' @rdname dplyr_verbs -add_tally.disk.frame <- create_dplyr_mapper(dplyr::add_tally) +add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) #' @export #' @importFrom dplyr summarize #' @rdname dplyr_verbs -chunk_summarize <- create_dplyr_mapper(dplyr::summarize) +chunk_summarize <- create_chunk_mapper(dplyr::summarize) #' @export #' @importFrom dplyr summarise #' @rdname dplyr_verbs -chunk_summarise <- create_dplyr_mapper(dplyr::summarise) +chunk_summarise <- create_chunk_mapper(dplyr::summarise) #' @export #' @importFrom dplyr summarize #' @rdname dplyr_verbs -summarize.disk.frame <- create_dplyr_mapper(dplyr::summarize, warning_msg="`summarize.disk.frame` is now deprecated. Please use `chunk_summarize` instead. This is in preparation for a more powerful `group_by` framework") +summarize.disk.frame <- create_chunk_mapper(dplyr::summarize, warning_msg="`summarize.disk.frame` is now deprecated. Please use `chunk_summarize` instead. This is in preparation for a more powerful `group_by` framework") #function(...) { #stop("`summarize.disk.frame` has been removed. Please use `chunk_summarize` instead. This is in preparation for a more powerful `group_by` framework") #} @@ -222,7 +139,7 @@ summarize.disk.frame <- create_dplyr_mapper(dplyr::summarize, warning_msg="`summ #' @export #' @importFrom dplyr summarize #' @rdname dplyr_verbs -summarise.disk.frame <- create_dplyr_mapper(dplyr::summarise, warning_msg="`summarise.disk.frame` is now deprecated. Please use `chunk_summarise` instead. This is in preparation for a more powerful `group_by` framework") +summarise.disk.frame <- create_chunk_mapper(dplyr::summarise, warning_msg="`summarise.disk.frame` is now deprecated. Please use `chunk_summarise` instead. This is in preparation for a more powerful `group_by` framework") #function(...) { # stop("`summarise.disk.frame` has been removed. Please use `chunk_summarise` instead. This is in preparation for a more powerful `group_by` framework") # } @@ -231,115 +148,115 @@ summarise.disk.frame <- create_dplyr_mapper(dplyr::summarise, warning_msg="`summ #' @export #' @rdname dplyr_verbs #' @importFrom dplyr do -do.disk.frame <- create_dplyr_mapper(dplyr::do) +do.disk.frame <- create_chunk_mapper(dplyr::do) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr group_by_all -group_by_all.disk.frame <- create_dplyr_mapper(dplyr::group_by_all) +group_by_all.disk.frame <- create_chunk_mapper(dplyr::group_by_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr group_by_at -group_by_at.disk.frame <- create_dplyr_mapper(dplyr::group_by_at) +group_by_at.disk.frame <- create_chunk_mapper(dplyr::group_by_at) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr group_by_if -group_by_if.disk.frame <- create_dplyr_mapper(dplyr::group_by_if) +group_by_if.disk.frame <- create_chunk_mapper(dplyr::group_by_if) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr mutate_all -mutate_all.disk.frame <- create_dplyr_mapper(dplyr::mutate_all) +mutate_all.disk.frame <- create_chunk_mapper(dplyr::mutate_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr mutate_at -mutate_at.disk.frame <- create_dplyr_mapper(dplyr::mutate_at) +mutate_at.disk.frame <- create_chunk_mapper(dplyr::mutate_at) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr mutate_if -mutate_if.disk.frame <- create_dplyr_mapper(dplyr::mutate_if) +mutate_if.disk.frame <- create_chunk_mapper(dplyr::mutate_if) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr rename_all -rename_all.disk.frame <- create_dplyr_mapper(dplyr::rename_all) +rename_all.disk.frame <- create_chunk_mapper(dplyr::rename_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr rename_at -rename_at.disk.frame <- create_dplyr_mapper(dplyr::rename_at) +rename_at.disk.frame <- create_chunk_mapper(dplyr::rename_at) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr rename_if -rename_if.disk.frame <- create_dplyr_mapper(dplyr::rename_if) +rename_if.disk.frame <- create_chunk_mapper(dplyr::rename_if) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr select_all -select_all.disk.frame <- create_dplyr_mapper(dplyr::select_all) +select_all.disk.frame <- create_chunk_mapper(dplyr::select_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr select_at -select_at.disk.frame <- create_dplyr_mapper(dplyr::select_at) +select_at.disk.frame <- create_chunk_mapper(dplyr::select_at) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr select_if -select_if.disk.frame <- create_dplyr_mapper(dplyr::select_if) +select_if.disk.frame <- create_chunk_mapper(dplyr::select_if) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarise_all -chunk_summarise_all <- create_dplyr_mapper(dplyr::summarise_all) +chunk_summarise_all <- create_chunk_mapper(dplyr::summarise_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarise_at -chunk_summarise_at <- create_dplyr_mapper(dplyr::summarise_at) +chunk_summarise_at <- create_chunk_mapper(dplyr::summarise_at) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarize -chunk_summarize <- create_dplyr_mapper(dplyr::summarize) +chunk_summarize <- create_chunk_mapper(dplyr::summarize) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarize_all -chunk_summarize_all <- create_dplyr_mapper(dplyr::summarize_all) +chunk_summarize_all <- create_chunk_mapper(dplyr::summarize_all) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarize_at -chunk_summarize_at <- create_dplyr_mapper(dplyr::summarize_at) +chunk_summarize_at <- create_chunk_mapper(dplyr::summarize_at) #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarize_if -chunk_summarize_if <- create_dplyr_mapper(dplyr::summarize_if) +chunk_summarize_if <- create_chunk_mapper(dplyr::summarize_if) #' @export @@ -353,7 +270,7 @@ distinct.disk.frame <- function(...) { #' @export #' @rdname dplyr_verbs #' @importFrom dplyr distinct -chunk_distinct <- create_dplyr_mapper(dplyr::distinct, warning_msg = "the `distinct` function applies distinct chunk-wise") +chunk_distinct <- create_chunk_mapper(dplyr::distinct, warning_msg = "the `distinct` function applies distinct chunk-wise") #' The shard keys of the disk.frame #' @return character @@ -402,8 +319,8 @@ groups.disk.frame <- function(x){ # eval(parse(text=rlang::as_label(code)), envir = this_env) # }, lazy = TRUE) # } -#group_by.disk.frame <- create_dplyr_mapper(dplyr::group_by, warning_msg = "The group_by operation is applied WITHIN each chunk, hence the results may not be as expected. To address this issue, you can rechunk(df, shardby = your_group_keys) which can be computationally expensive. Otherwise, you may use a second stage summary to obtain the desired result.") -group_by.disk.frame <- create_dplyr_mapper(dplyr::group_by, warning_msg="`group_by.disk.frame` is now deprecated. Please use `chunk_group_by` instead. This is in preparation for a more powerful `group_by` framework") +#group_by.disk.frame <- create_chunk_mapper(dplyr::group_by, warning_msg = "The group_by operation is applied WITHIN each chunk, hence the results may not be as expected. To address this issue, you can rechunk(df, shardby = your_group_keys) which can be computationally expensive. Otherwise, you may use a second stage summary to obtain the desired result.") +group_by.disk.frame <- create_chunk_mapper(dplyr::group_by, warning_msg="`group_by.disk.frame` is now deprecated. Please use `chunk_group_by` instead. This is in preparation for a more powerful `group_by` framework") #function(...) { #stop("`arrange.disk.frame` has been removed. Please use `chunk_arrange` instead. This is preparation for a more powerful `group_by` framework") #} @@ -411,7 +328,7 @@ group_by.disk.frame <- create_dplyr_mapper(dplyr::group_by, warning_msg="`group_ #' @export #' @rdname group_by -chunk_group_by <- create_dplyr_mapper(dplyr::group_by) +chunk_group_by <- create_chunk_mapper(dplyr::group_by) #' @export #' @rdname dplyr_verbs diff --git a/R/sample_frac.R b/R/sample_frac.R index 0191b948..476cf98e 100644 --- a/R/sample_frac.R +++ b/R/sample_frac.R @@ -20,7 +20,7 @@ sample_frac.disk.frame <- function(tbl, size=1, replace=FALSE, weight=NULL, .env stop(warning_msg) } - fn = disk.frame::create_dplyr_mapper(dplyr::sample_frac) + fn = disk.frame::create_chunk_mapper(dplyr::sample_frac) fn(tbl, size = size, replace = replace, ...) } diff --git a/R/tidyfast-verbs.r b/R/tidyfast-verbs.r new file mode 100644 index 00000000..da969f6e --- /dev/null +++ b/R/tidyfast-verbs.r @@ -0,0 +1,49 @@ +#' The tidy verbs implemented for disk.frame +#' @description +#' Please see the tidyfast document for their usage +#' @export +#' @importFrom tidyfast dt_count dt_uncount dt_hoist dt_nest dt_unnest dt_fill dt_separate +#' @param ... Same as the tidyfast functions +#' @param .data a disk.frame +#' @rdname tidyfast_verbs +#' @family tidyfast verbs +#' @examples +#' library(tidyfast) +#' library(data.table) +#' +#' #' create a disk.frame +#' disk.frame_to_split <- as.disk.frame(data.table( +#' x = paste(letters, LETTERS, sep = ".") +#' )) +#' +#' disk.frame_to_split %>% +#' dt_separate(x, into = c("lower", "upper")) %>% +#' collect +#' +#' #' clean up +#' delete(disk.frame_to_split) +chunk_dt_count.disk.frame <- create_chunk_mapper(tidyfast::dt_count, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_uncount.disk.frame <- create_chunk_mapper(tidyfast::dt_uncount, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_unnest = create_chunk_mapper(tidyfast::dt_unnest, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_nest = create_chunk_mapper(tidyfast::dt_nest, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_hoist = create_chunk_mapper(tidyfast::dt_hoist, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_fill = create_chunk_mapper(tidyfast::dt_fill, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +dt_separate.disk.frame = create_chunk_mapper(tidyfast::dt_separate, as.data.frame = FALSE) diff --git a/man/create_dplyr_mapper.Rd b/man/create_chunk_mapper.Rd similarity index 66% rename from man/create_dplyr_mapper.Rd rename to man/create_chunk_mapper.Rd index 64256972..dfe508f6 100644 --- a/man/create_dplyr_mapper.Rd +++ b/man/create_chunk_mapper.Rd @@ -1,24 +1,24 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_verbs.r -\name{create_dplyr_mapper} -\alias{create_dplyr_mapper} -\title{Create dplyr function for disk.frame} +% Please edit documentation in R/chunk_mapper.r +\name{create_chunk_mapper} +\alias{create_chunk_mapper} +\title{Create function that applies to each chunk if disk.frame} \usage{ -create_dplyr_mapper(dplyr_fn, warning_msg = NULL, as.data.frame = TRUE) +create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) } \arguments{ -\item{dplyr_fn}{The dplyr function to create a mapper for} - \item{warning_msg}{The warning message to display when invoking the mapper} \item{as.data.frame}{force the input chunk of a data.frame; needed for dtplyr} + +\item{fn}{The dplyr function to create a mapper for} } \description{ A function to make it easier to create functions like \code{filter} } \examples{ -filter = create_dplyr_mapper(dplyr::filter) +filter = create_chunk_mapper(dplyr::filter) #' example: creating a function that keeps only the first and last n row first_and_last <- function(chunk, n, ...) { @@ -28,7 +28,7 @@ first_and_last <- function(chunk, n, ...) { } #' create the function for use with disk.frame -first_and_last_df = create_dplyr_mapper(first_and_last) +first_and_last_df = create_chunk_mapper(first_and_last) mtcars.df = as.disk.frame(mtcars) diff --git a/man/tidyfast_verbs.Rd b/man/tidyfast_verbs.Rd new file mode 100644 index 00000000..15d3b522 --- /dev/null +++ b/man/tidyfast_verbs.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyfast-verbs.r +\name{chunk_dt_count.disk.frame} +\alias{chunk_dt_count.disk.frame} +\alias{chunk_dt_uncount.disk.frame} +\alias{chunk_dt_unnest} +\alias{chunk_dt_nest} +\alias{chunk_dt_hoist} +\alias{chunk_dt_fill} +\alias{dt_separate.disk.frame} +\title{The tidy verbs implemented for disk.frame} +\usage{ +chunk_dt_count.disk.frame(.data, ...) + +chunk_dt_uncount.disk.frame(.data, ...) + +chunk_dt_unnest(.data, ...) + +chunk_dt_nest(.data, ...) + +chunk_dt_hoist(.data, ...) + +chunk_dt_fill(.data, ...) + +\method{dt_separate}{disk.frame}(.data, ...) +} +\arguments{ +\item{.data}{a disk.frame} + +\item{...}{Same as the tidyfast functions} +} +\description{ +Please see the tidyfast document for their usage +} +\examples{ +library(tidyfast) +library(data.table) + +#' create a disk.frame +disk.frame_to_split <- as.disk.frame(data.table( + x = paste(letters, LETTERS, sep = ".") +)) + +disk.frame_to_split \%>\% + dt_separate(x, into = c("lower", "upper")) \%>\% + collect + +#' clean up +delete(disk.frame_to_split) +} +\concept{tidyfast verbs} From 3a32b21b6384b58a91bfe8f98eabf23773ecd817 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 16 Mar 2020 23:10:34 +1100 Subject: [PATCH 2/3] update readme with youtube stream link --- README.Rmd | 4 ++++ README.md | 66 +++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/README.Rmd b/README.Rmd index 0c1d02ce..bcd3e1a0 100644 --- a/README.Rmd +++ b/README.Rmd @@ -327,3 +327,7 @@ Do you wish to give back the open-source community in non-financial ways? Here a [![](http://cranlogs.r-pkg.org/badges/grand-total/disk.frame)](https://cran.r-project.org/package=disk.frame) [![Travis build status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame) + +## Live Stream of `{disk.frame}` development + +* https://www.youtube.com/playlist?list=PL3DVdT3kym4fIU5CO-pxKtWhdjMVn4XGe diff --git a/README.md b/README.md index 95e17ce0..a43cfe2a 100644 --- a/README.md +++ b/README.md @@ -217,12 +217,15 @@ flights.df %>% filter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 -#> 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 -#> origin dest air_time distance hour minute time_hour origin_dest -#> 1 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH -#> 2 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1 2013 1 1 517 515 2 830 819 +#> 2 2013 1 1 533 529 4 850 830 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> time_hour origin_dest +#> 1 2013-01-01 05:00:00 EWRIAH +#> 2 2013-01-01 05:00:00 LGAIAH ``` ### Group-by @@ -279,7 +282,6 @@ obtained using estimated methods. ``` r library(data.table) -#> data.table 1.12.8 using 6 threads (see ?getDTthreads). Latest news: r-datatable.com #> #> Attaching package: 'data.table' #> The following object is masked from 'package:purrr': @@ -296,6 +298,30 @@ grp_by_stage1 = .(sum_dist = sum(distance)), .(qtr = ifelse(month <= 3, "Q1", "Q2")) ] +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading grp_by_stage1 #> qtr sum_dist @@ -326,7 +352,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmpa6R05d\\file1b086cec36c7.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpOeAro4\\file17a0150634fd.df" ``` A number of data.frame functions are implemented for disk.frame @@ -334,19 +360,23 @@ A number of data.frame functions are implemented for disk.frame ``` r # get first few rows head(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1: 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 -#> origin dest air_time distance hour minute time_hour -#> 1: EWR IAH 227 1400 5 15 2013-01-01 05:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> time_hour +#> 1: 2013-01-01 05:00:00 ``` ``` r # get last few rows tail(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ 3531 N839MQ -#> origin dest air_time distance hour minute time_hour -#> 1: LGA RDU NA 431 8 40 2013-09-30 08:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 9 30 NA 840 NA NA 1020 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40 +#> time_hour +#> 1: 2013-09-30 08:00:00 ``` ``` r @@ -455,3 +485,7 @@ ways? Here are some ways you can contribute status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame) + +## Live Stream of `{disk.frame}` development + + - From ff2c0f0fbe0b67f06b8f1b61e90e1131af72d401 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 21 Mar 2020 20:03:24 +1100 Subject: [PATCH 3/3] mid development --- R/chunk_mapper.r | 4 ++-- R/tidyfast-verbs.r | 14 ++++++++++++-- man/chunk_group_by.Rd | 3 --- man/create_chunk_mapper.Rd | 4 ++-- man/create_dplyr_mapper.Rd | 11 +++++++++++ man/tidyfast_verbs.Rd | 2 +- 6 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 man/create_dplyr_mapper.Rd diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index a6d80c61..eea3a288 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -39,7 +39,7 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR warning(warning_msg) } - + browser() quo_dotdotdot = rlang::enquos(...) # this is designed to capture any global stuff @@ -85,4 +85,4 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR }, lazy = TRUE) } return_func -} \ No newline at end of file +} diff --git a/R/tidyfast-verbs.r b/R/tidyfast-verbs.r index da969f6e..b4883238 100644 --- a/R/tidyfast-verbs.r +++ b/R/tidyfast-verbs.r @@ -22,11 +22,21 @@ #' #' #' clean up #' delete(disk.frame_to_split) -chunk_dt_count.disk.frame <- create_chunk_mapper(tidyfast::dt_count, as.data.frame = FALSE) +chunk_dt_count <- create_chunk_mapper(tidyfast::dt_count, as.data.frame = FALSE) + +#' dt_count working on whole disk.frame +dt_count.disk.frame <- function(dt_, ..., na.rm = FALSE, wt = NULL) { + stop("ZJ: I was up to here, and I need better understanding of NSE. Why? + ifelse(is.null(wt), NULL, wt) is not going to work if wt is a column name") + + dt_ %>% + chunk_dt_count(..., na.rm = force(na.rm), wt = ifelse(is.null(wt), NULL, wt)) %>% + collect +} #' @rdname tidyfast_verbs #' @export -chunk_dt_uncount.disk.frame <- create_chunk_mapper(tidyfast::dt_uncount, as.data.frame = FALSE) +chunk_dt_uncount <- create_chunk_mapper(tidyfast::dt_uncount, as.data.frame = FALSE) #' @rdname tidyfast_verbs #' @export diff --git a/man/chunk_group_by.Rd b/man/chunk_group_by.Rd index f06836c2..9a54e6f3 100644 --- a/man/chunk_group_by.Rd +++ b/man/chunk_group_by.Rd @@ -2,15 +2,12 @@ % Please edit documentation in R/dplyr_verbs.r \name{chunk_summarize} \alias{chunk_summarize} -\alias{chunk_summarise} \alias{chunk_group_by} \alias{chunk_ungroup} \title{Group by within each disk.frame} \usage{ chunk_summarize(.data, ...) -chunk_summarise(.data, ...) - chunk_group_by(.data, ...) chunk_ungroup(.data, ...) diff --git a/man/create_chunk_mapper.Rd b/man/create_chunk_mapper.Rd index dfe508f6..0702093d 100644 --- a/man/create_chunk_mapper.Rd +++ b/man/create_chunk_mapper.Rd @@ -7,11 +7,11 @@ create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) } \arguments{ +\item{chunk_fn}{The dplyr function to create a mapper for} + \item{warning_msg}{The warning message to display when invoking the mapper} \item{as.data.frame}{force the input chunk of a data.frame; needed for dtplyr} - -\item{fn}{The dplyr function to create a mapper for} } \description{ A function to make it easier to create functions like \code{filter} diff --git a/man/create_dplyr_mapper.Rd b/man/create_dplyr_mapper.Rd new file mode 100644 index 00000000..a486be28 --- /dev/null +++ b/man/create_dplyr_mapper.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_verbs.r +\name{create_dplyr_mapper} +\alias{create_dplyr_mapper} +\title{Kept for backwards-compatibility to be removed in 0.3} +\usage{ +create_dplyr_mapper() +} +\description{ +Kept for backwards-compatibility to be removed in 0.3 +} diff --git a/man/tidyfast_verbs.Rd b/man/tidyfast_verbs.Rd index 15d3b522..977cbcf3 100644 --- a/man/tidyfast_verbs.Rd +++ b/man/tidyfast_verbs.Rd @@ -22,7 +22,7 @@ chunk_dt_hoist(.data, ...) chunk_dt_fill(.data, ...) -\method{dt_separate}{disk.frame}(.data, ...) +dt_separate.disk.frame(.data, ...) } \arguments{ \item{.data}{a disk.frame}