diff --git a/.Rbuildignore b/.Rbuildignore index 5c5d5b95..5d380324 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ +^renv$ +^renv\.lock$ ^.*\.Rproj$ ^\.github$ ^manuscript$ @@ -65,4 +67,6 @@ vignettes.Rnw.template ^codecov\.yml$ new-nse-dev.r test-poorman.R -*.parquet \ No newline at end of file +.parquet$ +maditr-devs.r +^CRAN-SUBMISSION$ diff --git a/CRAN-RELEASE b/CRAN-RELEASE deleted file mode 100644 index 48d603de..00000000 --- a/CRAN-RELEASE +++ /dev/null @@ -1,2 +0,0 @@ -This package was submitted to CRAN on 2021-03-12. -Once it is accepted, delete this file and tag the release (commit 34bafaa). diff --git a/DESCRIPTION b/DESCRIPTION index 0f848ee8..1ce9bbee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.5.0 -Date: 2021-05-09 +Version: 0.6.0 +Date: 2022-01-31 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) @@ -17,27 +17,24 @@ License: MIT + file LICENSE Imports: Rcpp (>= 0.12.13), glue (>= 1.3.1), - rlang (>= 0.4.0), future.apply (>= 1.3.0), fs (>= 1.3.1), jsonlite (>= 1.6), pryr (>= 0.1.4), stringr (>= 1.4.0), fst (>= 0.8.0), - globals (>= 0.12.4), future (>= 1.14.0), data.table (>= 1.12.2), crayon (>= 1.3.4), bigreadr (>= 0.2.0), - furrr (>= 0.2.2), bit64, - benchmarkme + benchmarkme, + purrr (>= 0.3.2), + rlang Depends: R (>= 3.4), - dplyr (>= 1.0.0), - purrr (>= 0.3.2) + dplyr (>= 1.0.0) Suggests: - testthat (>= 2.1.0), nycflights13, magrittr, shiny, @@ -49,10 +46,11 @@ Suggests: speedglm, broom, ggplot2, - covr + rmarkdown LinkingTo: Rcpp -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 +VignetteBuilder: rmarkdown Encoding: UTF-8 URL: https://diskframe.com BugReports: https://github.com/xiaodaigh/disk.frame/issues diff --git a/NAMESPACE b/NAMESPACE index da78d075..2def4912 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,7 +17,6 @@ S3method(colnames,disk.frame) S3method(compute,disk.frame) S3method(delayed,disk.frame) S3method(distinct,disk.frame) -S3method(do,disk.frame) S3method(filter,disk.frame) S3method(full_join,disk.frame) S3method(get_chunk,disk.frame) @@ -25,23 +24,10 @@ S3method(glimpse,disk.frame) S3method(group_by,disk.frame) S3method(group_vars,disk.frame) S3method(groups,disk.frame) -S3method(hard_arrange,data.frame) -S3method(hard_arrange,disk.frame) -S3method(hard_group_by,data.frame) -S3method(hard_group_by,disk.frame) S3method(head,disk.frame) -S3method(imap,default) -S3method(imap_dfr,default) -S3method(imap_dfr,disk.frame) S3method(inner_join,disk.frame) S3method(lazy,disk.frame) S3method(left_join,disk.frame) -S3method(map,default) -S3method(map,disk.frame) -S3method(map2,default) -S3method(map2,disk.frame) -S3method(map_dfr,default) -S3method(map_dfr,disk.frame) S3method(merge,disk.frame) S3method(mutate,disk.frame) S3method(names,disk.frame) @@ -67,22 +53,22 @@ S3method(transmute,disk.frame) export(IQR_df.chunk_agg.disk.frame) export(IQR_df.collected_agg.disk.frame) export(add_chunk) -export(add_tally.disk.frame) export(all_df.chunk_agg.disk.frame) export(all_df.collected_agg.disk.frame) export(any_df.chunk_agg.disk.frame) export(any_df.collected_agg.disk.frame) export(as.disk.frame) +export(bind_rows.disk.frame) export(ceremony_text) export(chunk_arrange) export(chunk_distinct) export(chunk_group_by) -export(chunk_lapply) export(chunk_summarise) export(chunk_summarize) export(chunk_ungroup) export(cimap) export(cimap_dfr) +export(clapply) export(cmap) export(cmap2) export(cmap_dfr) @@ -102,18 +88,12 @@ export(foverlaps.disk.frame) export(gen_datatable_synthetic) export(get_chunk) export(get_chunk_ids) -export(hard_arrange) -export(hard_group_by) -export(imap) -export(imap_dfr) export(insert_ceremony) export(is_disk.frame) export(lazy) export(length_df.chunk_agg.disk.frame) export(length_df.collected_agg.disk.frame) export(make_glm_streaming_fn) -export(map) -export(map2) export(map_by_chunk_id) export(max_df.chunk_agg.disk.frame) export(max_df.collected_agg.disk.frame) @@ -148,7 +128,6 @@ export(shardkey_equal) export(show_boilerplate) export(show_ceremony) export(srckeep) -export(srckeepchunks) export(sum_df.chunk_agg.disk.frame) export(sum_df.collected_agg.disk.frame) export(var_df.chunk_agg.disk.frame) @@ -172,10 +151,8 @@ importFrom(data.table,foverlaps) importFrom(data.table,fread) importFrom(data.table,rbindlist) importFrom(data.table,setDT) -importFrom(data.table,setkey) importFrom(data.table,setkeyv) importFrom(data.table,timetaken) -importFrom(dplyr,add_tally) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) @@ -218,7 +195,6 @@ importFrom(future,nbrOfWorkers) importFrom(future,plan) importFrom(future,sequential) importFrom(future.apply,future_lapply) -importFrom(globals,findGlobals) importFrom(glue,glue) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) @@ -230,9 +206,7 @@ importFrom(purrr,map2) importFrom(purrr,map_chr) importFrom(purrr,map_dfr) importFrom(purrr,map_lgl) -importFrom(rlang,enquos) -importFrom(rlang,eval_tidy) -importFrom(rlang,quo) +importFrom(rlang,enexpr) importFrom(stats,median) importFrom(stats,quantile) importFrom(stats,runif) @@ -240,9 +214,6 @@ importFrom(stringr,fixed) importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,memory.limit) -importFrom(utils,methods) -importFrom(utils,setTxtProgressBar) importFrom(utils,tail) -importFrom(utils,txtProgressBar) importFrom(utils,unzip) useDynLib(disk.frame) diff --git a/NEWS.md b/NEWS.md index 088222ce..e075c6da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# disk.frame 0.6 +* Much better NSE support in disk.frame! +* removed `hard_arrange` and `hard_group_by` +* various API updates + # disk.frame 0.5 * removed `add_count` method diff --git a/R/add_chunk.r b/R/add_chunk.r index 215feb85..08fee3d2 100644 --- a/R/add_chunk.r +++ b/R/add_chunk.r @@ -116,9 +116,10 @@ add_chunk <- function(df, chunk, chunk_id = NULL, full.names = FALSE, ...) { data.table::setDT(check_vars) if(nrow(check_vars[is.na(new_chunk)]) > 0) { + vars_strings = paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ') warning( - glue::glue( - "these variables are in the disk.frame but not in the new chunk: \n {paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ')}")) + sprintf( + "these variables are in the disk.frame but not in the new chunk: \n %s", vars_strings)) } if(nrow(check_vars[is.na(existing_df)]) > 0){ warning(glue::glue("these variables are in the new chunk but not in the existing disk.frame: {paste0(check_vars[is.na(existing_df), colnames], collapse=', ')}")) diff --git a/R/anti_join.r b/R/anti_join.r index dc473a6e..2108bdf6 100644 --- a/R/anti_join.r +++ b/R/anti_join.r @@ -3,9 +3,10 @@ #' @param merge_by_chunk_id the merge is performed by chunk id #' @param overwrite overwrite output directory #' @param .progress Show progress or not. Defaults to FALSE +#' @param suffix see dplyr::XXX_join +#' @param keep see dplyr::XXX_join #' @param ... same as dplyr's joins #' @rdname join -#' @importFrom rlang quo enquos #' @importFrom dplyr anti_join left_join full_join semi_join inner_join #' @return disk.frame or data.frame/data.table #' @export @@ -29,11 +30,11 @@ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi overwrite_check(outdir, overwrite) if("data.frame" %in% class(y)) { - quo_dotdotdot = enquos(...) - cmap_dfr.disk.frame(x, ~{ - code = quo(anti_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) + tmp = cmap.disk.frame(x, ~{ + anti_join(.x, y, by = by, copy = copy, ...) }, .progress = .progress) + + return(tmp) } else if("disk.frame" %in% class(y)) { if(is.null(merge_by_chunk_id)) { stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") @@ -47,12 +48,12 @@ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi if (merge_by_chunk_id == FALSE) { warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + ncxy = max(ncy,ncx) + x = rechunk(x, shardby=by, nchunks = ncxy, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) + y = rechunk(y, shardby=by, nchunks =ncxy, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(anti_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { res = cmap2.disk.frame(x, y, ~{ - #res = cmap2(x, y, ~{ if(is.null(.y)) { return(.x) } else if (is.null(.x)) { diff --git a/R/as.disk.frame.r b/R/as.disk.frame.r index f55c2a78..2e208e03 100644 --- a/R/as.disk.frame.r +++ b/R/as.disk.frame.r @@ -25,7 +25,6 @@ #' delete(cars_new_location.df) #' delete(cars_chunks.df) as.disk.frame <- function(df, outdir = tempfile(fileext = ".df"), nchunks = recommend_nchunks(df), overwrite = FALSE, shardby = NULL, compress = 50,...) { - stopifnot("data.frame" %in% class(df)) overwrite_check(outdir, overwrite) data.table::setDT(df) diff --git a/R/bind_rows.r b/R/bind_rows.r new file mode 100644 index 00000000..59b6cf4b --- /dev/null +++ b/R/bind_rows.r @@ -0,0 +1,6 @@ +#' Bind rows +#' @param ... disk.frame to be row bound +#' @export +bind_rows.disk.frame <- function(...) { + rbindlist.disk.frame(list(...)) +} \ No newline at end of file diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index 2063252a..1b0d3ff7 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -31,59 +31,49 @@ #' @param chunk_fn The dplyr function to create a mapper for #' @param warning_msg The warning message to display when invoking the mapper #' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr -#' @importFrom rlang enquos quo #' @export -create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) { - return_func <- function(.data, ...) { - if (!is.null(warning_msg)) { +create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FALSE) { + if(as.data.frame) { + warning("`as.data.frame` is deprecated in create_chunk_mapper") + } + + return(function(.data, ...) { + if(!is.null(warning_msg)) { warning(warning_msg) } + # you need to use list otherwise the names will be gone + code = substitute(chunk_fn(.disk.frame.chunk, ...)) - quo_dotdotdot = rlang::enquos(...) + if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { + globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) + } else { + globals_and_pkgs = future::getGlobalsAndPackages(code) + } - # this is designed to capture any global stuff - vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot) - data_for_eval_tidy = force(vars_and_pkgs$globals) - res = cmap(.data, ~{ - - this_env = environment() - - if(length(data_for_eval_tidy) > 0) { - for(i in 1:length(data_for_eval_tidy)) { - assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env) - } - } - - lapply(quo_dotdotdot, function(x) { - attr(x, ".Environment") = this_env - }) - - if(as.data.frame) { - if("grouped_df" %in% class(.x)) { - code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) - } else { - code = rlang::quo(chunk_fn(as.data.frame(.x), !!!quo_dotdotdot)) - } - } else { - code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) + global_vars = globals_and_pkgs$globals + + env = parent.frame() + + done = identical(env, emptyenv()) || identical(env, globalenv()) + + # keep adding global variables by moving up the environment chain + while(!done) { + tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) + new_global_vars = tmp_globals_and_pkgs$globals + for (name in setdiff(names(new_global_vars), names(global_vars))) { + global_vars[[name]] <- new_global_vars[[name]] } - # ZJ: we need both approaches. TRUST ME - # TODO better NSE at some point need dist - tryCatch({ - return(rlang::eval_tidy(code)) - }, error = function(e) { - as_label_code = rlang::as_label(code) - if(as_label_code == "chunk_fn(...)") { - stop(glue::glue("disk.frame has detected a syntax error in \n\n`{code}`\n\n. If you believe your syntax is correct, raise an issue at https://github.com/xiaodaigh/disk.frame with a MWE")) - } else { - # likely to be dealing with data.tables - return(eval(parse(text=as_label_code), envir = this_env)) - } - }) - }, lazy = TRUE) - } - return_func -} \ No newline at end of file + done = identical(env, emptyenv()) || identical(env, globalenv()) + env = parent.env(env) + } + + globals_and_pkgs$globals = global_vars + + attr(.data, "recordings") = c(attr(.data, "recordings"), list(globals_and_pkgs)) + + .data + }) +} diff --git a/R/clapply.r b/R/clapply.r new file mode 100644 index 00000000..e69de29b diff --git a/R/cmap.r b/R/cmap.r index ca18a26c..a9e578ce 100644 --- a/R/cmap.r +++ b/R/cmap.r @@ -2,20 +2,15 @@ #' @param .x a disk.frame #' @param .f a function to apply to each of the chunks #' @param outdir the output directory -#' @param keep the columns to keep from the input -#' @param chunks The number of chunks to output #' @param lazy if TRUE then do this lazily -#' @param compress 0-100 fst compression ratio -#' @param overwrite if TRUE removes any existing chunks in the data #' @param use.names for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist #' @param fill for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist #' @param idcol for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist -#' @param vars_and_pkgs variables and packages to send to a background session. This is typically automatically detected -#' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr} -#' @param ... for compatibility with `purrr::map` -#' @import fst -#' @importFrom purrr as_mapper map -#' @importFrom future.apply future_lapply +#' @param .id ignored +#' @param keep The columns to keep at source +#' @param compress The compression setting. 0-100 +#' @param overwrite Whether to overwrite any files in the output directory +#' @param ... Passed to `collect` and `write_disk.frame` #' @export #' @examples #' cars.df = as.disk.frame(cars) @@ -56,78 +51,11 @@ cmap <- function(.x, .f, ...) { cmap.disk.frame <- function( .x, .f, - ..., - outdir = NULL, - keep = NULL, - chunks = nchunks(.x), - compress = 50, - lazy = TRUE, - overwrite = FALSE, - vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()), .progress = TRUE) { + ...) { .f = purrr::as_mapper(.f) - if(lazy) { - attr(.x, "lazyfn") = - c( - attr(.x, "lazyfn"), - list( - list( - func = .f, - vars_and_pkgs = vars_and_pkgs, - dotdotdot = list(...) - ) - ) - ) - return(.x) - } - - if(!is.null(outdir)) { - overwrite_check(outdir, overwrite) - } - - stopifnot(is_ready(.x)) - - keep1 = attr(.x,"keep", exact=TRUE) - - if(is.null(keep)) { - keep = keep1 - } - - path <- attr(.x, "path") - files <- list.files(path, full.names = TRUE) - files_shortname <- list.files(path) - - keep_future = keep - - cid = get_chunk_ids(.x, full.names = TRUE) - - dotdotdot = list(...) - res = future.apply::future_lapply(1:length(files), function(ii, ...) { - #res = lapply(1:length(files), function(ii) { - ds = disk.frame::get_chunk(.x, cid[ii], keep=keep_future, full.names = TRUE) - - res = .f(ds, ...) - # res = do.call(.f, c(ds, dotdotdot)) - - if(!is.null(outdir)) { - if(nrow(res) == 0) { - warning(glue::glue("The output chunk has 0 row, therefore chunk {ii} NOT written")) - } else { - fst::write_fst(res, file.path(outdir, files_shortname[ii]), compress) - } - return(ii) - } else { - return(res) - } - }, ..., - future.seed=TRUE # to get rid of the error TODO investigate making this better - ) - - if(!is.null(outdir)) { - return(disk.frame(outdir)) - } else { - return(res) - } + result = create_chunk_mapper(.f)(.x, ...) + return(result) } #' @export @@ -143,8 +71,8 @@ cmap_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill warning(".id is not NULL, but the parameter is not used with cmap_dfr.disk.frame") } - # TODO warn the user if outdir is cmap_dfr - data.table::rbindlist(cmap.disk.frame(.x, .f, ..., outdir = NULL, lazy = FALSE), use.names = use.names, fill = fill, idcol = idcol) + list_df = collect_list(cmap.disk.frame(.x, .f, ...)) + data.table::rbindlist(list_df, use.names = use.names, fill = fill, idcol = idcol) } @@ -158,13 +86,13 @@ cimap <- function(.x, .f, ...) { #' second is the chunk ID #' @export #' @rdname cmap -cimap.disk.frame <- function(.x, .f, outdir = NULL, keep = NULL, chunks = nchunks(.x), compress = 50, lazy = TRUE, overwrite = FALSE, ...) { - .f = purrr::as_mapper(.f) +cimap.disk.frame <- function(.x, .f, outdir = NULL, keep = NULL, lazy = TRUE, overwrite = FALSE, compress=50, ...) { + .f = purrr_as_mapper(.f) # TODO support lazy for cimap if(lazy) { stop("cimap.disk.frame: lazy = TRUE is not supported at this stage") - attr(.x, "lazyfn") = c(attr(.x, "lazyfn"), .f) + attr(.x, "recordings") = c(attr(.x, "recordings"), .f) return(.x) } @@ -246,7 +174,6 @@ delayed.disk.frame <- function(.x, .f, ...) { #' @export #' @rdname cmap -chunk_lapply <- function (...) { - warning("chunk_lapply is deprecated in favour of cmap.disk.frame") +clapply <- function (...) { cmap.disk.frame(...) } diff --git a/R/collect.r b/R/collect.r index a403098f..cf5add15 100644 --- a/R/collect.r +++ b/R/collect.r @@ -23,15 +23,14 @@ #' delete(cars.df) #' @export #' @rdname collect -collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { +collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"recordings"))) { cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) - #cids = as.integer(get_chunk_ids(x)) if(nchunks(x) > 0) { if(parallel) { - future.apply::future_lapply(cids, function(.x) { - get_chunk(x, .x, full.names = TRUE) - }, future.seed = TRUE) %>% - rbindlist() + tmp = future.apply::future_lapply(cids, function(.x) { + get_chunk(x, .x, full.names = TRUE) + }, future.seed = TRUE) + return(rbindlist(tmp)) } else { purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE)) } @@ -52,24 +51,26 @@ collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { #' #' # clean up #' delete(cars.df) -collect_list <- function(x, simplify = FALSE, parallel = !is.null(attr(x,"lazyfn"))) { +collect_list <- function(x, simplify = FALSE, parallel = !is.null(attr(x,"recordings")), ...) { + # get the chunk ids cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) - - if(nchunks(x) > 0) { - res <- NULL + if(length(cids) > 0) { + list_of_results = NULL if (parallel) { - #res = furrr::future_map(1:nchunks(x), ~get_chunk(x, .x)) - res = future.apply::future_lapply(cids, function(.x) { + list_of_results = future.apply::future_lapply(cids, function(.x) { get_chunk(x, .x, full.names = TRUE) }, future.seed=TRUE) } else { - res = purrr::map(cids, ~get_chunk(x, .x, full.names = TRUE)) + list_of_results = lapply(cids, function(cid) { + get_chunk(x, cid, full.names = TRUE) + }) } + if (simplify) { - return(simplify2array(res)) + return(simplify2array(list_of_results)) } else { - return(res) + return(list_of_results) } } else { list() diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index b9ba042e..edd686a4 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -23,8 +23,128 @@ #' delete(cars.df) #' @export #' @rdname collect -collect.summarized_disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { - code_to_run = glue::glue("x %>% {attr(x, 'summarize_code') %>% as.character}") - class(x) <- "disk.frame" - eval(parse(text = code_to_run)) -} +collect.summarized_disk.frame <- + function(x, ..., parallel = !is.null(attr(x, "recordings"))) { + dotdotdot <- attr(x, 'summarize_code') + + # make a copy + dotdotdot_chunk_agg <- dotdotdot + dotdotdot_collected_agg <- dotdotdot + + i = 1 + for (a_call in dotdotdot) { + # obtain the function call name + func_call_str = paste0(deparse(a_call[[1]]), collapse = "") + + # parse(...) returns an expression, but I just want the sole symbol which + # can be extracted with [[1]] + func_call_chunk_agg = parse(text = paste0(func_call_str, "_df.chunk_agg.disk.frame"))[[1]] + # replace the function call with the chunk_agg_function + dotdotdot_chunk_agg[[i]][[1]] = func_call_chunk_agg + + func_call_collected_agg = paste0(func_call_str, "_df.collected_agg.disk.frame") + # replace the function call with the chunk_agg_function + dotdotdot_collected_agg[[i]] = parse(text = sprintf( + "%s(%s)", + func_call_collected_agg, + paste0(".disk.frame.tmp", i) + ))[[1]] + i = i + 1 + # TODO extract global variables from here and store them in the global + } + + group_by_vars = attr(x, "group_by_cols") + + # figure out how many group by arguments there are + n_grp_args = length(group_by_vars) + + # generate a function call with as many arguments + x_as.disk.frame = x + class(x_as.disk.frame) = "disk.frame" + first_stage_code = eval(parse( + text = sprintf( + "quote(chunk_group_by(x_as.disk.frame, %s))", + paste0(rep_len("NULL", n_grp_args), collapse = ", ") + ) + )) + + if (n_grp_args >= 1) { + for (i in 1:n_grp_args) { + first_stage_code[[i + 2]] = group_by_vars[[i]] + } + } + + # TODO add appropriate environment + tmp_df = eval(first_stage_code) + + n_summ_args = length(dotdotdot_chunk_agg) + + chunk_summ_code = + eval(parse(text = sprintf( + "quote(chunk_summarise(tmp_df, %s))", + paste0("NULL", 1:n_summ_args, collapse = ", ") + ))) + + + chunk_summ_code_str = chunk_summ_code %>% + deparse %>% + paste0(collapse = "") + + for (i in 1:n_summ_args) { + lhs = sprintf(".disk.frame.tmp%d", i) + rhs = paste0(deparse(dotdotdot_chunk_agg[[i]]), collapse = "") + + tmp_code = paste0("NULL", i) + chunk_summ_code_str = gsub( + pattern = tmp_code, + sprintf("%s=list(%s)", lhs, rhs), + chunk_summ_code_str, + fixed = TRUE + ) + } + + tmp2 = collect(eval(parse(text = chunk_summ_code_str))) + + second_stage_code = eval(parse(text = sprintf( + "quote(group_by(tmp2, %s))", paste0(rep_len("NULL", n_grp_args), collapse = ", ") + ))) + + if (n_grp_args >= 1) { + for (i in 1:n_grp_args) { + second_stage_code[[i + 2]] = group_by_vars[[i]] + } + } + + tmp3 = eval(second_stage_code) + + n_summ2_args = length(dotdotdot_collected_agg) + # final stage of summary + chunk_summ2_code = + eval(parse(text = sprintf( + "quote(summarise(tmp3, %s))", + paste0(rep_len("NULL", n_summ2_args), collapse = ", ") + ))) + + names_chunk_summ_code = names(dotdotdot_chunk_agg) + for (i in 1:n_summ_args) { + chunk_summ2_code[[i + 2]] = dotdotdot_collected_agg[[i]] + } + + tmp4 = eval(chunk_summ2_code) + + names_tmp4 = names(tmp4) + + orig_names = sapply(dotdotdot, function(code) { + code %>% + deparse %>% + paste0(collapse = "") + }) + + + names(tmp4)[(n_grp_args + 1):length(names_tmp4)] = ifelse(names_chunk_summ_code == + "", + orig_names, + names_chunk_summ_code) + + return(tmp4) + } diff --git a/R/compute.r b/R/compute.r index 99d31037..f2a95a4e 100644 --- a/R/compute.r +++ b/R/compute.r @@ -1,11 +1,10 @@ -#' Compute without writing +#' Force computations. The results are stored in a folder. #' @description #' Perform the computation; same as calling cmap without .f and lazy = FALSE #' @param x a disk.frame #' @param outdir the output directory -#' @param overwrite whether to overwrite or not -#' @param name Not used. Kept for compatibility with dplyr -#' @param ... Not used. Kept for dplyr compatibility +#' @param name If not NULL then used as outdir prefix. +#' @param ... Passed to `write_disk.frame` #' @export #' @importFrom dplyr compute #' @examples @@ -17,7 +16,11 @@ #' # clean up #' delete(cars.df) #' delete(cars.df3) -compute.disk.frame <- function(x, name, outdir = tempfile("tmp_df_", fileext=".df"), overwrite = TRUE, ...) { - overwrite_check(outdir, overwrite) - write_disk.frame(x, outdir = outdir, overwrite = TRUE) +compute.disk.frame <- function(x, name = NULL, outdir = tempfile("tmp_df_", fileext=".df"), ...) { + if (!is.null(name)) { + warning("in `compute.disk.frame()` name is not NULL, using `name` file name prefix in temporary `outdir` ") + outdir = tempfile(name, fileext=".df") + } + + write_disk.frame(x, outdir = outdir, ...) } diff --git a/R/csv2disk.frame.r b/R/csv2disk.frame.r index d63ee0e2..ac929b77 100644 --- a/R/csv2disk.frame.r +++ b/R/csv2disk.frame.r @@ -19,8 +19,7 @@ #' the highest compression ratio. #' @param overwrite Whether to overwrite the existing directory #' @param header Whether the files have header. Defaults to TRUE -#' @param .progress A logical, for whether or not to print a progress bar for -#' multiprocess, multisession, and multicore plans. From {furrr} +#' @param .progress A logical, for whether or not to show progress #' @param backend The CSV reader backend to choose: "data.table" or "readr". #' disk.frame does not have its own CSV reader. It uses either #' data.table::fread or readr::read_delimited. It is worth noting that @@ -66,20 +65,20 @@ csv_to_disk.frame <- function(infile, outdir = tempfile(fileext = ".df"), inmapf overwrite_check(outdir, overwrite) - # we need multiple backend because data.table has poor support for when the file is larger than RAM + # we need multiple backends because data.table has poor support for when the file is larger than RAM # https://github.com/Rdatatable/data.table/issues/3526 # TODO detect these cases # user has requested chunk-wise reading but wants me to do it #if(is.null(in_chunk_size)) { - + #} else if(is.character(in_chunk_size) && in_chunk_size == "guess") { - - #library(bigreadr) - # system.time(wc_l <- R.utils::countLines(infile)) - # system.time(infos_split <- split_file(infile, every_nlines = 1e7)) - # file_parts <- get_split_files(infos_split) + + #library(bigreadr) + # system.time(wc_l <- R.utils::countLines(infile)) + # system.time(infos_split <- split_file(infile, every_nlines = 1e7)) + # file_parts <- get_split_files(infos_split) #} else if(is.numeric(in_chunk_size)) { @@ -307,12 +306,10 @@ csv_to_disk.frame_data.table_backend <- function(infile, outdir = tempfile(filee message("") } - outdf_tmp = furrr::future_imap(infile, ~{ - dotdotdotorigarg1 = c(dotdotdotorigarg, list(outdir = file.path(tempdir(), .y), infile=.x)) - - pryr::do_call(csv_to_disk.frame_data.table_backend, dotdotdotorigarg1) - }, - .progress = .progress) + outdf_tmp = future.apply::future_lapply(1:length(infile), function(i) { + dotdotdotorigarg1 = c(dotdotdotorigarg, list(outdir = file.path(tempdir(), i), infile=infile[i])) + do.call(csv_to_disk.frame_data.table_backend, dotdotdotorigarg1) + }) if(.progress) { message(paste("-- Converting CSVs to disk.frame -- Stage 1 or 2 took:", data.table::timetaken(pt))) @@ -330,7 +327,6 @@ csv_to_disk.frame_data.table_backend <- function(infile, outdir = tempfile(filee outdf = rbindlist.disk.frame(outdf_tmp, outdir = outdir, by_chunk_id = TRUE, compress = compress, overwrite = overwrite, .progress = .progress) if(.progress) { - message(paste("Stage 2 of 2 took:", data.table::timetaken(pt2))) message(" ----------------------------------------------------- ") message(paste("Stage 1 & 2 in total took:", data.table::timetaken(pt))) diff --git a/R/data.table.r b/R/data.table.r index 41226504..0d12184f 100644 --- a/R/data.table.r +++ b/R/data.table.r @@ -9,7 +9,6 @@ #' @import fst #' @importFrom future.apply future_lapply #' @importFrom data.table rbindlist -#' @importFrom globals findGlobals #' @export #' @examples #' cars.df = as.disk.frame(cars) @@ -19,28 +18,58 @@ #' # clean up #' delete(cars.df) `[.disk.frame` <- function(df, ..., keep = NULL, rbind = TRUE, use.names = TRUE, fill = FALSE, idcol = NULL) { + message("data.table syntax for disk.frame may be moved to a separate package in the future") + keep_for_future = keep - dotdotdot = substitute(...()) #this is an alist + code = substitute(chunk[...]) # sometimes the arguments could be empty # in a recent version of globals that would cause a fail # to avoid the fail remove them from the test - dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))] + #dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))] - ag = globals::findGlobals(dotdotdot_for_find_global) + #ag = globals::findGlobals(dotdotdot_for_find_global) #ag = setdiff(ag, "") # "" can cause issues with future # this line no longer needed - res = future.apply::future_lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) { - #lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) { - chunk = get_chunk(df, chunk_id, keep = keep_for_future) + + # you need to use list otherwise the names will be gone + if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { + globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) + } else { + globals_and_pkgs = future::getGlobalsAndPackages(code) + } + + + global_vars = globals_and_pkgs$globals + + env = parent.frame() + + done = identical(env, emptyenv()) || identical(env, globalenv()) + + # keep adding global variables by moving up the environment chain + while(!done) { + tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) + new_global_vars = tmp_globals_and_pkgs$globals + for (name in setdiff(names(new_global_vars), names(global_vars))) { + global_vars[[name]] <- new_global_vars[[name]] + } + + done = identical(env, emptyenv()) || identical(env, globalenv()) + env = parent.env(env) + } + + globals_and_pkgs$globals = global_vars + + res = future.apply::future_lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { + #res = lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { + chunk = get_chunk(df, chunk_id, full.names=TRUE, keep = keep_for_future) data.table::setDT(chunk) - expr <- quote(chunk) - expr <- c(expr, dotdotdot) - res <- do.call(`[`, expr) + res = eval(code, envir=globals_and_pkgs$globals) res - }, future.globals = c("df", "keep_for_future", "dotdotdot", ag), future.packages = c("data.table","disk.frame"), - future.seed=TRUE + } + , future.packages = c("data.table", globals_and_pkgs$packages), + future.seed=TRUE ) if(rbind & all(sapply(res, function(x) "data.frame" %in% class(x)))) { @@ -53,14 +82,6 @@ } # Solutions from https://stackoverflow.com/questions/57122960/how-to-use-non-standard-evaluation-nse-to-evaluate-arguments-on-data-table?answertab=active#tab-top -# `[.dd` <- function(x, ...) { -# code <- rlang::enexprs(...) -# lapply(x, function(dt) { -# ex <- rlang::expr(dt[!!!code]) -# rlang::eval_tidy(ex) -# }) -# } -# # # `[.dd` <- function(x,...) { # a <- substitute(...()) #this is an alist diff --git a/R/disk.frame.r b/R/disk.frame.r index c949f43b..09894efc 100755 --- a/R/disk.frame.r +++ b/R/disk.frame.r @@ -213,7 +213,7 @@ is.dir.disk.frame <- function(df, check.consistency = TRUE) { head.disk.frame <- function(x, n = 6L, ...) { stopifnot(is_ready(x)) path1 <- attr(x,"path") - cmds <- attr(x, "lazyfn") + cmds <- attr(x, "recordings") if(fs::dir_exists(path1)) { path2 <- list.files(path1,full.names = TRUE)[1] head(play(fst::read_fst(path2, from = 1, to = n, as.data.table = TRUE), cmds), n = n, ...) @@ -229,7 +229,7 @@ head.disk.frame <- function(x, n = 6L, ...) { tail.disk.frame <- function(x, n = 6L, ...) { stopifnot(is_ready(x)) path1 <- attr(x,"path") - cmds <- attr(x, "lazyfn") + cmds <- attr(x, "recordings") if(dir.exists(path1)) { path2 <- list.files(path1,full.names = TRUE) path2 <- path2[length(path2)] diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 75be5774..285c678f 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -23,13 +23,8 @@ #' #' # clean up cars.df #' delete(cars.df) -select.disk.frame <- function(.data, ...) { - quo_dotdotdot = rlang::enquos(...) - cmap(.data, ~{ - code = rlang::quo(dplyr::select(.x, !!!quo_dotdotdot)) - rlang::eval_tidy(code) - }, lazy = TRUE) -} +select.disk.frame <- create_chunk_mapper(dplyr::select) + #' @export #' @rdname dplyr_verbs @@ -43,7 +38,6 @@ filter.disk.frame <- create_chunk_mapper(dplyr::filter) #' @export #' @rdname dplyr_verbs #' @importFrom future getGlobalsAndPackages -#' @importFrom rlang eval_tidy quo enquos #' @importFrom dplyr mutate mutate.disk.frame <- create_chunk_mapper(dplyr::mutate) @@ -65,21 +59,21 @@ arrange.disk.frame =create_chunk_mapper(dplyr::arrange, warning_msg="`arrange.di #' @rdname dplyr_verbs chunk_arrange <- create_chunk_mapper(dplyr::arrange) - # TODO family is not required is group-by # TODO alot of these .disk.frame functions are not generic +# TODO make this work like in dplyr #' #' @export #' #' @importFrom dplyr add_count #' #' @rdname dplyr_verbs #' add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) -#' @export -#' @importFrom dplyr add_tally -#' @rdname dplyr_verbs -add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) +#' #' @export +#' #' @importFrom dplyr add_tally +#' #' @rdname dplyr_verbs +#' add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) #' @export @@ -94,10 +88,10 @@ chunk_summarize <- create_chunk_mapper(dplyr::summarize) chunk_summarise <- create_chunk_mapper(dplyr::summarise) -#' @export -#' @rdname dplyr_verbs -#' @importFrom dplyr do -do.disk.frame <- create_chunk_mapper(dplyr::do) +#' #' @export +#' #' @rdname dplyr_verbs +#' #' @importFrom dplyr do +#' do.disk.frame <- create_chunk_mapper(dplyr::do) #' @export @@ -148,37 +142,3 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) glimpse.disk.frame <- function(.data, ...) { glimpse(head(.data, ...), ...) } - -# Internal methods -# @param .data the data -# @param cmd the function to record -record <- function(.data, cmd){ - attr(.data,"lazyfn") <- c(attr(.data,"lazyfn"), list(cmd)) - .data -} - -# Internal methods -# @param .data the disk.frame -# @param cmds the list of function to play back -play <- function(.data, cmds=NULL) { - for (cmd in cmds){ - if (typeof(cmd) == "closure") { - .data <- cmd(.data) - } else { - # create a temporary environment - an_env = new.env(parent = environment()) - - ng = names(cmd$vars_and_pkgs$globals) - - if(length(ng) > 0) { - for(i in 1:length(cmd$vars_and_pkgs$globals)) { - g = cmd$vars_and_pkgs$globals[[i]] - assign(ng[i], g, pos = an_env) - } - } - - .data <- do.call(cmd$func, c(list(.data),cmd$dotdotdot), envir = an_env) - } - } - .data -} diff --git a/R/foverlaps.disk.frame.r b/R/foverlaps.disk.frame.r index 2ae7c706..657b8c65 100644 --- a/R/foverlaps.disk.frame.r +++ b/R/foverlaps.disk.frame.r @@ -84,7 +84,7 @@ foverlaps.disk.frame <- function( dotdotdot$x = data1 dotdotdot$y = data2 - data3 = pryr::do_call(foverlaps, dotdotdot) + data3 = do.call(foverlaps, dotdotdot) rm(data1); rm(data2); gc() outdir fst::write_fst(data3, glue::glue("{outdir}/{chunk_id}"), compress = compress) diff --git a/R/full_join.r b/R/full_join.r index 54dc2ba9..50ba772b 100644 --- a/R/full_join.r +++ b/R/full_join.r @@ -15,8 +15,8 @@ full_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi if("data.frame" %in% class(y)) { # full join cannot be support for y in data.frame ncx = nchunks(x) - dy = shard(y, shardby = by, nchunks = ncx, overwrite = TRUE) - dx = hard_group_by(x, by = by, overwrite = TRUE) + dy = shard(y, shardby = by, nchunks = ncx, overwrite = FALSE) + dx = rechunk(x, shardby = by, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(full_join.disk.frame(dx, dy, by, copy=copy, outdir=outdir, merge_by_chunk_id = TRUE)) } else if("disk.frame" %in% class(y)) { if(is.null(merge_by_chunk_id)) { @@ -30,8 +30,8 @@ full_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + x = rechunk(x, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) + y = rechunk(y, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(full_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { res = cmap2(x, y, ~{ diff --git a/R/get_chunk.r b/R/get_chunk.r index 1e3cfcbd..01c06e61 100644 --- a/R/get_chunk.r +++ b/R/get_chunk.r @@ -26,38 +26,18 @@ get_chunk <- function(...) { #' @export get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { stopifnot("disk.frame" %in% class(df)) - keep_chunks = attr(df, "keep_chunks", exact=TRUE) - - # print(names(attr(df, "lazyfn")[[1]]$vars_and_pkgs$globals)) - # stop("ok") - - # TODO relax this - # if(!is.null(keep_chunks)) { - # # browser() - # # n_int = as.integer(n) - # # - # # if(is.na(n_int)) { - # # if(as.character(n) %in% get_chunk_ids(df)[keep_chunks]) { - # # return(NULL) - # # } else if(normalizePath(as.character(n)) %in% sapply(get_chunk_ids(df, full.names = TRUE)[keep_chunks],normalizePath)) { - # # return(NULL) - # # } - # # } else { - # # if(!n %in% keep_chunk) { - # # return(NULL) - # # } - # # } - # } + # keep_chunks = attr(df, "keep_chunks", exact=TRUE) path = attr(df,"path", exact=TRUE) # all the variables to keep in the attr from a previous srckeep - keep1 = attr(df,"keep", exact=TRUE) + keep1 = attr(df, "keep", exact=TRUE) - cmds = attr(df,"lazyfn", exact=TRUE) + recordings = attr(df, "recordings", exact=TRUE) filename = "" if (typeof(keep) == "closure") { + # sometimes purrr::keep is picked up keep = keep1 } else if(!is.null(keep1) & !is.null(keep)) { if (length(setdiff(keep, keep1)) > 0) { @@ -89,7 +69,8 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { } } - # if the file you are looking for don't exist + + # if the file you are looking for doesn't exist if (!fs::file_exists(filename)) { warning(glue("The chunk {filename} does not exist; returning an empty data.table")) notbl <- data.table() @@ -97,17 +78,19 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { return(notbl) } - if (is.null(cmds)) { - if(typeof(keep)!="closure") { - fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) - } else { + + if (is.null(recordings)) { + if(typeof(keep)=="closure") { fst::read_fst(filename, as.data.table = TRUE,...) + } else { + fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) } } else { if(typeof(keep)!="closure") { - play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), cmds) + play(fst::read_fst(filename, as.data.table = TRUE,...), recordings) } else { - play(fst::read_fst(filename, as.data.table = TRUE,...), cmds) + play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), recordings) + } } } diff --git a/R/hard_arrange.r b/R/hard_arrange.r index f5ca5de5..ca8cee7e 100644 --- a/R/hard_arrange.r +++ b/R/hard_arrange.r @@ -1,67 +1,67 @@ -#' Perform a hard arrange -#' @description -#' A hard_arrange is a sort by that also reorganizes the chunks to ensure that -#' every unique grouping of `by`` is in the same chunk. Or in other words, every -#' row that share the same `by` value will end up in the same chunk. -#' @param df a disk.frame -#' @param ... grouping variables -#' @param outdir the output directory -#' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) -#' @param overwrite overwrite the out put directory -#' @param add same as dplyr::arrange -#' @param .drop same as dplyr::arrange -#' @export -#' @examples -#' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' Perform a hard arrange +#' #' @description +#' #' A hard_arrange is a sort by that also reorganizes the chunks to ensure that +#' #' every unique grouping of `by`` is in the same chunk. Or in other words, every +#' #' row that share the same `by` value will end up in the same chunk. +#' #' @param df a disk.frame +#' #' @param ... grouping variables +#' #' @param outdir the output directory +#' #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) +#' #' @param overwrite overwrite the out put directory +#' #' @param add same as dplyr::arrange +#' #' @param .drop same as dplyr::arrange +#' #' @export +#' #' @examples +#' #' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' +#' #' # arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk +#' #' iris_hard.df = hard_arrange(iris.df, Species) +#' #' +#' #' get_chunk(iris_hard.df, 1) +#' #' get_chunk(iris_hard.df, 2) +#' #' +#' #' # clean up cars.df +#' #' delete(iris.df) +#' #' delete(iris_hard.df) +#' hard_arrange <- function(df, ..., add = FALSE, .drop = FALSE) { +#' UseMethod("hard_arrange") +#' } #' -#' # arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk -#' iris_hard.df = hard_arrange(iris.df, Species) +#' #' @rdname hard_arrange +#' #' @export +#' #' @importFrom dplyr arrange +#' hard_arrange.data.frame <- function(df, ...) { +#' dplyr::arrange(df, ...) +#' } #' -#' get_chunk(iris_hard.df, 1) -#' get_chunk(iris_hard.df, 2) +#' #' @rdname hard_arrange +#' #' @importFrom purrr map +#' #' @export +#' hard_arrange.disk.frame <- function(df, ..., outdir=tempfile("tmp_disk_frame_hard_arrange"), nchunks = disk.frame::nchunks(df), overwrite = TRUE) { +#' overwrite_check(outdir, overwrite) +#' +#' # Refer also to Dplyr arrange: https://github.com/tidyverse/dplyr/blob/master/src/arrange.cpp +#' q <- enquos(...) +#' is_sym <- sapply(q, rlang::quo_is_symbol) +#' arrange_codes <- sapply(q, rlang::as_label) +#' +#' # Check if desc... +#' is_desc <- substr(arrange_codes, 1, 5) == "desc(" +#' +#' # If expr is a symbol from the data, just use it. +#' # Otherwise need to evaluate ... +#' # (TODO - currently only support variables and desc in the data) +#' # Peels off "desc" from the original +#' vars <- sub(")", "", sub("desc(", "", arrange_codes, fixed=TRUE), fixed=TRUE) #' -#' # clean up cars.df -#' delete(iris.df) -#' delete(iris_hard.df) -hard_arrange <- function(df, ..., add = FALSE, .drop = FALSE) { - UseMethod("hard_arrange") -} - -#' @rdname hard_arrange -#' @export -#' @importFrom dplyr arrange -hard_arrange.data.frame <- function(df, ...) { - dplyr::arrange(df, ...) -} - -#' @rdname hard_arrange -#' @importFrom purrr map -#' @export -hard_arrange.disk.frame <- function(df, ..., outdir=tempfile("tmp_disk_frame_hard_arrange"), nchunks = disk.frame::nchunks(df), overwrite = TRUE) { - overwrite_check(outdir, overwrite) - - # Refer also to Dplyr arrange: https://github.com/tidyverse/dplyr/blob/master/src/arrange.cpp - q <- enquos(...) - is_sym <- sapply(q, rlang::quo_is_symbol) - arrange_codes <- sapply(q, rlang::as_label) - - # Check if desc... - is_desc <- substr(arrange_codes, 1, 5) == "desc(" - - # If expr is a symbol from the data, just use it. - # Otherwise need to evaluate ... - # (TODO - currently only support variables and desc in the data) - # Peels off "desc" from the original - vars <- sub(")", "", sub("desc(", "", arrange_codes, fixed=TRUE), fixed=TRUE) - - desc_vars <- vars[is_desc] - - if(!all(vars %in% colnames(df))){ - stop(paste0("Expressions currently not supported. Columns not found in colnames:", vars[!vars %in% colnames(df)])) - } - - # Hard group by in a partially sorted way at the chunk level and then arrange within chunks - df %>% - disk.frame::hard_group_by(vars, outdir=outdir, nchunks=nchunks, overwrite=overwrite, shardby_function="sort", desc_vars=desc_vars) %>% - chunk_arrange(...) -} \ No newline at end of file +#' desc_vars <- vars[is_desc] +#' +#' if(!all(vars %in% colnames(df))){ +#' stop(paste0("Expressions currently not supported. Columns not found in colnames:", vars[!vars %in% colnames(df)])) +#' } +#' +#' # Hard group by in a partially sorted way at the chunk level and then arrange within chunks +#' df %>% +#' disk.frame::hard_group_by(vars, outdir=outdir, nchunks=nchunks, overwrite=overwrite, shardby_function="sort", desc_vars=desc_vars) %>% +#' chunk_arrange(...) +#' } \ No newline at end of file diff --git a/R/hard_group_by.r b/R/hard_group_by.r index f0397cde..48228564 100644 --- a/R/hard_group_by.r +++ b/R/hard_group_by.r @@ -1,217 +1,217 @@ -#' Show a progress bar of the action being performed -#' @importFrom utils txtProgressBar setTxtProgressBar -#' @param df a disk.frame -#' @noRd -progressbar <- function(df) { - if(attr(df,"performing", exact=TRUE) == "hard_group_by") { - # create progress bar - - shardby = "acct_id" - #list.files( - fparent = attr(df,"parent", exact=TRUE) - - #tmp = file.path(fparent,".performing","inchunks") - tmp = "tmphardgroupby2" - - l = length(list.files(fparent)) - pt_begin_split = proc.time() - doprog <- function(pt_from, sleep = 1) { - #tkpb = winProgressBar(title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), label = "Checking completeness", - # min = 0, max = l*1.5, initial = 0, width = 500) - pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) - - on.exit(close(pb)) - # on.exit(close(tkpb)) - while(length(list.files(file.path(tmp,l))) < l) { - wl = length(list.files(file.path(tmp,1:l)))/l - tt <- proc.time()[3] - pt_from[3] - #list.files( - avg_speed = tt/wl - pred_speed = avg_speed*(l-wl) + avg_speed*l/2 - elapsed = round(tt/60,1) - - #setWinProgressBar(tkpb, wl, - # title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), - # label = sprintf("%.0f out of %d; avg speed %.2f mins; elapsed %.1f mins; another %.1f mins", wl,l, round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) - setTxtProgressBar(pb, length(list.files(file.path(tmp,l))), - title = sprintf("Group By - %s", shardby)) - Sys.sleep(sleep) - } - } - doprog(pt_begin_split, 1) - - pt_begin_collate = proc.time() - doprog2 <- function(pt_from, sleep = 1) { - # tkpb = winProgressBar(title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating", shardby), label = "Checking completeness", - # min = 0, max = l*1.5, initial = 0, width = 600) - pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) - - on.exit(close(pb)) - # on.exit(close(tkpb)) - while(length(list.files("large_sorted")) < l) { - wl = length(list.files("large_sorted")) - tt <- proc.time()[3] - pt_from[3] - #list.files( - avg_speed = tt/wl - pred_speed = avg_speed*(l-wl) - elapsed = round(tt/60,1) - - # setWinProgressBar(tkpb, l + wl/2, - # title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating -- %.0f out of %d chunks processed;", shardby, wl, l), - # label = sprintf("avg %.2f min/chunk; %.1f mins elapsed; %.1f mins remaining;", round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) - setTxtProgressBar(pb, length(list.files("large_sorted")), - title = sprintf("Hard Group By - %s", shardby)) - Sys.sleep(sleep) - } - } - doprog2(pt_begin_collate, 1) - } -} - -#' Perform a hard group -#' @description -#' A hard_group_by is a group by that also reorganizes the chunks to ensure that -#' every unique grouping of `by`` is in the same chunk. Or in other words, every -#' row that share the same `by` value will end up in the same chunk. -#' @param df a disk.frame -#' @param ... grouping variables -#' @param outdir the output directory -#' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) -#' @param overwrite overwrite the out put directory -#' @param .add same as dplyr::group_by -#' @param .drop same as dplyr::group_by -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits for the "sort" shardby function, a dataframe with the split values. -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. -#' @param sort_split_sample_size for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits. -#' @export -#' @examples -#' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' Show a progress bar of the action being performed +#' #' @importFrom utils txtProgressBar setTxtProgressBar +#' #' @param df a disk.frame +#' #' @noRd +#' progressbar <- function(df) { +#' if(attr(df,"performing", exact=TRUE) == "hard_group_by") { +#' # create progress bar +#' +#' shardby = "acct_id" +#' #list.files( +#' fparent = attr(df,"parent", exact=TRUE) +#' +#' #tmp = file.path(fparent,".performing","inchunks") +#' tmp = "tmphardgroupby2" +#' +#' l = length(list.files(fparent)) +#' pt_begin_split = proc.time() +#' doprog <- function(pt_from, sleep = 1) { +#' #tkpb = winProgressBar(title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), label = "Checking completeness", +#' # min = 0, max = l*1.5, initial = 0, width = 500) +#' pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) +#' +#' on.exit(close(pb)) +#' # on.exit(close(tkpb)) +#' while(length(list.files(file.path(tmp,l))) < l) { +#' wl = length(list.files(file.path(tmp,1:l)))/l +#' tt <- proc.time()[3] - pt_from[3] +#' #list.files( +#' avg_speed = tt/wl +#' pred_speed = avg_speed*(l-wl) + avg_speed*l/2 +#' elapsed = round(tt/60,1) +#' +#' #setWinProgressBar(tkpb, wl, +#' # title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), +#' # label = sprintf("%.0f out of %d; avg speed %.2f mins; elapsed %.1f mins; another %.1f mins", wl,l, round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) +#' setTxtProgressBar(pb, length(list.files(file.path(tmp,l))), +#' title = sprintf("Group By - %s", shardby)) +#' Sys.sleep(sleep) +#' } +#' } +#' doprog(pt_begin_split, 1) +#' +#' pt_begin_collate = proc.time() +#' doprog2 <- function(pt_from, sleep = 1) { +#' # tkpb = winProgressBar(title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating", shardby), label = "Checking completeness", +#' # min = 0, max = l*1.5, initial = 0, width = 600) +#' pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) +#' +#' on.exit(close(pb)) +#' # on.exit(close(tkpb)) +#' while(length(list.files("large_sorted")) < l) { +#' wl = length(list.files("large_sorted")) +#' tt <- proc.time()[3] - pt_from[3] +#' #list.files( +#' avg_speed = tt/wl +#' pred_speed = avg_speed*(l-wl) +#' elapsed = round(tt/60,1) +#' +#' # setWinProgressBar(tkpb, l + wl/2, +#' # title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating -- %.0f out of %d chunks processed;", shardby, wl, l), +#' # label = sprintf("avg %.2f min/chunk; %.1f mins elapsed; %.1f mins remaining;", round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) +#' setTxtProgressBar(pb, length(list.files("large_sorted")), +#' title = sprintf("Hard Group By - %s", shardby)) +#' Sys.sleep(sleep) +#' } +#' } +#' doprog2(pt_begin_collate, 1) +#' } +#' } #' -#' # group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk -#' iris_hard.df = hard_group_by(iris.df, Species) +#' #' Perform a hard group +#' #' @description +#' #' A hard_group_by is a group by that also reorganizes the chunks to ensure that +#' #' every unique grouping of `by`` is in the same chunk. Or in other words, every +#' #' row that share the same `by` value will end up in the same chunk. +#' #' @param df a disk.frame +#' #' @param ... grouping variables +#' #' @param outdir the output directory +#' #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) +#' #' @param overwrite overwrite the out put directory +#' #' @param .add same as dplyr::group_by +#' #' @param .drop same as dplyr::group_by +#' #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks +#' #' @param sort_splits for the "sort" shardby function, a dataframe with the split values. +#' #' @param desc_vars for the "sort" shardby function, the variables to sort descending. +#' #' @param sort_split_sample_size for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits. +#' #' @export +#' #' @examples +#' #' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' +#' #' # group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk +#' #' iris_hard.df = hard_group_by(iris.df, Species) +#' #' +#' #' get_chunk(iris_hard.df, 1) +#' #' get_chunk(iris_hard.df, 2) +#' #' +#' #' # clean up cars.df +#' #' delete(iris.df) +#' #' delete(iris_hard.df) +#' hard_group_by <- function(df, ..., .add = FALSE, .drop = FALSE) { +#' UseMethod("hard_group_by") +#' } #' -#' get_chunk(iris_hard.df, 1) -#' get_chunk(iris_hard.df, 2) +#' #' @rdname hard_group_by +#' #' @export +#' #' @importFrom dplyr group_by +#' hard_group_by.data.frame <- function(df, ..., .add = FALSE, .drop = FALSE) { +#' dplyr::group_by(df, ..., .add = FALSE, .drop = FALSE) +#' } #' -#' # clean up cars.df -#' delete(iris.df) -#' delete(iris_hard.df) -hard_group_by <- function(df, ..., .add = FALSE, .drop = FALSE) { - UseMethod("hard_group_by") -} - -#' @rdname hard_group_by -#' @export -#' @importFrom dplyr group_by -hard_group_by.data.frame <- function(df, ..., .add = FALSE, .drop = FALSE) { - dplyr::group_by(df, ..., .add = FALSE, .drop = FALSE) -} - -#' @rdname hard_group_by -#' @importFrom purrr map -#' @export -hard_group_by.disk.frame <- function( - df, - ..., - outdir=tempfile("tmp_disk_frame_hard_group_by"), - nchunks = disk.frame::nchunks(df), - overwrite = TRUE, - shardby_function="hash", - sort_splits=NULL, - desc_vars=NULL, - sort_split_sample_size=100 - ) { - overwrite_check(outdir, overwrite) - - ff = list.files(attr(df, "path")) - stopifnot(shardby_function %in% c("hash", "sort")) - - if (shardby_function == "sort" && is.null(sort_splits)){ - # Sample enough per chunk to generate reasonable splits - sample_size_per_chunk = ceiling(nchunks / disk.frame::nchunks(df)) * sort_split_sample_size - - # Sample and sort - sort_splits_sample <- cmap(df, dplyr::sample_n, size=sample_size_per_chunk, replace=TRUE) %>% - select(...) %>% - collect() - - # NSE - tryCatch({ - sort_splits_sample <- sort_splits_sample %>% - arrange(!!!syms(...)) - }, error = function(e) { - sort_splits_sample <- sort_splits_sample %>% - arrange(...) - }) - - # If 100 chunks, this return get 99 splits based on percentiles. - ntiles <- round((1:(nchunks-1)) * (nrow(sort_splits_sample) / (nchunks))) - - # Get splits. May lead to less than nchunks if duplicates are selected. - sort_splits <- sort_splits_sample %>% - dplyr::slice(ntiles) %>% - distinct() - } - - # test if the unlist it will error - - tryCatch({ - # This will return the variable names - - # TODO use better ways to do NSE - # the below will fail if indeed ... can not be list-ed - # there should be a better way to do this - by <- unlist(list(...)) - - # shard and create temporary diskframes - tmp_df = cmap(df, function(df1) { - tmpdir = tempfile() - shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) - }, lazy = FALSE) - - - # now rbindlist - res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) - - # clean up the tmp dir - purrr::walk(tmp_df, ~{ - fs::dir_delete(attr(.x, "path", exact=TRUE)) - }) - - - res1 <- NULL - if(typeof(by) == "character") { - eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) - } else if(length(by) == 1) { - res1 = res %>% dplyr::group_by({{by}}) - } else { - eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) - } - - res1 - }, error = function(e) { - # message(e) - # This will return the variable names - by = rlang::enquos(...) %>% - substr(2, nchar(.)) - - # shard and create temporary diskframes - tmp_df = cmap(df, function(df1) { - tmpdir = tempfile() - shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) - }, lazy = FALSE) - - # now rbindlist - res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) - - # clean up the tmp dir - purrr::walk(tmp_df, ~{ - fs::dir_delete(attr(.x, "path", exact=TRUE)) - }) - - res1 = res %>% chunk_group_by(!!!syms(by)) - - res1 - }) -} +#' #' @rdname hard_group_by +#' #' @importFrom purrr map +#' #' @export +#' hard_group_by.disk.frame <- function( +#' df, +#' ..., +#' outdir=tempfile("tmp_disk_frame_hard_group_by"), +#' nchunks = disk.frame::nchunks(df), +#' overwrite = TRUE, +#' shardby_function="hash", +#' sort_splits=NULL, +#' desc_vars=NULL, +#' sort_split_sample_size=100 +#' ) { +#' overwrite_check(outdir, overwrite) +#' +#' ff = list.files(attr(df, "path")) +#' stopifnot(shardby_function %in% c("hash", "sort")) +#' +#' if (shardby_function == "sort" && is.null(sort_splits)){ +#' # Sample enough per chunk to generate reasonable splits +#' sample_size_per_chunk = ceiling(nchunks / disk.frame::nchunks(df)) * sort_split_sample_size +#' +#' # Sample and sort +#' sort_splits_sample <- cmap(df, dplyr::sample_n, size=sample_size_per_chunk, replace=TRUE) %>% +#' select(...) %>% +#' collect() +#' +#' # NSE +#' tryCatch({ +#' sort_splits_sample <- sort_splits_sample %>% +#' arrange(!!!syms(...)) +#' }, error = function(e) { +#' sort_splits_sample <- sort_splits_sample %>% +#' arrange(...) +#' }) +#' +#' # If 100 chunks, this return get 99 splits based on percentiles. +#' ntiles <- round((1:(nchunks-1)) * (nrow(sort_splits_sample) / (nchunks))) +#' +#' # Get splits. May lead to less than nchunks if duplicates are selected. +#' sort_splits <- sort_splits_sample %>% +#' dplyr::slice(ntiles) %>% +#' distinct() +#' } +#' +#' # test if the unlist it will error +#' +#' tryCatch({ +#' # This will return the variable names +#' +#' # TODO use better ways to do NSE +#' # the below will fail if indeed ... can not be list-ed +#' # there should be a better way to do this +#' by <- unlist(list(...)) +#' +#' # shard and create temporary diskframes +#' tmp_df = cmap(df, function(df1) { +#' tmpdir = tempfile() +#' shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) +#' }, lazy = FALSE) +#' +#' +#' # now rbindlist +#' res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) +#' +#' # clean up the tmp dir +#' purrr::walk(tmp_df, ~{ +#' fs::dir_delete(attr(.x, "path", exact=TRUE)) +#' }) +#' +#' +#' res1 <- NULL +#' if(typeof(by) == "character") { +#' eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) +#' } else if(length(by) == 1) { +#' res1 = res %>% dplyr::group_by({{by}}) +#' } else { +#' eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) +#' } +#' +#' res1 +#' }, error = function(e) { +#' # message(e) +#' # This will return the variable names +#' by = rlang::enquos(...) %>% +#' substr(2, nchar(.)) +#' +#' # shard and create temporary diskframes +#' tmp_df = cmap(df, function(df1) { +#' tmpdir = tempfile() +#' shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) +#' }, lazy = FALSE) +#' +#' # now rbindlist +#' res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) +#' +#' # clean up the tmp dir +#' purrr::walk(tmp_df, ~{ +#' fs::dir_delete(attr(.x, "path", exact=TRUE)) +#' }) +#' +#' res1 = res %>% chunk_group_by(!!!syms(by)) +#' +#' res1 +#' }) +#' } diff --git a/R/inner_join.r b/R/inner_join.r index 545136f9..5699e71b 100644 --- a/R/inner_join.r +++ b/R/inner_join.r @@ -8,7 +8,7 @@ #' # clean up cars.df #' delete(cars.df) #' delete(join.df) -inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, .progress = FALSE) { +inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, suffix=c(".x", ".y"), ..., keep=FALSE, outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, .progress = FALSE) { stopifnot("disk.frame" %in% class(x)) overwrite_check(outdir, overwrite) @@ -24,10 +24,8 @@ inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempf } if("data.frame" %in% class(y)) { - quo_dotdotdot = enquos(...) res = cmap_dfr(x, ~{ - code = quo(inner_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) + inner_join(.x, y, by = by, copy = copy, suffix=suffix, ..., keep=keep) }, .progress = .progress) return(res) } else if("disk.frame" %in% class(y)) { @@ -41,22 +39,18 @@ inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempf ncx = nchunks(x) ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) - return(inner_join.disk.frame(x, y, by, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) + x = rechunk(x, shardby=by, nchunks = max(ncy,ncx), outdir = tempfile(fileext = ".df"), overwrite = FALSE) + y = rechunk(y, shardby=by, nchunks = max(ncy,ncx), outdir = tempfile(fileext = ".df"), overwrite = FALSE) + return(inner_join.disk.frame(x, y, by, copy=copy, suffix = suffix, ..., keep=keep, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { - dotdotdot <- list(...) - res = cmap2.disk.frame(x, y, ~{ if(is.null(.y)) { return(data.table()) } else if (is.null(.x)) { return(data.table()) } - #inner_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite) - lij = purrr::lift(dplyr::inner_join) - lij(c(list(x = .x, y = .y, by = by, copy = copy), dotdotdot)) - }, outdir = outdir, .progress = .progress) + inner_join(.x, .y, by = by, copy = copy, suffix = suffix, ..., keep=keep) + }, outdir = outdir, .progress = .progress, overwrite = overwrite) return(res) } else { # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine diff --git a/R/left_join.r b/R/left_join.r index 4c1b6e3b..e3b810b8 100644 --- a/R/left_join.r +++ b/R/left_join.r @@ -1,3 +1,5 @@ +left_join_y_is_data.frame = create_chunk_mapper(dplyr::left_join) + #' Performs join/merge for disk.frames #' @rdname join #' @export @@ -9,21 +11,14 @@ #' # clean up cars.df #' delete(cars.df) #' delete(join.df) -left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { +left_join.disk.frame = function(x, y, by=NULL, copy=FALSE, suffix=c(".x", ".y"), ..., keep=FALSE, outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { stopifnot("disk.frame" %in% class(x)) - overwrite_check(outdir, overwrite) - - if("data.frame" %in% class(y)) { - # note that x is named .data in the lazy evaluation - quo_dotdotdot = enquos(...) - cmap_dfr(x, ~{ - code = quo(left_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) - }, .progress = .progress) - } else if("disk.frame" %in% class(y)) { + if ("data.frame" %in% class(y)) { + left_join_y_is_data.frame(x, y, by=by, copy=copy, suffix=suffix, ..., keep=keep) + } else { if(is.null(merge_by_chunk_id)) { - stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") + stop("Both `x` and `y` are disk.frames. You need to specify `merge_by_chunk_id = TRUE` or `FALSE` explicitly") } if(is.null(by)) { by <- intersect(names(x), names(y)) @@ -32,12 +27,11 @@ left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi ncx = nchunks(x) ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { - warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + warning("`merge_by_chunk_id = FALSE`. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making `y` a data.frame or set merge_by_chunk_id = TRUE for better performance.") + x = rechunk(x, nchunks = max(ncy, ncx), shardby = by, outdir=tempfile(), overwrite = FALSE) + y = rechunk(x, nchunks = max(ncy, ncx), shardby = by, outdir=tempfile(), overwrite = FALSE) return(left_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) } else if(merge_by_chunk_id == TRUE) { - #} else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { dotdotdot = list(...) res = cmap2.disk.frame(x, y, ~{ if(is.null(.y)) { @@ -45,9 +39,9 @@ left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi } else if (is.null(.x)) { return(data.table()) } - llj = purrr::lift(dplyr::left_join) - #left_join(.x, .y, by = by, copy = copy, ...) - llj(c(list(x=.x, y =.y, by = by, copy = copy), dotdotdot)) + left_join(.x, .y, by = by, copy = copy, suffix=suffix, ..., keep=keep) + #llj = purrr::lift(dplyr::left_join) + #llj(c(list(x=.x, y =.y, by = by, copy = copy), dotdotdot)) }, outdir = outdir) return(res) } else { @@ -56,3 +50,4 @@ left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi } } } + \ No newline at end of file diff --git a/R/map-deprecated.r b/R/map-deprecated.r deleted file mode 100644 index bac52e3d..00000000 --- a/R/map-deprecated.r +++ /dev/null @@ -1,85 +0,0 @@ -#' @export -#' @rdname cmap -map <- function(.x, .f, ...) { - UseMethod("map") -} - -#' @export -#' @rdname cmap -map.disk.frame <- function(...) { - warning("map(df, ...) where df is a disk.frame has been deprecated. Please use cmap(df,...) instead") - cmap.disk.frame(...) -} - -#' @export -#' @rdname cmap -map.default <- function(.x, .f, ...) { - purrr::map(.x, .f, ...) -} - - -#' @export -#' @rdname cmap -imap_dfr <- function(.x, .f, ..., .id = NULL) { - UseMethod("imap_dfr") -} - -#' @export -#' @rdname cmap -imap_dfr.disk.frame <- function(...) { - warning("imap_dfr(df, ...) where df is disk.frame is deprecated. Please use cimap_dfr(df, ...) instead") - cimap_dfr.disk.frame(...) -} - -#' @export -#' @rdname cmap -imap_dfr.default <- function(.x, .f, ..., .id = NULL) { - purrr::imap_dfr(.x, .f, ..., .id = .id) -} - -#' @export -#' @rdname cmap -#' @examples -#' cars.df = as.disk.frame(cars) -#' -#' # .x is the chunk and .y is the ID as an integer -#' -#' # lazy = TRUE support is not available at the moment -#' cimap(cars.df, ~.x[, id := .y], lazy = FALSE) -#' -#' cimap_dfr(cars.df, ~.x[, id := .y]) -#' -#' # clean up cars.df -#' delete(cars.df) -imap <- function(.x, .f, ...) { - UseMethod("imap") -} - -imap.disk.frame <- function(...) { - warning("imap(df,..) where df is disk.frame is deprecated. Use cimap(df, ...) instead") - cimap.disk.frame(...) -} - -#' @export -#' @rdname cmap -imap.default <- function(.x, .f, ...) { - purrr::imap(.x, .f, ...) -} - -#' @rdname cmap -#' @param .id not used -#' @export -map_dfr.disk.frame <- function(...) { - warning("map_dfr(df, ...) where df is disk.frame is deprecated. Please use cmap_dfr instead") - cmap_dfr.disk.frame(...) -} - -map_dfr <- function(.x, .f, ..., .id = NULL) { - UseMethod("map_dfr") -} - -#' @export -#' @rdname cmap -map_dfr.default <- function(.x, .f, ..., .id = NULL) { - purrr::map_dfr(.x, .f, ..., .id = .id) -} \ No newline at end of file diff --git a/R/map2.r b/R/map2.r index a2882b2f..22633241 100644 --- a/R/map2.r +++ b/R/map2.r @@ -26,22 +26,6 @@ cmap2 <- function(.x, .y, .f, ...){ UseMethod("cmap2") } -#' @export -#' @rdname cmap2 -map2 <- function(.x, .y, .f, ...){ - UseMethod("map2") -} - -#' @export -map2.default <- function(.x, .y, .f, ...) { - purrr::map2(.x,.y,.f,...) -} - -#' @export -map2.disk.frame <- function(...) { - warning("map2.disk.frame(df, df1, ..) where df is disk.frame is deprecated. Use cmap(df, df1, ...) instead") - cmap2.disk.frame(...) -} #' @export #' @importFrom pryr do_call @@ -51,10 +35,8 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") stop(sprintf("running %s : the .x argument must be a disk.frame", code)) } - .f = purrr::as_mapper(.f) - if("disk.frame" %in% class(.y)) { fs::dir_create(outdir) @@ -65,8 +47,6 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") yc[,yid:=get_chunk_ids(.y, full.names = TRUE)] xyc = merge(xc, yc, by="cid", all = TRUE, allow.cartesian = TRUE) - - ddd = list(...) # apply the functions future.apply::future_mapply(function(xid, yid, outid) { @@ -77,11 +57,10 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") if(base::nrow(xych) > 0) { fst::write_fst(xych, file.path(outdir, paste0(outid,".fst"))) } else { - warning(glue::glue("one of the chunks, {xid}, is empty")) + warning(sprintf("one of the chunks, %s, is empty", xid)) } NULL - } - ,xyc$xid, xyc$yid, xyc$cid # together with mapply + }, xyc$xid, xyc$yid, xyc$cid # together with mapply , future.seed=NULL ) @@ -91,13 +70,12 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") warning("in cmap2(.x,.y,...) the .y is not a disk.frame, so returning a list instead of a disk.frame") f_for_passing = force(.f) - ddd = list(...) tmp_disk.frame = force(.x) - res = furrr::future_map2(get_chunk_ids(tmp_disk.frame, full.names = TRUE), .y, function(xs, ys) { - ddd = c(list(get_chunk(tmp_disk.frame, xs, full.names = TRUE), ys), ddd) - - pryr::do_call(f_for_passing, ddd) - }) + res = future.apply::future_mapply(function(xs, ys, ...) { + ddd = c(list(get_chunk(tmp_disk.frame, xs, full.names = TRUE), ys), ...) + do.call(f_for_passing, ddd) + }, get_chunk_ids(tmp_disk.frame, full.names = TRUE), .y, ..., SIMPLIFY=FALSE, future.seed = TRUE) + return(res) } diff --git a/R/map_by_chunk_id.r b/R/map_by_chunk_id.r index f446848a..32ee0d16 100644 --- a/R/map_by_chunk_id.r +++ b/R/map_by_chunk_id.r @@ -2,5 +2,5 @@ #' @export map_by_chunk_id <- function(.x, .y, .f, ..., outdir) { warning("map_by_chunk_id is deprecated. Use map2 instead") - map2.disk.frame(.x, .y, .f, ..., outdir = outdir) + cmap2.disk.frame(.x, .y, .f, ..., outdir = outdir) } \ No newline at end of file diff --git a/R/names.r b/R/names.r index 10954ad4..05d047da 100644 --- a/R/names.r +++ b/R/names.r @@ -25,8 +25,9 @@ names.disk.frame <- function(x, ...) { #' @export colnames.disk.frame <- function(x, ...) { res = attr(x, "path", exact=TRUE) %>% - fs::dir_ls(type="file") - if(is.null(attr(x, "lazyfn"))) { + list.files(full.names = TRUE) + + if(is.null(attr(x, "recordings"))) { if(length(res) == 0) { return(vector("character")) } @@ -42,4 +43,4 @@ colnames.disk.frame <- function(x, ...) { #' @export colnames.default <- function(x, ...) { base::colnames(x, ...) -} \ No newline at end of file +} diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R index 210be4f0..27f62f2f 100644 --- a/R/one-stage-verbs.R +++ b/R/one-stage-verbs.R @@ -209,28 +209,20 @@ IQR_df.collected_agg.disk.frame <- function(listx, ...) { #' @rdname group_by #' @export summarise.grouped_disk.frame <- function(.data, ...) { + # get all components of the summarise + dotdotdot = rlang::enexprs(...) + # convert any quosure to labels + for (i in seq_along(dotdotdot)) { + if("quosure" %in% class(dotdotdot[[i]])) { + dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) + } + } - ca_code = generate_summ_code(...) + class(.data) <- c("summarized_disk.frame", "disk.frame") + attr(.data, "summarize_code") = dotdotdot - if(is.null(names(ca_code))) { - return(eval(parse(text = glue::glue(".data %>% {rlang::as_label(ca_code)}")))) - } else if("chunk_summ_code" %in% names(ca_code)) { - chunk_summ_code = ca_code$chunk_summ_code - agg_summ_code = ca_code$agg_summ_code - - # get the by variables - group_by_cols = purrr::map_chr(attr(.data, "group_by_cols", exact=TRUE), ~{deparse(.x)}) - - # generate full code - code_to_run = glue::glue("chunk_group_by({paste0(group_by_cols, collapse=',')}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({paste0(group_by_cols, collapse=',')}) %>% summarize({agg_summ_code})") - - class(.data) <- c("summarized_disk.frame", "disk.frame") - attr(.data, "summarize_code") = code_to_run - return(.data) - } else { - stop("something's wrong mate") - } + return(.data) } #' @export @@ -245,137 +237,53 @@ summarize.grouped_disk.frame = summarise.grouped_disk.frame #' reorganizes the chunks by the shard key. #' @seealso hard_group_by #' @param .data a disk.frame -#' @param add from dplyr +#' @param .add from dplyr #' @param .drop from dplyr #' @param ... same as the dplyr::group_by #' @importFrom dplyr group_by_drop_default +#' @importFrom rlang enexpr #' @export #' @rdname group_by # learning from https://docs.dask.org/en/latest/dataframe-groupby.html -group_by.disk.frame <- function(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { +group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")) { + class(.data) <- c("grouped_disk.frame", "disk.frame") - attr(.data, "group_by_cols") = substitute(list(...))[-1] + + # using rlang is a neccesary evil here as I need to deal with !!! that is supported by group_by etc + group_by_cols = rlang::enexprs(...) + + # convert any quosure to labels + for (i in seq_along(group_by_cols)) { + if("quosure" %in% class(group_by_cols[[i]])) { + group_by_cols[[i]] <- rlang::sym(rlang::as_label(group_by_cols[[i]])) + } + } + + + attr(.data, "group_by_cols") = group_by_cols + .data } + #' @export #' @importFrom dplyr summarize #' @rdname group_by summarize.disk.frame <- function(.data, ...) { - ca_code = generate_summ_code(...) + # get all components of the summarise + dotdotdot = rlang::enexprs(...) - if(is.null(names(ca_code))) { - return(eval(parse(text = glue::glue(".data %>% {rlang::as_label(ca_code)}")))) - } else if("chunk_summ_code" %in% names(ca_code)) { - chunk_summ_code = ca_code$chunk_summ_code - agg_summ_code = ca_code$agg_summ_code - - # generate full code - code_to_run = glue::glue("chunk_summarize({chunk_summ_code}) %>% collect %>% summarize({agg_summ_code})") - - class(.data) <- c("summarized_disk.frame", "disk.frame") - attr(.data, "summarize_code") = code_to_run - return(.data ) - } else { - stop("something's wrong") - } -} - -#' Helper function to generate summarisation code -#' @importFrom data.table setDT setkey -#' @importFrom utils methods -#' @noRd -generate_summ_code <- function(...) { - # expand the code - code_to_expand = glue::glue("quo(summarise({rlang::as_label(substitute(...))}))") - - summ_code_quosure = eval(parse(text = code_to_expand)) - #print(summ_code_quosure) + # convert any quosure to labels + for (i in seq_along(dotdotdot)) { + if("quosure" %in% class(dotdotdot[[i]])) { + dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) + } + } - # ZJ: - # try the traditional route which can't deal with !!!, so if this fails then try the !!! route - tryCatch({ - code = substitute(list(...))[-1] - # print("hehe") - # print(code) - expr_id = 0 - temp_varn = 0 - - list_of_chunk_agg_fns <- as.character(utils::methods(class = "chunk_agg.disk.frame")) - list_of_collected_agg_fns <- as.character(utils::methods(class = "collected_agg.disk.frame")) - # browser() - # generate the chunk_summarize_code - summarize_code = purrr::map_dfr(code, ~{ - # print("raw code") - # print(.x) - expr_id <<- expr_id + 1 - # parse the function into table form for easy interrogration - # The keep.source = TRUE options seems necessary to keep it working in Rscript mode - gpd = getParseData(parse(text = deparse(.x), keep.source = TRUE), includeText = TRUE); - # print("raw table") - # print(deparse(.x)) - # print(gpd) - grp_funcs = gpd %>% filter(token == "SYMBOL_FUNCTION_CALL") %>% select(text) %>% pull - grp_funcs = grp_funcs %>% paste0("_df") - - # search in the space to find functions name `fn`.chunk_agg.disk.frame - # only allow one such functions for now TODO improve it - num_of_chunk_functions = sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame")))) - num_of_collected_functions= sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame")))) - - # the number chunk and aggregation functions must match - stopifnot(num_of_chunk_functions == num_of_collected_functions) - - # keep only grp_functions - grp_funcs= grp_funcs[sapply(grp_funcs, function(x) exists(paste0(x, ".chunk_agg.disk.frame")))] - - if(num_of_chunk_functions == 0) { - stop(sprintf("There must be at least one summarization function in %s", deparse(.x))) - } else if (num_of_chunk_functions > 1) { - stop(sprintf("Two or more summarisation functions are detected in \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n * Nestling (like mean(sum(x) + y)) or \n * combinations (like sum(x) + mean(x))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x))) - } - - # check to see if the mean is only two from parent 0, otherwise it would a statement in the form of 1 + mean(x) - # which isn't supported - data.table::setDT(gpd) - data.table::setkey(gpd, parent) - if (gpd[id == gpd[id == gpd[(paste0(text,"_df") == grp_funcs) & (token == "SYMBOL_FUNCTION_CALL"), parent], parent], parent] != 0) { - stop(sprintf("Combining summarization with other operations \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n * combinations (like sum(x) + 1)\n* combinations (like list(sum(x)))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x))) - } - - temp_varn <<- temp_varn + 1 - grp_funcs_wo_df = sapply(grp_funcs, function(grp_func) substr(grp_func, 1, nchar(grp_func)-3)) - - tmpcode = deparse(evalparseglue("substitute({deparse(.x)}, list({grp_funcs_wo_df} = quote({grp_funcs}.chunk_agg.disk.frame)))")) %>% paste0(collapse = " ") - - chunk_code = data.frame(assign_to = as.character(glue::glue("tmp{temp_varn}")), expr = tmpcode, stringsAsFactors = FALSE) - - chunk_code$orig_code = deparse(.x) - chunk_code$expr_id = expr_id - chunk_code$grp_fn = grp_funcs - chunk_code$name = ifelse(is.null(names(code[expr_id])), "", names(code[expr_id])) - - # create the aggregation code - chunk_code$agg_expr = as.character(glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})")) - - #print(sapply(chunk_code, typeof)) - chunk_code - }) - - chunk_summ_code = paste0(summarize_code$assign_to, "=list(", summarize_code$expr, ")") %>% paste0(collapse = ", ") - - agg_code_df = summarize_code %>% - select(expr_id, name, agg_expr, orig_code) %>% - unique %>% - transmute(agg_code = paste0(ifelse(name == "", paste0("`", orig_code, "` = "), paste0(name, "=")), agg_expr)) - - agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",") - - return(list(chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)) - }, error = function(e) { - return(summ_code_quosure) - }) + class(.data) <- c("summarized_disk.frame", "disk.frame") + attr(.data, "summarize_code") = dotdotdot + return(.data) } @@ -383,8 +291,3 @@ generate_summ_code <- function(...) { #' @importFrom dplyr summarize #' @rdname group_by summarise.disk.frame <- summarize.disk.frame - - - - - diff --git a/R/play.r b/R/play.r new file mode 100644 index 00000000..b5ee9fe7 --- /dev/null +++ b/R/play.r @@ -0,0 +1,15 @@ +#' Play the recorded lazy operations +#' @param dataframe A data.frame +#' @param recordings A recording the expression, globals and packages using create_chunk_mapper +play <- function(dataframe, recordings) { + for(recording in recordings) { + tmp_env = list2env(recording$globals) + + # replace .disk.frame.chunk with dataframe in the function + code = eval(bquote(substitute(.(recording$expr), list(.disk.frame.chunk=quote(dataframe))))) + + # execute the delayed function + dataframe = eval(code, envir = tmp_env) + } + dataframe +} \ No newline at end of file diff --git a/R/print.disk.frame.r b/R/print.disk.frame.r index fe739059..5ff81516 100644 --- a/R/print.disk.frame.r +++ b/R/print.disk.frame.r @@ -7,13 +7,22 @@ #' @importFrom glue glue # TODO add chunk print.disk.frame <- function(x, ...) { - a = paste(sep = "\n" - ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") - ,glue::glue("nchunks: {disk.frame::nchunks(x)}") - ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") - ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") - ,glue::glue("nrow (post operations): ???") - ,glue::glue("ncol (post operations): ???\n") - ) + if (is.null(attr(x, "recordings"))) { + a = paste(sep = "\n" + ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") + ,glue::glue("nchunks: {disk.frame::nchunks(x)}") + ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") + ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") + ) + } else { + a = paste(sep = "\n" + ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") + ,glue::glue("nchunks: {disk.frame::nchunks(x)}") + ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") + ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") + ,glue::glue("nrow (post operations): ???") + ,glue::glue("ncol (post operations): ???\n") + ) + } message(a) } diff --git a/R/rbindlist.disk.frame.r b/R/rbindlist.disk.frame.r index 7bbdd19f..87850ad1 100644 --- a/R/rbindlist.disk.frame.r +++ b/R/rbindlist.disk.frame.r @@ -5,7 +5,7 @@ #' @param parallel if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE #' @param compress 0-100, 100 being the highest compression rate. #' @param overwrite overwrite the output directory -#' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr} +#' @param .progress A logical, for whether or not to show progress. #' @import fs #' @importFrom data.table data.table setDT #' @importFrom future.apply future_lapply diff --git a/R/rechunk.r b/R/rechunk.r index 142b0576..9b79ba80 100644 --- a/R/rechunk.r +++ b/R/rechunk.r @@ -4,9 +4,6 @@ #' @param shardby the shardkeys #' @param outdir the output directory #' @param overwrite overwrite the output directory -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits for the "sort" shardby function, a dataframe with the split values. -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. #' @export #' @examples #' # create a disk.frame with 2 chunks in tempdir() @@ -22,9 +19,8 @@ #' # clean up cars.df #' delete(cars.df) #' delete(cars2.df) -rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby = NULL, overwrite = TRUE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) { - - # we need to force the chunks to be computed first as it's common to make nchunks a multiple of chunks(df) +rechunk <- function(df, nchunks = disk.frame::nchunks(df), outdir = attr(df, "path", exact=TRUE), shardby = NULL, overwrite = TRUE) { + # we need to force the chunks to be computed first as it's common to make nchunks a multiple of chunks(df) # but if we do it too late then the folder could be empty force(nchunks) @@ -52,9 +48,9 @@ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby short_files = dir(outdir) # move all files to the back up folder - purrr::map(full_files, ~{ - fs::file_move(.x, back_up_tmp_dir) - }) + for(file in full_files) { + fs::file_move(file, back_up_tmp_dir) + } if(fs::dir_exists(file.path(outdir, ".metadata"))) { fs::dir_delete(file.path(outdir, ".metadata")) @@ -76,9 +72,14 @@ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby shardby = existing_shardkey[[1]] } - if(user_had_set_shard_by) { - return(hard_group_by(df, shardby, nchunks = nchunks, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)) + tmp = cmap(df, ~{ + shard(.x, shardby, nchunks=nchunks, overwrite=FALSE) + }) %>% collect_list + + return( + rbindlist.disk.frame(tmp) + ) } else if (identical(shardby, "") | is.null(shardby)) { # if no existing shardby nr = nrow(df) @@ -138,9 +139,9 @@ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby tmp_fdlr = tempfile("rechunk_shard") fs::dir_create(tmp_fdlr) - oks = furrr::future_map(which(lp == 1), function(i) { - file_chunk = file.path(attr(df, "path", exact=TRUE), i %>% paste0(".fst")) - fs::file_move(file_chunk, file.path(tmp_fdlr, possibles_new_chunk_id[[i]] %>% paste0(".fst"))) + oks = future.apply::future_lapply(which(lp == 1), function(i) { + file_chunk = file.path(attr(df, "path", exact=TRUE), paste0(i, ".fst")) + fs::file_move(file_chunk, file.path(tmp_fdlr, paste0(possibles_new_chunk_id[[i]], ".fst"))) disk.frame(tmp_fdlr) }) diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index e5be6ac0..2b987600 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -85,8 +85,6 @@ df_ram_size <- function() { message(system("wmic MemoryChip get Capacity", intern=TRUE)) message("") message("") - #message("The option disk.frame.ram_size is not set. - #message("To set the ram_size, do options(disk.frame_ram_size = your_ram_size_in_gigabytes)") ram_size = 16 } } diff --git a/R/semi_join.r b/R/semi_join.r index fb46350f..f462efac 100644 --- a/R/semi_join.r +++ b/R/semi_join.r @@ -17,11 +17,11 @@ semi_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi overwrite_check(outdir, overwrite) if("data.frame" %in% class(y)) { - quo_dotdotdot = enquos(...) - cmap_dfr(x, ~{ - code = quo(semi_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) + tmp = cmap_dfr(x, ~{ + semi_join(.x, y, by = by, copy = copy, ...) }, .progress = .progress) + + return(tmp) } else if("disk.frame" %in% class(y)) { if(is.null(merge_by_chunk_id)) { stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") @@ -34,8 +34,8 @@ semi_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + x = rechunk(x, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) + y = rechunk(y, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(semi_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { res = cmap2.disk.frame(x, y, ~{ diff --git a/R/shard.r b/R/shard.r index e3b9285c..42a94946 100644 --- a/R/shard.r +++ b/R/shard.r @@ -4,9 +4,6 @@ #' @param nchunks The number of chunks #' @param outdir The output directory of the disk.frame #' @param overwrite If TRUE then the chunks are overwritten -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits If shardby_function is "sort", the split values for sharding -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. #' @param ... not used #' @importFrom data.table setDT #' @importFrom glue glue @@ -18,32 +15,33 @@ #' #' # clean up cars.df #' delete(iris.df) -shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), overwrite = FALSE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) { +shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), overwrite = FALSE) { force(nchunks) overwrite_check(outdir, overwrite) - stopifnot(shardby_function %in% c("hash", "sort")) + # stopifnot(shardby_function %in% c("hash", "sort")) if("data.frame" %in% class(df)) { data.table::setDT(df) - if(shardby_function == "hash"){ - message("Hashing...") - if(length(shardby) == 1) { - code = glue::glue("df[,.out.disk.frame.id := hashstr2i(as.character({shardby}), nchunks)]") - } else { - shardby_list = glue::glue("paste0({paste0(sort(shardby),collapse=',')})") - code = glue::glue("df[,.out.disk.frame.id := hashstr2i({shardby_list}, nchunks)]") - } - } else if(shardby_function == "sort"){ - if(nchunks == 1){ - message("Only one chunk: set .out.disk.frame.id = 0") - code = glue::glue("df[,.out.disk.frame.id := 0]") - } else { - shard_by_rule <- sortablestr2i(sort_splits, desc_vars) - # message(shard_by_rule) - setDT(df) - code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]") - } + # if(shardby_function == "hash"){ + # message("Hashing...") + if(length(shardby) == 1) { + # TODO rewrite + code = glue::glue("df[,.out.disk.frame.id := hashstr2i(as.character({shardby}), nchunks)]") + } else { + shardby_list = glue::glue("paste0({paste0(sort(shardby),collapse=',')})") + code = glue::glue("df[,.out.disk.frame.id := hashstr2i({shardby_list}, nchunks)]") } + # } else if(shardby_function == "sort"){ + # if(nchunks == 1){ + # message("Only one chunk: set .out.disk.frame.id = 0") + # code = glue::glue("df[,.out.disk.frame.id := 0]") + # } else { + # shard_by_rule <- sortablestr2i(sort_splits, desc_vars) + # # message(shard_by_rule) + # setDT(df) + # code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]") + # } + # } tryCatch( eval(parse(text=code)), @@ -54,11 +52,11 @@ shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks stopifnot(".out.disk.frame.id" %in% names(df)) - res = write_disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, shardchunks = nchunks, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) + res = write_disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, shardchunks = nchunks) return(res) } else if ("disk.frame" %in% class(df)){ nchunks_rechunk = nchunks - return(rechunk(df, shardby = shardby, nchunks = nchunks_rechunk, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)) + return(rechunk(df, shardby = shardby, nchunks = nchunks_rechunk, outdir = outdir, overwrite = TRUE)) } } diff --git a/R/srckeep.disk.frame.r b/R/srckeep.disk.frame.r index 54aef73a..1cfc1b2a 100644 --- a/R/srckeep.disk.frame.r +++ b/R/srckeep.disk.frame.r @@ -21,11 +21,11 @@ srckeep <- function(diskf, selections, ...) { #' @param chunks The chunks to load #' @rdname srckeep #' @export -srckeepchunks <- function(diskf, chunks, ...) { - stopifnot("disk.frame" %in% class(diskf)) - # TODO relax this - stopifnot(is.integer(chunks)) - - attr(df,"keep_chunks") = chunks - diskf -} +# srckeepchunks <- function(diskf, chunks, ...) { +# stopifnot("disk.frame" %in% class(diskf)) +# # TODO relax this +# stopifnot(is.integer(chunks)) +# +# attr(df,"keep_chunks") = chunks +# diskf +# } diff --git a/R/util.r b/R/util.r index 9fabce9e..8b760f0f 100644 --- a/R/util.r +++ b/R/util.r @@ -24,4 +24,24 @@ gen_datatable_synthetic <- function(N=2e8, K=100) { v3 = sample(round(runif(100,max=100),4), N, TRUE), # numeric e.g. 23.5749 date1 = sample(seq(as.Date('1970-01-01'), as.Date('2019-01-01'), by = "day"), N, TRUE) # date ) -} \ No newline at end of file +} + +#' Used to convert a function to purrr syntax if needed +#' @param .f a normal function or purrr syntax function i.e. `~{ ...code...}` +#' @importFrom purrr as_mapper +purrr_as_mapper <- function(.f) { + if(typeof(.f) == "language") { + if(requireNamespace("purrr")) { + .f = purrr::as_mapper(.f) + } else { + code = paste0(deparse(substitute(.f)), collapse = "") + stop( + sprintf( + "in cmap(.x, %s), it appears you are using {purrr} syntax but do not have {purrr} installed. Try `install.packages('purrr')`", + code + ) + ) + } + } + return(.f) +} diff --git a/R/write_disk.frame.r b/R/write_disk.frame.r index ceab7c15..cdfd0c36 100644 --- a/R/write_disk.frame.r +++ b/R/write_disk.frame.r @@ -2,7 +2,7 @@ #' @description #' Write a data.frame/disk.frame to a disk.frame location. If df is a data.frame #' then using the as.disk.frame function is recommended for most cases -#' @param df a disk.frame +#' @param diskf a disk.frame #' @param outdir output directory for the disk.frame #' @param nchunks number of chunks #' @param overwrite overwrite output directory @@ -26,12 +26,12 @@ #' delete(cars.df) #' delete(cars2.df) write_disk.frame <- function( - df, + diskf, outdir = tempfile(fileext = ".df"), nchunks = ifelse( - "disk.frame"%in% class(df), - nchunks.disk.frame(df), - recommend_nchunks(df)), + "disk.frame"%in% class(diskf), + nchunks.disk.frame(diskf), + recommend_nchunks(diskf)), overwrite = FALSE, shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) { @@ -40,16 +40,30 @@ write_disk.frame <- function( if(is.null(outdir)) { - stop("outdir must not be NULL") + stop("write_disk.frame error: outdir must not be NULL") } - if(is_disk.frame(df)) { + if(is_disk.frame(diskf)) { if(is.null(shardby)) { - cmap.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE) + path = attr(diskf, "path") + files_shortname <- list.files(path) + cids = get_chunk_ids(diskf, full.names = T, strip_extension = F) + + future.apply::future_lapply(1:length(cids), function(ii, ...) { + chunk = get_chunk(diskf, cids[ii], full.names = TRUE) + if(nrow(chunk) == 0) { + warning(sprintf("The output chunk has 0 row, therefore chunk %d NOT written", ii)) + } else { + out_chunk_name = file.path(outdir, files_shortname[ii]) + fst::write_fst(chunk, out_chunk_name, compress) + return(files_shortname) + } + return(NULL) + }, ..., future.seed = TRUE) + return(disk.frame(outdir)) } else { # TODO really inefficient - #df2 = cmap.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE) - shard(df, + shard(diskf, outdir = outdir, nchunks = nchunks, overwrite = TRUE, @@ -60,9 +74,9 @@ write_disk.frame <- function( ... ) } - } else if ("data.frame" %in% class(df)) { - if(".out.disk.frame.id" %in% names(df)) { - df[,{ + } else if ("data.frame" %in% class(diskf)) { + if(".out.disk.frame.id" %in% names(diskf)) { + diskf[,{ if (base::nrow(.SD) > 0) { list_columns = purrr::map_lgl(.SD, is.list) if(any(list_columns)){ @@ -70,17 +84,17 @@ write_disk.frame <- function( } else { fst::write_fst(.SD, file.path(outdir, paste0(.BY, ".fst")), compress = compress) NULL - } } + } NULL }, .out.disk.frame.id] res = disk.frame(outdir) add_meta(res, shardkey = shardby, shardchunks = nchunks, compress = compress) } else { - as.disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, compress = compress, ...) + as.disk.frame(diskf, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, compress = compress, ...) } } else { - stop("write_disk.frame error: df must be a disk.frame or data.frame") + stop("write_disk.frame error: diskf must be a disk.frame or data.frame") } } diff --git a/R/zip_to_disk.frame.r b/R/zip_to_disk.frame.r index 5418756e..5aacdd5b 100644 --- a/R/zip_to_disk.frame.r +++ b/R/zip_to_disk.frame.r @@ -5,7 +5,7 @@ #' @param ... passed to fread #' @param validation.check should the function perform a check at the end to check for validity of output. It can detect issues with conversion #' @param overwrite overwrite output directory -#' @import fst fs +#' @import fst #' @importFrom glue glue #' @importFrom future.apply future_lapply #' @importFrom utils unzip @@ -33,18 +33,18 @@ zip_to_disk.frame = function(zipfile, outdir, ..., validation.check = FALSE, overwrite = TRUE) { files = unzip(zipfile, list=TRUE) - fs::dir_create(outdir) + if(!dir.exists(outdir)) { + dir.create(outdir) + } tmpdir = tempfile(pattern = "tmp_zip2csv") - + dotdotdots = list(...) - - dfs = future.apply::future_lapply(files$Name, function(fn) { - #dfs = lapply(files$Name, function(fn) { + dfs = future.apply::future_lapply(files$Name, function(fn, ...) { outdfpath = file.path(outdir, fn) overwrite_check(outdfpath, TRUE) unzip(zipfile, files = fn, exdir = tmpdir) - + # lift the domain of csv_to_disk.frame so it accepts a list cl = purrr::lift(csv_to_disk.frame) @@ -55,15 +55,14 @@ zip_to_disk.frame = function(zipfile, outdir, ..., validation.check = FALSE, ove #csv_to_disk.frame(, outdfpath, overwrite = overwrite, ...) cl(ok) }, future.seed=TRUE) - dfs } -#' `validate_zip_to_disk.frame` is used to validate and auto-correct read and convert every single file within the zip file to df format +#' `validate_zip_to_disk.frame` is used to validate and auto-correct read and convert every single file within the zip file to disk.frame format #' @importFrom glue glue #' @importFrom utils unzip #' @importFrom data.table timetaken fread -#' @import fst +#' @importFrom fst read_fst #' @rdname zip_to_disk.frame #' @noRd validate_zip_to_disk.frame = function(zipfile, outdir) { @@ -86,14 +85,14 @@ validate_zip_to_disk.frame = function(zipfile, outdir) { # read it and if it errors then the file might be corrupted, so # read it again and write again pt = proc.time() - read_fst(out_fst_file, as.data.table = TRUE) + fst::read_fst(out_fst_file, as.data.table = TRUE) message(paste0("checking(read): ", timetaken(pt))); pt = proc.time() }, error = function(e) { message(e) pt = proc.time() unzip(zipfile, files = fn, exdir = tmpdir) message(paste0("unzip: ", timetaken(pt))); pt = proc.time() - write_fst(fread(file.path(tmpdir, fn)), out_fst_file,100) + fst::write_fst(data.table::fread(file.path(tmpdir, fn)), out_fst_file,100) message(paste0("read: ", timetaken(pt))) unlink(file.path(tmpdir, fn)) gc() @@ -106,7 +105,7 @@ validate_zip_to_disk.frame = function(zipfile, outdir) { pt = proc.time() unzip(zipfile, files = fn, exdir = tmpdir) message(paste0("unzip: ", timetaken(pt))); pt = proc.time() - write_fst(fread(file.path(tmpdir, fn)), out_fst_file,100) + fst::write_fst(data.table::fread(file.path(tmpdir, fn)), out_fst_file,100) message(paste0("read: ", timetaken(pt))) unlink(file.path(tmpdir, fn)) gc() diff --git a/README.md b/README.md index 8b8b1af9..2ca03010 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,6 @@ library(nycflights13) # this will setup disk.frame's parallel backend with number of workers equal to the number of CPU cores (hyper-threaded cores are counted as one not two) setup_disk.frame() -#> The number of workers available for disk.frame is 6 # this allows large datasets to be transferred between sessions options(future.globals.maxSize = Inf) @@ -211,12 +210,15 @@ flights.df %>% filter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1 2013 1 1 517 515 2 830 819 11 UA -#> 2 2013 1 1 533 529 4 850 830 20 UA -#> flight tailnum origin dest air_time distance hour minute time_hour origin_dest -#> 1 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH -#> 2 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> 2: 2013 1 1 533 529 4 850 830 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2: 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> time_hour origin_dest +#> 1: 2013-01-01 05:00:00 EWRIAH +#> 2: 2013-01-01 05:00:00 LGAIAH ``` ### Group-by @@ -313,7 +315,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpIlXNzn\\file568813b835a7.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQH7obF\\file42d452c32907.df" ``` A number of data.frame functions are implemented for disk.frame @@ -321,19 +323,23 @@ A number of data.frame functions are implemented for disk.frame ``` r # get first few rows head(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1: 2013 1 1 517 515 2 830 819 11 UA -#> flight tailnum origin dest air_time distance hour minute time_hour -#> 1: 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> time_hour +#> 1: 2013-01-01 05:00:00 ``` ``` r # get last few rows tail(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ -#> flight tailnum origin dest air_time distance hour minute time_hour -#> 1: 3531 N839MQ LGA RDU NA 431 8 40 2013-09-30 08:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 9 30 NA 840 NA NA 1020 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40 +#> time_hour +#> 1: 2013-09-30 08:00:00 ``` ``` r diff --git a/book/02-intro-disk-frame.Rmd b/book/02-intro-disk-frame.Rmd index 79428c3d..8d50bc8b 100644 --- a/book/02-intro-disk-frame.Rmd +++ b/book/02-intro-disk-frame.Rmd @@ -168,7 +168,7 @@ mutate(flights.df, speed = distance / air_time * 60) %>% collect %>% head(2) ### Examples of NOT fully supported `dplyr` verbs -The `chunk_arrange` function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly `chunk_summarise` creates summary variables within each chunk and hence also needs to be used with caution. In the Group By section, we demonstrate how to use `summarise` in the `disk.frame` context correctly with `hard_group_by`s. +The `chunk_arrange` function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly `chunk_summarise` creates summary variables within each chunk and hence also needs to be used with caution. ```{r} # this only sorts within each chunk @@ -227,7 +227,7 @@ The `by` variables that were used to shard the dataset are called the `shardkey` ```{r} flights.df %>% - group_by(carrier) %>% # notice that hard_group_by needs to be set + group_by(carrier) %>% summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules collect %>% arrange(carrier) @@ -256,11 +256,11 @@ left_join inner_join semi_join inner_join -full_join # requires hard_group_by on both left and right +full_join # requires rechunk on both left and right ``` -In all cases, the left dataset (`x`) must be a `disk.frame`, and the right dataset (`y`) can be either a `disk.frame` or a `data.frame`. If the right dataset is a `disk.frame` and the `shardkey`s are different between the two `disk.frame`s then two expensive `hard` `group_by` operations are performed *eagerly*, one on the left `disk.frame` and one on the right `disk.frame` to perform the joins correctly. +In all cases, the left dataset (`x`) must be a `disk.frame`, and the right dataset (`y`) can be either a `disk.frame` or a `data.frame`. If the right dataset is a `disk.frame` and the `shardkey`s are different between the two `disk.frame`s then two expensive `hard` `rechunk` operations are performed *eagerly*, one on the left `disk.frame` and one on the right `disk.frame` to perform the joins correctly. -However, if the right dataset is a `data.frame` then `hard_group_by`s are only performed in the case of `full_join`. +However, if the right dataset is a `data.frame` then `rechunk``s are only performed in the case of `full_join`. Note `disk.frame` does not support `right_join` the user should use `left_join` instead. @@ -287,33 +287,7 @@ flights.df %>% `{disk.frame}` supports all `data.frame` operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like `min_rank` and `rank` are supported out of the box. -For the following example, we will use the `hard_group_by` which performs a group-by and also reorganises the chunks so that all records with the same `year`, `month`, and `day` end up in the same chunk. This is typically not advised, as `hard_group_by` can be slow for large datasets. - -```{r} -# Find the most and least delayed flight each day -bestworst <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>% - collect - -bestworst %>% head -``` - -another example - -```{r} -ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>% - collect - -ranked %>% head -``` - -one more example - +For example ```{r} # Rank each flight within a daily ranked <- flights.df %>% @@ -329,22 +303,22 @@ ranked %>% head ## Arbitrary by-chunk processing -One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `map.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk +One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `cmap.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk ```{r} flights.df1 <- delayed(flights.df, ~nrow(.x)) collect_list(flights.df1) %>% head # returns number of rows for each data.frame in a list ``` -and to do the same with `map.disk.frame` +and to do the same with `cmap.disk.frame` ```{r} -map(flights.df, ~nrow(.x), lazy = F) %>% head +cmap(flights.df, ~nrow(.x), lazy = F) %>% head ``` -The `map` function can also output the results to another disk.frame folder, e.g. +The `cmap` function can also output the results to another disk.frame folder, e.g. ```{r} # return the first 10 rows of each chunk -flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) +flights.df2 <- cmap(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) flights.df2 %>% head ``` @@ -369,7 +343,7 @@ write_disk.frame(flights.df, outdir="out") this will output a disk.frame to the folder "out" ```{r cleanup, include=FALSE} -fs::dir_delete(file.path(tempdir(), "tmp_flights.df")) -fs::dir_delete(file.path(tempdir(), "tmp2")) -fs::file_delete(file.path(tempdir(), "tmp_flights.csv")) +# fs::dir_delete(file.path(tempdir(), "tmp_flights.df")) +# fs::dir_delete(file.path(tempdir(), "tmp2")) +# fs::file_delete(file.path(tempdir(), "tmp_flights.csv")) ``` diff --git a/docs/404.html b/docs/404.html index 78100fa4..9ccac74e 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index abba5cc8..5757cc4b 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index 584809c3..af353427 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html index fec5669f..5526ed10 100644 --- a/docs/articles/02-intro-disk-frame.html +++ b/docs/articles/02-intro-disk-frame.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -211,12 +211,13 @@

Creating a disk.frame from csv_path, outdir = df_path, in_chunk_size = 100000) -#> Warning: UNRELIABLE VALUE: Future ('<none>') unexpectedly generated random -#> numbers without specifying argument 'seed'. There is a risk that those random -#> numbers are not statistically sound and the overall results might be invalid. -#> To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random -#> numbers are produced via the L'Ecuyer-CMRG method. To disable this check, use -#> 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore". +#> Warning: UNRELIABLE VALUE: One of the 'future.apply' iterations +#> ('future_lapply-1') unexpectedly generated random numbers without declaring so. +#> There is a risk that those random numbers are not statistically sound and the +#> overall results might be invalid. To fix this, specify 'future.seed=TRUE'. This +#> ensures that proper, parallel-safe random numbers are produced via the L'Ecuyer- +#> CMRG method. To disable this check, use 'future.seed = NULL', or set option +#> 'future.rng.onMisuse' to "ignore". flights.df

disk.frame also has a function zip_to_disk.frame that can convert every CSV in a zip file to disk.frames.

@@ -265,7 +266,7 @@

Simple dplyr verbs

Examples of NOT fully supported dplyr verbs

-

The chunk_arrange function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly chunk_summarise creates summary variables within each chunk and hence also needs to be used with caution. In the Group By section, we demonstrate how to use summarise in the disk.frame context correctly with hard_group_bys.

+

The chunk_arrange function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly chunk_summarise creates summary variables within each chunk and hence also needs to be used with caution.

 # this only sorts within each chunk
 chunk_arrange(flights.df, dplyr::desc(dep_delay)) %>% collect %>% head(2)
@@ -342,7 +343,7 @@ 

Group-bydisk.frame implements the group_by operation some caveats. In the disk.frame framework, only a set functions are supported in summarize. However, the user can create more custom group-by functions can be defined.

 flights.df %>%
-  group_by(carrier) %>% # notice that hard_group_by needs to be set
+  group_by(carrier) %>% 
   summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
   collect %>% 
   arrange(carrier)
@@ -406,9 +407,9 @@ 

Joins inner_join semi_join inner_join -full_join # requires hard_group_by on both left and right

-

In all cases, the left dataset (x) must be a disk.frame, and the right dataset (y) can be either a disk.frame or a data.frame. If the right dataset is a disk.frame and the shardkeys are different between the two disk.frames then two expensive hard group_by operations are performed eagerly, one on the left disk.frame and one on the right disk.frame to perform the joins correctly.

-

However, if the right dataset is a data.frame then hard_group_bys are only performed in the case of full_join.

+full_join # requires rechunk on both left and right

+

In all cases, the left dataset (x) must be a disk.frame, and the right dataset (y) can be either a disk.frame or a data.frame. If the right dataset is a disk.frame and the shardkeys are different between the two disk.frames then two expensive hard rechunk operations are performed eagerly, one on the left disk.frame and one on the right disk.frame to perform the joins correctly.

+

However, if the right dataset is a data.frame then rechunk``s are only performed in the case offull_join`.

Note disk.frame does not support right_join the user should use left_join instead.

The below joins are performed lazily because airlines.dt is a data.table not a disk.frame:

@@ -471,41 +472,8 @@ 

Joins

Window functions and arbitrary functions

disk.frame supports all data.frame operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like min_rank and rank are supported out of the box.

-

For the following example, we will use the hard_group_by which performs a group-by and also reorganises the chunks so that all records with the same year, month, and day end up in the same chunk. This is typically not advised, as hard_group_by can be slow for large datasets.

+

For example

-# Find the most and least delayed flight each day
-bestworst <- flights.df %>%
-   srckeep(c("year","month","day", "dep_delay")) %>%
-   hard_group_by(c("year", "month", "day")) %>%
-   filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>%
-   collect
-   
-bestworst %>% head
-#>    year month day dep_delay
-#> 1: 2013     2  21       301
-#> 2: 2013     2  21        -9
-#> 3: 2013     2  21        -1
-#> 4: 2013     2  21         2
-#> 5: 2013     2  21        -4
-#> 6: 2013     2  21        10
-

another example

-
-ranked <- flights.df %>%
-  srckeep(c("year","month","day", "dep_delay")) %>%
-  hard_group_by(c("year", "month", "day")) %>%
-  filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>%
-  collect
-
-ranked %>% head
-#>    year month day dep_delay
-#> 1: 2013     1   9      1301
-#> 2: 2013     1   9       253
-#> 3: 2013     1  10      1126
-#> 4: 2013     1  10       385
-#> 5: 2013     1  17       259
-#> 6: 2013     1  17       255
-

one more example

-
 # Rank each flight within a daily
 ranked <- flights.df %>%
   srckeep(c("year","month","day", "dep_delay")) %>%
@@ -526,8 +494,8 @@ 

Window functions and arbitrary

Arbitrary by-chunk processing

-

One can apply arbitrary transformations to each chunk of the disk.frame by using the delayed function which evaluates lazily or the map.disk.frame(lazy = F) function which evaluates eagerly. For example to return the number of rows in each chunk

-
+

One can apply arbitrary transformations to each chunk of the disk.frame by using the delayed function which evaluates lazily or the cmap.disk.frame(lazy = F) function which evaluates eagerly. For example to return the number of rows in each chunk

+
 flights.df1 <- delayed(flights.df, ~nrow(.x))
 collect_list(flights.df1) %>% head # returns number of rows for each data.frame in a list
 #> [[1]]
@@ -547,35 +515,14 @@ 

Arbitrary by-chunk processing#> #> [[6]] #> [1] 56121

-

and to do the same with map.disk.frame

-
-map(flights.df, ~nrow(.x), lazy = F) %>% head
-#> Warning in map.disk.frame(flights.df, ~nrow(.x), lazy = F): map(df, ...) where
-#> df is a disk.frame has been deprecated. Please use cmap(df,...) instead
-#> [[1]]
-#> [1] 56131
-#> 
-#> [[2]]
-#> [1] 56131
-#> 
-#> [[3]]
-#> [1] 56131
-#> 
-#> [[4]]
-#> [1] 56131
-#> 
-#> [[5]]
-#> [1] 56131
-#> 
-#> [[6]]
-#> [1] 56121
-

The map function can also output the results to another disk.frame folder, e.g.

-
+

and to do the same with cmap.disk.frame

+
+cmap(flights.df, ~nrow(.x), lazy = F) %>% head
+#> [1] 6
+

The cmap function can also output the results to another disk.frame folder, e.g.

+
 # return the first 10 rows of each chunk
-flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T)
-#> Warning in map.disk.frame(flights.df, ~.x[1:10, ], lazy = F, outdir =
-#> file.path(tempdir(), : map(df, ...) where df is a disk.frame has been
-#> deprecated. Please use cmap(df,...) instead
+flights.df2 <- cmap(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T)
 
 flights.df2 %>% head
 #>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
@@ -605,35 +552,35 @@ 

Arbitrary by-chunk processingSampling

In the disk.frame framework, sampling a proportion of rows within each chunk can be performed using sample_frac.

-
+
 flights.df %>% sample_frac(0.01) %>% collect %>% head
 #>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     5  10      554            600        -6      645            659
-#> 2: 2013     8  28      752            800        -8     1015           1022
-#> 3: 2013     1   3      955            958        -3     1120           1137
-#> 4: 2013     8  22     2157           2100        57        7           2323
-#> 5: 2013     5  14     1853           1900        -7     2003           2048
-#> 6: 2013     8  25     1550           1540        10     1747           1747
+#> 1: 2013     1  10     1614           1605         9     1926           1944
+#> 2: 2013     5  13     1136           1136         0     1225           1234
+#> 3: 2013     5  16     1428           1420         8     1535           1538
+#> 4: 2013    12  29     1536           1520        16     1817           1750
+#> 5: 2013    12  21      849            820        29     1315           1345
+#> 6: 2013    12  25     1624           1559        25     1846           1825
 #>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:       -14      US   2161  N747UW    LGA  DCA       38      214    6      0
-#> 2:        -7      UA    561  N513UA    LGA  DEN      218     1620    8      0
-#> 3:       -17      UA    258  N831UA    LGA  ORD      124      733    9     58
-#> 4:        44      DL   1247  N914DE    LGA  ATL      102      762   21      0
-#> 5:       -45      EV   5038  N741EV    LGA  BHM      111      866   19      0
-#> 6:         0      9E   3648  N8940E    JFK  CMH       69      483   15     40
+#> 1:       -18      DL   1508  N952DL    JFK  RSW      163     1074   16      5
+#> 2:        -9      EV   3830  N13955    EWR  PVD       35      160   11     36
+#> 3:        -3      EV   4284  N11536    EWR  ROC       45      246   14     20
+#> 4:        27      MQ   3553  N520MQ    LGA  XNA      181     1147   15     20
+#> 5:       -30      DL    454  N682DA    JFK  STT      188     1623    8     20
+#> 6:        21      EV   5567  N870AS    LGA  CAE       99      617   15     59
 #>              time_hour
-#> 1: 2013-05-10 10:00:00
-#> 2: 2013-08-28 12:00:00
-#> 3: 2013-01-03 14:00:00
-#> 4: 2013-08-23 01:00:00
-#> 5: 2013-05-14 23:00:00
-#> 6: 2013-08-25 19:00:00
+#> 1: 2013-01-10 21:00:00 +#> 2: 2013-05-13 15:00:00 +#> 3: 2013-05-16 18:00:00 +#> 4: 2013-12-29 20:00:00 +#> 5: 2013-12-21 13:00:00 +#> 6: 2013-12-25 20:00:00

Writing Data

One can output a disk.frame by using the write_disk.frame function. E.g.

-
+
 write_disk.frame(flights.df, outdir="out")

this will output a disk.frame to the folder “out”

diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html index 418a1757..5797c66c 100644 --- a/docs/articles/03-concepts.html +++ b/docs/articles/03-concepts.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -147,7 +147,6 @@

Workers and parallelism#> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union -#> Loading required package: purrr #> #> ## Message from disk.frame: #> We have 1 workers to use with disk.frame. @@ -168,9 +167,6 @@

Workers and parallelism#> #> #> Attaching package: 'disk.frame' -#> The following objects are masked from 'package:purrr': -#> -#> imap, imap_dfr, map, map2 #> The following objects are masked from 'package:base': #> #> colnames, ncol, nrow diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html index eb86d829..d9e14783 100644 --- a/docs/articles/04-ingesting-data.html +++ b/docs/articles/04-ingesting-data.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0

diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html index c59cc037..7027aebf 100644 --- a/docs/articles/05-data-table-syntax.html +++ b/docs/articles/05-data-table-syntax.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -149,9 +149,6 @@

library(data.table) #> #> Attaching package: 'data.table' -#> The following object is masked from 'package:purrr': -#> -#> transpose #> The following objects are masked from 'package:dplyr': #> #> between, first, last @@ -167,6 +164,7 @@

#> [17] "hour" "minute" "time_hour" flights.df[,.N, .(year, month), keep = c("year", "month")] +#> data.table syntax for disk.frame may be moved to a separate package in the future #> year month N #> 1: 2013 1 27004 #> 2: 2013 10 28889 @@ -199,6 +197,7 @@

External variables are capturedflights.df[,some_fn(y)] +#> data.table syntax for disk.frame may be moved to a separate package in the future #> [1] 42 42 42 42 42 42

In the above example, neither some_fn nor y are defined in the background workers’ environments, but disk.frame still manages to evaluate this code flights.df[,some_fn(y)].

diff --git a/docs/articles/06-vs-dask-juliadb.html b/docs/articles/06-vs-dask-juliadb.html index 053e7c5e..fe2bcd28 100644 --- a/docs/articles/06-vs-dask-juliadb.html +++ b/docs/articles/06-vs-dask-juliadb.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -189,21 +189,22 @@
disk.framesystem.time(setup_disk.frame()) # ~4s #> The number of workers available for disk.frame is 6 #> user system elapsed -#> 0.18 0.03 2.22 +#> 0.20 0.03 2.32

We note that there is some time needed for disk.frame to start up all the workers. Next we try to convert the largest CSV file to disk.frame format. The file to be converted is about 2.2GB in size

 time_to_convert_disk.frame = system.time(df1 <- csv_to_disk.frame("c:/data/Performance_2004Q3.txt", header = FALSE))[3]
 
 time_to_convert_disk.frame
 #> elapsed 
-#>   28.77
+#> 28.3

Now that we have converted it, we want to a count by the first column. To achieve this we use a “two-stage” aggregation strategy. Note that use keep="V1" to bring only the column V1 into RAM. This avoids the reading of other unnecessary columns and should speed-up the analysis significantly

 time_to_agg_disk.frame = system.time(summ <- df1[,.N, V1, keep = "V1"][, .(N = sum(N)), V1])
+#> data.table syntax for disk.frame may be moved to a separate package in the future
 
 time_to_agg_disk.frame
 #>    user  system elapsed 
-#>    0.13    0.03    7.89
+#> 0.13 0.03 8.48

We can inspect the result as well.

 summ
@@ -227,7 +228,7 @@ 
disk.framesummarise(N = n()) %>% collect) #> user system elapsed -#> 1.89 0.14 5.30
+#> 1.53 0.17 10.25

However, the dplyr syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.

diff --git a/docs/articles/07-glm.html b/docs/articles/07-glm.html index 9a04cd64..2b340f3c 100644 --- a/docs/articles/07-glm.html +++ b/docs/articles/07-glm.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0
diff --git a/docs/articles/08-more-epic.html b/docs/articles/08-more-epic.html index cfff702c..f4170f94 100644 --- a/docs/articles/08-more-epic.html +++ b/docs/articles/08-more-epic.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/articles/09-convenience-features.html b/docs/articles/09-convenience-features.html index 81ab3af8..ef7bf2ed 100644 --- a/docs/articles/09-convenience-features.html +++ b/docs/articles/09-convenience-features.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/articles/10-group-by.html b/docs/articles/10-group-by.html index c2799355..f18f3b51 100644 --- a/docs/articles/10-group-by.html +++ b/docs/articles/10-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -298,7 +298,7 @@

Group-by notes
 suppressMessages(library(disk.frame))
 flights.df %>%
-  hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
+  hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
   chunk_summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>%  # mean follows normal R rules
   collect %>% 
   arrange(carrier)
@@ -359,7 +359,7 @@

Hard group-bysrckeep(c("month", "dep_delay")) %>% filter(month <= 6) %>% mutate(qtr = ifelse(month <= 3, "Q1", "Q2")) %>% - hard_group_by(qtr) %>% # hard group_by is MUCH SLOWER but avoid a 2nd stage aggregation + hard_group_by(qtr) %>% # hard group_by is MUCH SLOWER but avoid a 2nd stage aggregation chunk_summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) %>% collect cat("group-by took: ", data.table::timetaken(pt), "\n") diff --git a/docs/articles/11-custom-group-by.html b/docs/articles/11-custom-group-by.html index 356dfa28..695f467f 100644 --- a/docs/articles/11-custom-group-by.html +++ b/docs/articles/11-custom-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/articles/88-trouble-shooting.html b/docs/articles/88-trouble-shooting.html index ab2a72a9..e1ba2f47 100644 --- a/docs/articles/88-trouble-shooting.html +++ b/docs/articles/88-trouble-shooting.html @@ -33,7 +33,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/articles/index.html b/docs/articles/index.html index f767f6f8..3e4cba20 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/authors.html b/docs/authors.html index 342af2fa..97226196 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -110,13 +110,13 @@

Citation

ZJ D (2022). disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework. -R package version 0.5.0, https://diskframe.com. +R package version 0.6.0, https://diskframe.com.

@Manual{,
   title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework},
   author = {Dai ZJ},
   year = {2022},
-  note = {R package version 0.5.0},
+  note = {R package version 0.6.0},
   url = {https://diskframe.com},
 }
diff --git a/docs/index.html b/docs/index.html index 89c4a22e..7ea2fc4e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,7 +37,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -228,7 +228,6 @@

Quick-start# this will setup disk.frame's parallel backend with number of workers equal to the number of CPU cores (hyper-threaded cores are counted as one not two) setup_disk.frame() -#> The number of workers available for disk.frame is 6 # this allows large datasets to be transferred between sessions options(future.globals.maxSize = Inf) @@ -248,12 +247,15 @@

dplyr verbsfilter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1 2013 1 1 517 515 2 830 819 11 UA -#> 2 2013 1 1 533 529 4 850 830 20 UA -#> flight tailnum origin dest air_time distance hour minute time_hour origin_dest -#> 1 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH -#> 2 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> 2: 2013 1 1 533 529 4 850 830 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2: 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> time_hour origin_dest +#> 1: 2013-01-01 05:00:00 EWRIAH +#> 2: 2013-01-01 05:00:00 LGAIAH

Group-by @@ -403,22 +405,26 @@

Basic info
 # where is the disk.frame stored
 attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpIlXNzn\\file568813b835a7.df"

+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQH7obF\\file42d452c32907.df"

A number of data.frame functions are implemented for disk.frame

 # get first few rows
 head(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
-#> 1: 2013     1   1      517            515         2      830            819        11      UA
-#>    flight tailnum origin dest air_time distance hour minute           time_hour
-#> 1:   1545  N14228    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00
+#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> time_hour +#> 1: 2013-01-01 05:00:00
 # get last few rows
 tail(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
-#> 1: 2013     9  30       NA            840        NA       NA           1020        NA      MQ
-#>    flight tailnum origin dest air_time distance hour minute           time_hour
-#> 1:   3531  N839MQ    LGA  RDU       NA      431    8     40 2013-09-30 08:00:00
+#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 9 30 NA 840 NA NA 1020 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40 +#> time_hour +#> 1: 2013-09-30 08:00:00
 # number of rows
 nrow(flights.df)
diff --git a/docs/news/index.html b/docs/news/index.html
index 0fb5baf2..48d1c26d 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
       
       
         disk.frame
-        0.5.0
+        0.6.0
       
     
@@ -90,6 +90,13 @@

Changelog

Source: NEWS.md +
+ +
  • Much better NSE support in disk.frame!
  • +
  • removed hard_arrange and hard_group_by +
  • +
  • various API updates
  • +
  • removed add_count method
  • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index e9e64112..d6970ada 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -14,5 +14,5 @@ articles: 10-group-by: 10-group-by.html 11-custom-group-by: 11-custom-group-by.html 88-trouble-shooting: 88-trouble-shooting.html -last_built: 2022-01-24T10:36Z +last_built: 2022-01-30T13:34Z diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html index 82b43bf9..33227b07 100644 --- a/docs/reference/add_chunk.html +++ b/docs/reference/add_chunk.html @@ -18,7 +18,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -135,19 +135,15 @@

Examples

# add a chunk to diskf add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk" #> nchunks: 1 #> nrow (at source): 50 #> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ??? add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk" #> nchunks: 2 #> nrow (at source): 100 #> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ??? nchunks(diskf) # 2 #> [1] 2 @@ -158,19 +154,15 @@

Examples

# you wish to add multiple chunk in parralel add_chunk(df2, data.frame(chunk=1), 1) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2" #> nchunks: 1 #> nrow (at source): 1 #> ncol (at source): 1 -#> nrow (post operations): ??? -#> ncol (post operations): ??? add_chunk(df2, data.frame(chunk=2), 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2" #> nchunks: 2 #> nrow (at source): 2 #> ncol (at source): 1 -#> nrow (post operations): ??? -#> ncol (post operations): ??? nchunks(df2) # 2 #> [1] 2 diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html index abfa6194..aea1586c 100644 --- a/docs/reference/as.data.frame.disk.frame.html +++ b/docs/reference/as.data.frame.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html index 7e9ac1e2..1e5146c2 100644 --- a/docs/reference/as.data.table.disk.frame.html +++ b/docs/reference/as.data.table.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -115,9 +115,6 @@

Examples

library(data.table)
 #> 
 #> Attaching package: 'data.table'
-#> The following object is masked from 'package:purrr':
-#> 
-#>     transpose
 #> The following objects are masked from 'package:dplyr':
 #> 
 #>     between, first, last
diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html
index 10189939..fa22a8ea 100644
--- a/docs/reference/as.disk.frame.html
+++ b/docs/reference/as.disk.frame.html
@@ -17,7 +17,7 @@
       
       
         disk.frame
-        0.5.0
+        0.6.0
       
     
diff --git a/docs/reference/bind_rows.disk.frame.html b/docs/reference/bind_rows.disk.frame.html new file mode 100644 index 00000000..3a336451 --- /dev/null +++ b/docs/reference/bind_rows.disk.frame.html @@ -0,0 +1,131 @@ + +Bind rows — bind_rows.disk.frame • disk.frame + + +
+
+ + + +
+
+ + +
+

Bind rows

+
+ +
+
bind_rows.disk.frame(...)
+
+ +
+

Arguments

+
...
+

disk.frame to be row bound

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html index 3b4f8168..f7bbff69 100644 --- a/docs/reference/chunk_group_by.html +++ b/docs/reference/chunk_group_by.html @@ -1,5 +1,19 @@ -Group by within each disk.frame — chunk_summarize • disk.frame#' @export +#' @importFrom dplyr add_count +#' @rdname dplyr_verbs +add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' @export +#' @importFrom dplyr add_tally +#' @rdname dplyr_verbs +add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) — chunk_summarize • disk.frameCompute without writing — compute.disk.frame • disk.frameForce computations. The results are stored in a folder. — compute.disk.frame • disk.frame @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -86,7 +86,7 @@
@@ -97,13 +97,7 @@

Compute without writing

# S3 method for disk.frame
-compute(
-  x,
-  name,
-  outdir = tempfile("tmp_df_", fileext = ".df"),
-  overwrite = TRUE,
-  ...
-)
+compute(x, name = NULL, outdir = tempfile("tmp_df_", fileext = ".df"), ...)
@@ -111,13 +105,11 @@

Arguments

x

a disk.frame

name
-

Not used. Kept for compatibility with dplyr

+

If not NULL then used as outdir prefix.

outdir

the output directory

-
overwrite
-

whether to overwrite or not

...
-

Not used. Kept for dplyr compatibility

+

Passed to `write_disk.frame`

diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index 7c28d311..1269b5dd 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -96,7 +96,7 @@

Create function that applies to each chunk if disk.frame

-
create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
+
create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = FALSE)
diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index 17b0d387..f91d899a 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -142,8 +142,7 @@

Arguments

header

Whether the files have header. Defaults to TRUE

.progress
-

A logical, for whether or not to print a progress bar for -multiprocess, multisession, and multicore plans. From furrr

+

A logical, for whether or not to show progress

backend

The CSV reader backend to choose: "data.table" or "readr". disk.frame does not have its own CSV reader. It uses either diff --git a/docs/reference/delete.html b/docs/reference/delete.html index 2a238505..9cf26191 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index 57bfaa8d..c986617f 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index 1197ac98..c9cc7c73 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -18,7 +18,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index 0d2320c2..fd1f9556 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -111,12 +111,10 @@

Arguments

Examples

path = file.path(tempdir(),"cars")
 as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
-#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/cars"
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/cars"
 #> nchunks: 2
 #> nrow (at source): 50
 #> ncol (at source): 2
-#> nrow (post operations): ???
-#> ncol (post operations): ???
 df = disk.frame(path)
 head(df)
 #>    speed dist
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 4cbb0612..143e9c7e 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.5.0
+        0.6.0
       
     
@@ -118,11 +118,6 @@

The dplyr verbs implemented for disk.frame

chunk_arrange(.data, ...) -add_tally.disk.frame(.data, ...) - -# S3 method for disk.frame -do(.data, ...) - # S3 method for disk.frame distinct(...) diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index 4172714e..d88bc80a 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index 19607792..6fbc7457 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index 608041f6..8855f4c1 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index 66df5c10..fffdd1c7 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index d54c8f15..fc6d82da 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -121,12 +121,12 @@

Examples

# return the file name chunk IDs get_chunk_ids(cars.df, full.names = TRUE) -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/6.fst" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/1.fst" +#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/2.fst" +#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/3.fst" +#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/4.fst" +#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/5.fst" +#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/6.fst" # return the file name chunk IDs with file extension get_chunk_ids(cars.df, strip_extension = FALSE) diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index a7d39c8e..7149e006 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -20,7 +20,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -109,7 +109,12 @@

A function to parse the summarize function

summarize(.data, ...) # S3 method for disk.frame -group_by(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +group_by( + .data, + ..., + .add = FALSE, + .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage") +) # S3 method for disk.frame summarize(.data, ...) @@ -124,7 +129,7 @@

Arguments

a disk.frame

...

same as the dplyr::group_by

-
add
+
.add

from dplyr

.drop

from dplyr

diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index 02be8acf..89dc5d99 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 6312a2b2..01e04939 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/index.html b/docs/reference/index.html index 482f042c..7447de77 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -109,16 +109,27 @@

All functions as.disk.frame()

Make a data.frame into a disk.frame

+ +

bind_rows.disk.frame()

+ +

Bind rows

chunk_summarize() chunk_summarise() chunk_group_by() chunk_ungroup()

-

Group by within each disk.frame

+

#' @export +#' @importFrom dplyr add_count +#' @rdname dplyr_verbs +add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' @export +#' @importFrom dplyr add_tally +#' @rdname dplyr_verbs +add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally)

-

cmap() cmap_dfr() cimap() cimap_dfr() lazy() delayed() chunk_lapply() map() imap_dfr() imap() map_dfr(<disk.frame>) map_dfr(<default>)

+

cmap() cmap_dfr() cimap() cimap_dfr() lazy() delayed() clapply()

Apply the same function to all chunks

-

cmap2() map2() map_by_chunk_id()

+

cmap2() map_by_chunk_id()

`cmap2` a function to two disk.frames

@@ -132,7 +143,7 @@

All functions

compute(<disk.frame>)

-

Compute without writing

+

Force computations. The results are stored in a folder.

create_chunk_mapper()

@@ -158,7 +169,7 @@

All functions

Create a disk.frame from a folder

-

select(<disk.frame>) rename(<disk.frame>) filter(<disk.frame>) mutate(<disk.frame>) transmute(<disk.frame>) arrange(<disk.frame>) chunk_arrange() add_tally.disk.frame() do(<disk.frame>) distinct(<disk.frame>) chunk_distinct() glimpse(<disk.frame>)

+

select(<disk.frame>) rename(<disk.frame>) filter(<disk.frame>) mutate(<disk.frame>) transmute(<disk.frame>) arrange(<disk.frame>) chunk_arrange() distinct(<disk.frame>) chunk_distinct() glimpse(<disk.frame>)

The dplyr verbs implemented for disk.frame

@@ -189,14 +200,6 @@

All functions summarise(<grouped_disk.frame>) summarize(<grouped_disk.frame>) group_by(<disk.frame>) summarize(<disk.frame>) summarise(<disk.frame>)

A function to parse the summarize function

- -

hard_arrange()

- -

Perform a hard arrange

- -

hard_group_by()

- -

Perform a hard group

head(<disk.frame>) tail(<disk.frame>)

@@ -237,6 +240,10 @@

All functions overwrite_check()

Check if the outdir exists or not

+ +

play()

+ +

Play the recorded lazy operations

print(<disk.frame>)

@@ -245,6 +252,10 @@

All functions pull(<disk.frame>)

Pull a column from table similar to `dplyr::pull`.

+ +

purrr_as_mapper()

+ +

Used to convert a function to purrr syntax if needed

rbindlist.disk.frame()

@@ -286,7 +297,7 @@

All functions

Show the code to setup disk.frame

-

srckeep() srckeepchunks()

+

srckeep()

Keep only the variables from the input listed in selections

diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index c56ab38f..d32d0036 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/join.html b/docs/reference/join.html index 6bdc7ff7..87b1e852 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -128,7 +128,9 @@

Performs join/merge for disk.frames

y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ..., + keep = FALSE, outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, @@ -141,7 +143,9 @@

Performs join/merge for disk.frames

y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ..., + keep = FALSE, outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, @@ -182,6 +186,10 @@

Arguments

overwrite output directory

.progress

Show progress or not. Defaults to FALSE

+
suffix
+

see dplyr::XXX_join

+
keep
+

see dplyr::XXX_join

Value

@@ -195,12 +203,7 @@

Examples

anti_joined.df = anti_join(df.df, df2.df) #> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance. -#> Hashing... -#> Hashing... -#> Hashing... #> Appending disk.frames: -#> Hashing... -#> Hashing... #> Appending disk.frames: anti_joined.df %>% collect @@ -208,9 +211,6 @@

Examples

#> 1: 3 6 anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11)) -#> Joining, by = "x" -#> Joining, by = "x" -#> Joining, by = "x" # clean up delete(df.df) @@ -233,20 +233,8 @@

Examples

cars.df = as.disk.frame(cars) join.df = left_join(cars.df, cars.df) -#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance. -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... +#> Warning: `merge_by_chunk_id = FALSE`. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making `y` a data.frame or set merge_by_chunk_id = TRUE for better performance. #> Appending disk.frames: -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... #> Appending disk.frames: # clean up cars.df @@ -256,19 +244,7 @@

Examples

join.df = semi_join(cars.df, cars.df) #> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance. -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... #> Appending disk.frames: -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... -#> Hashing... #> Appending disk.frames: # clean up cars.df diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index 91b68bc6..63de7ae6 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0
diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index 26cafba5..f93a7f53 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index 9037467e..d4211c0b 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index 988619c1..9321954a 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index a8a9d09a..5b5d239a 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index 22b7a93c..ac3c5abf 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -19,7 +19,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index f4d07be2..a33eaeb4 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/play.html b/docs/reference/play.html new file mode 100644 index 00000000..286b1855 --- /dev/null +++ b/docs/reference/play.html @@ -0,0 +1,133 @@ + +Play the recorded lazy operations — play • disk.frame + + +
+
+ + + +
+
+ + +
+

Play the recorded lazy operations

+
+ +
+
play(dataframe, recordings)
+
+ +
+

Arguments

+
dataframe
+

A data.frame

+
recordings
+

A recording the expression, globals and packages using create_chunk_mapper

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index a0ef8db4..76b587e8 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index ff438239..172cb910 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/purrr_as_mapper.html b/docs/reference/purrr_as_mapper.html new file mode 100644 index 00000000..4e334c11 --- /dev/null +++ b/docs/reference/purrr_as_mapper.html @@ -0,0 +1,131 @@ + +Used to convert a function to purrr syntax if needed — purrr_as_mapper • disk.frame + + +
+
+ + + +
+
+ + +
+

Used to convert a function to purrr syntax if needed

+
+ +
+
purrr_as_mapper(.f)
+
+ +
+

Arguments

+
.f
+

a normal function or purrr syntax function i.e. `~ ...code...`

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index 081dadec..b50ff95f 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -122,7 +122,7 @@

Arguments

overwrite

overwrite the output directory

.progress
-

A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

+

A logical, for whether or not to show progress.

diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index 1ad617e3..b5f2b78c 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -98,13 +98,10 @@

Increase or decrease the number of chunks in the disk.frame

rechunk(
   df,
-  nchunks,
+  nchunks = disk.frame::nchunks(df),
   outdir = attr(df, "path", exact = TRUE),
   shardby = NULL,
-  overwrite = TRUE,
-  shardby_function = "hash",
-  sort_splits = NULL,
-  desc_vars = NULL
+  overwrite = TRUE
 )
@@ -120,12 +117,6 @@

Arguments

the shardkeys

overwrite

overwrite the output directory

-
shardby_function
-

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

-
sort_splits
-

for the "sort" shardby function, a dataframe with the split values.

-
desc_vars
-

for the "sort" shardby function, the variables to sort descending.

@@ -135,20 +126,15 @@

Examples

# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df rechunk(cars.df, 3) -#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\back_up_tmp_dir56f4356b56cb. You can recover there files until you restart your R session -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f4c8a34c9.df" +#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\back_up_tmp_dir471815f02658. You can recover there files until you restart your R session +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471862cf5821.df" #> nchunks: 3 #> nrow (at source): 50 #> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ??? new_path = tempfile(fileext = ".df") # re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed") -#> Hashing... -#> Hashing... -#> Hashing... #> Appending disk.frames: # clean up cars.df diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index e549d81c..dd1c227b 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -18,7 +18,7 @@ disk.frame - 0.5.0 + 0.6.0
diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index 974ae319..232f00e1 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -116,34 +116,28 @@

Examples

# removes 3rd chunk remove_chunk(cars.df, 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" #> nchunks: 3 #> nrow (at source): 37 #> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ??? nchunks(cars.df) # 3 #> [1] 3 # removes 4th chunk remove_chunk(cars.df, "4.fst") -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" #> nchunks: 2 #> nrow (at source): 26 #> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ??? nchunks(cars.df) # 3 #> [1] 2 # removes 2nd chunk remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" #> nchunks: 1 #> nrow (at source): 13 #> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ??? nchunks(cars.df) # 1 #> [1] 1 diff --git a/docs/reference/sample.html b/docs/reference/sample.html index 719d4244..056cef09 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -127,28 +127,28 @@

Examples

collect(sample_frac(cars.df, 0.5)) #> speed dist -#> 1: 7 22 -#> 2: 7 4 -#> 3: 9 10 -#> 4: 10 26 -#> 5: 13 34 -#> 6: 11 17 -#> 7: 12 20 -#> 8: 11 28 -#> 9: 14 26 -#> 10: 14 36 -#> 11: 15 26 -#> 12: 14 80 -#> 13: 18 84 -#> 14: 18 56 -#> 15: 18 76 -#> 16: 17 50 +#> 1: 7 4 +#> 2: 4 2 +#> 3: 10 18 +#> 4: 10 34 +#> 5: 11 17 +#> 6: 13 34 +#> 7: 13 26 +#> 8: 12 28 +#> 9: 15 26 +#> 10: 13 46 +#> 11: 16 32 +#> 12: 15 20 +#> 13: 18 42 +#> 14: 17 50 +#> 15: 19 36 +#> 16: 18 76 #> 17: 20 56 -#> 18: 19 46 -#> 19: 19 68 -#> 20: 20 32 -#> 21: 25 85 -#> 22: 24 70 +#> 18: 20 64 +#> 19: 23 54 +#> 20: 20 52 +#> 21: 24 93 +#> 22: 25 85 #> speed dist # clean up cars.df diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 4b1ee1c7..6d502bac 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/shard.html b/docs/reference/shard.html index eca41f84..7b598974 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -18,7 +18,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -104,10 +104,7 @@

Shard a data.frame/data.table or disk.frame into chunk and saves it into a d outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), - overwrite = FALSE, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL + overwrite = FALSE ) distribute(...) @@ -127,12 +124,6 @@

Arguments

The number of chunks

overwrite

If TRUE then the chunks are overwritten

-
shardby_function
-

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

-
sort_splits
-

If shardby_function is "sort", the split values for sharding

-
desc_vars
-

for the "sort" shardby function, the variables to sort descending.

@@ -140,7 +131,6 @@

Examples


 # shard the cars data.frame by speed so that rows with the same speed are in the same chunk
 iris.df = shard(iris, "Species")
-#> Hashing...
 
 # clean up cars.df
 delete(iris.df)
diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html
index eeca73d7..2fce89e2 100644
--- a/docs/reference/shardkey.html
+++ b/docs/reference/shardkey.html
@@ -17,7 +17,7 @@
       
       
         disk.frame
-        0.5.0
+        0.6.0
       
     
diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index e51669c5..dbc9bcce 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0
diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index a4e1badc..d1d69c51 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html index 934352ba..8bfaee95 100644 --- a/docs/reference/srckeep.html +++ b/docs/reference/srckeep.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -96,9 +96,7 @@

Keep only the variables from the input listed in selections

-
srckeep(diskf, selections, ...)
-
-srckeepchunks(diskf, chunks, ...)
+
srckeep(diskf, selections, ...)
@@ -109,8 +107,6 @@

Arguments

The list of variables to keep from the input source

...

not yet used

-
chunks
-

The chunks to load

diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index b3ecb1b5..3c758a26 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -131,7 +131,39 @@

Examples

cars.df = as.disk.frame(cars)
 speed_limit = 50
 cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
-#> Error in .checkTypos(e, names_x): Object 'speed_limit' not found amongst speed, dist
+#> data.table syntax for disk.frame may be moved to a separate package in the future
+#>           cut N
+#>  1:     (0,5] 2
+#>  2:    (5,10] 2
+#>  3:   (20,25] 1
+#>  4:   (15,20] 2
+#>  5:   (25,30] 1
+#>  6:   (30,35] 1
+#>  7:   (15,20] 2
+#>  8:   (25,30] 3
+#>  9:   (10,15] 1
+#> 10:   (20,25] 1
+#> 11:   (30,35] 2
+#> 12:   (40,50] 1
+#> 13:   (20,30] 2
+#> 14:   (30,40] 2
+#> 15:   (50,60] 2
+#> 16:   (70,80] 1
+#> 17:      <NA> 1
+#> 18:   (30,40] 4
+#> 19:   (40,50] 2
+#> 20:   (50,60] 1
+#> 21:   (70,80] 1
+#> 22:   (80,90] 1
+#> 23:   (40,50] 2
+#> 24:   (60,70] 3
+#> 25:   (30,40] 1
+#> 26:   (50,60] 3
+#> 27:      <NA> 1
+#> 28:  (90,100] 2
+#> 29: (110,120] 1
+#> 30:   (80,90] 1
+#>           cut N
 
 # clean up
 delete(cars.df)
diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html
index d61fa7a6..1df5aeaf 100644
--- a/docs/reference/tbl_vars.disk.frame.html
+++ b/docs/reference/tbl_vars.disk.frame.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.5.0
+        0.6.0
       
     
diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index 5f3f9fdb..6eda846a 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.5.0 + 0.6.0 @@ -99,10 +99,10 @@

Write disk.frame to disk

write_disk.frame(
-  df,
+  diskf,
   outdir = tempfile(fileext = ".df"),
-  nchunks = ifelse("disk.frame" %in% class(df), nchunks.disk.frame(df),
-    recommend_nchunks(df)),
+  nchunks = ifelse("disk.frame" %in% class(diskf), nchunks.disk.frame(diskf),
+    recommend_nchunks(diskf)),
   overwrite = FALSE,
   shardby = NULL,
   compress = 50,
@@ -117,7 +117,7 @@ 

Write disk.frame to disk

Arguments

-
df
+
diskf

a disk.frame

outdir

output directory for the disk.frame

diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index 9188f8ac..fe4335d3 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -20,7 +20,7 @@ disk.frame - 0.5.0 + 0.6.0
@@ -144,7 +144,7 @@

Examples

# read every file and convert it to a disk.frame zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) -#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f44b886b42.zip' cannot be opened +#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471855d8254c.zip' cannot be opened # there is only one csv file so it return a list of one disk.frame zip.df[[1]] diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 4aa93c2a..5df92161 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -120,6 +120,9 @@ /reference/as.disk.frame.html + + /reference/bind_rows.disk.frame.html + /reference/bloomfilter.html @@ -237,12 +240,18 @@ /reference/overwrite_check.html + + /reference/play.html + /reference/print.disk.frame.html /reference/pull.disk.frame.html + + /reference/purrr_as_mapper.html + /reference/rbindlist.disk.frame.html diff --git a/man/bind_rows.disk.frame.Rd b/man/bind_rows.disk.frame.Rd new file mode 100644 index 00000000..d69a64b0 --- /dev/null +++ b/man/bind_rows.disk.frame.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bind_rows.r +\name{bind_rows.disk.frame} +\alias{bind_rows.disk.frame} +\title{Bind rows} +\usage{ +bind_rows.disk.frame(...) +} +\arguments{ +\item{...}{disk.frame to be row bound} +} +\description{ +Bind rows +} diff --git a/man/chunk_group_by.Rd b/man/chunk_group_by.Rd index f06836c2..fac8791c 100644 --- a/man/chunk_group_by.Rd +++ b/man/chunk_group_by.Rd @@ -5,7 +5,14 @@ \alias{chunk_summarise} \alias{chunk_group_by} \alias{chunk_ungroup} -\title{Group by within each disk.frame} +\title{#' @export +#' @importFrom dplyr add_count +#' @rdname dplyr_verbs +add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' @export +#' @importFrom dplyr add_tally +#' @rdname dplyr_verbs +add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally)} \usage{ chunk_summarize(.data, ...) diff --git a/man/cmap.Rd b/man/cmap.Rd index fc06d9cd..1f08ba63 100644 --- a/man/cmap.Rd +++ b/man/cmap.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cmap.r, R/map-deprecated.r +% Please edit documentation in R/cmap.r \name{cmap} \alias{cmap} \alias{cmap.disk.frame} @@ -12,34 +12,12 @@ \alias{lazy} \alias{lazy.disk.frame} \alias{delayed} -\alias{chunk_lapply} -\alias{map} -\alias{map.disk.frame} -\alias{map.default} -\alias{imap_dfr} -\alias{imap_dfr.disk.frame} -\alias{imap_dfr.default} -\alias{imap} -\alias{imap.default} -\alias{map_dfr.disk.frame} -\alias{map_dfr.default} +\alias{clapply} \title{Apply the same function to all chunks} \usage{ cmap(.x, .f, ...) -\method{cmap}{disk.frame}( - .x, - .f, - ..., - outdir = NULL, - keep = NULL, - chunks = nchunks(.x), - compress = 50, - lazy = TRUE, - overwrite = FALSE, - vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()), - .progress = TRUE -) +\method{cmap}{disk.frame}(.x, .f, ...) cmap_dfr(.x, .f, ..., .id = NULL) @@ -52,10 +30,9 @@ cimap(.x, .f, ...) .f, outdir = NULL, keep = NULL, - chunks = nchunks(.x), - compress = 50, lazy = TRUE, overwrite = FALSE, + compress = 50, ... ) @@ -77,58 +54,32 @@ lazy(.x, .f, ...) delayed(.x, .f, ...) -chunk_lapply(...) - -map(.x, .f, ...) - -\method{map}{disk.frame}(...) - -\method{map}{default}(.x, .f, ...) - -imap_dfr(.x, .f, ..., .id = NULL) - -\method{imap_dfr}{disk.frame}(...) - -\method{imap_dfr}{default}(.x, .f, ..., .id = NULL) - -imap(.x, .f, ...) - -\method{imap}{default}(.x, .f, ...) - -\method{map_dfr}{disk.frame}(...) - -\method{map_dfr}{default}(.x, .f, ..., .id = NULL) +clapply(...) } \arguments{ \item{.x}{a disk.frame} \item{.f}{a function to apply to each of the chunks} -\item{...}{for compatibility with `purrr::map`} +\item{...}{Passed to `collect` and `write_disk.frame`} -\item{outdir}{the output directory} - -\item{keep}{the columns to keep from the input} - -\item{chunks}{The number of chunks to output} +\item{.id}{ignored} -\item{compress}{0-100 fst compression ratio} - -\item{lazy}{if TRUE then do this lazily} +\item{use.names}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist} -\item{overwrite}{if TRUE removes any existing chunks in the data} +\item{fill}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist} -\item{vars_and_pkgs}{variables and packages to send to a background session. This is typically automatically detected} +\item{idcol}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist} -\item{.progress}{A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}} +\item{outdir}{the output directory} -\item{.id}{not used} +\item{keep}{The columns to keep at source} -\item{use.names}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist} +\item{lazy}{if TRUE then do this lazily} -\item{fill}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist} +\item{overwrite}{Whether to overwrite any files in the output directory} -\item{idcol}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist} +\item{compress}{The compression setting. 0-100} } \description{ Apply the same function to all chunks @@ -166,17 +117,6 @@ cmap_dfr(cars.df, ~.x[1,]) collect(lazy(cars.df, ~.x[1,])) collect(delayed(cars.df, ~.x[1,])) -# clean up cars.df -delete(cars.df) -cars.df = as.disk.frame(cars) - -# .x is the chunk and .y is the ID as an integer - -# lazy = TRUE support is not available at the moment -cimap(cars.df, ~.x[, id := .y], lazy = FALSE) - -cimap_dfr(cars.df, ~.x[, id := .y]) - # clean up cars.df delete(cars.df) } diff --git a/man/cmap2.Rd b/man/cmap2.Rd index 721dd80e..2a03f241 100644 --- a/man/cmap2.Rd +++ b/man/cmap2.Rd @@ -2,14 +2,11 @@ % Please edit documentation in R/map2.r, R/map_by_chunk_id.r \name{cmap2} \alias{cmap2} -\alias{map2} \alias{map_by_chunk_id} \title{`cmap2` a function to two disk.frames} \usage{ cmap2(.x, .y, .f, ...) -map2(.x, .y, .f, ...) - map_by_chunk_id(.x, .y, .f, ..., outdir) } \arguments{ diff --git a/man/collect.Rd b/man/collect.Rd index 520157ea..0acee69c 100644 --- a/man/collect.Rd +++ b/man/collect.Rd @@ -6,11 +6,16 @@ \alias{collect.summarized_disk.frame} \title{Bring the disk.frame into R} \usage{ -\method{collect}{disk.frame}(x, ..., parallel = !is.null(attr(x, "lazyfn"))) +\method{collect}{disk.frame}(x, ..., parallel = !is.null(attr(x, "recordings"))) -collect_list(x, simplify = FALSE, parallel = !is.null(attr(x, "lazyfn"))) +collect_list( + x, + simplify = FALSE, + parallel = !is.null(attr(x, "recordings")), + ... +) -\method{collect}{summarized_disk.frame}(x, ..., parallel = !is.null(attr(x, "lazyfn"))) +\method{collect}{summarized_disk.frame}(x, ..., parallel = !is.null(attr(x, "recordings"))) } \arguments{ \item{x}{a disk.frame} diff --git a/man/compute.disk.frame.Rd b/man/compute.disk.frame.Rd index 57f1c6e2..e4de9e2d 100644 --- a/man/compute.disk.frame.Rd +++ b/man/compute.disk.frame.Rd @@ -2,26 +2,18 @@ % Please edit documentation in R/compute.r \name{compute.disk.frame} \alias{compute.disk.frame} -\title{Compute without writing} +\title{Force computations. The results are stored in a folder.} \usage{ -\method{compute}{disk.frame}( - x, - name, - outdir = tempfile("tmp_df_", fileext = ".df"), - overwrite = TRUE, - ... -) +\method{compute}{disk.frame}(x, name = NULL, outdir = tempfile("tmp_df_", fileext = ".df"), ...) } \arguments{ \item{x}{a disk.frame} -\item{name}{Not used. Kept for compatibility with dplyr} +\item{name}{If not NULL then used as outdir prefix.} \item{outdir}{the output directory} -\item{overwrite}{whether to overwrite or not} - -\item{...}{Not used. Kept for dplyr compatibility} +\item{...}{Passed to `write_disk.frame`} } \description{ Perform the computation; same as calling cmap without .f and lazy = FALSE diff --git a/man/create_chunk_mapper.Rd b/man/create_chunk_mapper.Rd index 0702093d..7be02cea 100644 --- a/man/create_chunk_mapper.Rd +++ b/man/create_chunk_mapper.Rd @@ -4,7 +4,7 @@ \alias{create_chunk_mapper} \title{Create function that applies to each chunk if disk.frame} \usage{ -create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) +create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = FALSE) } \arguments{ \item{chunk_fn}{The dplyr function to create a mapper for} diff --git a/man/csv_to_disk.frame.Rd b/man/csv_to_disk.frame.Rd index 8c59d1de..b82ac585 100644 --- a/man/csv_to_disk.frame.Rd +++ b/man/csv_to_disk.frame.Rd @@ -47,8 +47,7 @@ the highest compression ratio.} \item{header}{Whether the files have header. Defaults to TRUE} -\item{.progress}{A logical, for whether or not to print a progress bar for -multiprocess, multisession, and multicore plans. From {furrr}} +\item{.progress}{A logical, for whether or not to show progress} \item{backend}{The CSV reader backend to choose: "data.table" or "readr". disk.frame does not have its own CSV reader. It uses either diff --git a/man/dplyr_verbs.Rd b/man/dplyr_verbs.Rd index 340e6958..cfebebb4 100644 --- a/man/dplyr_verbs.Rd +++ b/man/dplyr_verbs.Rd @@ -8,8 +8,6 @@ \alias{transmute.disk.frame} \alias{arrange.disk.frame} \alias{chunk_arrange} -\alias{add_tally.disk.frame} -\alias{do.disk.frame} \alias{distinct.disk.frame} \alias{chunk_distinct} \alias{glimpse.disk.frame} @@ -29,10 +27,6 @@ chunk_arrange(.data, ...) -add_tally.disk.frame(.data, ...) - -\method{do}{disk.frame}(.data, ...) - \method{distinct}{disk.frame}(...) chunk_distinct(.data, ...) diff --git a/man/group_by.Rd b/man/group_by.Rd index 1bb0d4a2..56bd42f7 100644 --- a/man/group_by.Rd +++ b/man/group_by.Rd @@ -12,7 +12,12 @@ \method{summarize}{grouped_disk.frame}(.data, ...) -\method{group_by}{disk.frame}(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by}{disk.frame}( + .data, + ..., + .add = FALSE, + .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage") +) \method{summarize}{disk.frame}(.data, ...) @@ -23,7 +28,7 @@ \item{...}{same as the dplyr::group_by} -\item{add}{from dplyr} +\item{.add}{from dplyr} \item{.drop}{from dplyr} } diff --git a/man/hard_arrange.Rd b/man/hard_arrange.Rd deleted file mode 100644 index a866f91d..00000000 --- a/man/hard_arrange.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hard_arrange.r -\name{hard_arrange} -\alias{hard_arrange} -\alias{hard_arrange.data.frame} -\alias{hard_arrange.disk.frame} -\title{Perform a hard arrange} -\usage{ -hard_arrange(df, ..., add = FALSE, .drop = FALSE) - -\method{hard_arrange}{data.frame}(df, ...) - -\method{hard_arrange}{disk.frame}( - df, - ..., - outdir = tempfile("tmp_disk_frame_hard_arrange"), - nchunks = disk.frame::nchunks(df), - overwrite = TRUE -) -} -\arguments{ -\item{df}{a disk.frame} - -\item{...}{grouping variables} - -\item{add}{same as dplyr::arrange} - -\item{.drop}{same as dplyr::arrange} - -\item{outdir}{the output directory} - -\item{nchunks}{The number of chunks in the output. Defaults = nchunks.disk.frame(df)} - -\item{overwrite}{overwrite the out put directory} -} -\description{ -A hard_arrange is a sort by that also reorganizes the chunks to ensure that -every unique grouping of `by`` is in the same chunk. Or in other words, every -row that share the same `by` value will end up in the same chunk. -} -\examples{ -iris.df = as.disk.frame(iris, nchunks = 2) - -# arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk -iris_hard.df = hard_arrange(iris.df, Species) - -get_chunk(iris_hard.df, 1) -get_chunk(iris_hard.df, 2) - -# clean up cars.df -delete(iris.df) -delete(iris_hard.df) -} diff --git a/man/hard_group_by.Rd b/man/hard_group_by.Rd deleted file mode 100644 index 3d2af379..00000000 --- a/man/hard_group_by.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hard_group_by.r -\name{hard_group_by} -\alias{hard_group_by} -\alias{hard_group_by.data.frame} -\alias{hard_group_by.disk.frame} -\title{Perform a hard group} -\usage{ -hard_group_by(df, ..., .add = FALSE, .drop = FALSE) - -\method{hard_group_by}{data.frame}(df, ..., .add = FALSE, .drop = FALSE) - -\method{hard_group_by}{disk.frame}( - df, - ..., - outdir = tempfile("tmp_disk_frame_hard_group_by"), - nchunks = disk.frame::nchunks(df), - overwrite = TRUE, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL, - sort_split_sample_size = 100 -) -} -\arguments{ -\item{df}{a disk.frame} - -\item{...}{grouping variables} - -\item{.add}{same as dplyr::group_by} - -\item{.drop}{same as dplyr::group_by} - -\item{outdir}{the output directory} - -\item{nchunks}{The number of chunks in the output. Defaults = nchunks.disk.frame(df)} - -\item{overwrite}{overwrite the out put directory} - -\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks} - -\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.} - -\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.} - -\item{sort_split_sample_size}{for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.} -} -\description{ -A hard_group_by is a group by that also reorganizes the chunks to ensure that -every unique grouping of `by`` is in the same chunk. Or in other words, every -row that share the same `by` value will end up in the same chunk. -} -\examples{ -iris.df = as.disk.frame(iris, nchunks = 2) - -# group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk -iris_hard.df = hard_group_by(iris.df, Species) - -get_chunk(iris_hard.df, 1) -get_chunk(iris_hard.df, 2) - -# clean up cars.df -delete(iris.df) -delete(iris_hard.df) -} diff --git a/man/join.Rd b/man/join.Rd index 74d6d0dd..218bd626 100644 --- a/man/join.Rd +++ b/man/join.Rd @@ -38,7 +38,9 @@ y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ..., + keep = FALSE, outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, @@ -50,7 +52,9 @@ y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ..., + keep = FALSE, outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, @@ -87,6 +91,10 @@ \item{overwrite}{overwrite output directory} \item{.progress}{Show progress or not. Defaults to FALSE} + +\item{suffix}{see dplyr::XXX_join} + +\item{keep}{see dplyr::XXX_join} } \value{ disk.frame or data.frame/data.table diff --git a/man/play.Rd b/man/play.Rd new file mode 100644 index 00000000..d252ead2 --- /dev/null +++ b/man/play.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/play.r +\name{play} +\alias{play} +\title{Play the recorded lazy operations} +\usage{ +play(dataframe, recordings) +} +\arguments{ +\item{dataframe}{A data.frame} + +\item{recordings}{A recording the expression, globals and packages using create_chunk_mapper} +} +\description{ +Play the recorded lazy operations +} diff --git a/man/purrr_as_mapper.Rd b/man/purrr_as_mapper.Rd new file mode 100644 index 00000000..a34dd80b --- /dev/null +++ b/man/purrr_as_mapper.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.r +\name{purrr_as_mapper} +\alias{purrr_as_mapper} +\title{Used to convert a function to purrr syntax if needed} +\usage{ +purrr_as_mapper(.f) +} +\arguments{ +\item{.f}{a normal function or purrr syntax function i.e. `~{ ...code...}`} +} +\description{ +Used to convert a function to purrr syntax if needed +} diff --git a/man/rbindlist.disk.frame.Rd b/man/rbindlist.disk.frame.Rd index 9a334037..0549ebd5 100644 --- a/man/rbindlist.disk.frame.Rd +++ b/man/rbindlist.disk.frame.Rd @@ -27,7 +27,7 @@ rbindlist.disk.frame( \item{overwrite}{overwrite the output directory} -\item{.progress}{A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}} +\item{.progress}{A logical, for whether or not to show progress.} } \description{ rbindlist disk.frames together diff --git a/man/rechunk.Rd b/man/rechunk.Rd index ce74ac58..086c53b0 100644 --- a/man/rechunk.Rd +++ b/man/rechunk.Rd @@ -6,13 +6,10 @@ \usage{ rechunk( df, - nchunks, + nchunks = disk.frame::nchunks(df), outdir = attr(df, "path", exact = TRUE), shardby = NULL, - overwrite = TRUE, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL + overwrite = TRUE ) } \arguments{ @@ -25,12 +22,6 @@ rechunk( \item{shardby}{the shardkeys} \item{overwrite}{overwrite the output directory} - -\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks} - -\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.} - -\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.} } \description{ Increase or decrease the number of chunks in the disk.frame diff --git a/man/shard.Rd b/man/shard.Rd index 09b8ef5c..a5250d70 100644 --- a/man/shard.Rd +++ b/man/shard.Rd @@ -11,10 +11,7 @@ shard( outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), - overwrite = FALSE, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL + overwrite = FALSE ) distribute(...) @@ -31,12 +28,6 @@ distribute(...) \item{nchunks}{The number of chunks} \item{overwrite}{If TRUE then the chunks are overwritten} - -\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks} - -\item{sort_splits}{If shardby_function is "sort", the split values for sharding} - -\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.} } \description{ Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame diff --git a/man/srckeep.Rd b/man/srckeep.Rd index c377c1c1..e4c4accc 100644 --- a/man/srckeep.Rd +++ b/man/srckeep.Rd @@ -2,12 +2,9 @@ % Please edit documentation in R/srckeep.disk.frame.r \name{srckeep} \alias{srckeep} -\alias{srckeepchunks} \title{Keep only the variables from the input listed in selections} \usage{ srckeep(diskf, selections, ...) - -srckeepchunks(diskf, chunks, ...) } \arguments{ \item{diskf}{a disk.frame} @@ -15,8 +12,6 @@ srckeepchunks(diskf, chunks, ...) \item{selections}{The list of variables to keep from the input source} \item{...}{not yet used} - -\item{chunks}{The chunks to load} } \description{ Keep only the variables from the input listed in selections diff --git a/man/write_disk.frame.Rd b/man/write_disk.frame.Rd index 88977489..60c2bf09 100644 --- a/man/write_disk.frame.Rd +++ b/man/write_disk.frame.Rd @@ -6,10 +6,10 @@ \title{Write disk.frame to disk} \usage{ write_disk.frame( - df, + diskf, outdir = tempfile(fileext = ".df"), - nchunks = ifelse("disk.frame" \%in\% class(df), nchunks.disk.frame(df), - recommend_nchunks(df)), + nchunks = ifelse("disk.frame" \%in\% class(diskf), nchunks.disk.frame(diskf), + recommend_nchunks(diskf)), overwrite = FALSE, shardby = NULL, compress = 50, @@ -22,7 +22,7 @@ write_disk.frame( output_disk.frame(...) } \arguments{ -\item{df}{a disk.frame} +\item{diskf}{a disk.frame} \item{outdir}{output directory for the disk.frame} diff --git a/misc/NAMESPACE_20190205 b/misc/NAMESPACE_20190205 deleted file mode 100644 index 72ead0da..00000000 --- a/misc/NAMESPACE_20190205 +++ /dev/null @@ -1,99 +0,0 @@ -# Generated by roxygen2: do not edit by hand - -S3method("[",disk.frame) -S3method(anti_join,disk.frame) -S3method(arrange_,disk.frame) -S3method(as.data.frame,disk.frame) -S3method(as.data.table,disk.frame) -S3method(collect,disk.frame) -S3method(compute,disk.frame) -S3method(delayed,disk.frame) -S3method(do_,disk.frame) -S3method(filter_,disk.frame) -S3method(full_join,disk.frame) -S3method(get_chunk,disk.frame) -S3method(glimpse,disk.frame) -S3method(group_by,disk.frame) -S3method(group_by_,disk.frame) -S3method(groups,disk.frame) -S3method(hard_group_by,disk.frame) -S3method(head,disk.frame) -S3method(inner_join,disk.frame) -S3method(left_join,disk.frame) -S3method(merge,disk.frame) -S3method(mutate_,disk.frame) -S3method(names,disk.frame) -S3method(nchunk,disk.frame) -S3method(nchunks,disk.frame) -S3method(ncol,default) -S3method(ncol,disk.frame) -S3method(nrow,default) -S3method(nrow,disk.frame) -S3method(print,disk.frame) -S3method(print,xgdf_scorecard) -S3method(rename_,disk.frame) -S3method(sample_frac,disk.frame) -S3method(select_,disk.frame) -S3method(semi_join,disk.frame) -S3method(summarise_,disk.frame) -S3method(tail,disk.frame) -S3method(tbl_vars,disk.frame) -S3method(transmute_,disk.frame) -export(add_chunk) -export(add_meta) -export(as.disk.frame) -export(auc) -export(chunk_lapply) -export(collect_list) -export(colnames) -export(csv_to_disk.frame) -export(delayed) -export(disk.frame) -export(distribute) -export(evalparseglue) -export(foverlaps.disk.frame) -export(get_chunk) -export(hard_group_by) -export(is_disk.frame) -export(lazy) -export(map.disk.frame) -export(map_by_chunk_id) -export(nchunk) -export(nchunks) -export(ncol) -export(nrow) -export(overwrite_check) -export(rbindlist.disk.frame) -export(rechunk) -export(recommend_nchunks) -export(remove_chunk) -export(shard) -export(shardkey) -export(srckeep) -export(write_disk.frame) -export(zip_to_disk.frame) -import(base) -import(dplyr) -import(dtplyr) -import(fs) -import(fst) -import(furrr) -import(purrr) -import(stringr) -importFrom(Rcpp,evalCpp) -importFrom(data.table,as.data.table) -importFrom(data.table,data.table) -importFrom(data.table,foverlaps) -importFrom(data.table,rbindlist) -importFrom(data.table,setDT) -importFrom(furrr,future_map_dfr) -importFrom(future.apply,future_lapply) -importFrom(glue,glue) -importFrom(jsonlite,fromJSON) -importFrom(jsonlite,toJSON) -importFrom(pryr,object_size) -importFrom(purrr,map_dfr) -importFrom(xgboost,xgb.DMatrix) -importFrom(xgboost,xgb.save) -importFrom(xgboost,xgboost) -useDynLib(disk.frame) diff --git a/misc/NAMESPACE_ok b/misc/NAMESPACE_ok deleted file mode 100644 index 18b7d99c..00000000 --- a/misc/NAMESPACE_ok +++ /dev/null @@ -1,61 +0,0 @@ -# Generated by roxygen2: do not edit by hand -useDynLib(disk.frame, .registration=TRUE) -importFrom(Rcpp,evalCpp) -S3method("[",disk.frame) -S3method(anti_join,disk.frame) -S3method(as.data.frame,disk.frame) -S3method(as.data.table,disk.frame) -S3method(collect,disk.frame) -S3method(do_,disk.frame) -S3method(filter_,disk.frame) -S3method(get_chunk,disk.frame) -S3method(group_by_,disk.frame) -S3method(groups,disk.frame) -S3method(hard_group_by,disk.frame) -S3method(head,disk.frame) -S3method(inner_join,disk.frame) -S3method(keep,disk.frame) -S3method(left_join,disk.frame) -S3method(map,disk.frame) -S3method(merge,disk.frame) -S3method(mutate_,disk.frame) -S3method(names,disk.frame) -S3method(nchunk,disk.frame) -S3method(nchunks,disk.frame) -S3method(nrow,default) -S3method(nrow,disk.frame) -S3method(print,disk.frame) -S3method(rename_,disk.frame) -S3method(select_,disk.frame) -S3method(semi_join,disk.frame) -S3method(summarise_,disk.frame) -S3method(tail,disk.frame) -S3method(tbl_vars,disk.frame) -S3method(transmute_,disk.frame) -export(chunk_lapply) -export(collect) -export(colnames.disk.frame) -export(delayed) -export(disk.frame) -export(distribute) -export(foverlaps.disk.frame) -export(get_chunk) -export(hard_group_by) -export(keep) -export(lazy) -export(map) -export(nchunk) -export(nchunks) -export(nrow) -export(progressbar) -export(rbindlist.disk.frame) -export(shard) -import(data.table) -import(dplyr) -import(dtplyr) -import(fs) -import(fst) -import(future) -import(future.apply) -import(glue) -import(purrr) diff --git a/misc/disk.frame-report.html b/misc/disk.frame-report.html index 12c97232..5f8a1d71 100644 --- a/misc/disk.frame-report.html +++ b/misc/disk.frame-report.html @@ -1,6 +1,6 @@ - - + + @@ -22,7 +22,7 @@ - +
-

disk.frame coverage - 51.96%

+

disk.frame coverage - 52.99%

-
- +R/zip_to_disk.frame.r1154504500.00%R/sas2disk_frame.r774004000.00%R/foverlaps.disk.frame.r893503500.00%R/csv2disk.frame_readr.r512902900.00%R/move_to.r662202200.00%R/show_ceremony.R3170700.00%R/collect.summarized_disk.frame.r3130300.00%R/map_by_chunk_id.r620200.00%R/csv2disk.frame.r46826829239110.82%R/map-deprecated.r85162141012.50%R/setup.r8522319113.64%R/sample_n.R9312033.33%R/disk.frame.r24096336315134.38%R/hard_group_by.r2208131501938.27%R/recommend_nchunks.r1366326371441.27%R/srckeep.disk.frame.r31734042.86%R/one-stage-verbs.R3901105357048.18%R/glm.r6821129157.14%R/write_disk.frame.r9132191337859.38%R/remove_chunk.r441174163.64%R/get_chunk.r10234231156067.65%R/rechunk.r155755322570.67%R/dplyr_verbs.r3222216629172.73%R/overwrite_check.r431612423375.00%R/shardkey.r2686216175.00%R/collect.r75171341976.47%R/sample_frac.R27972177.78%R/rbindlist.disk.frame.r80332677178.79%R/cmap.r2417661155180.26%R/add_chunk.r1476250121880.65%R/ncol-nrow.r66161337581.25%R/data.table.r6817143882.35%R/full_join.r53292451082.76%R/map2.r106352963282.86%R/semi_join.r5530255783.33%R/anti_join.r7030255783.33%R/inner_join.r6637316883.78%R/left_join.r5832275884.38%R/chunk_mapper.r892723414285.19%R/nchunks.r4376130885.71%R/is_disk.frame.r401513230486.67%R/make_glm_streaming_fn.r54161421287.50%R/names.r398712087.50%R/get_chunk_ids.r381110121090.91%R/hard_arrange.r6714131692.86%R/shard.r692927212893.10%R/merge.disk.frame.r98303002100.00%R/sortablestr2i.R672222038100.00%R/zzz.r661212016100.00%R/util.r271212014100.00%R/as.disk.frame.r4299063100.00%R/print.disk.frame.r198801100.00%R/as.data.frame.r312201100.00%R/compute.r232203100.00%R/delete.r122208100.00%R/tbl_vars.r101101100.00%
+
@@ -227,7 +227,7 @@

disk.frame coverage - 51.96%

16 - 517x + 499x
  stopifnot(backend == "fst")
@@ -241,14 +241,14 @@

disk.frame coverage - 51.96%

18 - 517x + 499x
  if(dir.exists(path)) {
19 - 517x + 499x
    disk.frame_folder(path)
@@ -472,7 +472,7 @@

disk.frame coverage - 51.96%

51 - 377x + 361x
  stopifnot("disk.frame" %in% class(df))
@@ -486,7 +486,7 @@

disk.frame coverage - 51.96%

53 - 377x + 361x
  if(is.null(shardkey)) {
@@ -521,14 +521,14 @@

disk.frame coverage - 51.96%

58 - 377x + 361x
  fs::dir_create(file.path(attr(df,"path"), ".metadata"))
59 - 377x + 361x
  json_path = fs::file_create(file.path(attr(df,"path"), ".metadata", "meta.json"))
@@ -542,21 +542,21 @@

disk.frame coverage - 51.96%

61 - 377x + 361x
  filesize = file.size("meta.json")
62 - 377x + 361x
  meta_out = NULL
63 - 377x + 361x
  if(is.na(filesize)) {
@@ -570,49 +570,49 @@

disk.frame coverage - 51.96%

65 - 377x + 361x
    meta_out = jsonlite::toJSON(
66 - 377x + 361x
        c(
67 - 377x + 361x
          list(
68 - 377x + 361x
            nchunks = nchunks, 
69 - 377x + 361x
            shardkey = shardkey, 
70 - 377x + 361x
            shardchunks = shardchunks), 
71 - 377x + 361x
          list(...)
@@ -682,14 +682,14 @@

disk.frame coverage - 51.96%

81 - 377x + 361x
  cat(meta_out, file = json_path)
82 - 377x + 361x
  df
@@ -738,42 +738,42 @@

disk.frame coverage - 51.96%

89 - 517x + 499x
  df <- list()
90 - 517x + 499x
  df$files <- list.files(path, full.names = TRUE)
91 - 517x + 499x
  df$files_short <- list.files(path)
92 - 517x + 499x
  attr(df,"path") <- path
93 - 517x + 499x
  attr(df,"backend") <- "fst"
94 - 517x + 499x
  class(df) <- c("disk.frame", "disk.frame.folder")
@@ -787,14 +787,14 @@

disk.frame coverage - 51.96%

96 - 517x + 499x
  attr(df, "performing") <- "none"
97 - 517x + 499x
  df
@@ -1039,7 +1039,7 @@

disk.frame coverage - 51.96%

132 - 104x + 103x
  return(TRUE)
@@ -1368,21 +1368,21 @@

disk.frame coverage - 51.96%

179 - 654x + 628x
  if(check.consistency) {
180 - 654x + 628x
    fpath <- attr(df,"path")
181 - 654x + 628x
    if(!dir.exists(fpath) & file.exists(fpath)) {
@@ -1403,7 +1403,7 @@

disk.frame coverage - 51.96%

184 - 654x + 628x
      return(FALSE)
@@ -1466,7 +1466,7 @@

disk.frame coverage - 51.96%

193 - 654x + 628x
  !is.file.disk.frame(df, check.consistency = check.consistency)
@@ -1803,467 +1803,600 @@

disk.frame coverage - 51.96%

-