diff --git a/.Rbuildignore b/.Rbuildignore index 5c5d5b95..5d380324 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ +^renv$ +^renv\.lock$ ^.*\.Rproj$ ^\.github$ ^manuscript$ @@ -65,4 +67,6 @@ vignettes.Rnw.template ^codecov\.yml$ new-nse-dev.r test-poorman.R -*.parquet \ No newline at end of file +.parquet$ +maditr-devs.r +^CRAN-SUBMISSION$ diff --git a/CRAN-RELEASE b/CRAN-RELEASE deleted file mode 100644 index 48d603de..00000000 --- a/CRAN-RELEASE +++ /dev/null @@ -1,2 +0,0 @@ -This package was submitted to CRAN on 2021-03-12. -Once it is accepted, delete this file and tag the release (commit 34bafaa). diff --git a/DESCRIPTION b/DESCRIPTION index 0f848ee8..1ce9bbee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.5.0 -Date: 2021-05-09 +Version: 0.6.0 +Date: 2022-01-31 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) @@ -17,27 +17,24 @@ License: MIT + file LICENSE Imports: Rcpp (>= 0.12.13), glue (>= 1.3.1), - rlang (>= 0.4.0), future.apply (>= 1.3.0), fs (>= 1.3.1), jsonlite (>= 1.6), pryr (>= 0.1.4), stringr (>= 1.4.0), fst (>= 0.8.0), - globals (>= 0.12.4), future (>= 1.14.0), data.table (>= 1.12.2), crayon (>= 1.3.4), bigreadr (>= 0.2.0), - furrr (>= 0.2.2), bit64, - benchmarkme + benchmarkme, + purrr (>= 0.3.2), + rlang Depends: R (>= 3.4), - dplyr (>= 1.0.0), - purrr (>= 0.3.2) + dplyr (>= 1.0.0) Suggests: - testthat (>= 2.1.0), nycflights13, magrittr, shiny, @@ -49,10 +46,11 @@ Suggests: speedglm, broom, ggplot2, - covr + rmarkdown LinkingTo: Rcpp -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 +VignetteBuilder: rmarkdown Encoding: UTF-8 URL: https://diskframe.com BugReports: https://github.com/xiaodaigh/disk.frame/issues diff --git a/NAMESPACE b/NAMESPACE index da78d075..2def4912 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,7 +17,6 @@ S3method(colnames,disk.frame) S3method(compute,disk.frame) S3method(delayed,disk.frame) S3method(distinct,disk.frame) -S3method(do,disk.frame) S3method(filter,disk.frame) S3method(full_join,disk.frame) S3method(get_chunk,disk.frame) @@ -25,23 +24,10 @@ S3method(glimpse,disk.frame) S3method(group_by,disk.frame) S3method(group_vars,disk.frame) S3method(groups,disk.frame) -S3method(hard_arrange,data.frame) -S3method(hard_arrange,disk.frame) -S3method(hard_group_by,data.frame) -S3method(hard_group_by,disk.frame) S3method(head,disk.frame) -S3method(imap,default) -S3method(imap_dfr,default) -S3method(imap_dfr,disk.frame) S3method(inner_join,disk.frame) S3method(lazy,disk.frame) S3method(left_join,disk.frame) -S3method(map,default) -S3method(map,disk.frame) -S3method(map2,default) -S3method(map2,disk.frame) -S3method(map_dfr,default) -S3method(map_dfr,disk.frame) S3method(merge,disk.frame) S3method(mutate,disk.frame) S3method(names,disk.frame) @@ -67,22 +53,22 @@ S3method(transmute,disk.frame) export(IQR_df.chunk_agg.disk.frame) export(IQR_df.collected_agg.disk.frame) export(add_chunk) -export(add_tally.disk.frame) export(all_df.chunk_agg.disk.frame) export(all_df.collected_agg.disk.frame) export(any_df.chunk_agg.disk.frame) export(any_df.collected_agg.disk.frame) export(as.disk.frame) +export(bind_rows.disk.frame) export(ceremony_text) export(chunk_arrange) export(chunk_distinct) export(chunk_group_by) -export(chunk_lapply) export(chunk_summarise) export(chunk_summarize) export(chunk_ungroup) export(cimap) export(cimap_dfr) +export(clapply) export(cmap) export(cmap2) export(cmap_dfr) @@ -102,18 +88,12 @@ export(foverlaps.disk.frame) export(gen_datatable_synthetic) export(get_chunk) export(get_chunk_ids) -export(hard_arrange) -export(hard_group_by) -export(imap) -export(imap_dfr) export(insert_ceremony) export(is_disk.frame) export(lazy) export(length_df.chunk_agg.disk.frame) export(length_df.collected_agg.disk.frame) export(make_glm_streaming_fn) -export(map) -export(map2) export(map_by_chunk_id) export(max_df.chunk_agg.disk.frame) export(max_df.collected_agg.disk.frame) @@ -148,7 +128,6 @@ export(shardkey_equal) export(show_boilerplate) export(show_ceremony) export(srckeep) -export(srckeepchunks) export(sum_df.chunk_agg.disk.frame) export(sum_df.collected_agg.disk.frame) export(var_df.chunk_agg.disk.frame) @@ -172,10 +151,8 @@ importFrom(data.table,foverlaps) importFrom(data.table,fread) importFrom(data.table,rbindlist) importFrom(data.table,setDT) -importFrom(data.table,setkey) importFrom(data.table,setkeyv) importFrom(data.table,timetaken) -importFrom(dplyr,add_tally) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) @@ -218,7 +195,6 @@ importFrom(future,nbrOfWorkers) importFrom(future,plan) importFrom(future,sequential) importFrom(future.apply,future_lapply) -importFrom(globals,findGlobals) importFrom(glue,glue) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) @@ -230,9 +206,7 @@ importFrom(purrr,map2) importFrom(purrr,map_chr) importFrom(purrr,map_dfr) importFrom(purrr,map_lgl) -importFrom(rlang,enquos) -importFrom(rlang,eval_tidy) -importFrom(rlang,quo) +importFrom(rlang,enexpr) importFrom(stats,median) importFrom(stats,quantile) importFrom(stats,runif) @@ -240,9 +214,6 @@ importFrom(stringr,fixed) importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,memory.limit) -importFrom(utils,methods) -importFrom(utils,setTxtProgressBar) importFrom(utils,tail) -importFrom(utils,txtProgressBar) importFrom(utils,unzip) useDynLib(disk.frame) diff --git a/NEWS.md b/NEWS.md index 088222ce..e075c6da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# disk.frame 0.6 +* Much better NSE support in disk.frame! +* removed `hard_arrange` and `hard_group_by` +* various API updates + # disk.frame 0.5 * removed `add_count` method diff --git a/R/add_chunk.r b/R/add_chunk.r index 215feb85..08fee3d2 100644 --- a/R/add_chunk.r +++ b/R/add_chunk.r @@ -116,9 +116,10 @@ add_chunk <- function(df, chunk, chunk_id = NULL, full.names = FALSE, ...) { data.table::setDT(check_vars) if(nrow(check_vars[is.na(new_chunk)]) > 0) { + vars_strings = paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ') warning( - glue::glue( - "these variables are in the disk.frame but not in the new chunk: \n {paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ')}")) + sprintf( + "these variables are in the disk.frame but not in the new chunk: \n %s", vars_strings)) } if(nrow(check_vars[is.na(existing_df)]) > 0){ warning(glue::glue("these variables are in the new chunk but not in the existing disk.frame: {paste0(check_vars[is.na(existing_df), colnames], collapse=', ')}")) diff --git a/R/anti_join.r b/R/anti_join.r index dc473a6e..2108bdf6 100644 --- a/R/anti_join.r +++ b/R/anti_join.r @@ -3,9 +3,10 @@ #' @param merge_by_chunk_id the merge is performed by chunk id #' @param overwrite overwrite output directory #' @param .progress Show progress or not. Defaults to FALSE +#' @param suffix see dplyr::XXX_join +#' @param keep see dplyr::XXX_join #' @param ... same as dplyr's joins #' @rdname join -#' @importFrom rlang quo enquos #' @importFrom dplyr anti_join left_join full_join semi_join inner_join #' @return disk.frame or data.frame/data.table #' @export @@ -29,11 +30,11 @@ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi overwrite_check(outdir, overwrite) if("data.frame" %in% class(y)) { - quo_dotdotdot = enquos(...) - cmap_dfr.disk.frame(x, ~{ - code = quo(anti_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) + tmp = cmap.disk.frame(x, ~{ + anti_join(.x, y, by = by, copy = copy, ...) }, .progress = .progress) + + return(tmp) } else if("disk.frame" %in% class(y)) { if(is.null(merge_by_chunk_id)) { stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") @@ -47,12 +48,12 @@ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi if (merge_by_chunk_id == FALSE) { warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + ncxy = max(ncy,ncx) + x = rechunk(x, shardby=by, nchunks = ncxy, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) + y = rechunk(y, shardby=by, nchunks =ncxy, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(anti_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { res = cmap2.disk.frame(x, y, ~{ - #res = cmap2(x, y, ~{ if(is.null(.y)) { return(.x) } else if (is.null(.x)) { diff --git a/R/as.disk.frame.r b/R/as.disk.frame.r index f55c2a78..2e208e03 100644 --- a/R/as.disk.frame.r +++ b/R/as.disk.frame.r @@ -25,7 +25,6 @@ #' delete(cars_new_location.df) #' delete(cars_chunks.df) as.disk.frame <- function(df, outdir = tempfile(fileext = ".df"), nchunks = recommend_nchunks(df), overwrite = FALSE, shardby = NULL, compress = 50,...) { - stopifnot("data.frame" %in% class(df)) overwrite_check(outdir, overwrite) data.table::setDT(df) diff --git a/R/bind_rows.r b/R/bind_rows.r new file mode 100644 index 00000000..59b6cf4b --- /dev/null +++ b/R/bind_rows.r @@ -0,0 +1,6 @@ +#' Bind rows +#' @param ... disk.frame to be row bound +#' @export +bind_rows.disk.frame <- function(...) { + rbindlist.disk.frame(list(...)) +} \ No newline at end of file diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index 2063252a..1b0d3ff7 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -31,59 +31,49 @@ #' @param chunk_fn The dplyr function to create a mapper for #' @param warning_msg The warning message to display when invoking the mapper #' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr -#' @importFrom rlang enquos quo #' @export -create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) { - return_func <- function(.data, ...) { - if (!is.null(warning_msg)) { +create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FALSE) { + if(as.data.frame) { + warning("`as.data.frame` is deprecated in create_chunk_mapper") + } + + return(function(.data, ...) { + if(!is.null(warning_msg)) { warning(warning_msg) } + # you need to use list otherwise the names will be gone + code = substitute(chunk_fn(.disk.frame.chunk, ...)) - quo_dotdotdot = rlang::enquos(...) + if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { + globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) + } else { + globals_and_pkgs = future::getGlobalsAndPackages(code) + } - # this is designed to capture any global stuff - vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot) - data_for_eval_tidy = force(vars_and_pkgs$globals) - res = cmap(.data, ~{ - - this_env = environment() - - if(length(data_for_eval_tidy) > 0) { - for(i in 1:length(data_for_eval_tidy)) { - assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env) - } - } - - lapply(quo_dotdotdot, function(x) { - attr(x, ".Environment") = this_env - }) - - if(as.data.frame) { - if("grouped_df" %in% class(.x)) { - code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) - } else { - code = rlang::quo(chunk_fn(as.data.frame(.x), !!!quo_dotdotdot)) - } - } else { - code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) + global_vars = globals_and_pkgs$globals + + env = parent.frame() + + done = identical(env, emptyenv()) || identical(env, globalenv()) + + # keep adding global variables by moving up the environment chain + while(!done) { + tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) + new_global_vars = tmp_globals_and_pkgs$globals + for (name in setdiff(names(new_global_vars), names(global_vars))) { + global_vars[[name]] <- new_global_vars[[name]] } - # ZJ: we need both approaches. TRUST ME - # TODO better NSE at some point need dist - tryCatch({ - return(rlang::eval_tidy(code)) - }, error = function(e) { - as_label_code = rlang::as_label(code) - if(as_label_code == "chunk_fn(...)") { - stop(glue::glue("disk.frame has detected a syntax error in \n\n`{code}`\n\n. If you believe your syntax is correct, raise an issue at https://github.com/xiaodaigh/disk.frame with a MWE")) - } else { - # likely to be dealing with data.tables - return(eval(parse(text=as_label_code), envir = this_env)) - } - }) - }, lazy = TRUE) - } - return_func -} \ No newline at end of file + done = identical(env, emptyenv()) || identical(env, globalenv()) + env = parent.env(env) + } + + globals_and_pkgs$globals = global_vars + + attr(.data, "recordings") = c(attr(.data, "recordings"), list(globals_and_pkgs)) + + .data + }) +} diff --git a/R/clapply.r b/R/clapply.r new file mode 100644 index 00000000..e69de29b diff --git a/R/cmap.r b/R/cmap.r index ca18a26c..a9e578ce 100644 --- a/R/cmap.r +++ b/R/cmap.r @@ -2,20 +2,15 @@ #' @param .x a disk.frame #' @param .f a function to apply to each of the chunks #' @param outdir the output directory -#' @param keep the columns to keep from the input -#' @param chunks The number of chunks to output #' @param lazy if TRUE then do this lazily -#' @param compress 0-100 fst compression ratio -#' @param overwrite if TRUE removes any existing chunks in the data #' @param use.names for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist #' @param fill for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist #' @param idcol for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist -#' @param vars_and_pkgs variables and packages to send to a background session. This is typically automatically detected -#' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr} -#' @param ... for compatibility with `purrr::map` -#' @import fst -#' @importFrom purrr as_mapper map -#' @importFrom future.apply future_lapply +#' @param .id ignored +#' @param keep The columns to keep at source +#' @param compress The compression setting. 0-100 +#' @param overwrite Whether to overwrite any files in the output directory +#' @param ... Passed to `collect` and `write_disk.frame` #' @export #' @examples #' cars.df = as.disk.frame(cars) @@ -56,78 +51,11 @@ cmap <- function(.x, .f, ...) { cmap.disk.frame <- function( .x, .f, - ..., - outdir = NULL, - keep = NULL, - chunks = nchunks(.x), - compress = 50, - lazy = TRUE, - overwrite = FALSE, - vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()), .progress = TRUE) { + ...) { .f = purrr::as_mapper(.f) - if(lazy) { - attr(.x, "lazyfn") = - c( - attr(.x, "lazyfn"), - list( - list( - func = .f, - vars_and_pkgs = vars_and_pkgs, - dotdotdot = list(...) - ) - ) - ) - return(.x) - } - - if(!is.null(outdir)) { - overwrite_check(outdir, overwrite) - } - - stopifnot(is_ready(.x)) - - keep1 = attr(.x,"keep", exact=TRUE) - - if(is.null(keep)) { - keep = keep1 - } - - path <- attr(.x, "path") - files <- list.files(path, full.names = TRUE) - files_shortname <- list.files(path) - - keep_future = keep - - cid = get_chunk_ids(.x, full.names = TRUE) - - dotdotdot = list(...) - res = future.apply::future_lapply(1:length(files), function(ii, ...) { - #res = lapply(1:length(files), function(ii) { - ds = disk.frame::get_chunk(.x, cid[ii], keep=keep_future, full.names = TRUE) - - res = .f(ds, ...) - # res = do.call(.f, c(ds, dotdotdot)) - - if(!is.null(outdir)) { - if(nrow(res) == 0) { - warning(glue::glue("The output chunk has 0 row, therefore chunk {ii} NOT written")) - } else { - fst::write_fst(res, file.path(outdir, files_shortname[ii]), compress) - } - return(ii) - } else { - return(res) - } - }, ..., - future.seed=TRUE # to get rid of the error TODO investigate making this better - ) - - if(!is.null(outdir)) { - return(disk.frame(outdir)) - } else { - return(res) - } + result = create_chunk_mapper(.f)(.x, ...) + return(result) } #' @export @@ -143,8 +71,8 @@ cmap_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill warning(".id is not NULL, but the parameter is not used with cmap_dfr.disk.frame") } - # TODO warn the user if outdir is cmap_dfr - data.table::rbindlist(cmap.disk.frame(.x, .f, ..., outdir = NULL, lazy = FALSE), use.names = use.names, fill = fill, idcol = idcol) + list_df = collect_list(cmap.disk.frame(.x, .f, ...)) + data.table::rbindlist(list_df, use.names = use.names, fill = fill, idcol = idcol) } @@ -158,13 +86,13 @@ cimap <- function(.x, .f, ...) { #' second is the chunk ID #' @export #' @rdname cmap -cimap.disk.frame <- function(.x, .f, outdir = NULL, keep = NULL, chunks = nchunks(.x), compress = 50, lazy = TRUE, overwrite = FALSE, ...) { - .f = purrr::as_mapper(.f) +cimap.disk.frame <- function(.x, .f, outdir = NULL, keep = NULL, lazy = TRUE, overwrite = FALSE, compress=50, ...) { + .f = purrr_as_mapper(.f) # TODO support lazy for cimap if(lazy) { stop("cimap.disk.frame: lazy = TRUE is not supported at this stage") - attr(.x, "lazyfn") = c(attr(.x, "lazyfn"), .f) + attr(.x, "recordings") = c(attr(.x, "recordings"), .f) return(.x) } @@ -246,7 +174,6 @@ delayed.disk.frame <- function(.x, .f, ...) { #' @export #' @rdname cmap -chunk_lapply <- function (...) { - warning("chunk_lapply is deprecated in favour of cmap.disk.frame") +clapply <- function (...) { cmap.disk.frame(...) } diff --git a/R/collect.r b/R/collect.r index a403098f..cf5add15 100644 --- a/R/collect.r +++ b/R/collect.r @@ -23,15 +23,14 @@ #' delete(cars.df) #' @export #' @rdname collect -collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { +collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"recordings"))) { cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) - #cids = as.integer(get_chunk_ids(x)) if(nchunks(x) > 0) { if(parallel) { - future.apply::future_lapply(cids, function(.x) { - get_chunk(x, .x, full.names = TRUE) - }, future.seed = TRUE) %>% - rbindlist() + tmp = future.apply::future_lapply(cids, function(.x) { + get_chunk(x, .x, full.names = TRUE) + }, future.seed = TRUE) + return(rbindlist(tmp)) } else { purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE)) } @@ -52,24 +51,26 @@ collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { #' #' # clean up #' delete(cars.df) -collect_list <- function(x, simplify = FALSE, parallel = !is.null(attr(x,"lazyfn"))) { +collect_list <- function(x, simplify = FALSE, parallel = !is.null(attr(x,"recordings")), ...) { + # get the chunk ids cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) - - if(nchunks(x) > 0) { - res <- NULL + if(length(cids) > 0) { + list_of_results = NULL if (parallel) { - #res = furrr::future_map(1:nchunks(x), ~get_chunk(x, .x)) - res = future.apply::future_lapply(cids, function(.x) { + list_of_results = future.apply::future_lapply(cids, function(.x) { get_chunk(x, .x, full.names = TRUE) }, future.seed=TRUE) } else { - res = purrr::map(cids, ~get_chunk(x, .x, full.names = TRUE)) + list_of_results = lapply(cids, function(cid) { + get_chunk(x, cid, full.names = TRUE) + }) } + if (simplify) { - return(simplify2array(res)) + return(simplify2array(list_of_results)) } else { - return(res) + return(list_of_results) } } else { list() diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index b9ba042e..edd686a4 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -23,8 +23,128 @@ #' delete(cars.df) #' @export #' @rdname collect -collect.summarized_disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { - code_to_run = glue::glue("x %>% {attr(x, 'summarize_code') %>% as.character}") - class(x) <- "disk.frame" - eval(parse(text = code_to_run)) -} +collect.summarized_disk.frame <- + function(x, ..., parallel = !is.null(attr(x, "recordings"))) { + dotdotdot <- attr(x, 'summarize_code') + + # make a copy + dotdotdot_chunk_agg <- dotdotdot + dotdotdot_collected_agg <- dotdotdot + + i = 1 + for (a_call in dotdotdot) { + # obtain the function call name + func_call_str = paste0(deparse(a_call[[1]]), collapse = "") + + # parse(...) returns an expression, but I just want the sole symbol which + # can be extracted with [[1]] + func_call_chunk_agg = parse(text = paste0(func_call_str, "_df.chunk_agg.disk.frame"))[[1]] + # replace the function call with the chunk_agg_function + dotdotdot_chunk_agg[[i]][[1]] = func_call_chunk_agg + + func_call_collected_agg = paste0(func_call_str, "_df.collected_agg.disk.frame") + # replace the function call with the chunk_agg_function + dotdotdot_collected_agg[[i]] = parse(text = sprintf( + "%s(%s)", + func_call_collected_agg, + paste0(".disk.frame.tmp", i) + ))[[1]] + i = i + 1 + # TODO extract global variables from here and store them in the global + } + + group_by_vars = attr(x, "group_by_cols") + + # figure out how many group by arguments there are + n_grp_args = length(group_by_vars) + + # generate a function call with as many arguments + x_as.disk.frame = x + class(x_as.disk.frame) = "disk.frame" + first_stage_code = eval(parse( + text = sprintf( + "quote(chunk_group_by(x_as.disk.frame, %s))", + paste0(rep_len("NULL", n_grp_args), collapse = ", ") + ) + )) + + if (n_grp_args >= 1) { + for (i in 1:n_grp_args) { + first_stage_code[[i + 2]] = group_by_vars[[i]] + } + } + + # TODO add appropriate environment + tmp_df = eval(first_stage_code) + + n_summ_args = length(dotdotdot_chunk_agg) + + chunk_summ_code = + eval(parse(text = sprintf( + "quote(chunk_summarise(tmp_df, %s))", + paste0("NULL", 1:n_summ_args, collapse = ", ") + ))) + + + chunk_summ_code_str = chunk_summ_code %>% + deparse %>% + paste0(collapse = "") + + for (i in 1:n_summ_args) { + lhs = sprintf(".disk.frame.tmp%d", i) + rhs = paste0(deparse(dotdotdot_chunk_agg[[i]]), collapse = "") + + tmp_code = paste0("NULL", i) + chunk_summ_code_str = gsub( + pattern = tmp_code, + sprintf("%s=list(%s)", lhs, rhs), + chunk_summ_code_str, + fixed = TRUE + ) + } + + tmp2 = collect(eval(parse(text = chunk_summ_code_str))) + + second_stage_code = eval(parse(text = sprintf( + "quote(group_by(tmp2, %s))", paste0(rep_len("NULL", n_grp_args), collapse = ", ") + ))) + + if (n_grp_args >= 1) { + for (i in 1:n_grp_args) { + second_stage_code[[i + 2]] = group_by_vars[[i]] + } + } + + tmp3 = eval(second_stage_code) + + n_summ2_args = length(dotdotdot_collected_agg) + # final stage of summary + chunk_summ2_code = + eval(parse(text = sprintf( + "quote(summarise(tmp3, %s))", + paste0(rep_len("NULL", n_summ2_args), collapse = ", ") + ))) + + names_chunk_summ_code = names(dotdotdot_chunk_agg) + for (i in 1:n_summ_args) { + chunk_summ2_code[[i + 2]] = dotdotdot_collected_agg[[i]] + } + + tmp4 = eval(chunk_summ2_code) + + names_tmp4 = names(tmp4) + + orig_names = sapply(dotdotdot, function(code) { + code %>% + deparse %>% + paste0(collapse = "") + }) + + + names(tmp4)[(n_grp_args + 1):length(names_tmp4)] = ifelse(names_chunk_summ_code == + "", + orig_names, + names_chunk_summ_code) + + return(tmp4) + } diff --git a/R/compute.r b/R/compute.r index 99d31037..f2a95a4e 100644 --- a/R/compute.r +++ b/R/compute.r @@ -1,11 +1,10 @@ -#' Compute without writing +#' Force computations. The results are stored in a folder. #' @description #' Perform the computation; same as calling cmap without .f and lazy = FALSE #' @param x a disk.frame #' @param outdir the output directory -#' @param overwrite whether to overwrite or not -#' @param name Not used. Kept for compatibility with dplyr -#' @param ... Not used. Kept for dplyr compatibility +#' @param name If not NULL then used as outdir prefix. +#' @param ... Passed to `write_disk.frame` #' @export #' @importFrom dplyr compute #' @examples @@ -17,7 +16,11 @@ #' # clean up #' delete(cars.df) #' delete(cars.df3) -compute.disk.frame <- function(x, name, outdir = tempfile("tmp_df_", fileext=".df"), overwrite = TRUE, ...) { - overwrite_check(outdir, overwrite) - write_disk.frame(x, outdir = outdir, overwrite = TRUE) +compute.disk.frame <- function(x, name = NULL, outdir = tempfile("tmp_df_", fileext=".df"), ...) { + if (!is.null(name)) { + warning("in `compute.disk.frame()` name is not NULL, using `name` file name prefix in temporary `outdir` ") + outdir = tempfile(name, fileext=".df") + } + + write_disk.frame(x, outdir = outdir, ...) } diff --git a/R/csv2disk.frame.r b/R/csv2disk.frame.r index d63ee0e2..ac929b77 100644 --- a/R/csv2disk.frame.r +++ b/R/csv2disk.frame.r @@ -19,8 +19,7 @@ #' the highest compression ratio. #' @param overwrite Whether to overwrite the existing directory #' @param header Whether the files have header. Defaults to TRUE -#' @param .progress A logical, for whether or not to print a progress bar for -#' multiprocess, multisession, and multicore plans. From {furrr} +#' @param .progress A logical, for whether or not to show progress #' @param backend The CSV reader backend to choose: "data.table" or "readr". #' disk.frame does not have its own CSV reader. It uses either #' data.table::fread or readr::read_delimited. It is worth noting that @@ -66,20 +65,20 @@ csv_to_disk.frame <- function(infile, outdir = tempfile(fileext = ".df"), inmapf overwrite_check(outdir, overwrite) - # we need multiple backend because data.table has poor support for when the file is larger than RAM + # we need multiple backends because data.table has poor support for when the file is larger than RAM # https://github.com/Rdatatable/data.table/issues/3526 # TODO detect these cases # user has requested chunk-wise reading but wants me to do it #if(is.null(in_chunk_size)) { - + #} else if(is.character(in_chunk_size) && in_chunk_size == "guess") { - - #library(bigreadr) - # system.time(wc_l <- R.utils::countLines(infile)) - # system.time(infos_split <- split_file(infile, every_nlines = 1e7)) - # file_parts <- get_split_files(infos_split) + + #library(bigreadr) + # system.time(wc_l <- R.utils::countLines(infile)) + # system.time(infos_split <- split_file(infile, every_nlines = 1e7)) + # file_parts <- get_split_files(infos_split) #} else if(is.numeric(in_chunk_size)) { @@ -307,12 +306,10 @@ csv_to_disk.frame_data.table_backend <- function(infile, outdir = tempfile(filee message("") } - outdf_tmp = furrr::future_imap(infile, ~{ - dotdotdotorigarg1 = c(dotdotdotorigarg, list(outdir = file.path(tempdir(), .y), infile=.x)) - - pryr::do_call(csv_to_disk.frame_data.table_backend, dotdotdotorigarg1) - }, - .progress = .progress) + outdf_tmp = future.apply::future_lapply(1:length(infile), function(i) { + dotdotdotorigarg1 = c(dotdotdotorigarg, list(outdir = file.path(tempdir(), i), infile=infile[i])) + do.call(csv_to_disk.frame_data.table_backend, dotdotdotorigarg1) + }) if(.progress) { message(paste("-- Converting CSVs to disk.frame -- Stage 1 or 2 took:", data.table::timetaken(pt))) @@ -330,7 +327,6 @@ csv_to_disk.frame_data.table_backend <- function(infile, outdir = tempfile(filee outdf = rbindlist.disk.frame(outdf_tmp, outdir = outdir, by_chunk_id = TRUE, compress = compress, overwrite = overwrite, .progress = .progress) if(.progress) { - message(paste("Stage 2 of 2 took:", data.table::timetaken(pt2))) message(" ----------------------------------------------------- ") message(paste("Stage 1 & 2 in total took:", data.table::timetaken(pt))) diff --git a/R/data.table.r b/R/data.table.r index 41226504..0d12184f 100644 --- a/R/data.table.r +++ b/R/data.table.r @@ -9,7 +9,6 @@ #' @import fst #' @importFrom future.apply future_lapply #' @importFrom data.table rbindlist -#' @importFrom globals findGlobals #' @export #' @examples #' cars.df = as.disk.frame(cars) @@ -19,28 +18,58 @@ #' # clean up #' delete(cars.df) `[.disk.frame` <- function(df, ..., keep = NULL, rbind = TRUE, use.names = TRUE, fill = FALSE, idcol = NULL) { + message("data.table syntax for disk.frame may be moved to a separate package in the future") + keep_for_future = keep - dotdotdot = substitute(...()) #this is an alist + code = substitute(chunk[...]) # sometimes the arguments could be empty # in a recent version of globals that would cause a fail # to avoid the fail remove them from the test - dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))] + #dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))] - ag = globals::findGlobals(dotdotdot_for_find_global) + #ag = globals::findGlobals(dotdotdot_for_find_global) #ag = setdiff(ag, "") # "" can cause issues with future # this line no longer needed - res = future.apply::future_lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) { - #lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) { - chunk = get_chunk(df, chunk_id, keep = keep_for_future) + + # you need to use list otherwise the names will be gone + if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { + globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) + } else { + globals_and_pkgs = future::getGlobalsAndPackages(code) + } + + + global_vars = globals_and_pkgs$globals + + env = parent.frame() + + done = identical(env, emptyenv()) || identical(env, globalenv()) + + # keep adding global variables by moving up the environment chain + while(!done) { + tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) + new_global_vars = tmp_globals_and_pkgs$globals + for (name in setdiff(names(new_global_vars), names(global_vars))) { + global_vars[[name]] <- new_global_vars[[name]] + } + + done = identical(env, emptyenv()) || identical(env, globalenv()) + env = parent.env(env) + } + + globals_and_pkgs$globals = global_vars + + res = future.apply::future_lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { + #res = lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { + chunk = get_chunk(df, chunk_id, full.names=TRUE, keep = keep_for_future) data.table::setDT(chunk) - expr <- quote(chunk) - expr <- c(expr, dotdotdot) - res <- do.call(`[`, expr) + res = eval(code, envir=globals_and_pkgs$globals) res - }, future.globals = c("df", "keep_for_future", "dotdotdot", ag), future.packages = c("data.table","disk.frame"), - future.seed=TRUE + } + , future.packages = c("data.table", globals_and_pkgs$packages), + future.seed=TRUE ) if(rbind & all(sapply(res, function(x) "data.frame" %in% class(x)))) { @@ -53,14 +82,6 @@ } # Solutions from https://stackoverflow.com/questions/57122960/how-to-use-non-standard-evaluation-nse-to-evaluate-arguments-on-data-table?answertab=active#tab-top -# `[.dd` <- function(x, ...) { -# code <- rlang::enexprs(...) -# lapply(x, function(dt) { -# ex <- rlang::expr(dt[!!!code]) -# rlang::eval_tidy(ex) -# }) -# } -# # # `[.dd` <- function(x,...) { # a <- substitute(...()) #this is an alist diff --git a/R/disk.frame.r b/R/disk.frame.r index c949f43b..09894efc 100755 --- a/R/disk.frame.r +++ b/R/disk.frame.r @@ -213,7 +213,7 @@ is.dir.disk.frame <- function(df, check.consistency = TRUE) { head.disk.frame <- function(x, n = 6L, ...) { stopifnot(is_ready(x)) path1 <- attr(x,"path") - cmds <- attr(x, "lazyfn") + cmds <- attr(x, "recordings") if(fs::dir_exists(path1)) { path2 <- list.files(path1,full.names = TRUE)[1] head(play(fst::read_fst(path2, from = 1, to = n, as.data.table = TRUE), cmds), n = n, ...) @@ -229,7 +229,7 @@ head.disk.frame <- function(x, n = 6L, ...) { tail.disk.frame <- function(x, n = 6L, ...) { stopifnot(is_ready(x)) path1 <- attr(x,"path") - cmds <- attr(x, "lazyfn") + cmds <- attr(x, "recordings") if(dir.exists(path1)) { path2 <- list.files(path1,full.names = TRUE) path2 <- path2[length(path2)] diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 75be5774..285c678f 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -23,13 +23,8 @@ #' #' # clean up cars.df #' delete(cars.df) -select.disk.frame <- function(.data, ...) { - quo_dotdotdot = rlang::enquos(...) - cmap(.data, ~{ - code = rlang::quo(dplyr::select(.x, !!!quo_dotdotdot)) - rlang::eval_tidy(code) - }, lazy = TRUE) -} +select.disk.frame <- create_chunk_mapper(dplyr::select) + #' @export #' @rdname dplyr_verbs @@ -43,7 +38,6 @@ filter.disk.frame <- create_chunk_mapper(dplyr::filter) #' @export #' @rdname dplyr_verbs #' @importFrom future getGlobalsAndPackages -#' @importFrom rlang eval_tidy quo enquos #' @importFrom dplyr mutate mutate.disk.frame <- create_chunk_mapper(dplyr::mutate) @@ -65,21 +59,21 @@ arrange.disk.frame =create_chunk_mapper(dplyr::arrange, warning_msg="`arrange.di #' @rdname dplyr_verbs chunk_arrange <- create_chunk_mapper(dplyr::arrange) - # TODO family is not required is group-by # TODO alot of these .disk.frame functions are not generic +# TODO make this work like in dplyr #' #' @export #' #' @importFrom dplyr add_count #' #' @rdname dplyr_verbs #' add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) -#' @export -#' @importFrom dplyr add_tally -#' @rdname dplyr_verbs -add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) +#' #' @export +#' #' @importFrom dplyr add_tally +#' #' @rdname dplyr_verbs +#' add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) #' @export @@ -94,10 +88,10 @@ chunk_summarize <- create_chunk_mapper(dplyr::summarize) chunk_summarise <- create_chunk_mapper(dplyr::summarise) -#' @export -#' @rdname dplyr_verbs -#' @importFrom dplyr do -do.disk.frame <- create_chunk_mapper(dplyr::do) +#' #' @export +#' #' @rdname dplyr_verbs +#' #' @importFrom dplyr do +#' do.disk.frame <- create_chunk_mapper(dplyr::do) #' @export @@ -148,37 +142,3 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) glimpse.disk.frame <- function(.data, ...) { glimpse(head(.data, ...), ...) } - -# Internal methods -# @param .data the data -# @param cmd the function to record -record <- function(.data, cmd){ - attr(.data,"lazyfn") <- c(attr(.data,"lazyfn"), list(cmd)) - .data -} - -# Internal methods -# @param .data the disk.frame -# @param cmds the list of function to play back -play <- function(.data, cmds=NULL) { - for (cmd in cmds){ - if (typeof(cmd) == "closure") { - .data <- cmd(.data) - } else { - # create a temporary environment - an_env = new.env(parent = environment()) - - ng = names(cmd$vars_and_pkgs$globals) - - if(length(ng) > 0) { - for(i in 1:length(cmd$vars_and_pkgs$globals)) { - g = cmd$vars_and_pkgs$globals[[i]] - assign(ng[i], g, pos = an_env) - } - } - - .data <- do.call(cmd$func, c(list(.data),cmd$dotdotdot), envir = an_env) - } - } - .data -} diff --git a/R/foverlaps.disk.frame.r b/R/foverlaps.disk.frame.r index 2ae7c706..657b8c65 100644 --- a/R/foverlaps.disk.frame.r +++ b/R/foverlaps.disk.frame.r @@ -84,7 +84,7 @@ foverlaps.disk.frame <- function( dotdotdot$x = data1 dotdotdot$y = data2 - data3 = pryr::do_call(foverlaps, dotdotdot) + data3 = do.call(foverlaps, dotdotdot) rm(data1); rm(data2); gc() outdir fst::write_fst(data3, glue::glue("{outdir}/{chunk_id}"), compress = compress) diff --git a/R/full_join.r b/R/full_join.r index 54dc2ba9..50ba772b 100644 --- a/R/full_join.r +++ b/R/full_join.r @@ -15,8 +15,8 @@ full_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi if("data.frame" %in% class(y)) { # full join cannot be support for y in data.frame ncx = nchunks(x) - dy = shard(y, shardby = by, nchunks = ncx, overwrite = TRUE) - dx = hard_group_by(x, by = by, overwrite = TRUE) + dy = shard(y, shardby = by, nchunks = ncx, overwrite = FALSE) + dx = rechunk(x, shardby = by, outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(full_join.disk.frame(dx, dy, by, copy=copy, outdir=outdir, merge_by_chunk_id = TRUE)) } else if("disk.frame" %in% class(y)) { if(is.null(merge_by_chunk_id)) { @@ -30,8 +30,8 @@ full_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + x = rechunk(x, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) + y = rechunk(y, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(full_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { res = cmap2(x, y, ~{ diff --git a/R/get_chunk.r b/R/get_chunk.r index 1e3cfcbd..01c06e61 100644 --- a/R/get_chunk.r +++ b/R/get_chunk.r @@ -26,38 +26,18 @@ get_chunk <- function(...) { #' @export get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { stopifnot("disk.frame" %in% class(df)) - keep_chunks = attr(df, "keep_chunks", exact=TRUE) - - # print(names(attr(df, "lazyfn")[[1]]$vars_and_pkgs$globals)) - # stop("ok") - - # TODO relax this - # if(!is.null(keep_chunks)) { - # # browser() - # # n_int = as.integer(n) - # # - # # if(is.na(n_int)) { - # # if(as.character(n) %in% get_chunk_ids(df)[keep_chunks]) { - # # return(NULL) - # # } else if(normalizePath(as.character(n)) %in% sapply(get_chunk_ids(df, full.names = TRUE)[keep_chunks],normalizePath)) { - # # return(NULL) - # # } - # # } else { - # # if(!n %in% keep_chunk) { - # # return(NULL) - # # } - # # } - # } + # keep_chunks = attr(df, "keep_chunks", exact=TRUE) path = attr(df,"path", exact=TRUE) # all the variables to keep in the attr from a previous srckeep - keep1 = attr(df,"keep", exact=TRUE) + keep1 = attr(df, "keep", exact=TRUE) - cmds = attr(df,"lazyfn", exact=TRUE) + recordings = attr(df, "recordings", exact=TRUE) filename = "" if (typeof(keep) == "closure") { + # sometimes purrr::keep is picked up keep = keep1 } else if(!is.null(keep1) & !is.null(keep)) { if (length(setdiff(keep, keep1)) > 0) { @@ -89,7 +69,8 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { } } - # if the file you are looking for don't exist + + # if the file you are looking for doesn't exist if (!fs::file_exists(filename)) { warning(glue("The chunk {filename} does not exist; returning an empty data.table")) notbl <- data.table() @@ -97,17 +78,19 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { return(notbl) } - if (is.null(cmds)) { - if(typeof(keep)!="closure") { - fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) - } else { + + if (is.null(recordings)) { + if(typeof(keep)=="closure") { fst::read_fst(filename, as.data.table = TRUE,...) + } else { + fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) } } else { if(typeof(keep)!="closure") { - play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), cmds) + play(fst::read_fst(filename, as.data.table = TRUE,...), recordings) } else { - play(fst::read_fst(filename, as.data.table = TRUE,...), cmds) + play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), recordings) + } } } diff --git a/R/hard_arrange.r b/R/hard_arrange.r index f5ca5de5..ca8cee7e 100644 --- a/R/hard_arrange.r +++ b/R/hard_arrange.r @@ -1,67 +1,67 @@ -#' Perform a hard arrange -#' @description -#' A hard_arrange is a sort by that also reorganizes the chunks to ensure that -#' every unique grouping of `by`` is in the same chunk. Or in other words, every -#' row that share the same `by` value will end up in the same chunk. -#' @param df a disk.frame -#' @param ... grouping variables -#' @param outdir the output directory -#' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) -#' @param overwrite overwrite the out put directory -#' @param add same as dplyr::arrange -#' @param .drop same as dplyr::arrange -#' @export -#' @examples -#' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' Perform a hard arrange +#' #' @description +#' #' A hard_arrange is a sort by that also reorganizes the chunks to ensure that +#' #' every unique grouping of `by`` is in the same chunk. Or in other words, every +#' #' row that share the same `by` value will end up in the same chunk. +#' #' @param df a disk.frame +#' #' @param ... grouping variables +#' #' @param outdir the output directory +#' #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) +#' #' @param overwrite overwrite the out put directory +#' #' @param add same as dplyr::arrange +#' #' @param .drop same as dplyr::arrange +#' #' @export +#' #' @examples +#' #' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' +#' #' # arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk +#' #' iris_hard.df = hard_arrange(iris.df, Species) +#' #' +#' #' get_chunk(iris_hard.df, 1) +#' #' get_chunk(iris_hard.df, 2) +#' #' +#' #' # clean up cars.df +#' #' delete(iris.df) +#' #' delete(iris_hard.df) +#' hard_arrange <- function(df, ..., add = FALSE, .drop = FALSE) { +#' UseMethod("hard_arrange") +#' } #' -#' # arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk -#' iris_hard.df = hard_arrange(iris.df, Species) +#' #' @rdname hard_arrange +#' #' @export +#' #' @importFrom dplyr arrange +#' hard_arrange.data.frame <- function(df, ...) { +#' dplyr::arrange(df, ...) +#' } #' -#' get_chunk(iris_hard.df, 1) -#' get_chunk(iris_hard.df, 2) +#' #' @rdname hard_arrange +#' #' @importFrom purrr map +#' #' @export +#' hard_arrange.disk.frame <- function(df, ..., outdir=tempfile("tmp_disk_frame_hard_arrange"), nchunks = disk.frame::nchunks(df), overwrite = TRUE) { +#' overwrite_check(outdir, overwrite) +#' +#' # Refer also to Dplyr arrange: https://github.com/tidyverse/dplyr/blob/master/src/arrange.cpp +#' q <- enquos(...) +#' is_sym <- sapply(q, rlang::quo_is_symbol) +#' arrange_codes <- sapply(q, rlang::as_label) +#' +#' # Check if desc... +#' is_desc <- substr(arrange_codes, 1, 5) == "desc(" +#' +#' # If expr is a symbol from the data, just use it. +#' # Otherwise need to evaluate ... +#' # (TODO - currently only support variables and desc in the data) +#' # Peels off "desc" from the original +#' vars <- sub(")", "", sub("desc(", "", arrange_codes, fixed=TRUE), fixed=TRUE) #' -#' # clean up cars.df -#' delete(iris.df) -#' delete(iris_hard.df) -hard_arrange <- function(df, ..., add = FALSE, .drop = FALSE) { - UseMethod("hard_arrange") -} - -#' @rdname hard_arrange -#' @export -#' @importFrom dplyr arrange -hard_arrange.data.frame <- function(df, ...) { - dplyr::arrange(df, ...) -} - -#' @rdname hard_arrange -#' @importFrom purrr map -#' @export -hard_arrange.disk.frame <- function(df, ..., outdir=tempfile("tmp_disk_frame_hard_arrange"), nchunks = disk.frame::nchunks(df), overwrite = TRUE) { - overwrite_check(outdir, overwrite) - - # Refer also to Dplyr arrange: https://github.com/tidyverse/dplyr/blob/master/src/arrange.cpp - q <- enquos(...) - is_sym <- sapply(q, rlang::quo_is_symbol) - arrange_codes <- sapply(q, rlang::as_label) - - # Check if desc... - is_desc <- substr(arrange_codes, 1, 5) == "desc(" - - # If expr is a symbol from the data, just use it. - # Otherwise need to evaluate ... - # (TODO - currently only support variables and desc in the data) - # Peels off "desc" from the original - vars <- sub(")", "", sub("desc(", "", arrange_codes, fixed=TRUE), fixed=TRUE) - - desc_vars <- vars[is_desc] - - if(!all(vars %in% colnames(df))){ - stop(paste0("Expressions currently not supported. Columns not found in colnames:", vars[!vars %in% colnames(df)])) - } - - # Hard group by in a partially sorted way at the chunk level and then arrange within chunks - df %>% - disk.frame::hard_group_by(vars, outdir=outdir, nchunks=nchunks, overwrite=overwrite, shardby_function="sort", desc_vars=desc_vars) %>% - chunk_arrange(...) -} \ No newline at end of file +#' desc_vars <- vars[is_desc] +#' +#' if(!all(vars %in% colnames(df))){ +#' stop(paste0("Expressions currently not supported. Columns not found in colnames:", vars[!vars %in% colnames(df)])) +#' } +#' +#' # Hard group by in a partially sorted way at the chunk level and then arrange within chunks +#' df %>% +#' disk.frame::hard_group_by(vars, outdir=outdir, nchunks=nchunks, overwrite=overwrite, shardby_function="sort", desc_vars=desc_vars) %>% +#' chunk_arrange(...) +#' } \ No newline at end of file diff --git a/R/hard_group_by.r b/R/hard_group_by.r index f0397cde..48228564 100644 --- a/R/hard_group_by.r +++ b/R/hard_group_by.r @@ -1,217 +1,217 @@ -#' Show a progress bar of the action being performed -#' @importFrom utils txtProgressBar setTxtProgressBar -#' @param df a disk.frame -#' @noRd -progressbar <- function(df) { - if(attr(df,"performing", exact=TRUE) == "hard_group_by") { - # create progress bar - - shardby = "acct_id" - #list.files( - fparent = attr(df,"parent", exact=TRUE) - - #tmp = file.path(fparent,".performing","inchunks") - tmp = "tmphardgroupby2" - - l = length(list.files(fparent)) - pt_begin_split = proc.time() - doprog <- function(pt_from, sleep = 1) { - #tkpb = winProgressBar(title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), label = "Checking completeness", - # min = 0, max = l*1.5, initial = 0, width = 500) - pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) - - on.exit(close(pb)) - # on.exit(close(tkpb)) - while(length(list.files(file.path(tmp,l))) < l) { - wl = length(list.files(file.path(tmp,1:l)))/l - tt <- proc.time()[3] - pt_from[3] - #list.files( - avg_speed = tt/wl - pred_speed = avg_speed*(l-wl) + avg_speed*l/2 - elapsed = round(tt/60,1) - - #setWinProgressBar(tkpb, wl, - # title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), - # label = sprintf("%.0f out of %d; avg speed %.2f mins; elapsed %.1f mins; another %.1f mins", wl,l, round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) - setTxtProgressBar(pb, length(list.files(file.path(tmp,l))), - title = sprintf("Group By - %s", shardby)) - Sys.sleep(sleep) - } - } - doprog(pt_begin_split, 1) - - pt_begin_collate = proc.time() - doprog2 <- function(pt_from, sleep = 1) { - # tkpb = winProgressBar(title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating", shardby), label = "Checking completeness", - # min = 0, max = l*1.5, initial = 0, width = 600) - pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) - - on.exit(close(pb)) - # on.exit(close(tkpb)) - while(length(list.files("large_sorted")) < l) { - wl = length(list.files("large_sorted")) - tt <- proc.time()[3] - pt_from[3] - #list.files( - avg_speed = tt/wl - pred_speed = avg_speed*(l-wl) - elapsed = round(tt/60,1) - - # setWinProgressBar(tkpb, l + wl/2, - # title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating -- %.0f out of %d chunks processed;", shardby, wl, l), - # label = sprintf("avg %.2f min/chunk; %.1f mins elapsed; %.1f mins remaining;", round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) - setTxtProgressBar(pb, length(list.files("large_sorted")), - title = sprintf("Hard Group By - %s", shardby)) - Sys.sleep(sleep) - } - } - doprog2(pt_begin_collate, 1) - } -} - -#' Perform a hard group -#' @description -#' A hard_group_by is a group by that also reorganizes the chunks to ensure that -#' every unique grouping of `by`` is in the same chunk. Or in other words, every -#' row that share the same `by` value will end up in the same chunk. -#' @param df a disk.frame -#' @param ... grouping variables -#' @param outdir the output directory -#' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) -#' @param overwrite overwrite the out put directory -#' @param .add same as dplyr::group_by -#' @param .drop same as dplyr::group_by -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits for the "sort" shardby function, a dataframe with the split values. -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. -#' @param sort_split_sample_size for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits. -#' @export -#' @examples -#' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' Show a progress bar of the action being performed +#' #' @importFrom utils txtProgressBar setTxtProgressBar +#' #' @param df a disk.frame +#' #' @noRd +#' progressbar <- function(df) { +#' if(attr(df,"performing", exact=TRUE) == "hard_group_by") { +#' # create progress bar +#' +#' shardby = "acct_id" +#' #list.files( +#' fparent = attr(df,"parent", exact=TRUE) +#' +#' #tmp = file.path(fparent,".performing","inchunks") +#' tmp = "tmphardgroupby2" +#' +#' l = length(list.files(fparent)) +#' pt_begin_split = proc.time() +#' doprog <- function(pt_from, sleep = 1) { +#' #tkpb = winProgressBar(title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), label = "Checking completeness", +#' # min = 0, max = l*1.5, initial = 0, width = 500) +#' pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) +#' +#' on.exit(close(pb)) +#' # on.exit(close(tkpb)) +#' while(length(list.files(file.path(tmp,l))) < l) { +#' wl = length(list.files(file.path(tmp,1:l)))/l +#' tt <- proc.time()[3] - pt_from[3] +#' #list.files( +#' avg_speed = tt/wl +#' pred_speed = avg_speed*(l-wl) + avg_speed*l/2 +#' elapsed = round(tt/60,1) +#' +#' #setWinProgressBar(tkpb, wl, +#' # title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), +#' # label = sprintf("%.0f out of %d; avg speed %.2f mins; elapsed %.1f mins; another %.1f mins", wl,l, round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) +#' setTxtProgressBar(pb, length(list.files(file.path(tmp,l))), +#' title = sprintf("Group By - %s", shardby)) +#' Sys.sleep(sleep) +#' } +#' } +#' doprog(pt_begin_split, 1) +#' +#' pt_begin_collate = proc.time() +#' doprog2 <- function(pt_from, sleep = 1) { +#' # tkpb = winProgressBar(title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating", shardby), label = "Checking completeness", +#' # min = 0, max = l*1.5, initial = 0, width = 600) +#' pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) +#' +#' on.exit(close(pb)) +#' # on.exit(close(tkpb)) +#' while(length(list.files("large_sorted")) < l) { +#' wl = length(list.files("large_sorted")) +#' tt <- proc.time()[3] - pt_from[3] +#' #list.files( +#' avg_speed = tt/wl +#' pred_speed = avg_speed*(l-wl) +#' elapsed = round(tt/60,1) +#' +#' # setWinProgressBar(tkpb, l + wl/2, +#' # title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating -- %.0f out of %d chunks processed;", shardby, wl, l), +#' # label = sprintf("avg %.2f min/chunk; %.1f mins elapsed; %.1f mins remaining;", round(avg_speed/60,2), elapsed, round(pred_speed/60,2))) +#' setTxtProgressBar(pb, length(list.files("large_sorted")), +#' title = sprintf("Hard Group By - %s", shardby)) +#' Sys.sleep(sleep) +#' } +#' } +#' doprog2(pt_begin_collate, 1) +#' } +#' } #' -#' # group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk -#' iris_hard.df = hard_group_by(iris.df, Species) +#' #' Perform a hard group +#' #' @description +#' #' A hard_group_by is a group by that also reorganizes the chunks to ensure that +#' #' every unique grouping of `by`` is in the same chunk. Or in other words, every +#' #' row that share the same `by` value will end up in the same chunk. +#' #' @param df a disk.frame +#' #' @param ... grouping variables +#' #' @param outdir the output directory +#' #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df) +#' #' @param overwrite overwrite the out put directory +#' #' @param .add same as dplyr::group_by +#' #' @param .drop same as dplyr::group_by +#' #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks +#' #' @param sort_splits for the "sort" shardby function, a dataframe with the split values. +#' #' @param desc_vars for the "sort" shardby function, the variables to sort descending. +#' #' @param sort_split_sample_size for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits. +#' #' @export +#' #' @examples +#' #' iris.df = as.disk.frame(iris, nchunks = 2) +#' #' +#' #' # group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk +#' #' iris_hard.df = hard_group_by(iris.df, Species) +#' #' +#' #' get_chunk(iris_hard.df, 1) +#' #' get_chunk(iris_hard.df, 2) +#' #' +#' #' # clean up cars.df +#' #' delete(iris.df) +#' #' delete(iris_hard.df) +#' hard_group_by <- function(df, ..., .add = FALSE, .drop = FALSE) { +#' UseMethod("hard_group_by") +#' } #' -#' get_chunk(iris_hard.df, 1) -#' get_chunk(iris_hard.df, 2) +#' #' @rdname hard_group_by +#' #' @export +#' #' @importFrom dplyr group_by +#' hard_group_by.data.frame <- function(df, ..., .add = FALSE, .drop = FALSE) { +#' dplyr::group_by(df, ..., .add = FALSE, .drop = FALSE) +#' } #' -#' # clean up cars.df -#' delete(iris.df) -#' delete(iris_hard.df) -hard_group_by <- function(df, ..., .add = FALSE, .drop = FALSE) { - UseMethod("hard_group_by") -} - -#' @rdname hard_group_by -#' @export -#' @importFrom dplyr group_by -hard_group_by.data.frame <- function(df, ..., .add = FALSE, .drop = FALSE) { - dplyr::group_by(df, ..., .add = FALSE, .drop = FALSE) -} - -#' @rdname hard_group_by -#' @importFrom purrr map -#' @export -hard_group_by.disk.frame <- function( - df, - ..., - outdir=tempfile("tmp_disk_frame_hard_group_by"), - nchunks = disk.frame::nchunks(df), - overwrite = TRUE, - shardby_function="hash", - sort_splits=NULL, - desc_vars=NULL, - sort_split_sample_size=100 - ) { - overwrite_check(outdir, overwrite) - - ff = list.files(attr(df, "path")) - stopifnot(shardby_function %in% c("hash", "sort")) - - if (shardby_function == "sort" && is.null(sort_splits)){ - # Sample enough per chunk to generate reasonable splits - sample_size_per_chunk = ceiling(nchunks / disk.frame::nchunks(df)) * sort_split_sample_size - - # Sample and sort - sort_splits_sample <- cmap(df, dplyr::sample_n, size=sample_size_per_chunk, replace=TRUE) %>% - select(...) %>% - collect() - - # NSE - tryCatch({ - sort_splits_sample <- sort_splits_sample %>% - arrange(!!!syms(...)) - }, error = function(e) { - sort_splits_sample <- sort_splits_sample %>% - arrange(...) - }) - - # If 100 chunks, this return get 99 splits based on percentiles. - ntiles <- round((1:(nchunks-1)) * (nrow(sort_splits_sample) / (nchunks))) - - # Get splits. May lead to less than nchunks if duplicates are selected. - sort_splits <- sort_splits_sample %>% - dplyr::slice(ntiles) %>% - distinct() - } - - # test if the unlist it will error - - tryCatch({ - # This will return the variable names - - # TODO use better ways to do NSE - # the below will fail if indeed ... can not be list-ed - # there should be a better way to do this - by <- unlist(list(...)) - - # shard and create temporary diskframes - tmp_df = cmap(df, function(df1) { - tmpdir = tempfile() - shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) - }, lazy = FALSE) - - - # now rbindlist - res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) - - # clean up the tmp dir - purrr::walk(tmp_df, ~{ - fs::dir_delete(attr(.x, "path", exact=TRUE)) - }) - - - res1 <- NULL - if(typeof(by) == "character") { - eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) - } else if(length(by) == 1) { - res1 = res %>% dplyr::group_by({{by}}) - } else { - eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) - } - - res1 - }, error = function(e) { - # message(e) - # This will return the variable names - by = rlang::enquos(...) %>% - substr(2, nchar(.)) - - # shard and create temporary diskframes - tmp_df = cmap(df, function(df1) { - tmpdir = tempfile() - shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) - }, lazy = FALSE) - - # now rbindlist - res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) - - # clean up the tmp dir - purrr::walk(tmp_df, ~{ - fs::dir_delete(attr(.x, "path", exact=TRUE)) - }) - - res1 = res %>% chunk_group_by(!!!syms(by)) - - res1 - }) -} +#' #' @rdname hard_group_by +#' #' @importFrom purrr map +#' #' @export +#' hard_group_by.disk.frame <- function( +#' df, +#' ..., +#' outdir=tempfile("tmp_disk_frame_hard_group_by"), +#' nchunks = disk.frame::nchunks(df), +#' overwrite = TRUE, +#' shardby_function="hash", +#' sort_splits=NULL, +#' desc_vars=NULL, +#' sort_split_sample_size=100 +#' ) { +#' overwrite_check(outdir, overwrite) +#' +#' ff = list.files(attr(df, "path")) +#' stopifnot(shardby_function %in% c("hash", "sort")) +#' +#' if (shardby_function == "sort" && is.null(sort_splits)){ +#' # Sample enough per chunk to generate reasonable splits +#' sample_size_per_chunk = ceiling(nchunks / disk.frame::nchunks(df)) * sort_split_sample_size +#' +#' # Sample and sort +#' sort_splits_sample <- cmap(df, dplyr::sample_n, size=sample_size_per_chunk, replace=TRUE) %>% +#' select(...) %>% +#' collect() +#' +#' # NSE +#' tryCatch({ +#' sort_splits_sample <- sort_splits_sample %>% +#' arrange(!!!syms(...)) +#' }, error = function(e) { +#' sort_splits_sample <- sort_splits_sample %>% +#' arrange(...) +#' }) +#' +#' # If 100 chunks, this return get 99 splits based on percentiles. +#' ntiles <- round((1:(nchunks-1)) * (nrow(sort_splits_sample) / (nchunks))) +#' +#' # Get splits. May lead to less than nchunks if duplicates are selected. +#' sort_splits <- sort_splits_sample %>% +#' dplyr::slice(ntiles) %>% +#' distinct() +#' } +#' +#' # test if the unlist it will error +#' +#' tryCatch({ +#' # This will return the variable names +#' +#' # TODO use better ways to do NSE +#' # the below will fail if indeed ... can not be list-ed +#' # there should be a better way to do this +#' by <- unlist(list(...)) +#' +#' # shard and create temporary diskframes +#' tmp_df = cmap(df, function(df1) { +#' tmpdir = tempfile() +#' shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) +#' }, lazy = FALSE) +#' +#' +#' # now rbindlist +#' res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) +#' +#' # clean up the tmp dir +#' purrr::walk(tmp_df, ~{ +#' fs::dir_delete(attr(.x, "path", exact=TRUE)) +#' }) +#' +#' +#' res1 <- NULL +#' if(typeof(by) == "character") { +#' eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) +#' } else if(length(by) == 1) { +#' res1 = res %>% dplyr::group_by({{by}}) +#' } else { +#' eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) +#' } +#' +#' res1 +#' }, error = function(e) { +#' # message(e) +#' # This will return the variable names +#' by = rlang::enquos(...) %>% +#' substr(2, nchar(.)) +#' +#' # shard and create temporary diskframes +#' tmp_df = cmap(df, function(df1) { +#' tmpdir = tempfile() +#' shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) +#' }, lazy = FALSE) +#' +#' # now rbindlist +#' res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) +#' +#' # clean up the tmp dir +#' purrr::walk(tmp_df, ~{ +#' fs::dir_delete(attr(.x, "path", exact=TRUE)) +#' }) +#' +#' res1 = res %>% chunk_group_by(!!!syms(by)) +#' +#' res1 +#' }) +#' } diff --git a/R/inner_join.r b/R/inner_join.r index 545136f9..5699e71b 100644 --- a/R/inner_join.r +++ b/R/inner_join.r @@ -8,7 +8,7 @@ #' # clean up cars.df #' delete(cars.df) #' delete(join.df) -inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, .progress = FALSE) { +inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, suffix=c(".x", ".y"), ..., keep=FALSE, outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, .progress = FALSE) { stopifnot("disk.frame" %in% class(x)) overwrite_check(outdir, overwrite) @@ -24,10 +24,8 @@ inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempf } if("data.frame" %in% class(y)) { - quo_dotdotdot = enquos(...) res = cmap_dfr(x, ~{ - code = quo(inner_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) + inner_join(.x, y, by = by, copy = copy, suffix=suffix, ..., keep=keep) }, .progress = .progress) return(res) } else if("disk.frame" %in% class(y)) { @@ -41,22 +39,18 @@ inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempf ncx = nchunks(x) ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) - return(inner_join.disk.frame(x, y, by, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) + x = rechunk(x, shardby=by, nchunks = max(ncy,ncx), outdir = tempfile(fileext = ".df"), overwrite = FALSE) + y = rechunk(y, shardby=by, nchunks = max(ncy,ncx), outdir = tempfile(fileext = ".df"), overwrite = FALSE) + return(inner_join.disk.frame(x, y, by, copy=copy, suffix = suffix, ..., keep=keep, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { - dotdotdot <- list(...) - res = cmap2.disk.frame(x, y, ~{ if(is.null(.y)) { return(data.table()) } else if (is.null(.x)) { return(data.table()) } - #inner_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite) - lij = purrr::lift(dplyr::inner_join) - lij(c(list(x = .x, y = .y, by = by, copy = copy), dotdotdot)) - }, outdir = outdir, .progress = .progress) + inner_join(.x, .y, by = by, copy = copy, suffix = suffix, ..., keep=keep) + }, outdir = outdir, .progress = .progress, overwrite = overwrite) return(res) } else { # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine diff --git a/R/left_join.r b/R/left_join.r index 4c1b6e3b..e3b810b8 100644 --- a/R/left_join.r +++ b/R/left_join.r @@ -1,3 +1,5 @@ +left_join_y_is_data.frame = create_chunk_mapper(dplyr::left_join) + #' Performs join/merge for disk.frames #' @rdname join #' @export @@ -9,21 +11,14 @@ #' # clean up cars.df #' delete(cars.df) #' delete(join.df) -left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { +left_join.disk.frame = function(x, y, by=NULL, copy=FALSE, suffix=c(".x", ".y"), ..., keep=FALSE, outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { stopifnot("disk.frame" %in% class(x)) - overwrite_check(outdir, overwrite) - - if("data.frame" %in% class(y)) { - # note that x is named .data in the lazy evaluation - quo_dotdotdot = enquos(...) - cmap_dfr(x, ~{ - code = quo(left_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) - }, .progress = .progress) - } else if("disk.frame" %in% class(y)) { + if ("data.frame" %in% class(y)) { + left_join_y_is_data.frame(x, y, by=by, copy=copy, suffix=suffix, ..., keep=keep) + } else { if(is.null(merge_by_chunk_id)) { - stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") + stop("Both `x` and `y` are disk.frames. You need to specify `merge_by_chunk_id = TRUE` or `FALSE` explicitly") } if(is.null(by)) { by <- intersect(names(x), names(y)) @@ -32,12 +27,11 @@ left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi ncx = nchunks(x) ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { - warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + warning("`merge_by_chunk_id = FALSE`. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making `y` a data.frame or set merge_by_chunk_id = TRUE for better performance.") + x = rechunk(x, nchunks = max(ncy, ncx), shardby = by, outdir=tempfile(), overwrite = FALSE) + y = rechunk(x, nchunks = max(ncy, ncx), shardby = by, outdir=tempfile(), overwrite = FALSE) return(left_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) } else if(merge_by_chunk_id == TRUE) { - #} else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { dotdotdot = list(...) res = cmap2.disk.frame(x, y, ~{ if(is.null(.y)) { @@ -45,9 +39,9 @@ left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi } else if (is.null(.x)) { return(data.table()) } - llj = purrr::lift(dplyr::left_join) - #left_join(.x, .y, by = by, copy = copy, ...) - llj(c(list(x=.x, y =.y, by = by, copy = copy), dotdotdot)) + left_join(.x, .y, by = by, copy = copy, suffix=suffix, ..., keep=keep) + #llj = purrr::lift(dplyr::left_join) + #llj(c(list(x=.x, y =.y, by = by, copy = copy), dotdotdot)) }, outdir = outdir) return(res) } else { @@ -56,3 +50,4 @@ left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi } } } + \ No newline at end of file diff --git a/R/map-deprecated.r b/R/map-deprecated.r deleted file mode 100644 index bac52e3d..00000000 --- a/R/map-deprecated.r +++ /dev/null @@ -1,85 +0,0 @@ -#' @export -#' @rdname cmap -map <- function(.x, .f, ...) { - UseMethod("map") -} - -#' @export -#' @rdname cmap -map.disk.frame <- function(...) { - warning("map(df, ...) where df is a disk.frame has been deprecated. Please use cmap(df,...) instead") - cmap.disk.frame(...) -} - -#' @export -#' @rdname cmap -map.default <- function(.x, .f, ...) { - purrr::map(.x, .f, ...) -} - - -#' @export -#' @rdname cmap -imap_dfr <- function(.x, .f, ..., .id = NULL) { - UseMethod("imap_dfr") -} - -#' @export -#' @rdname cmap -imap_dfr.disk.frame <- function(...) { - warning("imap_dfr(df, ...) where df is disk.frame is deprecated. Please use cimap_dfr(df, ...) instead") - cimap_dfr.disk.frame(...) -} - -#' @export -#' @rdname cmap -imap_dfr.default <- function(.x, .f, ..., .id = NULL) { - purrr::imap_dfr(.x, .f, ..., .id = .id) -} - -#' @export -#' @rdname cmap -#' @examples -#' cars.df = as.disk.frame(cars) -#' -#' # .x is the chunk and .y is the ID as an integer -#' -#' # lazy = TRUE support is not available at the moment -#' cimap(cars.df, ~.x[, id := .y], lazy = FALSE) -#' -#' cimap_dfr(cars.df, ~.x[, id := .y]) -#' -#' # clean up cars.df -#' delete(cars.df) -imap <- function(.x, .f, ...) { - UseMethod("imap") -} - -imap.disk.frame <- function(...) { - warning("imap(df,..) where df is disk.frame is deprecated. Use cimap(df, ...) instead") - cimap.disk.frame(...) -} - -#' @export -#' @rdname cmap -imap.default <- function(.x, .f, ...) { - purrr::imap(.x, .f, ...) -} - -#' @rdname cmap -#' @param .id not used -#' @export -map_dfr.disk.frame <- function(...) { - warning("map_dfr(df, ...) where df is disk.frame is deprecated. Please use cmap_dfr instead") - cmap_dfr.disk.frame(...) -} - -map_dfr <- function(.x, .f, ..., .id = NULL) { - UseMethod("map_dfr") -} - -#' @export -#' @rdname cmap -map_dfr.default <- function(.x, .f, ..., .id = NULL) { - purrr::map_dfr(.x, .f, ..., .id = .id) -} \ No newline at end of file diff --git a/R/map2.r b/R/map2.r index a2882b2f..22633241 100644 --- a/R/map2.r +++ b/R/map2.r @@ -26,22 +26,6 @@ cmap2 <- function(.x, .y, .f, ...){ UseMethod("cmap2") } -#' @export -#' @rdname cmap2 -map2 <- function(.x, .y, .f, ...){ - UseMethod("map2") -} - -#' @export -map2.default <- function(.x, .y, .f, ...) { - purrr::map2(.x,.y,.f,...) -} - -#' @export -map2.disk.frame <- function(...) { - warning("map2.disk.frame(df, df1, ..) where df is disk.frame is deprecated. Use cmap(df, df1, ...) instead") - cmap2.disk.frame(...) -} #' @export #' @importFrom pryr do_call @@ -51,10 +35,8 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") stop(sprintf("running %s : the .x argument must be a disk.frame", code)) } - .f = purrr::as_mapper(.f) - if("disk.frame" %in% class(.y)) { fs::dir_create(outdir) @@ -65,8 +47,6 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") yc[,yid:=get_chunk_ids(.y, full.names = TRUE)] xyc = merge(xc, yc, by="cid", all = TRUE, allow.cartesian = TRUE) - - ddd = list(...) # apply the functions future.apply::future_mapply(function(xid, yid, outid) { @@ -77,11 +57,10 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") if(base::nrow(xych) > 0) { fst::write_fst(xych, file.path(outdir, paste0(outid,".fst"))) } else { - warning(glue::glue("one of the chunks, {xid}, is empty")) + warning(sprintf("one of the chunks, %s, is empty", xid)) } NULL - } - ,xyc$xid, xyc$yid, xyc$cid # together with mapply + }, xyc$xid, xyc$yid, xyc$cid # together with mapply , future.seed=NULL ) @@ -91,13 +70,12 @@ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df") warning("in cmap2(.x,.y,...) the .y is not a disk.frame, so returning a list instead of a disk.frame") f_for_passing = force(.f) - ddd = list(...) tmp_disk.frame = force(.x) - res = furrr::future_map2(get_chunk_ids(tmp_disk.frame, full.names = TRUE), .y, function(xs, ys) { - ddd = c(list(get_chunk(tmp_disk.frame, xs, full.names = TRUE), ys), ddd) - - pryr::do_call(f_for_passing, ddd) - }) + res = future.apply::future_mapply(function(xs, ys, ...) { + ddd = c(list(get_chunk(tmp_disk.frame, xs, full.names = TRUE), ys), ...) + do.call(f_for_passing, ddd) + }, get_chunk_ids(tmp_disk.frame, full.names = TRUE), .y, ..., SIMPLIFY=FALSE, future.seed = TRUE) + return(res) } diff --git a/R/map_by_chunk_id.r b/R/map_by_chunk_id.r index f446848a..32ee0d16 100644 --- a/R/map_by_chunk_id.r +++ b/R/map_by_chunk_id.r @@ -2,5 +2,5 @@ #' @export map_by_chunk_id <- function(.x, .y, .f, ..., outdir) { warning("map_by_chunk_id is deprecated. Use map2 instead") - map2.disk.frame(.x, .y, .f, ..., outdir = outdir) + cmap2.disk.frame(.x, .y, .f, ..., outdir = outdir) } \ No newline at end of file diff --git a/R/names.r b/R/names.r index 10954ad4..05d047da 100644 --- a/R/names.r +++ b/R/names.r @@ -25,8 +25,9 @@ names.disk.frame <- function(x, ...) { #' @export colnames.disk.frame <- function(x, ...) { res = attr(x, "path", exact=TRUE) %>% - fs::dir_ls(type="file") - if(is.null(attr(x, "lazyfn"))) { + list.files(full.names = TRUE) + + if(is.null(attr(x, "recordings"))) { if(length(res) == 0) { return(vector("character")) } @@ -42,4 +43,4 @@ colnames.disk.frame <- function(x, ...) { #' @export colnames.default <- function(x, ...) { base::colnames(x, ...) -} \ No newline at end of file +} diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R index 210be4f0..27f62f2f 100644 --- a/R/one-stage-verbs.R +++ b/R/one-stage-verbs.R @@ -209,28 +209,20 @@ IQR_df.collected_agg.disk.frame <- function(listx, ...) { #' @rdname group_by #' @export summarise.grouped_disk.frame <- function(.data, ...) { + # get all components of the summarise + dotdotdot = rlang::enexprs(...) + # convert any quosure to labels + for (i in seq_along(dotdotdot)) { + if("quosure" %in% class(dotdotdot[[i]])) { + dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) + } + } - ca_code = generate_summ_code(...) + class(.data) <- c("summarized_disk.frame", "disk.frame") + attr(.data, "summarize_code") = dotdotdot - if(is.null(names(ca_code))) { - return(eval(parse(text = glue::glue(".data %>% {rlang::as_label(ca_code)}")))) - } else if("chunk_summ_code" %in% names(ca_code)) { - chunk_summ_code = ca_code$chunk_summ_code - agg_summ_code = ca_code$agg_summ_code - - # get the by variables - group_by_cols = purrr::map_chr(attr(.data, "group_by_cols", exact=TRUE), ~{deparse(.x)}) - - # generate full code - code_to_run = glue::glue("chunk_group_by({paste0(group_by_cols, collapse=',')}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({paste0(group_by_cols, collapse=',')}) %>% summarize({agg_summ_code})") - - class(.data) <- c("summarized_disk.frame", "disk.frame") - attr(.data, "summarize_code") = code_to_run - return(.data) - } else { - stop("something's wrong mate") - } + return(.data) } #' @export @@ -245,137 +237,53 @@ summarize.grouped_disk.frame = summarise.grouped_disk.frame #' reorganizes the chunks by the shard key. #' @seealso hard_group_by #' @param .data a disk.frame -#' @param add from dplyr +#' @param .add from dplyr #' @param .drop from dplyr #' @param ... same as the dplyr::group_by #' @importFrom dplyr group_by_drop_default +#' @importFrom rlang enexpr #' @export #' @rdname group_by # learning from https://docs.dask.org/en/latest/dataframe-groupby.html -group_by.disk.frame <- function(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { +group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")) { + class(.data) <- c("grouped_disk.frame", "disk.frame") - attr(.data, "group_by_cols") = substitute(list(...))[-1] + + # using rlang is a neccesary evil here as I need to deal with !!! that is supported by group_by etc + group_by_cols = rlang::enexprs(...) + + # convert any quosure to labels + for (i in seq_along(group_by_cols)) { + if("quosure" %in% class(group_by_cols[[i]])) { + group_by_cols[[i]] <- rlang::sym(rlang::as_label(group_by_cols[[i]])) + } + } + + + attr(.data, "group_by_cols") = group_by_cols + .data } + #' @export #' @importFrom dplyr summarize #' @rdname group_by summarize.disk.frame <- function(.data, ...) { - ca_code = generate_summ_code(...) + # get all components of the summarise + dotdotdot = rlang::enexprs(...) - if(is.null(names(ca_code))) { - return(eval(parse(text = glue::glue(".data %>% {rlang::as_label(ca_code)}")))) - } else if("chunk_summ_code" %in% names(ca_code)) { - chunk_summ_code = ca_code$chunk_summ_code - agg_summ_code = ca_code$agg_summ_code - - # generate full code - code_to_run = glue::glue("chunk_summarize({chunk_summ_code}) %>% collect %>% summarize({agg_summ_code})") - - class(.data) <- c("summarized_disk.frame", "disk.frame") - attr(.data, "summarize_code") = code_to_run - return(.data ) - } else { - stop("something's wrong") - } -} - -#' Helper function to generate summarisation code -#' @importFrom data.table setDT setkey -#' @importFrom utils methods -#' @noRd -generate_summ_code <- function(...) { - # expand the code - code_to_expand = glue::glue("quo(summarise({rlang::as_label(substitute(...))}))") - - summ_code_quosure = eval(parse(text = code_to_expand)) - #print(summ_code_quosure) + # convert any quosure to labels + for (i in seq_along(dotdotdot)) { + if("quosure" %in% class(dotdotdot[[i]])) { + dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) + } + } - # ZJ: - # try the traditional route which can't deal with !!!, so if this fails then try the !!! route - tryCatch({ - code = substitute(list(...))[-1] - # print("hehe") - # print(code) - expr_id = 0 - temp_varn = 0 - - list_of_chunk_agg_fns <- as.character(utils::methods(class = "chunk_agg.disk.frame")) - list_of_collected_agg_fns <- as.character(utils::methods(class = "collected_agg.disk.frame")) - # browser() - # generate the chunk_summarize_code - summarize_code = purrr::map_dfr(code, ~{ - # print("raw code") - # print(.x) - expr_id <<- expr_id + 1 - # parse the function into table form for easy interrogration - # The keep.source = TRUE options seems necessary to keep it working in Rscript mode - gpd = getParseData(parse(text = deparse(.x), keep.source = TRUE), includeText = TRUE); - # print("raw table") - # print(deparse(.x)) - # print(gpd) - grp_funcs = gpd %>% filter(token == "SYMBOL_FUNCTION_CALL") %>% select(text) %>% pull - grp_funcs = grp_funcs %>% paste0("_df") - - # search in the space to find functions name `fn`.chunk_agg.disk.frame - # only allow one such functions for now TODO improve it - num_of_chunk_functions = sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame")))) - num_of_collected_functions= sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame")))) - - # the number chunk and aggregation functions must match - stopifnot(num_of_chunk_functions == num_of_collected_functions) - - # keep only grp_functions - grp_funcs= grp_funcs[sapply(grp_funcs, function(x) exists(paste0(x, ".chunk_agg.disk.frame")))] - - if(num_of_chunk_functions == 0) { - stop(sprintf("There must be at least one summarization function in %s", deparse(.x))) - } else if (num_of_chunk_functions > 1) { - stop(sprintf("Two or more summarisation functions are detected in \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n * Nestling (like mean(sum(x) + y)) or \n * combinations (like sum(x) + mean(x))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x))) - } - - # check to see if the mean is only two from parent 0, otherwise it would a statement in the form of 1 + mean(x) - # which isn't supported - data.table::setDT(gpd) - data.table::setkey(gpd, parent) - if (gpd[id == gpd[id == gpd[(paste0(text,"_df") == grp_funcs) & (token == "SYMBOL_FUNCTION_CALL"), parent], parent], parent] != 0) { - stop(sprintf("Combining summarization with other operations \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n * combinations (like sum(x) + 1)\n* combinations (like list(sum(x)))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x))) - } - - temp_varn <<- temp_varn + 1 - grp_funcs_wo_df = sapply(grp_funcs, function(grp_func) substr(grp_func, 1, nchar(grp_func)-3)) - - tmpcode = deparse(evalparseglue("substitute({deparse(.x)}, list({grp_funcs_wo_df} = quote({grp_funcs}.chunk_agg.disk.frame)))")) %>% paste0(collapse = " ") - - chunk_code = data.frame(assign_to = as.character(glue::glue("tmp{temp_varn}")), expr = tmpcode, stringsAsFactors = FALSE) - - chunk_code$orig_code = deparse(.x) - chunk_code$expr_id = expr_id - chunk_code$grp_fn = grp_funcs - chunk_code$name = ifelse(is.null(names(code[expr_id])), "", names(code[expr_id])) - - # create the aggregation code - chunk_code$agg_expr = as.character(glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})")) - - #print(sapply(chunk_code, typeof)) - chunk_code - }) - - chunk_summ_code = paste0(summarize_code$assign_to, "=list(", summarize_code$expr, ")") %>% paste0(collapse = ", ") - - agg_code_df = summarize_code %>% - select(expr_id, name, agg_expr, orig_code) %>% - unique %>% - transmute(agg_code = paste0(ifelse(name == "", paste0("`", orig_code, "` = "), paste0(name, "=")), agg_expr)) - - agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",") - - return(list(chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)) - }, error = function(e) { - return(summ_code_quosure) - }) + class(.data) <- c("summarized_disk.frame", "disk.frame") + attr(.data, "summarize_code") = dotdotdot + return(.data) } @@ -383,8 +291,3 @@ generate_summ_code <- function(...) { #' @importFrom dplyr summarize #' @rdname group_by summarise.disk.frame <- summarize.disk.frame - - - - - diff --git a/R/play.r b/R/play.r new file mode 100644 index 00000000..b5ee9fe7 --- /dev/null +++ b/R/play.r @@ -0,0 +1,15 @@ +#' Play the recorded lazy operations +#' @param dataframe A data.frame +#' @param recordings A recording the expression, globals and packages using create_chunk_mapper +play <- function(dataframe, recordings) { + for(recording in recordings) { + tmp_env = list2env(recording$globals) + + # replace .disk.frame.chunk with dataframe in the function + code = eval(bquote(substitute(.(recording$expr), list(.disk.frame.chunk=quote(dataframe))))) + + # execute the delayed function + dataframe = eval(code, envir = tmp_env) + } + dataframe +} \ No newline at end of file diff --git a/R/print.disk.frame.r b/R/print.disk.frame.r index fe739059..5ff81516 100644 --- a/R/print.disk.frame.r +++ b/R/print.disk.frame.r @@ -7,13 +7,22 @@ #' @importFrom glue glue # TODO add chunk print.disk.frame <- function(x, ...) { - a = paste(sep = "\n" - ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") - ,glue::glue("nchunks: {disk.frame::nchunks(x)}") - ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") - ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") - ,glue::glue("nrow (post operations): ???") - ,glue::glue("ncol (post operations): ???\n") - ) + if (is.null(attr(x, "recordings"))) { + a = paste(sep = "\n" + ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") + ,glue::glue("nchunks: {disk.frame::nchunks(x)}") + ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") + ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") + ) + } else { + a = paste(sep = "\n" + ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") + ,glue::glue("nchunks: {disk.frame::nchunks(x)}") + ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") + ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") + ,glue::glue("nrow (post operations): ???") + ,glue::glue("ncol (post operations): ???\n") + ) + } message(a) } diff --git a/R/rbindlist.disk.frame.r b/R/rbindlist.disk.frame.r index 7bbdd19f..87850ad1 100644 --- a/R/rbindlist.disk.frame.r +++ b/R/rbindlist.disk.frame.r @@ -5,7 +5,7 @@ #' @param parallel if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE #' @param compress 0-100, 100 being the highest compression rate. #' @param overwrite overwrite the output directory -#' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr} +#' @param .progress A logical, for whether or not to show progress. #' @import fs #' @importFrom data.table data.table setDT #' @importFrom future.apply future_lapply diff --git a/R/rechunk.r b/R/rechunk.r index 142b0576..9b79ba80 100644 --- a/R/rechunk.r +++ b/R/rechunk.r @@ -4,9 +4,6 @@ #' @param shardby the shardkeys #' @param outdir the output directory #' @param overwrite overwrite the output directory -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits for the "sort" shardby function, a dataframe with the split values. -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. #' @export #' @examples #' # create a disk.frame with 2 chunks in tempdir() @@ -22,9 +19,8 @@ #' # clean up cars.df #' delete(cars.df) #' delete(cars2.df) -rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby = NULL, overwrite = TRUE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) { - - # we need to force the chunks to be computed first as it's common to make nchunks a multiple of chunks(df) +rechunk <- function(df, nchunks = disk.frame::nchunks(df), outdir = attr(df, "path", exact=TRUE), shardby = NULL, overwrite = TRUE) { + # we need to force the chunks to be computed first as it's common to make nchunks a multiple of chunks(df) # but if we do it too late then the folder could be empty force(nchunks) @@ -52,9 +48,9 @@ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby short_files = dir(outdir) # move all files to the back up folder - purrr::map(full_files, ~{ - fs::file_move(.x, back_up_tmp_dir) - }) + for(file in full_files) { + fs::file_move(file, back_up_tmp_dir) + } if(fs::dir_exists(file.path(outdir, ".metadata"))) { fs::dir_delete(file.path(outdir, ".metadata")) @@ -76,9 +72,14 @@ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby shardby = existing_shardkey[[1]] } - if(user_had_set_shard_by) { - return(hard_group_by(df, shardby, nchunks = nchunks, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)) + tmp = cmap(df, ~{ + shard(.x, shardby, nchunks=nchunks, overwrite=FALSE) + }) %>% collect_list + + return( + rbindlist.disk.frame(tmp) + ) } else if (identical(shardby, "") | is.null(shardby)) { # if no existing shardby nr = nrow(df) @@ -138,9 +139,9 @@ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby tmp_fdlr = tempfile("rechunk_shard") fs::dir_create(tmp_fdlr) - oks = furrr::future_map(which(lp == 1), function(i) { - file_chunk = file.path(attr(df, "path", exact=TRUE), i %>% paste0(".fst")) - fs::file_move(file_chunk, file.path(tmp_fdlr, possibles_new_chunk_id[[i]] %>% paste0(".fst"))) + oks = future.apply::future_lapply(which(lp == 1), function(i) { + file_chunk = file.path(attr(df, "path", exact=TRUE), paste0(i, ".fst")) + fs::file_move(file_chunk, file.path(tmp_fdlr, paste0(possibles_new_chunk_id[[i]], ".fst"))) disk.frame(tmp_fdlr) }) diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index e5be6ac0..2b987600 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -85,8 +85,6 @@ df_ram_size <- function() { message(system("wmic MemoryChip get Capacity", intern=TRUE)) message("") message("") - #message("The option disk.frame.ram_size is not set. - #message("To set the ram_size, do options(disk.frame_ram_size = your_ram_size_in_gigabytes)") ram_size = 16 } } diff --git a/R/semi_join.r b/R/semi_join.r index fb46350f..f462efac 100644 --- a/R/semi_join.r +++ b/R/semi_join.r @@ -17,11 +17,11 @@ semi_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi overwrite_check(outdir, overwrite) if("data.frame" %in% class(y)) { - quo_dotdotdot = enquos(...) - cmap_dfr(x, ~{ - code = quo(semi_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) - rlang::eval_tidy(code) + tmp = cmap_dfr(x, ~{ + semi_join(.x, y, by = by, copy = copy, ...) }, .progress = .progress) + + return(tmp) } else if("disk.frame" %in% class(y)) { if(is.null(merge_by_chunk_id)) { stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") @@ -34,8 +34,8 @@ semi_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfi ncy = nchunks(y) if (merge_by_chunk_id == FALSE) { warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") - x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) - y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) + x = rechunk(x, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) + y = rechunk(y, by, nchunks = max(ncy,ncx), outdir=tempfile(fileext = ".jdf"), overwrite = FALSE) return(semi_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { res = cmap2.disk.frame(x, y, ~{ diff --git a/R/shard.r b/R/shard.r index e3b9285c..42a94946 100644 --- a/R/shard.r +++ b/R/shard.r @@ -4,9 +4,6 @@ #' @param nchunks The number of chunks #' @param outdir The output directory of the disk.frame #' @param overwrite If TRUE then the chunks are overwritten -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits If shardby_function is "sort", the split values for sharding -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. #' @param ... not used #' @importFrom data.table setDT #' @importFrom glue glue @@ -18,32 +15,33 @@ #' #' # clean up cars.df #' delete(iris.df) -shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), overwrite = FALSE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) { +shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), overwrite = FALSE) { force(nchunks) overwrite_check(outdir, overwrite) - stopifnot(shardby_function %in% c("hash", "sort")) + # stopifnot(shardby_function %in% c("hash", "sort")) if("data.frame" %in% class(df)) { data.table::setDT(df) - if(shardby_function == "hash"){ - message("Hashing...") - if(length(shardby) == 1) { - code = glue::glue("df[,.out.disk.frame.id := hashstr2i(as.character({shardby}), nchunks)]") - } else { - shardby_list = glue::glue("paste0({paste0(sort(shardby),collapse=',')})") - code = glue::glue("df[,.out.disk.frame.id := hashstr2i({shardby_list}, nchunks)]") - } - } else if(shardby_function == "sort"){ - if(nchunks == 1){ - message("Only one chunk: set .out.disk.frame.id = 0") - code = glue::glue("df[,.out.disk.frame.id := 0]") - } else { - shard_by_rule <- sortablestr2i(sort_splits, desc_vars) - # message(shard_by_rule) - setDT(df) - code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]") - } + # if(shardby_function == "hash"){ + # message("Hashing...") + if(length(shardby) == 1) { + # TODO rewrite + code = glue::glue("df[,.out.disk.frame.id := hashstr2i(as.character({shardby}), nchunks)]") + } else { + shardby_list = glue::glue("paste0({paste0(sort(shardby),collapse=',')})") + code = glue::glue("df[,.out.disk.frame.id := hashstr2i({shardby_list}, nchunks)]") } + # } else if(shardby_function == "sort"){ + # if(nchunks == 1){ + # message("Only one chunk: set .out.disk.frame.id = 0") + # code = glue::glue("df[,.out.disk.frame.id := 0]") + # } else { + # shard_by_rule <- sortablestr2i(sort_splits, desc_vars) + # # message(shard_by_rule) + # setDT(df) + # code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]") + # } + # } tryCatch( eval(parse(text=code)), @@ -54,11 +52,11 @@ shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks stopifnot(".out.disk.frame.id" %in% names(df)) - res = write_disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, shardchunks = nchunks, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) + res = write_disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, shardchunks = nchunks) return(res) } else if ("disk.frame" %in% class(df)){ nchunks_rechunk = nchunks - return(rechunk(df, shardby = shardby, nchunks = nchunks_rechunk, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)) + return(rechunk(df, shardby = shardby, nchunks = nchunks_rechunk, outdir = outdir, overwrite = TRUE)) } } diff --git a/R/srckeep.disk.frame.r b/R/srckeep.disk.frame.r index 54aef73a..1cfc1b2a 100644 --- a/R/srckeep.disk.frame.r +++ b/R/srckeep.disk.frame.r @@ -21,11 +21,11 @@ srckeep <- function(diskf, selections, ...) { #' @param chunks The chunks to load #' @rdname srckeep #' @export -srckeepchunks <- function(diskf, chunks, ...) { - stopifnot("disk.frame" %in% class(diskf)) - # TODO relax this - stopifnot(is.integer(chunks)) - - attr(df,"keep_chunks") = chunks - diskf -} +# srckeepchunks <- function(diskf, chunks, ...) { +# stopifnot("disk.frame" %in% class(diskf)) +# # TODO relax this +# stopifnot(is.integer(chunks)) +# +# attr(df,"keep_chunks") = chunks +# diskf +# } diff --git a/R/util.r b/R/util.r index 9fabce9e..8b760f0f 100644 --- a/R/util.r +++ b/R/util.r @@ -24,4 +24,24 @@ gen_datatable_synthetic <- function(N=2e8, K=100) { v3 = sample(round(runif(100,max=100),4), N, TRUE), # numeric e.g. 23.5749 date1 = sample(seq(as.Date('1970-01-01'), as.Date('2019-01-01'), by = "day"), N, TRUE) # date ) -} \ No newline at end of file +} + +#' Used to convert a function to purrr syntax if needed +#' @param .f a normal function or purrr syntax function i.e. `~{ ...code...}` +#' @importFrom purrr as_mapper +purrr_as_mapper <- function(.f) { + if(typeof(.f) == "language") { + if(requireNamespace("purrr")) { + .f = purrr::as_mapper(.f) + } else { + code = paste0(deparse(substitute(.f)), collapse = "") + stop( + sprintf( + "in cmap(.x, %s), it appears you are using {purrr} syntax but do not have {purrr} installed. Try `install.packages('purrr')`", + code + ) + ) + } + } + return(.f) +} diff --git a/R/write_disk.frame.r b/R/write_disk.frame.r index ceab7c15..cdfd0c36 100644 --- a/R/write_disk.frame.r +++ b/R/write_disk.frame.r @@ -2,7 +2,7 @@ #' @description #' Write a data.frame/disk.frame to a disk.frame location. If df is a data.frame #' then using the as.disk.frame function is recommended for most cases -#' @param df a disk.frame +#' @param diskf a disk.frame #' @param outdir output directory for the disk.frame #' @param nchunks number of chunks #' @param overwrite overwrite output directory @@ -26,12 +26,12 @@ #' delete(cars.df) #' delete(cars2.df) write_disk.frame <- function( - df, + diskf, outdir = tempfile(fileext = ".df"), nchunks = ifelse( - "disk.frame"%in% class(df), - nchunks.disk.frame(df), - recommend_nchunks(df)), + "disk.frame"%in% class(diskf), + nchunks.disk.frame(diskf), + recommend_nchunks(diskf)), overwrite = FALSE, shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) { @@ -40,16 +40,30 @@ write_disk.frame <- function( if(is.null(outdir)) { - stop("outdir must not be NULL") + stop("write_disk.frame error: outdir must not be NULL") } - if(is_disk.frame(df)) { + if(is_disk.frame(diskf)) { if(is.null(shardby)) { - cmap.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE) + path = attr(diskf, "path") + files_shortname <- list.files(path) + cids = get_chunk_ids(diskf, full.names = T, strip_extension = F) + + future.apply::future_lapply(1:length(cids), function(ii, ...) { + chunk = get_chunk(diskf, cids[ii], full.names = TRUE) + if(nrow(chunk) == 0) { + warning(sprintf("The output chunk has 0 row, therefore chunk %d NOT written", ii)) + } else { + out_chunk_name = file.path(outdir, files_shortname[ii]) + fst::write_fst(chunk, out_chunk_name, compress) + return(files_shortname) + } + return(NULL) + }, ..., future.seed = TRUE) + return(disk.frame(outdir)) } else { # TODO really inefficient - #df2 = cmap.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE) - shard(df, + shard(diskf, outdir = outdir, nchunks = nchunks, overwrite = TRUE, @@ -60,9 +74,9 @@ write_disk.frame <- function( ... ) } - } else if ("data.frame" %in% class(df)) { - if(".out.disk.frame.id" %in% names(df)) { - df[,{ + } else if ("data.frame" %in% class(diskf)) { + if(".out.disk.frame.id" %in% names(diskf)) { + diskf[,{ if (base::nrow(.SD) > 0) { list_columns = purrr::map_lgl(.SD, is.list) if(any(list_columns)){ @@ -70,17 +84,17 @@ write_disk.frame <- function( } else { fst::write_fst(.SD, file.path(outdir, paste0(.BY, ".fst")), compress = compress) NULL - } } + } NULL }, .out.disk.frame.id] res = disk.frame(outdir) add_meta(res, shardkey = shardby, shardchunks = nchunks, compress = compress) } else { - as.disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, compress = compress, ...) + as.disk.frame(diskf, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, compress = compress, ...) } } else { - stop("write_disk.frame error: df must be a disk.frame or data.frame") + stop("write_disk.frame error: diskf must be a disk.frame or data.frame") } } diff --git a/R/zip_to_disk.frame.r b/R/zip_to_disk.frame.r index 5418756e..5aacdd5b 100644 --- a/R/zip_to_disk.frame.r +++ b/R/zip_to_disk.frame.r @@ -5,7 +5,7 @@ #' @param ... passed to fread #' @param validation.check should the function perform a check at the end to check for validity of output. It can detect issues with conversion #' @param overwrite overwrite output directory -#' @import fst fs +#' @import fst #' @importFrom glue glue #' @importFrom future.apply future_lapply #' @importFrom utils unzip @@ -33,18 +33,18 @@ zip_to_disk.frame = function(zipfile, outdir, ..., validation.check = FALSE, overwrite = TRUE) { files = unzip(zipfile, list=TRUE) - fs::dir_create(outdir) + if(!dir.exists(outdir)) { + dir.create(outdir) + } tmpdir = tempfile(pattern = "tmp_zip2csv") - + dotdotdots = list(...) - - dfs = future.apply::future_lapply(files$Name, function(fn) { - #dfs = lapply(files$Name, function(fn) { + dfs = future.apply::future_lapply(files$Name, function(fn, ...) { outdfpath = file.path(outdir, fn) overwrite_check(outdfpath, TRUE) unzip(zipfile, files = fn, exdir = tmpdir) - + # lift the domain of csv_to_disk.frame so it accepts a list cl = purrr::lift(csv_to_disk.frame) @@ -55,15 +55,14 @@ zip_to_disk.frame = function(zipfile, outdir, ..., validation.check = FALSE, ove #csv_to_disk.frame(, outdfpath, overwrite = overwrite, ...) cl(ok) }, future.seed=TRUE) - dfs } -#' `validate_zip_to_disk.frame` is used to validate and auto-correct read and convert every single file within the zip file to df format +#' `validate_zip_to_disk.frame` is used to validate and auto-correct read and convert every single file within the zip file to disk.frame format #' @importFrom glue glue #' @importFrom utils unzip #' @importFrom data.table timetaken fread -#' @import fst +#' @importFrom fst read_fst #' @rdname zip_to_disk.frame #' @noRd validate_zip_to_disk.frame = function(zipfile, outdir) { @@ -86,14 +85,14 @@ validate_zip_to_disk.frame = function(zipfile, outdir) { # read it and if it errors then the file might be corrupted, so # read it again and write again pt = proc.time() - read_fst(out_fst_file, as.data.table = TRUE) + fst::read_fst(out_fst_file, as.data.table = TRUE) message(paste0("checking(read): ", timetaken(pt))); pt = proc.time() }, error = function(e) { message(e) pt = proc.time() unzip(zipfile, files = fn, exdir = tmpdir) message(paste0("unzip: ", timetaken(pt))); pt = proc.time() - write_fst(fread(file.path(tmpdir, fn)), out_fst_file,100) + fst::write_fst(data.table::fread(file.path(tmpdir, fn)), out_fst_file,100) message(paste0("read: ", timetaken(pt))) unlink(file.path(tmpdir, fn)) gc() @@ -106,7 +105,7 @@ validate_zip_to_disk.frame = function(zipfile, outdir) { pt = proc.time() unzip(zipfile, files = fn, exdir = tmpdir) message(paste0("unzip: ", timetaken(pt))); pt = proc.time() - write_fst(fread(file.path(tmpdir, fn)), out_fst_file,100) + fst::write_fst(data.table::fread(file.path(tmpdir, fn)), out_fst_file,100) message(paste0("read: ", timetaken(pt))) unlink(file.path(tmpdir, fn)) gc() diff --git a/README.md b/README.md index 8b8b1af9..2ca03010 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,6 @@ library(nycflights13) # this will setup disk.frame's parallel backend with number of workers equal to the number of CPU cores (hyper-threaded cores are counted as one not two) setup_disk.frame() -#> The number of workers available for disk.frame is 6 # this allows large datasets to be transferred between sessions options(future.globals.maxSize = Inf) @@ -211,12 +210,15 @@ flights.df %>% filter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1 2013 1 1 517 515 2 830 819 11 UA -#> 2 2013 1 1 533 529 4 850 830 20 UA -#> flight tailnum origin dest air_time distance hour minute time_hour origin_dest -#> 1 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH -#> 2 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> 2: 2013 1 1 533 529 4 850 830 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2: 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> time_hour origin_dest +#> 1: 2013-01-01 05:00:00 EWRIAH +#> 2: 2013-01-01 05:00:00 LGAIAH ``` ### Group-by @@ -313,7 +315,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpIlXNzn\\file568813b835a7.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQH7obF\\file42d452c32907.df" ``` A number of data.frame functions are implemented for disk.frame @@ -321,19 +323,23 @@ A number of data.frame functions are implemented for disk.frame ``` r # get first few rows head(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1: 2013 1 1 517 515 2 830 819 11 UA -#> flight tailnum origin dest air_time distance hour minute time_hour -#> 1: 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> time_hour +#> 1: 2013-01-01 05:00:00 ``` ``` r # get last few rows tail(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier -#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ -#> flight tailnum origin dest air_time distance hour minute time_hour -#> 1: 3531 N839MQ LGA RDU NA 431 8 40 2013-09-30 08:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 9 30 NA 840 NA NA 1020 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40 +#> time_hour +#> 1: 2013-09-30 08:00:00 ``` ``` r diff --git a/book/02-intro-disk-frame.Rmd b/book/02-intro-disk-frame.Rmd index 79428c3d..8d50bc8b 100644 --- a/book/02-intro-disk-frame.Rmd +++ b/book/02-intro-disk-frame.Rmd @@ -168,7 +168,7 @@ mutate(flights.df, speed = distance / air_time * 60) %>% collect %>% head(2) ### Examples of NOT fully supported `dplyr` verbs -The `chunk_arrange` function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly `chunk_summarise` creates summary variables within each chunk and hence also needs to be used with caution. In the Group By section, we demonstrate how to use `summarise` in the `disk.frame` context correctly with `hard_group_by`s. +The `chunk_arrange` function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly `chunk_summarise` creates summary variables within each chunk and hence also needs to be used with caution. ```{r} # this only sorts within each chunk @@ -227,7 +227,7 @@ The `by` variables that were used to shard the dataset are called the `shardkey` ```{r} flights.df %>% - group_by(carrier) %>% # notice that hard_group_by needs to be set + group_by(carrier) %>% summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules collect %>% arrange(carrier) @@ -256,11 +256,11 @@ left_join inner_join semi_join inner_join -full_join # requires hard_group_by on both left and right +full_join # requires rechunk on both left and right ``` -In all cases, the left dataset (`x`) must be a `disk.frame`, and the right dataset (`y`) can be either a `disk.frame` or a `data.frame`. If the right dataset is a `disk.frame` and the `shardkey`s are different between the two `disk.frame`s then two expensive `hard` `group_by` operations are performed *eagerly*, one on the left `disk.frame` and one on the right `disk.frame` to perform the joins correctly. +In all cases, the left dataset (`x`) must be a `disk.frame`, and the right dataset (`y`) can be either a `disk.frame` or a `data.frame`. If the right dataset is a `disk.frame` and the `shardkey`s are different between the two `disk.frame`s then two expensive `hard` `rechunk` operations are performed *eagerly*, one on the left `disk.frame` and one on the right `disk.frame` to perform the joins correctly. -However, if the right dataset is a `data.frame` then `hard_group_by`s are only performed in the case of `full_join`. +However, if the right dataset is a `data.frame` then `rechunk``s are only performed in the case of `full_join`. Note `disk.frame` does not support `right_join` the user should use `left_join` instead. @@ -287,33 +287,7 @@ flights.df %>% `{disk.frame}` supports all `data.frame` operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like `min_rank` and `rank` are supported out of the box. -For the following example, we will use the `hard_group_by` which performs a group-by and also reorganises the chunks so that all records with the same `year`, `month`, and `day` end up in the same chunk. This is typically not advised, as `hard_group_by` can be slow for large datasets. - -```{r} -# Find the most and least delayed flight each day -bestworst <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>% - collect - -bestworst %>% head -``` - -another example - -```{r} -ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>% - collect - -ranked %>% head -``` - -one more example - +For example ```{r} # Rank each flight within a daily ranked <- flights.df %>% @@ -329,22 +303,22 @@ ranked %>% head ## Arbitrary by-chunk processing -One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `map.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk +One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `cmap.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk ```{r} flights.df1 <- delayed(flights.df, ~nrow(.x)) collect_list(flights.df1) %>% head # returns number of rows for each data.frame in a list ``` -and to do the same with `map.disk.frame` +and to do the same with `cmap.disk.frame` ```{r} -map(flights.df, ~nrow(.x), lazy = F) %>% head +cmap(flights.df, ~nrow(.x), lazy = F) %>% head ``` -The `map` function can also output the results to another disk.frame folder, e.g. +The `cmap` function can also output the results to another disk.frame folder, e.g. ```{r} # return the first 10 rows of each chunk -flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) +flights.df2 <- cmap(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) flights.df2 %>% head ``` @@ -369,7 +343,7 @@ write_disk.frame(flights.df, outdir="out") this will output a disk.frame to the folder "out" ```{r cleanup, include=FALSE} -fs::dir_delete(file.path(tempdir(), "tmp_flights.df")) -fs::dir_delete(file.path(tempdir(), "tmp2")) -fs::file_delete(file.path(tempdir(), "tmp_flights.csv")) +# fs::dir_delete(file.path(tempdir(), "tmp_flights.df")) +# fs::dir_delete(file.path(tempdir(), "tmp2")) +# fs::file_delete(file.path(tempdir(), "tmp_flights.csv")) ``` diff --git a/docs/404.html b/docs/404.html index 78100fa4..9ccac74e 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index abba5cc8..5757cc4b 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index 584809c3..af353427 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -33,7 +33,7 @@ diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html index fec5669f..5526ed10 100644 --- a/docs/articles/02-intro-disk-frame.html +++ b/docs/articles/02-intro-disk-frame.html @@ -33,7 +33,7 @@ @@ -211,12 +211,13 @@disk.frame
from
csv_path,
outdir = df_path,
in_chunk_size = 100000)
-#> Warning: UNRELIABLE VALUE: Future ('<none>') unexpectedly generated random
-#> numbers without specifying argument 'seed'. There is a risk that those random
-#> numbers are not statistically sound and the overall results might be invalid.
-#> To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random
-#> numbers are produced via the L'Ecuyer-CMRG method. To disable this check, use
-#> 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore".
+#> Warning: UNRELIABLE VALUE: One of the 'future.apply' iterations
+#> ('future_lapply-1') unexpectedly generated random numbers without declaring so.
+#> There is a risk that those random numbers are not statistically sound and the
+#> overall results might be invalid. To fix this, specify 'future.seed=TRUE'. This
+#> ensures that proper, parallel-safe random numbers are produced via the L'Ecuyer-
+#> CMRG method. To disable this check, use 'future.seed = NULL', or set option
+#> 'future.rng.onMisuse' to "ignore".
flights.df
disk.frame
also has a function zip_to_disk.frame
that can convert every CSV in a zip file to disk.frame
s.
dplyr
verbs
dplyr
verbs
The chunk_arrange
function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly chunk_summarise
creates summary variables within each chunk and hence also needs to be used with caution. In the Group By section, we demonstrate how to use summarise
in the disk.frame
context correctly with hard_group_by
s.
The chunk_arrange
function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly chunk_summarise
creates summary variables within each chunk and hence also needs to be used with caution.
# this only sorts within each chunk
chunk_arrange(flights.df, dplyr::desc(dep_delay)) %>% collect %>% head(2)
@@ -342,7 +343,7 @@ Group-bydisk.frame implements the group_by
operation some caveats. In the disk.frame framework, only a set functions are supported in summarize
. However, the user can create more custom group-by
functions can be defined.
flights.df %>%
- group_by(carrier) %>% # notice that hard_group_by needs to be set
+ group_by(carrier) %>%
summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules
collect %>%
arrange(carrier)
@@ -406,9 +407,9 @@ Joins
inner_join
semi_join
inner_join
-full_join # requires hard_group_by on both left and right
-
In all cases, the left dataset (x
) must be a disk.frame
, and the right dataset (y
) can be either a disk.frame
or a data.frame
. If the right dataset is a disk.frame
and the shardkey
s are different between the two disk.frame
s then two expensive hard
group_by
operations are performed eagerly, one on the left disk.frame
and one on the right disk.frame
to perform the joins correctly.
-However, if the right dataset is a data.frame
then hard_group_by
s are only performed in the case of full_join
.
+full_join # requires rechunk on both left and right
In all cases, the left dataset (x
) must be a disk.frame
, and the right dataset (y
) can be either a disk.frame
or a data.frame
. If the right dataset is a disk.frame
and the shardkey
s are different between the two disk.frame
s then two expensive hard
rechunk
operations are performed eagerly, one on the left disk.frame
and one on the right disk.frame
to perform the joins correctly.
However, if the right dataset is a data.frame
then rechunk``s are only performed in the case of
full_join`.
Note disk.frame
does not support right_join
the user should use left_join
instead.
The below joins are performed lazily because airlines.dt
is a data.table
not a disk.frame
:
@@ -471,41 +472,8 @@Joins
Window functions and arbitrary functions
disk.frame supports all
-data.frame
operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions likemin_rank
andrank
are supported out of the box.For the following example, we will use the
+hard_group_by
which performs a group-by and also reorganises the chunks so that all records with the sameyear
,month
, andday
end up in the same chunk. This is typically not advised, ashard_group_by
can be slow for large datasets.For example
--# Find the most and least delayed flight each day -bestworst <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>% - collect - -bestworst %>% head -#> year month day dep_delay -#> 1: 2013 2 21 301 -#> 2: 2013 2 21 -9 -#> 3: 2013 2 21 -1 -#> 4: 2013 2 21 2 -#> 5: 2013 2 21 -4 -#> 6: 2013 2 21 10
another example
---ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>% - collect - -ranked %>% head -#> year month day dep_delay -#> 1: 2013 1 9 1301 -#> 2: 2013 1 9 253 -#> 3: 2013 1 10 1126 -#> 4: 2013 1 10 385 -#> 5: 2013 1 17 259 -#> 6: 2013 1 17 255
one more example
-diff --git a/docs/articles/06-vs-dask-juliadb.html b/docs/articles/06-vs-dask-juliadb.html index 053e7c5e..fe2bcd28 100644 --- a/docs/articles/06-vs-dask-juliadb.html +++ b/docs/articles/06-vs-dask-juliadb.html @@ -33,7 +33,7 @@# Rank each flight within a daily ranked <- flights.df %>% srckeep(c("year","month","day", "dep_delay")) %>% @@ -526,8 +494,8 @@
Window functions and arbitrary
Arbitrary by-chunk processing
-One can apply arbitrary transformations to each chunk of the
-disk.frame
by using thedelayed
function which evaluates lazily or themap.disk.frame(lazy = F)
function which evaluates eagerly. For example to return the number of rows in each chunk@@ -149,9 +149,6 @@+One can apply arbitrary transformations to each chunk of the
+disk.frame
by using thedelayed
function which evaluates lazily or thecmap.disk.frame(lazy = F)
function which evaluates eagerly. For example to return the number of rows in each chunk-flights.df1 <- delayed(flights.df, ~nrow(.x)) collect_list(flights.df1) %>% head # returns number of rows for each data.frame in a list #> [[1]] @@ -547,35 +515,14 @@
Arbitrary by-chunk processing#> #> [[6]] #> [1] 56121
and to do the same with
-map.disk.frame
--map(flights.df, ~nrow(.x), lazy = F) %>% head -#> Warning in map.disk.frame(flights.df, ~nrow(.x), lazy = F): map(df, ...) where -#> df is a disk.frame has been deprecated. Please use cmap(df,...) instead -#> [[1]] -#> [1] 56131 -#> -#> [[2]] -#> [1] 56131 -#> -#> [[3]] -#> [1] 56131 -#> -#> [[4]] -#> [1] 56131 -#> -#> [[5]] -#> [1] 56131 -#> -#> [[6]] -#> [1] 56121
The
-map
function can also output the results to another disk.frame folder, e.g.diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html index c59cc037..7027aebf 100644 --- a/docs/articles/05-data-table-syntax.html +++ b/docs/articles/05-data-table-syntax.html @@ -33,7 +33,7 @@+and to do the same with
+ +cmap.disk.frame
The
+cmap
function can also output the results to another disk.frame folder, e.g.# return the first 10 rows of each chunk -flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) -#> Warning in map.disk.frame(flights.df, ~.x[1:10, ], lazy = F, outdir = -#> file.path(tempdir(), : map(df, ...) where df is a disk.frame has been -#> deprecated. Please use cmap(df,...) instead +flights.df2 <- cmap(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) flights.df2 %>% head #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time @@ -605,35 +552,35 @@
Arbitrary by-chunk processingSampling
In the
-disk.frame
framework, sampling a proportion of rows within each chunk can be performed usingsample_frac
.++#> 1: 2013-01-10 21:00:00 +#> 2: 2013-05-13 15:00:00 +#> 3: 2013-05-16 18:00:00 +#> 4: 2013-12-29 20:00:00 +#> 5: 2013-12-21 13:00:00 +#> 6: 2013-12-25 20:00:00flights.df %>% sample_frac(0.01) %>% collect %>% head #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> 1: 2013 5 10 554 600 -6 645 659 -#> 2: 2013 8 28 752 800 -8 1015 1022 -#> 3: 2013 1 3 955 958 -3 1120 1137 -#> 4: 2013 8 22 2157 2100 57 7 2323 -#> 5: 2013 5 14 1853 1900 -7 2003 2048 -#> 6: 2013 8 25 1550 1540 10 1747 1747 +#> 1: 2013 1 10 1614 1605 9 1926 1944 +#> 2: 2013 5 13 1136 1136 0 1225 1234 +#> 3: 2013 5 16 1428 1420 8 1535 1538 +#> 4: 2013 12 29 1536 1520 16 1817 1750 +#> 5: 2013 12 21 849 820 29 1315 1345 +#> 6: 2013 12 25 1624 1559 25 1846 1825 #> arr_delay carrier flight tailnum origin dest air_time distance hour minute -#> 1: -14 US 2161 N747UW LGA DCA 38 214 6 0 -#> 2: -7 UA 561 N513UA LGA DEN 218 1620 8 0 -#> 3: -17 UA 258 N831UA LGA ORD 124 733 9 58 -#> 4: 44 DL 1247 N914DE LGA ATL 102 762 21 0 -#> 5: -45 EV 5038 N741EV LGA BHM 111 866 19 0 -#> 6: 0 9E 3648 N8940E JFK CMH 69 483 15 40 +#> 1: -18 DL 1508 N952DL JFK RSW 163 1074 16 5 +#> 2: -9 EV 3830 N13955 EWR PVD 35 160 11 36 +#> 3: -3 EV 4284 N11536 EWR ROC 45 246 14 20 +#> 4: 27 MQ 3553 N520MQ LGA XNA 181 1147 15 20 +#> 5: -30 DL 454 N682DA JFK STT 188 1623 8 20 +#> 6: 21 EV 5567 N870AS LGA CAE 99 617 15 59 #> time_hour -#> 1: 2013-05-10 10:00:00 -#> 2: 2013-08-28 12:00:00 -#> 3: 2013-01-03 14:00:00 -#> 4: 2013-08-23 01:00:00 -#> 5: 2013-05-14 23:00:00 -#> 6: 2013-08-25 19:00:00
@@ -147,7 +147,6 @@Writing Data
One can output a
-disk.frame
by using thewrite_disk.frame
function. E.g.diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html index 418a1757..5797c66c 100644 --- a/docs/articles/03-concepts.html +++ b/docs/articles/03-concepts.html @@ -33,7 +33,7 @@+write_disk.frame(flights.df, outdir="out")
this will output a disk.frame to the folder “out”
Workers and parallelism#> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union -#> Loading required package: purrr #> #> ## Message from disk.frame: #> We have 1 workers to use with disk.frame. @@ -168,9 +167,6 @@
Workers and parallelism#> #> #> Attaching package: 'disk.frame' -#> The following objects are masked from 'package:purrr': -#> -#> imap, imap_dfr, map, map2 #> The following objects are masked from 'package:base': #> #> colnames, ncol, nrow diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html index eb86d829..d9e14783 100644 --- a/docs/articles/04-ingesting-data.html +++ b/docs/articles/04-ingesting-data.html @@ -33,7 +33,7 @@
library(data.table) #> #> Attaching package: 'data.table' -#> The following object is masked from 'package:purrr': -#> -#> transpose #> The following objects are masked from 'package:dplyr': #> #> between, first, last @@ -167,6 +164,7 @@
#> [17] "hour" "minute" "time_hour" flights.df[,.N, .(year, month), keep = c("year", "month")] +#> data.table syntax for disk.frame may be moved to a separate package in the future #> year month N #> 1: 2013 1 27004 #> 2: 2013 10 28889 @@ -199,6 +197,7 @@
External variables are capturedflights.df[,some_fn(y)] +#> data.table syntax for disk.frame may be moved to a separate package in the future #> [1] 42 42 42 42 42 42
In the above example, neither
some_fn
nory
are defined in the background workers’ environments, butdisk.frame
still manages to evaluate this codeflights.df[,some_fn(y)]
.
We note that there is some time needed for disk.frame to start up all the workers. Next we try to convert the largest CSV file to disk.frame format. The file to be converted is about 2.2GB in size
time_to_convert_disk.frame = system.time(df1 <- csv_to_disk.frame("c:/data/Performance_2004Q3.txt", header = FALSE))[3]
time_to_convert_disk.frame
#> elapsed
-#> 28.77
Now that we have converted it, we want to a count by the first column. To achieve this we use a “two-stage” aggregation strategy. Note that use keep="V1"
to bring only the column V1
into RAM. This avoids the reading of other unnecessary columns and should speed-up the analysis significantly
time_to_agg_disk.frame = system.time(summ <- df1[,.N, V1, keep = "V1"][, .(N = sum(N)), V1])
+#> data.table syntax for disk.frame may be moved to a separate package in the future
time_to_agg_disk.frame
#> user system elapsed
-#> 0.13 0.03 7.89
We can inspect the result as well.
summ
@@ -227,7 +228,7 @@ disk.framesummarise(N = n()) %>%
collect)
#> user system elapsed
-#> 1.89 0.14 5.30
However, the dplyr
syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.
suppressMessages(library(disk.frame))
flights.df %>%
- hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
+ hard_group_by(carrier) %>% # notice that hard_group_by needs to be set
chunk_summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules
collect %>%
arrange(carrier)
@@ -359,7 +359,7 @@ ZJ D (2022). disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework. -R package version 0.5.0, https://diskframe.com. +R package version 0.6.0, https://diskframe.com.
@Manual{, title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework}, author = {Dai ZJ}, year = {2022}, - note = {R package version 0.5.0}, + note = {R package version 0.6.0}, url = {https://diskframe.com}, }diff --git a/docs/index.html b/docs/index.html index 89c4a22e..7ea2fc4e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,7 +37,7 @@ @@ -228,7 +228,6 @@
# where is the disk.frame stored
attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpIlXNzn\\file568813b835a7.df"
A number of data.frame functions are implemented for disk.frame
# get first few rows
head(flights.df, 1)
-#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
-#> 1: 2013 1 1 517 515 2 830 819 11 UA
-#> flight tailnum origin dest air_time distance hour minute time_hour
-#> 1: 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00
# get last few rows
tail(flights.df, 1)
-#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
-#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ
-#> flight tailnum origin dest air_time distance hour minute time_hour
-#> 1: 3531 N839MQ LGA RDU NA 431 8 40 2013-09-30 08:00:00
# number of rows
nrow(flights.df)
diff --git a/docs/news/index.html b/docs/news/index.html
index 0fb5baf2..48d1c26d 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
NEWS.md
+ hard_arrange
and hard_group_by
+add_count
methodlibrary(data.table)
#>
#> Attaching package: 'data.table'
-#> The following object is masked from 'package:purrr':
-#>
-#> transpose
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html
index 10189939..fa22a8ea 100644
--- a/docs/reference/as.disk.frame.html
+++ b/docs/reference/as.disk.frame.html
@@ -17,7 +17,7 @@
Bind rows
+bind_rows.disk.frame(...)
disk.frame to be row bound
R/compute.r
compute.disk.frame.Rd
a disk.frame
Not used. Kept for compatibility with dplyr
If not NULL then used as outdir prefix.
the output directory
whether to overwrite or not
Not used. Kept for dplyr compatibility
Passed to `write_disk.frame`
create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = FALSE)
Whether the files have header. Defaults to TRUE
A logical, for whether or not to print a progress bar for -multiprocess, multisession, and multicore plans. From furrr
A logical, for whether or not to show progress
The CSV reader backend to choose: "data.table" or "readr". disk.frame does not have its own CSV reader. It uses either diff --git a/docs/reference/delete.html b/docs/reference/delete.html index 2a238505..9cf26191 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -17,7 +17,7 @@
diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index 57bfaa8d..c986617f 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -17,7 +17,7 @@ diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index 1197ac98..c9cc7c73 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -18,7 +18,7 @@ diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index 0d2320c2..fd1f9556 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -17,7 +17,7 @@ @@ -111,12 +111,10 @@path = file.path(tempdir(),"cars")
as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
-#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/cars"
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/cars"
#> nchunks: 2
#> nrow (at source): 50
#> ncol (at source): 2
-#> nrow (post operations): ???
-#> ncol (post operations): ???
df = disk.frame(path)
head(df)
#> speed dist
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 4cbb0612..143e9c7e 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
a disk.frame
same as the dplyr::group_by
from dplyr
from dplyr
as.disk.frame()
Make a data.frame into a disk.frame
Bind rows
chunk_summarize()
chunk_summarise()
chunk_group_by()
chunk_ungroup()
Group by within each disk.frame
#' @export +#' @importFrom dplyr add_count +#' @rdname dplyr_verbs +add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' @export +#' @importFrom dplyr add_tally +#' @rdname dplyr_verbs +add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally)
cmap()
cmap_dfr()
cimap()
cimap_dfr()
lazy()
delayed()
chunk_lapply()
map()
imap_dfr()
imap()
map_dfr(<disk.frame>)
map_dfr(<default>)
cmap()
cmap_dfr()
cimap()
cimap_dfr()
lazy()
delayed()
clapply()
Apply the same function to all chunks
`cmap2` a function to two disk.frames
Compute without writing
Force computations. The results are stored in a folder.
Create a disk.frame from a folder
select(<disk.frame>)
rename(<disk.frame>)
filter(<disk.frame>)
mutate(<disk.frame>)
transmute(<disk.frame>)
arrange(<disk.frame>)
chunk_arrange()
add_tally.disk.frame()
do(<disk.frame>)
distinct(<disk.frame>)
chunk_distinct()
glimpse(<disk.frame>)
select(<disk.frame>)
rename(<disk.frame>)
filter(<disk.frame>)
mutate(<disk.frame>)
transmute(<disk.frame>)
arrange(<disk.frame>)
chunk_arrange()
distinct(<disk.frame>)
chunk_distinct()
glimpse(<disk.frame>)
The dplyr verbs implemented for disk.frame
summarise(<grouped_disk.frame>)
summarize(<grouped_disk.frame>)
group_by(<disk.frame>)
summarize(<disk.frame>)
summarise(<disk.frame>)
A function to parse the summarize function
Perform a hard arrange
Perform a hard group
overwrite_check()
Check if the outdir exists or not
Play the recorded lazy operations
pull(<disk.frame>)
Pull a column from table similar to `dplyr::pull`.
Used to convert a function to purrr syntax if needed
Show the code to setup disk.frame
Keep only the variables from the input listed in selections
overwrite output directory
Show progress or not. Defaults to FALSE
see dplyr::XXX_join
see dplyr::XXX_join
Play the recorded lazy operations
+play(dataframe, recordings)
A data.frame
A recording the expression, globals and packages using create_chunk_mapper
Used to convert a function to purrr syntax if needed
+purrr_as_mapper(.f)
a normal function or purrr syntax function i.e. `~ ...code...`
overwrite the output directory
A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr
A logical, for whether or not to show progress.
the shardkeys
overwrite the output directory
splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
for the "sort" shardby function, a dataframe with the split values.
for the "sort" shardby function, the variables to sort descending.
The number of chunks
If TRUE then the chunks are overwritten
splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
If shardby_function is "sort", the split values for sharding
for the "sort" shardby function, the variables to sort descending.
# shard the cars data.frame by speed so that rows with the same speed are in the same chunk
iris.df = shard(iris, "Species")
-#> Hashing...
# clean up cars.df
delete(iris.df)
diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html
index eeca73d7..2fce89e2 100644
--- a/docs/reference/shardkey.html
+++ b/docs/reference/shardkey.html
@@ -17,7 +17,7 @@
srckeep(diskf, selections, ...)
-
-srckeepchunks(diskf, chunks, ...)
srckeep(diskf, selections, ...)
The list of variables to keep from the input source
not yet used
The chunks to load
cars.df = as.disk.frame(cars)
speed_limit = 50
cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
-#> Error in .checkTypos(e, names_x): Object 'speed_limit' not found amongst speed, dist
+#> data.table syntax for disk.frame may be moved to a separate package in the future
+#> cut N
+#> 1: (0,5] 2
+#> 2: (5,10] 2
+#> 3: (20,25] 1
+#> 4: (15,20] 2
+#> 5: (25,30] 1
+#> 6: (30,35] 1
+#> 7: (15,20] 2
+#> 8: (25,30] 3
+#> 9: (10,15] 1
+#> 10: (20,25] 1
+#> 11: (30,35] 2
+#> 12: (40,50] 1
+#> 13: (20,30] 2
+#> 14: (30,40] 2
+#> 15: (50,60] 2
+#> 16: (70,80] 1
+#> 17: <NA> 1
+#> 18: (30,40] 4
+#> 19: (40,50] 2
+#> 20: (50,60] 1
+#> 21: (70,80] 1
+#> 22: (80,90] 1
+#> 23: (40,50] 2
+#> 24: (60,70] 3
+#> 25: (30,40] 1
+#> 26: (50,60] 3
+#> 27: <NA> 1
+#> 28: (90,100] 2
+#> 29: (110,120] 1
+#> 30: (80,90] 1
+#> cut N
# clean up
delete(cars.df)
diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html
index d61fa7a6..1df5aeaf 100644
--- a/docs/reference/tbl_vars.disk.frame.html
+++ b/docs/reference/tbl_vars.disk.frame.html
@@ -18,7 +18,7 @@
write_disk.frame(
- df,
+ diskf,
outdir = tempfile(fileext = ".df"),
- nchunks = ifelse("disk.frame" %in% class(df), nchunks.disk.frame(df),
- recommend_nchunks(df)),
+ nchunks = ifelse("disk.frame" %in% class(diskf), nchunks.disk.frame(diskf),
+ recommend_nchunks(diskf)),
overwrite = FALSE,
shardby = NULL,
compress = 50,
@@ -117,7 +117,7 @@ Write disk.frame to disk
Arguments
- - df
+ - diskf
a disk.frame
- outdir
output directory for the disk.frame
diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html
index 9188f8ac..fe4335d3 100644
--- a/docs/reference/zip_to_disk.frame.html
+++ b/docs/reference/zip_to_disk.frame.html
@@ -20,7 +20,7 @@
@@ -144,7 +144,7 @@ Examples
# read every file and convert it to a disk.frame
zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df"))
-#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f44b886b42.zip' cannot be opened
+#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471855d8254c.zip' cannot be opened
# there is only one csv file so it return a list of one disk.frame
zip.df[[1]]
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 4aa93c2a..5df92161 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -120,6 +120,9 @@
/reference/as.disk.frame.html
+
+ /reference/bind_rows.disk.frame.html
+
/reference/bloomfilter.html
@@ -237,12 +240,18 @@
/reference/overwrite_check.html
+
+ /reference/play.html
+
/reference/print.disk.frame.html
/reference/pull.disk.frame.html
+
+ /reference/purrr_as_mapper.html
+
/reference/rbindlist.disk.frame.html
diff --git a/man/bind_rows.disk.frame.Rd b/man/bind_rows.disk.frame.Rd
new file mode 100644
index 00000000..d69a64b0
--- /dev/null
+++ b/man/bind_rows.disk.frame.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/bind_rows.r
+\name{bind_rows.disk.frame}
+\alias{bind_rows.disk.frame}
+\title{Bind rows}
+\usage{
+bind_rows.disk.frame(...)
+}
+\arguments{
+\item{...}{disk.frame to be row bound}
+}
+\description{
+Bind rows
+}
diff --git a/man/chunk_group_by.Rd b/man/chunk_group_by.Rd
index f06836c2..fac8791c 100644
--- a/man/chunk_group_by.Rd
+++ b/man/chunk_group_by.Rd
@@ -5,7 +5,14 @@
\alias{chunk_summarise}
\alias{chunk_group_by}
\alias{chunk_ungroup}
-\title{Group by within each disk.frame}
+\title{#' @export
+#' @importFrom dplyr add_count
+#' @rdname dplyr_verbs
+add_count.disk.frame <- create_chunk_mapper(dplyr::add_count)
+#' @export
+#' @importFrom dplyr add_tally
+#' @rdname dplyr_verbs
+add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally)}
\usage{
chunk_summarize(.data, ...)
diff --git a/man/cmap.Rd b/man/cmap.Rd
index fc06d9cd..1f08ba63 100644
--- a/man/cmap.Rd
+++ b/man/cmap.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cmap.r, R/map-deprecated.r
+% Please edit documentation in R/cmap.r
\name{cmap}
\alias{cmap}
\alias{cmap.disk.frame}
@@ -12,34 +12,12 @@
\alias{lazy}
\alias{lazy.disk.frame}
\alias{delayed}
-\alias{chunk_lapply}
-\alias{map}
-\alias{map.disk.frame}
-\alias{map.default}
-\alias{imap_dfr}
-\alias{imap_dfr.disk.frame}
-\alias{imap_dfr.default}
-\alias{imap}
-\alias{imap.default}
-\alias{map_dfr.disk.frame}
-\alias{map_dfr.default}
+\alias{clapply}
\title{Apply the same function to all chunks}
\usage{
cmap(.x, .f, ...)
-\method{cmap}{disk.frame}(
- .x,
- .f,
- ...,
- outdir = NULL,
- keep = NULL,
- chunks = nchunks(.x),
- compress = 50,
- lazy = TRUE,
- overwrite = FALSE,
- vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()),
- .progress = TRUE
-)
+\method{cmap}{disk.frame}(.x, .f, ...)
cmap_dfr(.x, .f, ..., .id = NULL)
@@ -52,10 +30,9 @@ cimap(.x, .f, ...)
.f,
outdir = NULL,
keep = NULL,
- chunks = nchunks(.x),
- compress = 50,
lazy = TRUE,
overwrite = FALSE,
+ compress = 50,
...
)
@@ -77,58 +54,32 @@ lazy(.x, .f, ...)
delayed(.x, .f, ...)
-chunk_lapply(...)
-
-map(.x, .f, ...)
-
-\method{map}{disk.frame}(...)
-
-\method{map}{default}(.x, .f, ...)
-
-imap_dfr(.x, .f, ..., .id = NULL)
-
-\method{imap_dfr}{disk.frame}(...)
-
-\method{imap_dfr}{default}(.x, .f, ..., .id = NULL)
-
-imap(.x, .f, ...)
-
-\method{imap}{default}(.x, .f, ...)
-
-\method{map_dfr}{disk.frame}(...)
-
-\method{map_dfr}{default}(.x, .f, ..., .id = NULL)
+clapply(...)
}
\arguments{
\item{.x}{a disk.frame}
\item{.f}{a function to apply to each of the chunks}
-\item{...}{for compatibility with `purrr::map`}
+\item{...}{Passed to `collect` and `write_disk.frame`}
-\item{outdir}{the output directory}
-
-\item{keep}{the columns to keep from the input}
-
-\item{chunks}{The number of chunks to output}
+\item{.id}{ignored}
-\item{compress}{0-100 fst compression ratio}
-
-\item{lazy}{if TRUE then do this lazily}
+\item{use.names}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist}
-\item{overwrite}{if TRUE removes any existing chunks in the data}
+\item{fill}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist}
-\item{vars_and_pkgs}{variables and packages to send to a background session. This is typically automatically detected}
+\item{idcol}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist}
-\item{.progress}{A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}}
+\item{outdir}{the output directory}
-\item{.id}{not used}
+\item{keep}{The columns to keep at source}
-\item{use.names}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist}
+\item{lazy}{if TRUE then do this lazily}
-\item{fill}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist}
+\item{overwrite}{Whether to overwrite any files in the output directory}
-\item{idcol}{for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist}
+\item{compress}{The compression setting. 0-100}
}
\description{
Apply the same function to all chunks
@@ -166,17 +117,6 @@ cmap_dfr(cars.df, ~.x[1,])
collect(lazy(cars.df, ~.x[1,]))
collect(delayed(cars.df, ~.x[1,]))
-# clean up cars.df
-delete(cars.df)
-cars.df = as.disk.frame(cars)
-
-# .x is the chunk and .y is the ID as an integer
-
-# lazy = TRUE support is not available at the moment
-cimap(cars.df, ~.x[, id := .y], lazy = FALSE)
-
-cimap_dfr(cars.df, ~.x[, id := .y])
-
# clean up cars.df
delete(cars.df)
}
diff --git a/man/cmap2.Rd b/man/cmap2.Rd
index 721dd80e..2a03f241 100644
--- a/man/cmap2.Rd
+++ b/man/cmap2.Rd
@@ -2,14 +2,11 @@
% Please edit documentation in R/map2.r, R/map_by_chunk_id.r
\name{cmap2}
\alias{cmap2}
-\alias{map2}
\alias{map_by_chunk_id}
\title{`cmap2` a function to two disk.frames}
\usage{
cmap2(.x, .y, .f, ...)
-map2(.x, .y, .f, ...)
-
map_by_chunk_id(.x, .y, .f, ..., outdir)
}
\arguments{
diff --git a/man/collect.Rd b/man/collect.Rd
index 520157ea..0acee69c 100644
--- a/man/collect.Rd
+++ b/man/collect.Rd
@@ -6,11 +6,16 @@
\alias{collect.summarized_disk.frame}
\title{Bring the disk.frame into R}
\usage{
-\method{collect}{disk.frame}(x, ..., parallel = !is.null(attr(x, "lazyfn")))
+\method{collect}{disk.frame}(x, ..., parallel = !is.null(attr(x, "recordings")))
-collect_list(x, simplify = FALSE, parallel = !is.null(attr(x, "lazyfn")))
+collect_list(
+ x,
+ simplify = FALSE,
+ parallel = !is.null(attr(x, "recordings")),
+ ...
+)
-\method{collect}{summarized_disk.frame}(x, ..., parallel = !is.null(attr(x, "lazyfn")))
+\method{collect}{summarized_disk.frame}(x, ..., parallel = !is.null(attr(x, "recordings")))
}
\arguments{
\item{x}{a disk.frame}
diff --git a/man/compute.disk.frame.Rd b/man/compute.disk.frame.Rd
index 57f1c6e2..e4de9e2d 100644
--- a/man/compute.disk.frame.Rd
+++ b/man/compute.disk.frame.Rd
@@ -2,26 +2,18 @@
% Please edit documentation in R/compute.r
\name{compute.disk.frame}
\alias{compute.disk.frame}
-\title{Compute without writing}
+\title{Force computations. The results are stored in a folder.}
\usage{
-\method{compute}{disk.frame}(
- x,
- name,
- outdir = tempfile("tmp_df_", fileext = ".df"),
- overwrite = TRUE,
- ...
-)
+\method{compute}{disk.frame}(x, name = NULL, outdir = tempfile("tmp_df_", fileext = ".df"), ...)
}
\arguments{
\item{x}{a disk.frame}
-\item{name}{Not used. Kept for compatibility with dplyr}
+\item{name}{If not NULL then used as outdir prefix.}
\item{outdir}{the output directory}
-\item{overwrite}{whether to overwrite or not}
-
-\item{...}{Not used. Kept for dplyr compatibility}
+\item{...}{Passed to `write_disk.frame`}
}
\description{
Perform the computation; same as calling cmap without .f and lazy = FALSE
diff --git a/man/create_chunk_mapper.Rd b/man/create_chunk_mapper.Rd
index 0702093d..7be02cea 100644
--- a/man/create_chunk_mapper.Rd
+++ b/man/create_chunk_mapper.Rd
@@ -4,7 +4,7 @@
\alias{create_chunk_mapper}
\title{Create function that applies to each chunk if disk.frame}
\usage{
-create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
+create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = FALSE)
}
\arguments{
\item{chunk_fn}{The dplyr function to create a mapper for}
diff --git a/man/csv_to_disk.frame.Rd b/man/csv_to_disk.frame.Rd
index 8c59d1de..b82ac585 100644
--- a/man/csv_to_disk.frame.Rd
+++ b/man/csv_to_disk.frame.Rd
@@ -47,8 +47,7 @@ the highest compression ratio.}
\item{header}{Whether the files have header. Defaults to TRUE}
-\item{.progress}{A logical, for whether or not to print a progress bar for
-multiprocess, multisession, and multicore plans. From {furrr}}
+\item{.progress}{A logical, for whether or not to show progress}
\item{backend}{The CSV reader backend to choose: "data.table" or "readr".
disk.frame does not have its own CSV reader. It uses either
diff --git a/man/dplyr_verbs.Rd b/man/dplyr_verbs.Rd
index 340e6958..cfebebb4 100644
--- a/man/dplyr_verbs.Rd
+++ b/man/dplyr_verbs.Rd
@@ -8,8 +8,6 @@
\alias{transmute.disk.frame}
\alias{arrange.disk.frame}
\alias{chunk_arrange}
-\alias{add_tally.disk.frame}
-\alias{do.disk.frame}
\alias{distinct.disk.frame}
\alias{chunk_distinct}
\alias{glimpse.disk.frame}
@@ -29,10 +27,6 @@
chunk_arrange(.data, ...)
-add_tally.disk.frame(.data, ...)
-
-\method{do}{disk.frame}(.data, ...)
-
\method{distinct}{disk.frame}(...)
chunk_distinct(.data, ...)
diff --git a/man/group_by.Rd b/man/group_by.Rd
index 1bb0d4a2..56bd42f7 100644
--- a/man/group_by.Rd
+++ b/man/group_by.Rd
@@ -12,7 +12,12 @@
\method{summarize}{grouped_disk.frame}(.data, ...)
-\method{group_by}{disk.frame}(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data))
+\method{group_by}{disk.frame}(
+ .data,
+ ...,
+ .add = FALSE,
+ .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")
+)
\method{summarize}{disk.frame}(.data, ...)
@@ -23,7 +28,7 @@
\item{...}{same as the dplyr::group_by}
-\item{add}{from dplyr}
+\item{.add}{from dplyr}
\item{.drop}{from dplyr}
}
diff --git a/man/hard_arrange.Rd b/man/hard_arrange.Rd
deleted file mode 100644
index a866f91d..00000000
--- a/man/hard_arrange.Rd
+++ /dev/null
@@ -1,53 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/hard_arrange.r
-\name{hard_arrange}
-\alias{hard_arrange}
-\alias{hard_arrange.data.frame}
-\alias{hard_arrange.disk.frame}
-\title{Perform a hard arrange}
-\usage{
-hard_arrange(df, ..., add = FALSE, .drop = FALSE)
-
-\method{hard_arrange}{data.frame}(df, ...)
-
-\method{hard_arrange}{disk.frame}(
- df,
- ...,
- outdir = tempfile("tmp_disk_frame_hard_arrange"),
- nchunks = disk.frame::nchunks(df),
- overwrite = TRUE
-)
-}
-\arguments{
-\item{df}{a disk.frame}
-
-\item{...}{grouping variables}
-
-\item{add}{same as dplyr::arrange}
-
-\item{.drop}{same as dplyr::arrange}
-
-\item{outdir}{the output directory}
-
-\item{nchunks}{The number of chunks in the output. Defaults = nchunks.disk.frame(df)}
-
-\item{overwrite}{overwrite the out put directory}
-}
-\description{
-A hard_arrange is a sort by that also reorganizes the chunks to ensure that
-every unique grouping of `by`` is in the same chunk. Or in other words, every
-row that share the same `by` value will end up in the same chunk.
-}
-\examples{
-iris.df = as.disk.frame(iris, nchunks = 2)
-
-# arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk
-iris_hard.df = hard_arrange(iris.df, Species)
-
-get_chunk(iris_hard.df, 1)
-get_chunk(iris_hard.df, 2)
-
-# clean up cars.df
-delete(iris.df)
-delete(iris_hard.df)
-}
diff --git a/man/hard_group_by.Rd b/man/hard_group_by.Rd
deleted file mode 100644
index 3d2af379..00000000
--- a/man/hard_group_by.Rd
+++ /dev/null
@@ -1,65 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/hard_group_by.r
-\name{hard_group_by}
-\alias{hard_group_by}
-\alias{hard_group_by.data.frame}
-\alias{hard_group_by.disk.frame}
-\title{Perform a hard group}
-\usage{
-hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
-
-\method{hard_group_by}{data.frame}(df, ..., .add = FALSE, .drop = FALSE)
-
-\method{hard_group_by}{disk.frame}(
- df,
- ...,
- outdir = tempfile("tmp_disk_frame_hard_group_by"),
- nchunks = disk.frame::nchunks(df),
- overwrite = TRUE,
- shardby_function = "hash",
- sort_splits = NULL,
- desc_vars = NULL,
- sort_split_sample_size = 100
-)
-}
-\arguments{
-\item{df}{a disk.frame}
-
-\item{...}{grouping variables}
-
-\item{.add}{same as dplyr::group_by}
-
-\item{.drop}{same as dplyr::group_by}
-
-\item{outdir}{the output directory}
-
-\item{nchunks}{The number of chunks in the output. Defaults = nchunks.disk.frame(df)}
-
-\item{overwrite}{overwrite the out put directory}
-
-\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks}
-
-\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.}
-
-\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.}
-
-\item{sort_split_sample_size}{for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.}
-}
-\description{
-A hard_group_by is a group by that also reorganizes the chunks to ensure that
-every unique grouping of `by`` is in the same chunk. Or in other words, every
-row that share the same `by` value will end up in the same chunk.
-}
-\examples{
-iris.df = as.disk.frame(iris, nchunks = 2)
-
-# group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk
-iris_hard.df = hard_group_by(iris.df, Species)
-
-get_chunk(iris_hard.df, 1)
-get_chunk(iris_hard.df, 2)
-
-# clean up cars.df
-delete(iris.df)
-delete(iris_hard.df)
-}
diff --git a/man/join.Rd b/man/join.Rd
index 74d6d0dd..218bd626 100644
--- a/man/join.Rd
+++ b/man/join.Rd
@@ -38,7 +38,9 @@
y,
by = NULL,
copy = FALSE,
+ suffix = c(".x", ".y"),
...,
+ keep = FALSE,
outdir = tempfile("tmp_disk_frame_inner_join"),
merge_by_chunk_id = NULL,
overwrite = TRUE,
@@ -50,7 +52,9 @@
y,
by = NULL,
copy = FALSE,
+ suffix = c(".x", ".y"),
...,
+ keep = FALSE,
outdir = tempfile("tmp_disk_frame_left_join"),
merge_by_chunk_id = FALSE,
overwrite = TRUE,
@@ -87,6 +91,10 @@
\item{overwrite}{overwrite output directory}
\item{.progress}{Show progress or not. Defaults to FALSE}
+
+\item{suffix}{see dplyr::XXX_join}
+
+\item{keep}{see dplyr::XXX_join}
}
\value{
disk.frame or data.frame/data.table
diff --git a/man/play.Rd b/man/play.Rd
new file mode 100644
index 00000000..d252ead2
--- /dev/null
+++ b/man/play.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/play.r
+\name{play}
+\alias{play}
+\title{Play the recorded lazy operations}
+\usage{
+play(dataframe, recordings)
+}
+\arguments{
+\item{dataframe}{A data.frame}
+
+\item{recordings}{A recording the expression, globals and packages using create_chunk_mapper}
+}
+\description{
+Play the recorded lazy operations
+}
diff --git a/man/purrr_as_mapper.Rd b/man/purrr_as_mapper.Rd
new file mode 100644
index 00000000..a34dd80b
--- /dev/null
+++ b/man/purrr_as_mapper.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.r
+\name{purrr_as_mapper}
+\alias{purrr_as_mapper}
+\title{Used to convert a function to purrr syntax if needed}
+\usage{
+purrr_as_mapper(.f)
+}
+\arguments{
+\item{.f}{a normal function or purrr syntax function i.e. `~{ ...code...}`}
+}
+\description{
+Used to convert a function to purrr syntax if needed
+}
diff --git a/man/rbindlist.disk.frame.Rd b/man/rbindlist.disk.frame.Rd
index 9a334037..0549ebd5 100644
--- a/man/rbindlist.disk.frame.Rd
+++ b/man/rbindlist.disk.frame.Rd
@@ -27,7 +27,7 @@ rbindlist.disk.frame(
\item{overwrite}{overwrite the output directory}
-\item{.progress}{A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}}
+\item{.progress}{A logical, for whether or not to show progress.}
}
\description{
rbindlist disk.frames together
diff --git a/man/rechunk.Rd b/man/rechunk.Rd
index ce74ac58..086c53b0 100644
--- a/man/rechunk.Rd
+++ b/man/rechunk.Rd
@@ -6,13 +6,10 @@
\usage{
rechunk(
df,
- nchunks,
+ nchunks = disk.frame::nchunks(df),
outdir = attr(df, "path", exact = TRUE),
shardby = NULL,
- overwrite = TRUE,
- shardby_function = "hash",
- sort_splits = NULL,
- desc_vars = NULL
+ overwrite = TRUE
)
}
\arguments{
@@ -25,12 +22,6 @@ rechunk(
\item{shardby}{the shardkeys}
\item{overwrite}{overwrite the output directory}
-
-\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks}
-
-\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.}
-
-\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.}
}
\description{
Increase or decrease the number of chunks in the disk.frame
diff --git a/man/shard.Rd b/man/shard.Rd
index 09b8ef5c..a5250d70 100644
--- a/man/shard.Rd
+++ b/man/shard.Rd
@@ -11,10 +11,7 @@ shard(
outdir = tempfile(fileext = ".df"),
...,
nchunks = recommend_nchunks(df),
- overwrite = FALSE,
- shardby_function = "hash",
- sort_splits = NULL,
- desc_vars = NULL
+ overwrite = FALSE
)
distribute(...)
@@ -31,12 +28,6 @@ distribute(...)
\item{nchunks}{The number of chunks}
\item{overwrite}{If TRUE then the chunks are overwritten}
-
-\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks}
-
-\item{sort_splits}{If shardby_function is "sort", the split values for sharding}
-
-\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.}
}
\description{
Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame
diff --git a/man/srckeep.Rd b/man/srckeep.Rd
index c377c1c1..e4c4accc 100644
--- a/man/srckeep.Rd
+++ b/man/srckeep.Rd
@@ -2,12 +2,9 @@
% Please edit documentation in R/srckeep.disk.frame.r
\name{srckeep}
\alias{srckeep}
-\alias{srckeepchunks}
\title{Keep only the variables from the input listed in selections}
\usage{
srckeep(diskf, selections, ...)
-
-srckeepchunks(diskf, chunks, ...)
}
\arguments{
\item{diskf}{a disk.frame}
@@ -15,8 +12,6 @@ srckeepchunks(diskf, chunks, ...)
\item{selections}{The list of variables to keep from the input source}
\item{...}{not yet used}
-
-\item{chunks}{The chunks to load}
}
\description{
Keep only the variables from the input listed in selections
diff --git a/man/write_disk.frame.Rd b/man/write_disk.frame.Rd
index 88977489..60c2bf09 100644
--- a/man/write_disk.frame.Rd
+++ b/man/write_disk.frame.Rd
@@ -6,10 +6,10 @@
\title{Write disk.frame to disk}
\usage{
write_disk.frame(
- df,
+ diskf,
outdir = tempfile(fileext = ".df"),
- nchunks = ifelse("disk.frame" \%in\% class(df), nchunks.disk.frame(df),
- recommend_nchunks(df)),
+ nchunks = ifelse("disk.frame" \%in\% class(diskf), nchunks.disk.frame(diskf),
+ recommend_nchunks(diskf)),
overwrite = FALSE,
shardby = NULL,
compress = 50,
@@ -22,7 +22,7 @@ write_disk.frame(
output_disk.frame(...)
}
\arguments{
-\item{df}{a disk.frame}
+\item{diskf}{a disk.frame}
\item{outdir}{output directory for the disk.frame}
diff --git a/misc/NAMESPACE_20190205 b/misc/NAMESPACE_20190205
deleted file mode 100644
index 72ead0da..00000000
--- a/misc/NAMESPACE_20190205
+++ /dev/null
@@ -1,99 +0,0 @@
-# Generated by roxygen2: do not edit by hand
-
-S3method("[",disk.frame)
-S3method(anti_join,disk.frame)
-S3method(arrange_,disk.frame)
-S3method(as.data.frame,disk.frame)
-S3method(as.data.table,disk.frame)
-S3method(collect,disk.frame)
-S3method(compute,disk.frame)
-S3method(delayed,disk.frame)
-S3method(do_,disk.frame)
-S3method(filter_,disk.frame)
-S3method(full_join,disk.frame)
-S3method(get_chunk,disk.frame)
-S3method(glimpse,disk.frame)
-S3method(group_by,disk.frame)
-S3method(group_by_,disk.frame)
-S3method(groups,disk.frame)
-S3method(hard_group_by,disk.frame)
-S3method(head,disk.frame)
-S3method(inner_join,disk.frame)
-S3method(left_join,disk.frame)
-S3method(merge,disk.frame)
-S3method(mutate_,disk.frame)
-S3method(names,disk.frame)
-S3method(nchunk,disk.frame)
-S3method(nchunks,disk.frame)
-S3method(ncol,default)
-S3method(ncol,disk.frame)
-S3method(nrow,default)
-S3method(nrow,disk.frame)
-S3method(print,disk.frame)
-S3method(print,xgdf_scorecard)
-S3method(rename_,disk.frame)
-S3method(sample_frac,disk.frame)
-S3method(select_,disk.frame)
-S3method(semi_join,disk.frame)
-S3method(summarise_,disk.frame)
-S3method(tail,disk.frame)
-S3method(tbl_vars,disk.frame)
-S3method(transmute_,disk.frame)
-export(add_chunk)
-export(add_meta)
-export(as.disk.frame)
-export(auc)
-export(chunk_lapply)
-export(collect_list)
-export(colnames)
-export(csv_to_disk.frame)
-export(delayed)
-export(disk.frame)
-export(distribute)
-export(evalparseglue)
-export(foverlaps.disk.frame)
-export(get_chunk)
-export(hard_group_by)
-export(is_disk.frame)
-export(lazy)
-export(map.disk.frame)
-export(map_by_chunk_id)
-export(nchunk)
-export(nchunks)
-export(ncol)
-export(nrow)
-export(overwrite_check)
-export(rbindlist.disk.frame)
-export(rechunk)
-export(recommend_nchunks)
-export(remove_chunk)
-export(shard)
-export(shardkey)
-export(srckeep)
-export(write_disk.frame)
-export(zip_to_disk.frame)
-import(base)
-import(dplyr)
-import(dtplyr)
-import(fs)
-import(fst)
-import(furrr)
-import(purrr)
-import(stringr)
-importFrom(Rcpp,evalCpp)
-importFrom(data.table,as.data.table)
-importFrom(data.table,data.table)
-importFrom(data.table,foverlaps)
-importFrom(data.table,rbindlist)
-importFrom(data.table,setDT)
-importFrom(furrr,future_map_dfr)
-importFrom(future.apply,future_lapply)
-importFrom(glue,glue)
-importFrom(jsonlite,fromJSON)
-importFrom(jsonlite,toJSON)
-importFrom(pryr,object_size)
-importFrom(purrr,map_dfr)
-importFrom(xgboost,xgb.DMatrix)
-importFrom(xgboost,xgb.save)
-importFrom(xgboost,xgboost)
-useDynLib(disk.frame)
diff --git a/misc/NAMESPACE_ok b/misc/NAMESPACE_ok
deleted file mode 100644
index 18b7d99c..00000000
--- a/misc/NAMESPACE_ok
+++ /dev/null
@@ -1,61 +0,0 @@
-# Generated by roxygen2: do not edit by hand
-useDynLib(disk.frame, .registration=TRUE)
-importFrom(Rcpp,evalCpp)
-S3method("[",disk.frame)
-S3method(anti_join,disk.frame)
-S3method(as.data.frame,disk.frame)
-S3method(as.data.table,disk.frame)
-S3method(collect,disk.frame)
-S3method(do_,disk.frame)
-S3method(filter_,disk.frame)
-S3method(get_chunk,disk.frame)
-S3method(group_by_,disk.frame)
-S3method(groups,disk.frame)
-S3method(hard_group_by,disk.frame)
-S3method(head,disk.frame)
-S3method(inner_join,disk.frame)
-S3method(keep,disk.frame)
-S3method(left_join,disk.frame)
-S3method(map,disk.frame)
-S3method(merge,disk.frame)
-S3method(mutate_,disk.frame)
-S3method(names,disk.frame)
-S3method(nchunk,disk.frame)
-S3method(nchunks,disk.frame)
-S3method(nrow,default)
-S3method(nrow,disk.frame)
-S3method(print,disk.frame)
-S3method(rename_,disk.frame)
-S3method(select_,disk.frame)
-S3method(semi_join,disk.frame)
-S3method(summarise_,disk.frame)
-S3method(tail,disk.frame)
-S3method(tbl_vars,disk.frame)
-S3method(transmute_,disk.frame)
-export(chunk_lapply)
-export(collect)
-export(colnames.disk.frame)
-export(delayed)
-export(disk.frame)
-export(distribute)
-export(foverlaps.disk.frame)
-export(get_chunk)
-export(hard_group_by)
-export(keep)
-export(lazy)
-export(map)
-export(nchunk)
-export(nchunks)
-export(nrow)
-export(progressbar)
-export(rbindlist.disk.frame)
-export(shard)
-import(data.table)
-import(dplyr)
-import(dtplyr)
-import(fs)
-import(fst)
-import(future)
-import(future.apply)
-import(glue)
-import(purrr)
diff --git a/misc/disk.frame-report.html b/misc/disk.frame-report.html
index 12c97232..5f8a1d71 100644
--- a/misc/disk.frame-report.html
+++ b/misc/disk.frame-report.html
@@ -1,6 +1,6 @@
-
-
+
+
@@ -22,7 +22,7 @@
-
+
- disk.frame coverage - 51.96%
+ disk.frame coverage - 52.99%
-
+
@@ -227,7 +227,7 @@ disk.frame coverage - 51.96%
stopifnot(backend == "fst")
if(dir.exists(path)) {
disk.frame_folder(path)
stopifnot("disk.frame" %in% class(df))
if(is.null(shardkey)) {
fs::dir_create(file.path(attr(df,"path"), ".metadata"))
json_path = fs::file_create(file.path(attr(df,"path"), ".metadata", "meta.json"))
filesize = file.size("meta.json")
meta_out = NULL
if(is.na(filesize)) {
meta_out = jsonlite::toJSON(
c(
list(
nchunks = nchunks,
shardkey = shardkey,
shardchunks = shardchunks),
list(...)
cat(meta_out, file = json_path)
df
df <- list()
df$files <- list.files(path, full.names = TRUE)
df$files_short <- list.files(path)
attr(df,"path") <- path
attr(df,"backend") <- "fst"
class(df) <- c("disk.frame", "disk.frame.folder")
attr(df, "performing") <- "none"
df
return(TRUE)
if(check.consistency) {
fpath <- attr(df,"path")
if(!dir.exists(fpath) & file.exists(fpath)) {
return(FALSE)
!is.file.disk.frame(df, check.consistency = check.consistency)
1 |
- #' @export
+ #' Set up disk.frame environment
|
||
2 |
- #' @rdname join
+ #' @param workers the number of workers (background R processes in the
|
||
3 |
- #' @examples
+ #' @param future_backend which future backend to use for parallelization
|
||
4 |
- #' cars.df = as.disk.frame(cars)
+ #' @param gui Whether to use a Graphical User Interface (GUI) for selecting the options. Defaults to FALSE
|
||
5 |
- #'
+ #' @param ... passed to `future::plan`
|
||
6 |
- #' join.df = inner_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
+ #' @importFrom future plan multiprocess nbrOfWorkers sequential
|
||
7 |
- #'
+ #' @export
|
||
8 |
- #' # clean up cars.df
+ #' @examples
|
||
9 |
- #' delete(cars.df)
+ #' if (interactive()) {
|
||
10 |
- #' delete(join.df)
+ #' # setup disk.frame to use multiple workers these may use more than two
|
||
11 |
- inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE) {+ #' # cores, and is therefore not allowed on CRAN. Hence it's set to run only in
|
||
12 | -12x | +
- stopifnot("disk.frame" %in% class(x))+ #' # interactive session
|
|
13 |
- + #' setup_disk.frame()
|
||
14 | -12x | +
- overwrite_check(outdir, overwrite)+ #'
|
|
15 |
- + #' # use a Shiny GUI to adjust settings
|
||
16 |
- + #' # only run in interactive()
|
||
17 | -12x | +
- if(!is.null(outdir)) {+ #' setup_disk.frame(gui = TRUE)
|
|
18 | -12x | +
- if(overwrite & fs::dir_exists(outdir)) {+ #' }
|
|
19 | -12x | +
- fs::dir_delete(outdir)+ #'
|
|
20 | -12x | +
- fs::dir_create(outdir)+ #' # set the number workers to 2
|
|
21 |
- } else {+ #' setup_disk.frame(2)
|
||
22 | -! | +
- fs::dir_create(outdir)+ #'
|
|
23 |
- }
+ #' # if you do not wish to use multiple workers you can set it to sequential
|
||
24 |
- }
+ #' setup_disk.frame(future_backend=future::sequential)
|
||
25 |
- + setup_disk.frame <- function(workers = data.table::getDTthreads(), future_backend = future::multisession, ..., gui = FALSE) { |
||
26 | -12x | +6x |
- if("data.frame" %in% class(y)) {+ if(!gui) { |
27 | -4x | +6x |
- quo_dotdotdot = enquos(...)+ future::plan(future_backend, workers = workers, gc = TRUE, ...) |
28 | -4x | +6x |
- res = map_dfr(x, ~{+ message(sprintf("The number of workers available for disk.frame is %d", future::nbrOfWorkers())) |
29 | -16x | +
- code = quo(inner_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot))+ # do not limit the amount of transfers to other workers
|
|
30 | -16x | +
- rlang::eval_tidy(code)+ # this is not allowed by CRAN policy
|
|
31 |
- })+ #options(future.globals.maxSize = future.globals.maxSize)
|
||
32 | -4x | +
- return(res)+ #options(disk.frame.nworkers = workers)
|
|
33 | -8x | +! |
- } else if("disk.frame" %in% class(y)) {+ } else if(gui) { |
34 | -8x | +! |
- if(is.null(merge_by_chunk_id)) {+ if (!requireNamespace("shiny", quietly = TRUE)) { |
35 | ! |
- stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly")+ stop("Package \"shiny\" must be installed to use GUI. You can install shiny using install.packages('shiny')", |
|
36 | -+ | ! |
- }
+ call. = FALSE) |
37 | -8x | +
- if(is.null(by)) {+ }
|
|
38 | -! | +
- by <- intersect(names(x), names(y))+ |
|
39 | -+ | ! |
- }
+ ui <- shiny::fluidPage( |
40 | -+ | ! |
- + shiny::h1("disk.frame settings"), |
41 | -8x | +! |
- ncx = nchunks(x)+ shiny::sliderInput( |
42 | -8x | +! |
- ncy = nchunks(y)+ "nbrOfWorkers",
|
43 | -8x | +! |
- if (merge_by_chunk_id == FALSE) {+ sprintf("Number of workers (recommendation = %d)", parallel::detectCores(logical = FALSE)), |
44 | -4x | +! |
- x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ 1,
|
45 | -4x | +! |
- y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ parallel::detectCores(), |
46 | -4x | +! |
- return(inner_join.disk.frame(x, y, by, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite))+ value = future::nbrOfWorkers(), |
47 | -4x | +! |
- } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {+ step = 1), |
48 | -4x | +! |
- dotdotdot <- list(...)+ shiny::includeMarkdown(system.file("options.rmd", package="disk.frame")) |
49 |
- + # , shiny::checkboxInput(
|
||
50 | -4x | +
- res = map2.disk.frame(x, y, ~{+ # "inf_fgm",
|
|
51 | -21x | +
- if(is.null(.y)) {+ # "Recommended: Set Maximum transfer size between workers to Inf (so ignore slider below)",
|
|
52 | -! | +
- return(data.table())+ # value = ifelse(
|
|
53 | -21x | +
- } else if (is.null(.x)) {+ # is.null(getOption("future.globals.maxSize")),
|
|
54 | -! | +
- return(data.table())+ # TRUE,
|
|
55 |
- }
+ # is.infinite(getOption("future.globals.maxSize")))
|
||
56 |
- #inner_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite)
+ # )
|
||
57 | -21x | +
- lij = purrr::lift(dplyr::inner_join)+ # ,shiny::sliderInput(
|
|
58 | -21x | +
- lij(c(list(x = .x, y = .y, by = by, copy = copy), dotdotdot))+ # "future.globals.maxSize",
|
|
59 | -4x | +
- }, outdir = outdir)+ # "Maximum transfer size between workers (gb)",
|
|
60 | -4x | +
- return(res)+ # 0,
|
|
61 |
- } else {+ # ifelse(is.infinite(memory.limit()), 3904, memory.limit()/1024/1024/1024),
|
||
62 |
- # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
+ # value = ifelse(is.infinite(getOption("future.globals.maxSize")), 3904, memory.limit()/1024/1024/1024),
|
||
63 | -! | +
- stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE")+ # step = 0.5
|
|
64 |
- }
+ # )
|
||
65 |
- }
+ )
|
||
66 | + |
+ + |
+ |
67 | +! | +
+ server <- function(input, output, session) {+ |
+ |
68 | +! | +
+ shiny::observe({+ |
+ |
69 | +! | +
+ future::plan(future_backend, workers = input$nbrOfWorkers, gc = TRUE, ...)+ |
+ |
70 | ++ |
+ })+ |
+ |
71 | ++ |
+ + |
+ |
72 | ++ |
+ #shiny::observe({
+ |
+ |
73 | ++ |
+ #if(input$inf_fgm) {
+ |
+ |
74 | ++ |
+ #options(future.globals.maxSize = Inf)
+ |
+ |
75 | ++ |
+ #} else {
+ |
+ |
76 | ++ |
+ #options(future.globals.maxSize = input$future.globals.maxSize*1024*1024*1024)
+ |
+ |
77 | ++ |
+ #}
+ |
+ |
78 | ++ |
+ #})
+ |
+ |
79 | ++ |
+ }
+ |
+ |
80 | ++ |
+ + |
+ |
81 | +! | +
+ shiny::shinyApp(ui, server)+ |
+ |
82 | ++ |
+ } else {+ |
+ |
83 | +! | +
+ stop("setup_disk.frame: gui must be set to either TRUE or FALSE")+ |
+ |
84 | ++ |
+ }
+ |
+ |
85 | +
}
|
@@ -2271,3080 +2404,6566 @@
1 |
- #' [ interface for disk.frame using fst backend
+ #' Convert CSV file(s) to disk.frame format
|
||
2 |
- #' @param df a disk.frame
+ #' @importFrom glue glue
|
||
3 |
- #' @param ... same as data.table
+ #' @importFrom fs dir_delete
|
||
4 |
- #' @param keep the columns to srckeep
+ #' @importFrom pryr do_call
|
||
5 |
- #' @param rbind Whether to rbind the chunks. Defaults to TRUE
+ #' @param infile The input CSV file or files
|
||
6 |
- #' @param use.names Same as in data.table::rbindlist
+ #' @param outdir The directory to output the disk.frame to
|
||
7 |
- #' @param fill Same as in data.table::rbindlist
+ #' @param inmapfn A function to be applied to the chunk read in from CSV before
|
||
8 |
- #' @param idcol Same as in data.table::rbindlist
+ #' the chunk is being written out. Commonly used to perform simple
|
||
9 |
- #' @import fst
+ #' transformations. Defaults to the identity function (ie. no transformation)
|
||
10 |
- #' @importFrom future.apply future_lapply
+ #' @param nchunks Number of chunks to output
|
||
11 |
- #' @importFrom data.table rbindlist
+ #' @param in_chunk_size When reading in the file, how many lines to read in at
|
||
12 |
- #' @importFrom globals findGlobals
+ #' once. This is different to nchunks which controls how many chunks are
|
||
13 |
- #' @export
+ #' output
|
||
14 |
- #' @examples
+ #' @param shardby The column(s) to shard the data by. For example suppose
|
||
15 |
- #' cars.df = as.disk.frame(cars)
+ #' `shardby = c("col1","col2")` then every row where the values `col1` and
|
||
16 |
- #' speed_limit = 50
+ #' `col2` are the same will end up in the same chunk; this will allow merging
|
||
17 |
- #' cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
+ #' by `col1` and `col2` to be more efficient
|
||
18 |
- #'
+ #' @param compress For fst backends it's a number between 0 and 100 where 100 is
|
||
19 |
- #' # clean up
+ #' the highest compression ratio.
|
||
20 |
- #' delete(cars.df)
+ #' @param overwrite Whether to overwrite the existing directory
|
||
21 |
- `[.disk.frame` <- function(df, ..., keep = NULL, rbind = TRUE, use.names = TRUE, fill = FALSE, idcol = NULL) {+ #' @param header Whether the files have header. Defaults to TRUE
|
||
22 | -4x | +
- keep_for_future = keep+ #' @param .progress A logical, for whether or not to print a progress bar for
|
|
23 |
- + #' multiprocess, multisession, and multicore plans. From {furrr}
|
||
24 | -4x | +
- dotdotdot = substitute(...()) #this is an alist+ #' @param backend The CSV reader backend to choose: "data.table" or "readr".
|
|
25 |
- + #' disk.frame does not have its own CSV reader. It uses either
|
||
26 | -4x | +
- ag = globals::findGlobals(dotdotdot)+ #' data.table::fread or readr::read_delimited. It is worth noting that
|
|
27 | -4x | +
- ag = setdiff(ag, "") # "" can cause issues with future+ #' data.table::fread does not detect dates and all dates are imported as
|
|
28 |
- + #' strings, and you are encouraged to use {fasttime} to convert the strings to
|
||
29 | -4x | +
- res = future.apply::future_lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) {+ #' date. You can use the `inmapfn` to do that. However, if you want automatic
|
|
30 |
- #lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) {
+ #' date detection, then backend="readr" may suit your needs. However, readr
|
||
31 | -26x | +
- chunk = get_chunk(df, chunk_id, keep = keep_for_future)+ #' is often slower than data.table, hence data.table is chosen as the default.
|
|
32 | -26x | +
- data.table::setDT(chunk)+ #' @param chunk_reader Even if you choose a backend there can still be multiple
|
|
33 | -26x | +
- expr <- quote(chunk)+ #' strategies on how to approach the CSV reads. For example, data.table::fread
|
|
34 | -26x | +
- expr <- c(expr, dotdotdot)+ #' tries to mmap the whole file which can cause the whole read process to
|
|
35 | -26x | +
- res <- do.call(`[`, expr)+ #' fail. In that case we can change the chunk_reader to "readLines" which uses the
|
|
36 | -26x | +
- res
+ #' readLines function to read chunk by chunk and still use data.table::fread
|
|
37 | -4x | +
- }, future.globals = c("df", "keep_for_future", "dotdotdot", ag), future.packages = c("data.table","disk.frame"))+ #' to process the chunks. There are currently no strategies for readr backend,
|
|
38 |
- + #' except the default one.
|
||
39 | -4x | +
- if(rbind & all(sapply(res, function(x) "data.frame" %in% class(x)))) {+ #' @param ... passed to data.table::fread, disk.frame::as.disk.frame,
|
|
40 | -3x | +
- rbindlist(res, use.names = use.names, fill = fill, idcol = idcol)+ #' disk.frame::shard
|
|
41 | -1x | +
- } else if(rbind) {+ #' @importFrom pryr do_call
|
|
42 | -1x | +
- unlist(res)+ #@importFrom LaF detect_dm_csv process_blocks
|
|
43 |
- } else {+ #' @importFrom bigreadr split_file get_split_files
|
||
44 | -! | +
- res
+ #' @family ingesting data
|
|
45 |
- }
+ #' @export
|
||
46 |
- }
+ #' @examples
|
||
47 |
-
+ #' tmpfile = tempfile()
|
||
48 |
- # Solutions from https://stackoverflow.com/questions/57122960/how-to-use-non-standard-evaluation-nse-to-evaluate-arguments-on-data-table?answertab=active#tab-top
+ #' write.csv(cars, tmpfile)
|
||
49 |
- # `[.dd` <- function(x, ...) {
+ #' tmpdf = tempfile(fileext = ".df")
|
||
50 |
- # code <- rlang::enexprs(...)
+ #' df = csv_to_disk.frame(tmpfile, outdir = tmpdf, overwrite = TRUE)
|
||
51 |
- # lapply(x, function(dt) {
+ #'
|
||
52 |
- # ex <- rlang::expr(dt[!!!code])
+ #' # clean up
|
||
53 |
- # rlang::eval_tidy(ex)
+ #' fs::file_delete(tmpfile)
|
||
54 |
- # })
+ #' delete(df)
|
||
55 |
- # }
+ csv_to_disk.frame <- function(infile, outdir = tempfile(fileext = ".df"), inmapfn = base::I, nchunks = recommend_nchunks(sum(file.size(infile))), |
||
56 |
- #
+ in_chunk_size = NULL, shardby = NULL, compress=50, overwrite = TRUE, header = TRUE, .progress = TRUE, backend = c("data.table", "readr", "LaF"), chunk_reader = c("bigreadr", "data.table", "readr", "readLines"), ...) { |
||
57 | -+ | 8x |
- #
+ backend = match.arg(backend) |
58 | -+ | 8x |
- # `[.dd` <- function(x,...) {
+ chunk_reader = match.arg(chunk_reader) |
59 |
- # a <- substitute(...()) #this is an alist
+
|
||
60 | -+ | 8x |
- # expr <- quote(x[[i]])
+ if(backend == "readr" | chunk_reader == "readr") { |
61 | -+ | ! |
- # expr <- c(expr, a)
+ if(!requireNamespace("readr")) { |
62 | -+ | ! |
- # res <- list()
+ stop("csv_to_disk.frame: You have chosen backend = 'readr' or chunk_reader = 'readr'. But `readr` package is not installed. To install run: `install_packages(\"readr\")`") |
63 |
- # for (i in seq_along(x)) {
+ }
|
||
64 |
- # res[[i]] <- do.call(`[`, expr)
+ }
|
||
65 |
- # }
+ |
||
66 |
- # res
+ |
||
67 | -+ | 8x |
- # }
+ overwrite_check(outdir, overwrite) |
1 | +68 |
- #' Increase or decrease the number of chunks in the disk.frame
+ |
||
2 | +69 |
- #' @param df the disk.frame to rechunk
+ # we need multiple backend because data.table has poor support for when the file is larger than RAM
|
||
3 | +70 |
- #' @param nchunks number of chunks
+ # https://github.com/Rdatatable/data.table/issues/3526
|
||
4 | +71 |
- #' @param shardby the shardkeys
+ # TODO detect these cases
|
||
5 | +72 |
- #' @param outdir the output directory
+ |
||
6 | +73 |
- #' @param overwrite overwrite the output directory
+ # user has requested chunk-wise reading but wants me to do it
|
||
7 | +74 |
- #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
+ |
||
8 | +75 |
- #' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
+ #if(is.null(in_chunk_size)) {
|
||
9 | +76 |
- #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
+ |
||
10 | +77 |
- #' @export
+ #} else if(is.character(in_chunk_size) && in_chunk_size == "guess") {
|
||
11 | +78 |
- #' @examples
+ |
||
12 | +79 |
- #' # create a disk.frame with 2 chunks in tempdir()
+ #library(bigreadr)
|
||
13 | +80 |
- #' cars.df = as.disk.frame(cars, nchunks = 2)
+ # system.time(wc_l <- R.utils::countLines(infile))
|
||
14 | +81 |
- #'
+ # system.time(infos_split <- split_file(infile, every_nlines = 1e7))
|
||
15 | +82 |
- #' # re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df
+ # file_parts <- get_split_files(infos_split)
|
||
16 | +83 |
- #' rechunk(cars.df, 3)
+ |
||
17 | +84 |
- #'
+ #} else
|
||
18 | -+ | |||
85 | +8x |
- #' new_path = tempfile(fileext = ".df")
+ if(is.numeric(in_chunk_size)) { |
||
19 | -+ | |||
86 | +! |
- #' # re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory
+ if(backend =="data.table" & chunk_reader == "data.table") { |
||
20 | -+ | |||
87 | +! |
- #' cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed")
+ rs = df_ram_size() |
||
21 | +88 |
- #'
+ |
||
22 | -+ | |||
89 | +! |
- #' # clean up cars.df
+ if (any((sapply(infile, file.size)/1024^3)> rs)) { |
||
23 | -+ | |||
90 | +! |
- #' delete(cars.df)
+ message("csv_to_disk.frame: using backend = 'data.table' and chunk_reader = 'data.table'.") |
||
24 | -+ | |||
91 | +! |
- #' delete(cars2.df)
+ message(glue::glue("But one of your input files is larger than available RAM {rs}.")) |
||
25 | -+ | |||
92 | +! |
- rechunk <- function(df, nchunks, outdir = attr(df, "path"), shardby = NULL, overwrite = TRUE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) {+ message("if the file(s) fail to read, please set chunk_reader = 'readLines' or chunk_reader = 'readr'.") |
||
26 | -+ | |||
93 | +! |
- + message("E.g. csv_to_disk.frame(..., chunk_reader = 'readr')") |
||
27 | +94 |
- # we need to force the chunks to be computed first as it's common to make nchunks a multiple of chunks(df)
+ }
|
||
28 | +95 |
- # but if we do it too late then the folder could be empty
+ }
|
||
29 | -5x | +|||
96 | +
- force(nchunks)+ }
|
|||
30 | +97 |
|
||
31 | -5x | +98 | +8x |
- if (nchunks < 1) {+ if(length(infile)>1) { |
32 | +99 | ! |
- stop(glue::glue("nchunks must be larger than 1"))+ message("csv_to_disk.frame: Reading multiple input files.") |
|
33 | +100 |
- }
+ #param_names = names(list(...))
|
||
34 | +101 |
- + |
||
35 | -5x | +|||
102 | +! |
- stopifnot("disk.frame" %in% class(df))+ if(backend == "data.table") { |
||
36 | +103 |
- - |
- ||
37 | -5x | -
- user_had_not_set_shard_by = is.null(shardby)+ #if (!"colClasses" %in% param_names) {
|
||
38 | -5x | +|||
104 | +! |
- user_had_set_shard_by = !user_had_not_set_shard_by+ message("Please use `colClasses = ` to set column types to minimize the chance of a failed read") |
||
39 | +105 |
-
+ #}
|
||
40 | -+ | |||
106 | +! |
- # back up the files if writing to the same directory
+ } else if (backend == "readr") { |
||
41 | -5x | +|||
107 | +
- if(outdir == attr(df,"path")) {+ #if (!"col_types" %in% param_names) {
|
|||
42 | -4x | +|||
108 | +! |
- back_up_tmp_dir <- tempfile("back_up_tmp_dir")+ message("Please use `col_types = ` to set column types to minimize the chance of a failed read") |
||
43 | -4x | +|||
109 | +
- fs::dir_create(back_up_tmp_dir)+ #}
|
|||
44 | -+ | |||
110 | +! |
- + } else if (backend == "LaF") { |
||
45 | -4x | +|||
111 | +! |
- fs::dir_copy(+ message("Please check the documentation of {LaF} for how to set column classes. For example type `?LaF`") |
||
46 | -4x | +|||
112 | +
- file.path(outdir, ".metadata"), #from+ } else { |
|||
47 | -4x | +|||
113 | +! |
- file.path(back_up_tmp_dir, ".metadata") #to+ stop(glue::glue("csv_to_disk.frame: backend {backend} not supported")) |
||
48 | +114 |
- )
+ }
|
||
49 | +115 |
- + }
|
||
50 | +116 |
- # back-up the files first
+ |
||
51 | -4x | +117 | +8x |
- full_files = dir(outdir, full.names = TRUE)+ if(backend == "LaF") { |
52 | -4x | +|||
118 | +! |
- short_files = dir(outdir)+ if(!requireNamespace("LaF")) { |
||
53 | -+ | |||
119 | +! |
- + stop("You need to install the LaF package to use backend = 'LaF'. To install: install.packages('LaF')") |
||
54 | +120 |
- # move all files to the back up folder
+ }
|
||
55 | -4x | +|||
121 | +! |
- purrr::map(full_files, ~{+ if(length(infile) > 1) { |
||
56 | -20x | +|||
122 | +! |
- fs::file_move(.x, back_up_tmp_dir)+ stop("csv_to_disk.frame: backend = 'LaF' only supports single file, not multiple files as `infile`") |
||
57 | +123 |
- })+ }
|
||
58 | +124 |
|
||
59 | -4x | +|||
125 | +! |
- if(fs::dir_exists(file.path(outdir, ".metadata"))) {+ if(is.null(in_chunk_size)) { |
||
60 | -4x | +|||
126 | +! |
- fs::dir_delete(file.path(outdir, ".metadata"))+ stop("csv_to_disk.frame: backend = 'LaF' can only be used when in_chunk_size != NULL") |
||
61 | +127 |
}
|
||
62 | +128 |
- + }
|
||
63 | +129 |
- # TODO check for validity
+ |
||
64 | -4x | +130 | +8x |
- message(glue::glue("files have been backed up to temporary dir {back_up_tmp_dir}. You can recover there files until you restart your R session"))+ if(backend == "LaF" & !is.null(in_chunk_size) & length(infile) == 1) { |
65 | -+ | |||
131 | +! |
- + df_out = disk.frame(outdir) |
||
66 | -4x | +|||
132 | +! |
- df = disk.frame(back_up_tmp_dir)+ dm = LaF::detect_dm_csv(infile, header = TRUE, ...) |
||
67 | -+ | |||
133 | +! |
- }
+ LaF::process_blocks(LaF::laf_open(dm), function(chunk, past) { |
||
68 | -+ | |||
134 | +! |
- + if("data.frame" %in% class(chunk)) { |
||
69 | -5x | +|||
135 | +! | +
+ if(nrow(chunk) > 0 ) {+ |
+ ||
136 | +! | +
+ add_chunk(df_out, chunk)+ |
+ ||
137 | ++ |
+ }
+ |
+ ||
138 | ++ |
+ }
+ |
+ ||
139 | +! | +
+ NULL
+ |
+ ||
140 | +! | +
+ }, nrows = in_chunk_size)+ |
+ ||
141 | +! | +
+ return(df_out)+ |
+ ||
142 | +8x | +
+ } else if(backend == "data.table" & (is.null(in_chunk_size) | chunk_reader == "data.table")) {+ |
+ ||
143 | +8x | +
+ csv_to_disk.frame_data.table_backend(+ |
+ ||
144 | +8x | +
+ infile,
+ |
+ ||
145 | +8x | +
+ outdir,
+ |
+ ||
146 | +8x | +
+ inmapfn,
+ |
+ ||
147 | +8x | +
+ nchunks,
+ |
+ ||
148 | +8x | +
+ in_chunk_size,
+ |
+ ||
149 | +8x | +
+ shardby,
+ |
+ ||
150 | +8x | +
+ compress,
+ |
+ ||
151 | +8x | +
+ overwrite,
+ |
+ ||
152 | +8x | +
+ header,
+ |
+ ||
153 | +8x | +
+ .progress, ...+ |
+ ||
154 | ++ |
+ )
+ |
+ ||
155 | +! | +
+ } else if (backend == "data.table" & chunk_reader == "bigreadr" & !is.null(in_chunk_size)) {+ |
+ ||
156 | ++ |
+ # use bigreadr to split the files
+ |
+ ||
157 | +! | +
+ tf = tempfile()+ |
+ ||
158 | +! | +
+ pt = proc.time()+ |
+ ||
159 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
160 | +! | +
+ message(glue::glue("Stage 1 of 2: splitting the file {infile} into smallers files:"))+ |
+ ||
161 | +! | +
+ message(glue::glue("Destination: {tf}"))+ |
+ ||
162 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
163 | ++ |
+ + |
+ ||
164 | +! | +
+ split_file_info = bigreadr::split_file(+ |
+ ||
165 | +! | +
+ infile,
+ |
+ ||
166 | +! | +
+ every_nlines = in_chunk_size,+ |
+ ||
167 | +! | +
+ prefix_out = tf,+ |
+ ||
168 | +! | +
+ repeat_header = header)+ |
+ ||
169 | +! | +
+ files_split = bigreadr::get_split_files(split_file_info)+ |
+ ||
170 | +! | +
+ message(paste("Stage 1 of 2 took:", data.table::timetaken(pt)))+ |
+ ||
171 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
172 | ++ |
+ + |
+ ||
173 | +! | +
+ pt2 = proc.time()+ |
+ ||
174 | +! | +
+ message(glue::glue("Stage 2 of 2: Converting the smaller files into disk.frame"))+ |
+ ||
175 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
176 | ++ |
+ + |
+ ||
177 | +! | +
+ res = csv_to_disk.frame(+ |
+ ||
178 | +! | +
+ files_split,
+ |
+ ||
179 | +! | +
+ outdir = outdir,+ |
+ ||
180 | +! | +
+ inmapfn = inmapfn,+ |
+ ||
181 | +! | +
+ nchunks = nchunks,+ |
+ ||
182 | +! | +
+ shardby = shardby,+ |
+ ||
183 | +! | +
+ compress = compress,+ |
+ ||
184 | +! | +
+ overwrite = overwrite,+ |
+ ||
185 | +! | +
+ header = header,+ |
+ ||
186 | +! | +
+ .progress = .progress,+ |
+ ||
187 | +! | +
+ backend = backend,+ |
+ ||
188 | ++ |
+ ...)+ |
+ ||
189 | ++ |
+ + |
+ ||
190 | +! | +
+ message(paste("Stage 2 of 2 took:", data.table::timetaken(pt2)))+ |
+ ||
191 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
192 | ++ |
+ + |
+ ||
193 | +! | +
+ message(paste("Stage 2 & 2 took:", data.table::timetaken(pt)))+ |
+ ||
194 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
195 | ++ |
+ + |
+ ||
196 | +! | +
+ return(res)+ |
+ ||
197 | +! | +
+ } else if (backend == "data.table" & chunk_reader == "readLines" & !is.null(in_chunk_size)) {+ |
+ ||
198 | +! | +
+ if (length(infile) == 1) {+ |
+ ||
199 | ++ |
+ # establish a read connection to the file
+ |
+ ||
200 | +! | +
+ con = file(infile, "r")+ |
+ ||
201 | +! | +
+ on.exit(close(con))+ |
+ ||
202 | +! | +
+ xx = readLines(con, n = in_chunk_size)+ |
+ ||
203 | +! | +
+ diskf = disk.frame(outdir)+ |
+ ||
204 | +! | +
+ header_copy = header+ |
+ ||
205 | +! | +
+ colnames_copy = NULL+ |
+ ||
206 | +! | +
+ while(length(xx) > 0) {+ |
+ ||
207 | +! | +
+ if(is.null(colnames_copy)) {+ |
+ ||
208 | +! | +
+ new_chunk = inmapfn(data.table::fread(text = xx, header = header_copy, ...))+ |
+ ||
209 | +! | +
+ colnames_copy = names(new_chunk)+ |
+ ||
210 | ++ |
+ } else {+ |
+ ||
211 | ++ |
+ # TODO detect the correct delim; manually adding header
+ |
+ ||
212 | +! | +
+ header_colnames = paste0(colnames_copy, collapse = ",")+ |
+ ||
213 | +! | +
+ xx = c(header_colnames, xx)+ |
+ ||
214 | +! | +
+ new_chunk = inmapfn(+ |
+ ||
215 | +! | +
+ data.table::fread(+ |
+ ||
216 | +! | +
+ text = xx,+ |
+ ||
217 | +! | +
+ header = TRUE+ |
+ ||
218 | ++ |
+ , ...
+ |
+ ||
219 | ++ |
+ )
+ |
+ ||
220 | ++ |
+ )
+ |
+ ||
221 | ++ |
+ + |
+ ||
222 | ++ |
+ }
+ |
+ ||
223 | ++ | + + | +||
224 | +! | +
+ add_chunk(diskf, new_chunk)+ |
+ ||
225 | +! | +
+ xx = readLines(con, n = in_chunk_size)+ |
+ ||
226 | +! | +
+ header_copy = FALSE+ |
+ ||
227 | ++ |
+ }
+ |
+ ||
228 | +! | +
+ return(diskf)+ |
+ ||
229 | ++ |
+ } else {+ |
+ ||
230 | +! | +
+ stop("chunk_reader = 'readLines' is not yet supported for multiple files")+ |
+ ||
231 | ++ |
+ }
+ |
+ ||
232 | +! | +
+ } else if (backend == "data.table" & chunk_reader == "readr" & !is.null(in_chunk_size)) {+ |
+ ||
233 | +! | +
+ if (length(infile) == 1) {+ |
+ ||
234 | +! | +
+ diskf = disk.frame(outdir)+ |
+ ||
235 | ++ | + + | +||
236 | +! | +
+ colnames_copy = NULL+ |
+ ||
237 | +! | +
+ readr::read_lines_chunked(file = infile, callback = readr::SideEffectChunkCallback$new(function(xx, i) {+ |
+ ||
238 | ++ |
+ + |
+ ||
239 | +! | +
+ if(is.null(colnames_copy)) {+ |
+ ||
240 | +! | +
+ new_chunk = inmapfn(+ |
+ ||
241 | +! | +
+ data.table::fread(+ |
+ ||
242 | +! | +
+ text = xx,+ |
+ ||
243 | +! | +
+ header = header+ |
+ ||
244 | ++ |
+ , ...
+ |
+ ||
245 | ++ |
+ )
+ |
+ ||
246 | ++ |
+ )
+ |
+ ||
247 | +! | +
+ colnames_copy <<- names(new_chunk)+ |
+ ||
248 | ++ |
+ } else {+ |
+ ||
249 | +! | +
+ header_colnames = paste0(colnames_copy, collapse = ",")+ |
+ ||
250 | +! | +
+ xx = c(header_colnames, xx)+ |
+ ||
251 | +! | +
+ new_chunk = inmapfn(+ |
+ ||
252 | +! | +
+ data.table::fread(+ |
+ ||
253 | +! | +
+ text = xx,+ |
+ ||
254 | +! | +
+ header = TRUE+ |
+ ||
255 | ++ |
+ , ...
+ |
+ ||
256 | ++ |
+ )
+ |
+ ||
257 | ++ |
+ )
+ |
+ ||
258 | ++ |
+ }
+ |
+ ||
259 | +! | +
+ add_chunk(diskf, new_chunk)+ |
+ ||
260 | +! | +
+ }), chunk_size = in_chunk_size, progress = .progress)+ |
+ ||
261 | ++ |
+ + |
+ ||
262 | +! | +
+ return(diskf)+ |
+ ||
263 | ++ |
+ } else {+ |
+ ||
264 | +! | +
+ stop("chunk_reader = 'readr' is not yet supported for multiple files")+ |
+ ||
265 | ++ |
+ }
+ |
+ ||
266 | +! | +
+ } else if(backend == "readr") {+ |
+ ||
267 | ++ |
+ # if(is.null(in_chunk_size)) {
+ |
+ ||
268 | ++ |
+ # stop("for readr backend, only in_chunk_size != NULL is supported")
+ |
+ ||
269 | ++ |
+ # } else if (!is.null(shardby)) {
+ |
+ ||
270 | ++ |
+ # stop("for readr backend, only shardby == NULL is supported")
+ |
+ ||
271 | ++ |
+ # }
+ |
+ ||
272 | +! | +
+ csv_to_disk.frame_readr(+ |
+ ||
273 | +! | +
+ infile,
+ |
+ ||
274 | +! | +
+ outdir=outdir,+ |
+ ||
275 | +! | +
+ inmapfn=inmapfn,+ |
+ ||
276 | +! | +
+ nchunks=nchunks,+ |
+ ||
277 | +! | +
+ in_chunk_size=in_chunk_size,+ |
+ ||
278 | +! | +
+ shardby=shardby,+ |
+ ||
279 | +! | +
+ compress=compress,+ |
+ ||
280 | +! | +
+ overwrite=TRUE,+ |
+ ||
281 | +! | +
+ col_names=header,+ |
+ ||
282 | +! | +
+ .progress=.progress, ...)+ |
+ ||
283 | ++ |
+ } else {+ |
+ ||
284 | +! | +
+ stop("csv_to_disk.frame: this set of options is not supported")+ |
+ ||
285 | ++ |
+ }
+ |
+ ||
286 | ++ |
+ }
+ |
+ ||
287 | ++ | + + | +||
288 | ++ | + + | +||
289 | ++ |
+ csv_to_disk.frame_data.table_backend <- function(infile, outdir = tempfile(fileext = ".df"), inmapfn = base::I, nchunks = recommend_nchunks(sum(file.size(infile))),+ |
+ ||
290 | ++ |
+ in_chunk_size = NULL, shardby = NULL, compress=50, overwrite = TRUE, header = TRUE, .progress = TRUE, ...) {+ |
+ ||
291 | ++ |
+ # reading multiple files
+ |
+ ||
292 | +8x | +
+ if(length(infile) > 1) {+ |
+ ||
293 | +! | +
+ dotdotdot = list(...)+ |
+ ||
294 | +! | +
+ origarg = list(inmapfn = inmapfn, nchunks = nchunks,+ |
+ ||
295 | +! | +
+ in_chunk_size = in_chunk_size, shardby = shardby, compress = compress,+ |
+ ||
296 | +! | +
+ overwrite = TRUE, header = header)+ |
+ ||
297 | +! | +
+ dotdotdotorigarg = c(dotdotdot, origarg)+ |
+ ||
298 | ++ |
+ + |
+ ||
299 | +! | +
+ pt <- proc.time()+ |
+ ||
300 | +! | +
+ if(.progress) {+ |
+ ||
301 | +! | +
+ message("=================================================")+ |
+ ||
302 | +! | +
+ message("")+ |
+ ||
303 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
304 | +! | +
+ message("-- Converting CSVs to disk.frame -- Stage 1 of 2:")+ |
+ ||
305 | +! | +
+ message("")+ |
+ ||
306 | +! | +
+ message(glue::glue("Converting {length(infile)} CSVs to {nchunks} disk.frames each consisting of {nchunks} chunks"))+ |
+ ||
307 | +! | +
+ message("")+ |
+ ||
308 | ++ |
+ }
+ |
+ ||
309 | ++ |
+ + |
+ ||
310 | +! | +
+ outdf_tmp = furrr::future_imap(infile, ~{+ |
+ ||
311 | +! | +
+ dotdotdotorigarg1 = c(dotdotdotorigarg, list(outdir = file.path(tempdir(), .y), infile=.x))+ |
+ ||
312 | ++ |
+ + |
+ ||
313 | +! | +
+ pryr::do_call(csv_to_disk.frame_data.table_backend, dotdotdotorigarg1)+ |
+ ||
314 | +! | +
+ }, .progress = .progress)+ |
+ ||
315 | ++ |
+ + |
+ ||
316 | +! | +
+ if(.progress) {+ |
+ ||
317 | +! | +
+ message(paste("-- Converting CSVs to disk.frame -- Stage 1 or 2 took:", data.table::timetaken(pt)))+ |
+ ||
318 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
319 | +! | +
+ message(" ")+ |
+ ||
320 | ++ |
+ }
+ |
+ ||
321 | ++ |
+ + |
+ ||
322 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
323 | +! | +
+ message("-- Converting CSVs to disk.frame -- Stage 2 of 2:")+ |
+ ||
324 | +! | +
+ message("")+ |
+ ||
325 | +! | +
+ message(glue::glue("Row-binding the {nchunks} disk.frames together to form one large disk.frame:"))+ |
+ ||
326 | +! | +
+ message(glue::glue("Creating the disk.frame at {outdir}"))+ |
+ ||
327 | +! | +
+ message("")+ |
+ ||
328 | +! | +
+ pt2 <- proc.time()+ |
+ ||
329 | +! | +
+ outdf = rbindlist.disk.frame(outdf_tmp, outdir = outdir, by_chunk_id = TRUE, compress = compress, overwrite = overwrite, .progress = .progress)+ |
+ ||
330 | ++ |
+ + |
+ ||
331 | +! | +
+ if(.progress) {+ |
+ ||
332 | ++ |
+ + |
+ ||
333 | +! | +
+ message(paste("Stage 2 of 2 took:", data.table::timetaken(pt2)))+ |
+ ||
334 | +! | +
+ message(" ----------------------------------------------------- ")+ |
+ ||
335 | +! | +
+ message(paste("Stage 1 & 2 in total took:", data.table::timetaken(pt)))+ |
+ ||
336 | ++ |
+ }
+ |
+ ||
337 | ++ |
+ + |
+ ||
338 | +! | +
+ return(outdf)+ |
+ ||
339 | ++ |
+ } else { # reading one file+ |
+ ||
340 | +8x | +
+ l = length(list.files(outdir))+ |
+ ||
341 | +8x | +
+ if(is.null(shardby)) {+ |
+ ||
342 | +6x | +
+ if(is.null(in_chunk_size)) {+ |
+ ||
343 | +6x | +
+ a = as.disk.frame(inmapfn(data.table::fread(infile, header=header, ...)), outdir, compress=compress, nchunks = nchunks, overwrite = overwrite, ...)+ |
+ ||
344 | +6x | +
+ return(a)+ |
+ ||
345 | ++ |
+ } else {+ |
+ ||
346 | +! | +
+ outdf = disk.frame(outdir)+ |
+ ||
347 | +! | +
+ i <- 0+ |
+ ||
348 | +! | +
+ tmpdir1 = tempfile(pattern="df_tmp")+ |
+ ||
349 | +! | +
+ fs::dir_create(tmpdir1)+ |
+ ||
350 | ++ |
+ + |
+ ||
351 | +! | +
+ done = FALSE+ |
+ ||
352 | +! | +
+ skiprows = 0+ |
+ ||
353 | +! | +
+ column_names = ""+ |
+ ||
354 | +! | +
+ while(!done) {+ |
+ ||
355 | +! | +
+ if (identical(column_names, "")) {+ |
+ ||
356 | +! | +
+ tmpdt = inmapfn(data.table::fread(+ |
+ ||
357 | +! | +
+ infile,
+ |
+ ||
358 | +! | +
+ skip = skiprows, nrows = in_chunk_size,...))+ |
+ ||
359 | +! | +
+ column_names = names(tmpdt)+ |
+ ||
360 | ++ |
+ } else {+ |
+ ||
361 | +! | +
+ ddd = list(...)+ |
+ ||
362 | +! | +
+ if ("col.names" %in% names(ddd)) {+ |
+ ||
363 | +! | +
+ tmpdt = inmapfn(data.table::fread(+ |
+ ||
364 | +! | +
+ infile,
+ |
+ ||
365 | +! | +
+ skip = skiprows, nrows = in_chunk_size,+ |
+ ||
366 | +! | +
+ header=FALSE, ...))+ |
+ ||
367 | ++ |
+ } else {+ |
+ ||
368 | +! | +
+ tmpdt = inmapfn(data.table::fread(+ |
+ ||
369 | +! | +
+ infile,
+ |
+ ||
370 | +! | +
+ skip = skiprows, nrows = in_chunk_size,+ |
+ ||
371 | +! | +
+ header=FALSE, col.names = column_names, ...))+ |
+ ||
372 | ++ |
+ }
+ |
+ ||
373 | ++ |
+ }
+ |
+ ||
374 | ++ |
+ + |
+ ||
375 | +! | +
+ i <- i + 1+ |
+ ||
376 | +! | +
+ skiprows = skiprows + in_chunk_size ++ |
+ ||
377 | ++ |
+ # skips the header as well but only at the first chunk
+ |
+ ||
378 | +! | +
+ ifelse(i == 1 & header, 1, 0)+ |
+ ||
379 | +! | +
+ rows <- tmpdt[,.N]+ |
+ ||
380 | +! | +
+ if(rows < in_chunk_size) {+ |
+ ||
381 | +! | +
+ done <- TRUE+ |
+ ||
382 | ++ |
+ }
+ |
+ ||
383 | ++ |
+ + |
+ ||
384 | ++ |
+ # add to chunk
+ |
+ ||
385 | +! | +
+ add_chunk(outdf, tmpdt)+ |
+ ||
386 | +! | +
+ rm(tmpdt); gc()+ |
+ ||
387 | ++ |
+ }
+ |
+ ||
388 | ++ |
+ + |
+ ||
389 | +! | +
+ message(glue::glue("read {in_chunk_size*(i-1) + rows} rows from {infile}"))+ |
+ ||
390 | ++ |
+ + |
+ ||
391 | ++ |
+ # remove the files
+ |
+ ||
392 | +! | +
+ fs::dir_delete(tmpdir1)+ |
+ ||
393 | ++ |
+ }
+ |
+ ||
394 | ++ |
+ } else { # so shard by some element+ |
+ ||
395 | +2x | +
+ if(is.null(in_chunk_size)) {+ |
+ ||
396 | +2x | +
+ return(+ |
+ ||
397 | +2x | +
+ shard(inmapfn(data.table::fread(infile, header=header, ...)), shardby = shardby, nchunks = nchunks, outdir = outdir, overwrite = overwrite, compress = compress,...)+ |
+ ||
398 | ++ |
+ )
+ |
+ ||
399 | ++ |
+ } else {+ |
+ ||
400 | +! | +
+ i <- 0+ |
+ ||
401 | +! | +
+ tmpdir1 = tempfile(pattern="df_tmp")+ |
+ ||
402 | +! | +
+ fs::dir_create(tmpdir1)+ |
+ ||
403 | ++ |
+ #message(tmpdir1)
+ |
+ ||
404 | ++ |
+ + |
+ ||
405 | +! | +
+ done = FALSE+ |
+ ||
406 | +! | +
+ skiprows = 0+ |
+ ||
407 | +! | +
+ column_names = ""+ |
+ ||
408 | +! | +
+ while(!done) {+ |
+ ||
409 | +! | +
+ if (identical(column_names, "")) {+ |
+ ||
410 | +! | +
+ tmpdt = inmapfn(data.table::fread(+ |
+ ||
411 | +! | +
+ infile,
+ |
+ ||
412 | +! | +
+ skip = skiprows, nrows = in_chunk_size,...))+ |
+ ||
413 | +! | +
+ column_names = names(tmpdt)+ |
+ ||
414 | ++ |
+ } else {+ |
+ ||
415 | +! | +
+ ddd = list(...)+ |
+ ||
416 | +! | +
+ if ("col.names" %in% names(ddd)) {+ |
+ ||
417 | +! | +
+ tmpdt = inmapfn(data.table::fread(+ |
+ ||
418 | +! | +
+ infile,
+ |
+ ||
419 | +! | +
+ skip = skiprows, nrows = in_chunk_size,+ |
+ ||
420 | +! | +
+ header=FALSE, ...))+ |
+ ||
421 | ++ |
+ } else {+ |
+ ||
422 | +! | +
+ tmpdt = inmapfn(data.table::fread(+ |
+ ||
423 | +! | +
+ infile,
+ |
+ ||
424 | +! | +
+ skip = skiprows, nrows = in_chunk_size,+ |
+ ||
425 | +! | +
+ header=FALSE, col.names = column_names, ...))+ |
+ ||
426 | ++ |
+ }
+ |
+ ||
427 | ++ |
+ }
+ |
+ ||
428 | ++ |
+ + |
+ ||
429 | +! | +
+ i <- i + 1+ |
+ ||
430 | +! | +
+ skiprows = skiprows + in_chunk_size ++ |
+ ||
431 | ++ |
+ # skips the header as well but only at the first chunk
+ |
+ ||
432 | +! | +
+ ifelse(i == 1 & header, 1, 0)+ |
+ ||
433 | +! | +
+ rows <- tmpdt[,.N]+ |
+ ||
434 | +! | +
+ if(rows < in_chunk_size) {+ |
+ ||
435 | +! | +
+ done <- TRUE+ |
+ ||
436 | ++ |
+ }
+ |
+ ||
437 | ++ |
+ + |
+ ||
438 | +! | +
+ tmp.disk.frame = shard(+ |
+ ||
439 | +! | +
+ tmpdt,
+ |
+ ||
440 | +! | +
+ shardby = shardby,+ |
+ ||
441 | +! | +
+ nchunks = nchunks,+ |
+ ||
442 | +! | +
+ outdir = file.path(tmpdir1,i),+ |
+ ||
443 | +! | +
+ overwrite = TRUE,+ |
+ ||
444 | +! | +
+ compress = compress,...)+ |
+ ||
445 | +! | +
+ rm(tmpdt); gc()+ |
+ ||
446 | ++ |
+ }
+ |
+ ||
447 | ++ |
+ + |
+ ||
448 | +! | +
+ message(glue::glue("read {in_chunk_size*(i-1) + rows} rows from {infile}"))+ |
+ ||
449 | ++ |
+ #
+ |
+ ||
450 | ++ |
+ # do not run this in parallel as the level above this is likely in parallel
+ |
+ ||
451 | ++ |
+ # ZJ:
+ |
+ ||
452 | +! | +
+ system.time(+ |
+ ||
453 | +! | +
+ fnl_out <-+ |
+ ||
454 | +! | +
+ rbindlist.disk.frame(+ |
+ ||
455 | +! | +
+ lapply(+ |
+ ||
456 | +! | +
+ list.files(+ |
+ ||
457 | +! | +
+ tmpdir1, full.names = TRUE), disk.frame),+ |
+ ||
458 | +! | +
+ outdir = outdir, by_chunk_id = TRUE, parallel=FALSE, overwrite = overwrite, compress = compress))+ |
+ ||
459 | ++ |
+ + |
+ ||
460 | ++ |
+ # remove the files
+ |
+ ||
461 | +! | +
+ fs::dir_delete(tmpdir1)+ |
+ ||
462 | ++ |
+ }
+ |
+ ||
463 | ++ |
+ }
+ |
+ ||
464 | +! | +
+ df = disk.frame(outdir)+ |
+ ||
465 | +! | +
+ df = add_meta(df, nchunks=disk.frame::nchunks(df), shardkey = shardby, shardchunks = nchunks, compress = compress)+ |
+ ||
466 | +! | +
+ return(df)+ |
+ ||
467 | ++ |
+ }
+ |
+ ||
468 | ++ |
+ }
+ |
+
1 | ++ |
+ #@importFrom readr DataFrameCallback
+ |
+
2 | ++ |
+ #' @noRd
+ |
+
3 | ++ |
+ #' @noMd
+ |
+
4 | ++ |
+ csv_to_disk.frame_readr <- function(infile, outdir = tempfile(fileext = ".df"), inmapfn = base::I, nchunks = recommend_nchunks(sum(file.size(infile))),+ |
+
5 | ++ |
+ in_chunk_size = NULL, shardby = NULL, compress=50, overwrite = TRUE, col_names = TRUE, .progress = TRUE, delim=",", ...) {+ |
+
6 | +! | +
+ overwrite_check(outdir, overwrite)+ |
+
7 | ++ |
+ #
+ |
+
8 | +! | +
+ df = disk.frame(outdir)+ |
+
9 | ++ |
+ + |
+
10 | ++ |
+ # TODO check header
+ |
+
11 | +! | +
+ if(is.null(in_chunk_size)) {+ |
+
12 | +! | +
+ if(is.null(shardby)) {+ |
+
13 | +! | +
+ return(+ |
+
14 | +! | +
+ as.disk.frame(+ |
+
15 | +! | +
+ readr::read_delim(infile, delim = delim, ...) %>% inmapfn,+ |
+
16 | +! | +
+ outdir = outdir,+ |
+
17 | +! | +
+ overwrite = TRUE,+ |
+
18 | +! | +
+ nchunks = nchunks,+ |
+
19 | +! | +
+ compress = compress,+ |
+
20 | ++ |
+ ...
+ |
+
21 | ++ |
+ )
+ |
+
22 | ++ |
+ )
+ |
+
23 | ++ |
+ } else {+ |
+
24 | +! | +
+ return(shard(readr::read_delim(infile, delim = delim, ...), shardby = shardby, outdir = outdir, nchunks = nchunks, overwrite = TRUE,...))+ |
+
25 | ++ |
+ }
+ |
+
26 | ++ |
+ } else {+ |
+
27 | +! | +
+ if(!is.null(shardby)) {+ |
+
28 | +! | +
+ tmp_dir = tempfile()+ |
+
29 | +! | +
+ overwrite_check(tmp_dir, TRUE)+ |
+
30 | +! | +
+ df_tmp = disk.frame(tmp_dir)+ |
+
31 | +! | +
+ f = function(x, pos) {+ |
+
32 | +! | +
+ add_chunk(df_tmp, inmapfn(x))+ |
+
33 | ++ |
+ }
+ |
+
34 | ++ |
+ + |
+
35 | +! | +
+ message("csv_to_disk.frame reader backend: Stage 1/1 -- reading file")+ |
+
36 | +! | +
+ readr::read_delim_chunked(infile, readr::SideEffectChunkCallback$new(f), chunk_size = in_chunk_size, delim = delim, ...)+ |
+
37 | ++ |
+ + |
+
38 | +! | +
+ message(glue::glue("csv_to_disk.frame reader backend: Stage 2/2 -- performing shardby {shardby}"))+ |
+
39 | +! | +
+ df = shard(df_tmp,shardby = shardby,outdir = outdir, nchunks = nchunks, overwrite = TRUE)+ |
+
40 | +! | +
+ df = add_meta(df_tmp, nchunks=disk.frame::nchunks(df), shardkey = shardby, shardchunks = nchunks, compress = compress)+ |
+
41 | +! | +
+ df
+ |
+
42 | ++ |
+ } else {+ |
+
43 | +! | +
+ f = function(x, pos) {+ |
+
44 | +! | +
+ add_chunk(df, inmapfn(x))+ |
+
45 | ++ |
+ }
+ |
+
46 | +! | +
+ readr::read_delim_chunked(infile, readr::SideEffectChunkCallback$new(f), chunk_size = in_chunk_size, delim = delim, ...)+ |
+
47 | +! | +
+ df = add_meta(df, nchunks=disk.frame::nchunks(df), shardkey = shardby, shardchunks = nchunks, compress = compress)+ |
+
48 | +! | +
+ df
+ |
+
49 | ++ |
+ }
+ |
+
50 | ++ |
+ }
+ |
+
51 | ++ |
+ }
+ |
+
1 | ++ |
+ # Alternative to hashstr2i that can produce semi-sorted chunks
+ |
+
2 | ++ |
+ # Apply as e.g.:
+ |
+
3 | ++ |
+ # split_values <- cmap(dff, sample_n, size=1) %>%
+ |
+
4 | ++ |
+ # select(c("id1", "id2")) %>%
+ |
+
5 | ++ |
+ # collect() %>%
+ |
+
6 | ++ |
+ # arrange(id1, id2)
+ |
+
7 | ++ |
+ #
+ |
+
8 | ++ |
+ # shard_by_rule <- splitstr2i(split_values)
+ |
+
9 | ++ |
+ # code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]")
+ |
+
10 | ++ | + + | +
11 | ++ |
+ # Check if date
+ |
+
12 | +76x | +
+ is.date <- function(x) inherits(x, 'Date')+ |
+
13 | ++ | + + | +
14 | ++ |
+ # Escapes names
+ |
+
15 | ++ |
+ # Factors are converted to numbers
+ |
+
16 | ++ |
+ escape_name <- function(name, x) {+ |
+
17 | +38x | +
+ if(is.factor(x)){+ |
+
18 | +2x | +
+ paste0("as.numeric(", name, ")")+ |
+
19 | ++ |
+ } else {+ |
+
20 | +36x | +
+ name
+ |
+
21 | ++ |
+ }
+ |
+
22 | ++ |
+ }
+ |
+
23 | ++ | + + | +
24 | ++ |
+ # Escapes values
+ |
+
25 | ++ |
+ # Strings and dates are quoted
+ |
+
26 | ++ |
+ # Factors are converted to number
+ |
+
27 | ++ |
+ escape_value <- function(x) {+ |
+
28 | +76x | +
+ if(is.character(x) | is.date(x)){+ |
+
29 | +48x | +
+ paste0("\"", x, "\"")+ |
+
30 | +28x | +
+ } else if(is.factor(x)){+ |
+
31 | +4x | +
+ as.numeric(x)+ |
+
32 | ++ |
+ } else {+ |
+
33 | +24x | +
+ x
+ |
+
34 | ++ |
+ }
+ |
+
35 | ++ |
+ }
+ |
+
36 | ++ | + + | +
37 | ++ |
+ # Switch condition - returns
+ |
+
38 | ++ |
+ # ({name} < {split_value} | ({name} == {split_value} ...
+ |
+
39 | ++ | + + | +
40 | ++ |
+ switchcond <- function(name, split_values, desc_vars){+ |
+
41 | +38x | +
+ paste0("(",+ |
+
42 | +38x | +
+ escape_name(name, split_values[,name]),+ |
+
43 | +38x | +
+ ifelse(name %in% desc_vars, " < ", " > "),+ |
+
44 | +38x | +
+ escape_value(split_values[,name]),+ |
+
45 | ++ |
+ " | (",
+ |
+
46 | +38x | +
+ name,
+ |
+
47 | ++ |
+ " == ",
+ |
+
48 | +38x | +
+ escape_value(split_values[,name])+ |
+
49 | ++ |
+ )
+ |
+
50 | ++ |
+ }
+ |
+
51 | ++ | + + | +
52 | ++ |
+ # Composes the switch conditions, so each split row becomes
+ |
+
53 | ++ |
+ # ({name1} < {split_value1} | ({name1} == {split_value1} &
+ |
+
54 | ++ |
+ # ({name2} < {split_value2} | ({name2} == {split_value2} ...)))) * 1
+ |
+
55 | ++ |
+ # the sum of the split row is the id
+ |
+
56 | ++ |
+ sortablestr2i <- function(split_values, desc_vars){+ |
+
57 | +26x | +
+ do.call(+ |
+
58 | +26x | +
+ paste,
+ |
+
59 | +26x | +
+ c(+ |
+
60 | +26x | +
+ lapply(+ |
+
61 | +26x | +
+ as.list(do.call(paste, c(lapply(colnames(split_values), switchcond, split_values, desc_vars), sep=" & "))),+ |
+
62 | +122x | +
+ function(x) { paste0("(", x, paste0(rep(")", ncol(split_values) * 2 + 1), collapse = ""), "* 1")}+ |
+
63 | ++ |
+ ),
+ |
+
64 | +26x | +
+ sep = " + "+ |
+
65 | ++ |
+ )
+ |
+
66 | ++ |
+ )
+ |
+
67 | ++ |
+ }
+ |
+
1 | ++ |
+ #' Returns the shardkey (not implemented yet)
+ |
+
2 | ++ |
+ #' @importFrom jsonlite fromJSON
+ |
+
3 | ++ |
+ #' @param df a disk.frame
+ |
+
4 | ++ |
+ #' @export
+ |
+
5 | ++ |
+ # TODO make this work
+ |
+
6 | ++ |
+ shardkey <- function(df) {+ |
+
7 | +321x | +
+ meta_file = file.path(attr(df,"path", exact=TRUE),".metadata", "meta.json")+ |
+
8 | +321x | +
+ if(!file.exists(meta_file)) {+ |
+
9 | +! | +
+ add_meta(df)+ |
+
10 | ++ |
+ }
+ |
+
11 | +321x | +
+ meta = jsonlite::fromJSON(meta_file)+ |
+
12 | +321x | +
+ list(shardkey = meta$shardkey, shardchunks = meta$shardchunks)+ |
+
13 | ++ |
+ }
+ |
+
14 | ++ | + + | +
15 | ++ |
+ #' Compare two disk.frame shardkeys
+ |
+
16 | ++ |
+ #'
+ |
+
17 | ++ |
+ #' @param sk1 shardkey1
+ |
+
18 | ++ |
+ #' @param sk2 shardkey2
+ |
+
19 | ++ |
+ #' @export
+ |
+
20 | ++ |
+ shardkey_equal <- function(sk1, sk2) {+ |
+
21 | +2x | +
+ if (sk1$shardkey == "") {+ |
+
22 | ++ |
+ # if the shardkey is not set then it's the same as having no shardkey
+ |
+
23 | +2x | +
+ return(FALSE)+ |
+
24 | ++ |
+ }
+ |
+
25 | +! | +
+ (sk1$shardkey == sk2$skardkey) && (sk1$shardchunks == sk2$shardchunks)+ |
+
26 | ++ |
+ }
+ |
+
1 | +
- overwrite_check(outdir, overwrite)+ #' The dplyr verbs implemented for disk.frame
|
|||
70 | +2 |
- + #' @description Please see the dplyr document for their usage. Please note
|
||
71 | -5x | +|||
3 | +
- dfp = attr(df, "path")+ #' `chunk_arrange` performs the actions within each chunk
|
|||
72 | -5x | +|||
4 | +
- existing_shardkey = shardkey(df)+ #' @export
|
|||
73 | +5 |
- + #' @importFrom dplyr select rename filter mutate transmute arrange do groups
|
||
74 | +6 |
- # by default, if shardkey is defined then rechunk will continue to reuse it
+ #' group_by group_by glimpse summarise
|
||
75 | -5x | +|||
7 | +
- if (is.null(shardby)) {+ #' @param ... Same as the dplyr functions
|
|||
76 | -4x | +|||
8 | +
- shardby = existing_shardkey[[1]]+ #' @param .data a disk.frame
|
|||
77 | +9 |
- }
+ #' @rdname dplyr_verbs
|
||
78 | +10 |
-
+ #' @family dplyr verbs
|
||
79 | +11 |
-
+ #' @examples
|
||
80 | -5x | +|||
12 | +
- if(user_had_set_shard_by) {+ #' library(dplyr)
|
|||
81 | -1x | +|||
13 | +
- return(hard_group_by(df, shardby, nchunks = nchunks, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars))+ #' cars.df = as.disk.frame(cars)
|
|||
82 | -4x | +|||
14 | +
- } else if (identical(shardby, "") | is.null(shardby)) {+ #' mult = 2
|
|||
83 | +15 |
- # if no existing shardby
+ #'
|
||
84 | -4x | +|||
16 | +
- nr = nrow(df)+ #' # use all any of the supported dplyr
+ |
+ |||
17 | ++ |
+ #' cars2 = cars.df %>%
+ |
+ ||
18 | ++ |
+ #' select(speed) %>%
+ |
+ ||
19 | ++ |
+ #' mutate(speed2 = speed * mult) %>%
+ |
+ ||
20 | ++ |
+ #' filter(speed < 50) %>%
+ |
+ ||
21 | ++ |
+ #' rename(speed1 = speed) %>%
+ |
+ ||
22 | ++ |
+ #' collect
+ |
+ ||
23 | ++ |
+ #'
+ |
+ ||
24 | ++ |
+ #' # clean up cars.df
+ |
+ ||
25 | ++ |
+ #' delete(cars.df)
+ |
+ ||
26 | ++ |
+ select.disk.frame <- function(.data, ...) { |
||
85 | -4x | +27 | +11x |
- nr_per_chunk = ceiling(nr/nchunks)+ quo_dotdotdot = rlang::enquos(...) |
86 | -4x | +28 | +11x |
- used_so_far = 0+ cmap(.data, ~{ |
87 | -4x | +29 | +56x |
- done = FALSE+ code = rlang::quo(dplyr::select(.x, !!!quo_dotdotdot)) |
88 | -4x | +30 | +56x |
- chunks_read = 1+ rlang::eval_tidy(code) |
89 | -4x | +31 | +11x | +
+ }, lazy = TRUE)+ |
+
32 | ++ |
+ }
+ |
+ ||
33 | ++ | + + | +||
34 | ++ |
+ #' Kept for backwards-compatibility to be removed in 0.3
+ |
+ ||
35 | ++ |
+ #' @export
+ |
+ ||
36 | ++ |
+ create_dplyr_mapper = function() {+ |
+ ||
37 | +! | +
+ stop("create_dplyr_mapper has been deprecated. Please use create_chunk_mapper instead")+ |
+ ||
38 | ++ |
+ }
+ |
+ ||
39 | ++ | + + | +||
40 | ++ |
+ #' @export
+ |
+ ||
41 | +
- chunks_written = 0+ #' @rdname dplyr_verbs
|
|||
90 | -4x | +|||
42 | +
- res = disk.frame(outdir)+ rename.disk.frame <- create_chunk_mapper(dplyr::rename) |
|||
91 | -4x | +|||
43 | +
- a = get_chunk(df, 1)+ |
|||
92 | -4x | +|||
44 | +
- used_so_far = nrow(a)+ |
|||
93 | -4x | +|||
45 | +
- while(chunks_read < nchunks(df)) {+ #' @export
|
|||
94 | -30x | +|||
46 | +
- if((nr_per_chunk <= used_so_far)) {+ #' @rdname dplyr_verbs
|
|||
95 | -14x | +|||
47 | +
- disk.frame::add_chunk(res, a[1:nr_per_chunk,])+ filter.disk.frame <- create_chunk_mapper(dplyr::filter) |
|||
96 | -14x | +|||
48 | +
- chunks_written = chunks_written + 1+ |
|||
97 | -14x | +|||
49 | +
- a = a[-(1:nr_per_chunk),]+ |
|||
98 | -14x | +|||
50 | +
- used_so_far = nrow(a)+ #' @export
|
|||
99 | +51 |
- } else {+ #' @rdname dplyr_verbs
|
||
100 | -16x | +|||
52 | +
- chunks_read = chunks_read + 1+ #' @importFrom dplyr filter_all
|
|||
101 | -16x | +|||
53 | +
- newa = get_chunk(df, chunks_read)+ filter_all.disk.frame <- create_chunk_mapper(dplyr::filter_all) |
|||
102 | -16x | +|||
54 | +
- used_so_far = used_so_far + nrow(newa)+ |
|||
103 | -16x | +|||
55 | +
- a = rbindlist(list(a, newa))+ |
|||
104 | -16x | +|||
56 | +
- rm(newa)+ #' @export
|
|||
105 | +57 |
- }
+ #' @rdname dplyr_verbs
|
||
106 | +58 |
- }
+ #' @importFrom dplyr filter_if
|
||
107 | +59 |
- + filter_if.disk.frame <- create_chunk_mapper(dplyr::filter_if) |
||
108 | -4x | +|||
60 | +
- while(chunks_written < nchunks) {+ |
|||
109 | -6x | +|||
61 | +
- rows_to_write = min(nr_per_chunk, nrow(a))+ |
|||
110 | -6x | +|||
62 | +
- disk.frame::add_chunk(res, a[1:rows_to_write,])+ #' @export
|
|||
111 | -6x | +|||
63 | +
- a = a[-(1:rows_to_write),]+ #' @rdname dplyr_verbs
|
|||
112 | -6x | +|||
64 | +
- chunks_written = chunks_written + 1+ #' @importFrom dplyr filter_at
|
|||
113 | +65 |
- }
+ filter_at.disk.frame <- create_chunk_mapper(dplyr::filter_at) |
||
114 | +66 |
- + |
||
115 | -4x | +|||
67 | +
- return(res)+ |
|||
116 | -! | +|||
68 | +
- } else if(!is.null(shardby)) { # if there is existing shard by; shardby has been replaced with new shard by+ #' @export
|
|||
117 | -! | +|||
69 | +
- if(user_had_not_set_shard_by) {+ #' @rdname dplyr_verbs
|
|||
118 | -! | +|||
70 | +
- warning("shardby = NULL; but there are already shardkey's defined for this disk.frame. Therefore a rechunk algorithm that preserves the shardkey's has been applied and this algorithm is slower than an algorithm that doesn't use a shardkey.")+ #' @importFrom future getGlobalsAndPackages
|
|||
119 | +71 |
- }
+ #' @importFrom rlang eval_tidy quo enquos
|
||
120 | +72 |
- + #' @importFrom dplyr mutate
|
||
121 | +73 |
- # using some maths we can cut down on the number of operations
+ mutate.disk.frame <- create_chunk_mapper(dplyr::mutate) |
||
122 | -! | +|||
74 | +
- nc = nchunks(df)+ |
|||
123 | +75 |
- + |
||
124 | +76 |
- # TODO there is bug here! If the chunks are in numbers form!
+ #' @export
|
||
125 | +77 |
- # if the number of possible new chunk ids is one then no need to perform anything. just merge those
+ #' @importFrom dplyr transmute
|
||
126 | -! | +|||
78 | +
- possibles_new_chunk_id = purrr::map(1:nc, ~unique((.x-1 + (0:(nchunks-1))*nc) %% nchunks)+1)+ #' @rdname dplyr_verbs
|
|||
127 | -! | +|||
79 | +
- lp = purrr::map_int(possibles_new_chunk_id,length)+ transmute.disk.frame <- create_chunk_mapper(dplyr::transmute) |
|||
128 | +80 |
- + |
||
129 | +81 |
- #need to shards
+
|
||
130 | -! | +|||
82 | +
- nts = which(lp != 1)+ #' @export
|
|||
131 | +83 |
- + #' @importFrom dplyr arrange
|
||
132 | -! | +|||
84 | +
- bad_boys = future.apply::future_lapply(nts, function(chunk_id) {+ #' @rdname dplyr_verbs
|
|||
133 | -! | +|||
85 | +
- df1 = disk.frame::get_chunk(df, chunk_id)+ arrange.disk.frame =create_chunk_mapper(dplyr::arrange, warning_msg="`arrange.disk.frame` is now deprecated. Please use `chunk_arrange` instead. This is in preparation for a more powerful `arrange` that sorts the whole disk.frame") |
|||
134 | -! | +|||
86 | +
- disk.frame::shard(df1, shardby, nchunks = nchunks, overwrite = TRUE)+ |
|||
135 | +87 |
- })+ |
||
136 | +88 |
- + #' @export
|
||
137 | +89 |
- # for those that don't need to be resharded
+ #' @importFrom dplyr arrange
|
||
138 | -! | +|||
90 | +
- tmp_fdlr = tempfile("rechunk_shard")+ #' @rdname dplyr_verbs
|
|||
139 | -! | +|||
91 | +
- fs::dir_create(tmp_fdlr)+ chunk_arrange <- create_chunk_mapper(dplyr::arrange) |
|||
140 | +92 | |||
141 | -! | +|||
93 | +
- oks = furrr::future_map(which(lp == 1), function(i) {+ |
|||
142 | -! | +|||
94 | +
- file_chunk = file.path(attr(df, "path"), i %>% paste0(".fst"))+ #' @export
|
|||
143 | -! | +|||
95 | +
- fs::file_move(file_chunk, file.path(tmp_fdlr, possibles_new_chunk_id[[i]] %>% paste0(".fst")))+ #' @importFrom dplyr tally
|
|||
144 | -! | +|||
96 | +
- disk.frame(tmp_fdlr)+ #' @rdname dplyr_verbs
|
|||
145 | +97 |
- })+ tally.disk.frame <- create_chunk_mapper(dplyr::tally) |
||
146 | +98 |
- + |
||
147 | -! | +|||
99 | +
- list_of_sharded = c(bad_boys, oks)+ |
|||
148 | -! | +|||
100 | +
- new_one <- rbindlist.disk.frame(list_of_sharded, outdir=outdir, by_chunk_id = TRUE, overwrite = overwrite)+ #' @export
|
|||
149 | +101 |
- + #' @importFrom dplyr count
|
||
150 | -! | +|||
102 | +
- res = add_meta(new_one, nchunks = nchunks(new_one), shardkey = shardby, shardchunks = nchunks)+ #' @rdname dplyr_verbs
|
|||
151 | -! | +|||
103 | +
- return(res)+ count.disk.frame <- create_chunk_mapper(dplyr::count) |
|||
152 | +104 |
- } else {+ |
||
153 | -! | +|||
105 | +
- stop("rechunk: option not supported")+ # TODO family is not required is group-by
|
|||
154 | +106 |
- }
+ # TODO alot of these .disk.frame functions are not generic
|
||
155 | +107 |
- }
+
|
1 | +108 |
- #' Convert a SAS file (.sas7bdat) format to CSV or disk.frame by chunk
+
|
|
2 | +109 |
- #' @param infile the SAS7BDAT file
+ #' @export
|
|
3 | +110 |
- #' @param chunk which convert of nchunks to convert
+ #' @importFrom dplyr add_count
|
|
4 | +111 |
- #' @param nchunks number of chunks
+ #' @rdname dplyr_verbs
|
|
5 | +112 |
- #' @param sas2csvpath path to sas2csv.exe
+ add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) |
|
6 | +113 |
- #' @param sep separater of the CSV file, defaults to |
+
|
|
7 | +114 |
- #' @family ingesting data
+
|
|
8 | +115 |
- #' @noRd
+ #' @export
|
|
9 | +116 |
- sas_to_csv <- function(infile, chunk, nchunks = disk.frame::recommend_nchunks(fs::file_size(infile)), sas2csvpath = "sas2csv/sas2csv.exe", sep="|") {+ #' @importFrom dplyr add_tally
|
|
10 | -! | +||
117 | +
- if(!file.exists(sas2csvpath)) {+ #' @rdname dplyr_verbs
|
||
11 | -! | +||
118 | +
- stop("You must have the sas2csv.exe installed. Only Windows is supported at the moment. Please contact the author")+ add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) |
||
12 | +119 |
- }
+
|
|
13 | -! | +||
120 | +
- sasfile = glue::glue('"{infile}"')+ |
||
14 | -! | +||
121 | +
- fs::dir_create(file.path("outcsv"))+ #' @export
|
||
15 | -! | +||
122 | +
- fs::dir_create(file.path("outcsv", chunk))+ #' @importFrom dplyr summarize
|
||
16 | -! | +||
123 | +
- options = glue::glue("-o outcsv/{chunk}/ -d {sep} -c -n {nchunks} -k {paste(chunk-1,collapse = ' ')} -m")+ #' @rdname chunk_group_by
|
||
17 | +124 |
- + chunk_summarize <- create_chunk_mapper(dplyr::summarize) |
|
18 | -! | +||
125 | +
- cmd = paste(sas2csvpath, sasfile, options)+ |
||
19 | -! | +||
126 | +
- system(cmd)+ |
||
20 | +127 |
- }
+ #' @export
|
|
21 | +128 |
-
+ #' @importFrom dplyr summarise
|
|
22 | +129 |
- #' Convert a SAS file (.sas7bdat format) to disk.frame via CSVs
+ #' @rdname chunk_group_by
|
|
23 | +130 |
- #' @param inpath input SAS7BDAT file
+ chunk_summarise <- create_chunk_mapper(dplyr::summarise) |
|
24 | +131 |
- #' @param outpath output disk.frame
+
|
|
25 | +132 |
- #' @param nchunks number of chunks
+
|
|
26 | +133 |
- #' @param sep separater of the intermediate CSV file, defaults to |
+ #' @export
|
|
27 | +134 |
- #' @param remove_csv TRUE/FALSE. Remove the intermediate CSV after usage?
+ #' @rdname dplyr_verbs
|
|
28 | +135 |
- #' @importFrom future %<-%
+ #' @importFrom dplyr do
|
|
29 | +136 |
- #' @family ingesting data
+ do.disk.frame <- create_chunk_mapper(dplyr::do) |
|
30 | +137 |
- #' @noRd
+
|
|
31 | +138 |
- sas_to_disk.frame = function(inpath, outpath, nchunks = disk.frame::recommend_nchunks(inpath), sas2csvpath = "sas2csv/sas2csv.exe", sep = "|", remove_csv = T) {+ |
|
32 | -! | +||
139 | +
- files = file.path(outpath, paste0(1:nchunks,".fst"))+ #' @export
|
||
33 | -! | +||
140 | +
- ready = rep(F, nchunks) | file.exists(files)+ #' @rdname dplyr_verbs
|
||
34 | +141 |
- # ready = c(rep(T, 96), rep(F, 4))
+ #' @importFrom dplyr group_by_all
|
|
35 | -! | +||
142 | +
- extracting = rep(F, nchunks)+ chunk_group_by_all.disk.frame <- create_chunk_mapper(dplyr::group_by_all) |
||
36 | +143 |
- + |
|
37 | -! | +||
144 | +
- fs::dir_create(outpath)+ |
||
38 | -! | +||
145 | +
- fs::dir_delete(file.path("outcsv"))+ #' @export
|
||
39 | +146 |
-
+ #' @rdname dplyr_verbs
|
|
40 | -! | +||
147 | +
- print("this program converts SAS datasets to CSV first before conversion to disk.frame.")+ #' @importFrom dplyr group_by_at
|
||
41 | -! | +||
148 | +
- print(glue::glue("the intermediate CSVs are here: {file.path(getwd(), 'outcsv')}"))+ chunk_group_by_at.disk.frame <- create_chunk_mapper(dplyr::group_by_at) |
||
42 | +149 |
- + |
|
43 | -! | +||
150 | +
- while(!all(ready)) {+ |
||
44 | -! | +||
151 | +
- done1 = F+ #' @export
|
||
45 | -! | +||
152 | +
- extracting_jobs = F+ #' @rdname dplyr_verbs
|
||
46 | -! | +||
153 | +
- for(w in which(!ready)) {+ #' @importFrom dplyr group_by_if
|
||
47 | -! | +||
154 | +
- incsv = file.path("outcsv", w, paste0("_", w-1,".csv"))+ chunk_group_by_if.disk.frame <- create_chunk_mapper(dplyr::group_by_if) |
||
48 | -! | +||
155 | +
- if(file.exists(incsv)) {+ |
||
49 | -! | +||
156 | +
- done1 = T+ |
||
50 | -! | +||
157 | +
- ready[w] = T+ #' @export
|
||
51 | -! | +||
158 | +
- ok %<-% {+ #' @rdname dplyr_verbs
|
||
52 | -! | +||
159 | +
- fst::write_fst(data.table::fread(incsv), file.path(outpath, paste0(w,".fst.tmp")))+ #' @importFrom dplyr mutate_all
|
||
53 | -! | +||
160 | +
- file.rename(file.path(outpath, paste0(w,".fst.tmp")), file.path(outpath, paste0(w,".fst")))+ mutate_all.disk.frame <- create_chunk_mapper(dplyr::mutate_all) |
||
54 | -! | +||
161 | +
- if(remove_csv) {+ |
||
55 | -! | +||
162 | +
- file.remove(incsv)+ |
||
56 | +163 |
- }
+ #' @export
|
|
57 | -! | +||
164 | +
- gc()+ #' @rdname dplyr_verbs
|
||
58 | +165 |
- }
+ #' @importFrom dplyr mutate_at
|
|
59 | -! | +||
166 | +
- print(glue::glue("converting: {w} of {nchunks}; time: {Sys.time()}"))+ mutate_at.disk.frame <- create_chunk_mapper(dplyr::mutate_at) |
||
60 | -! | +||
167 | +
- } else if (!extracting_jobs & !extracting[w]) {+ |
||
61 | +168 |
- + |
|
62 | -! | +||
169 | +
- done1 = T+ #' @export
|
||
63 | -! | +||
170 | +
- extracting_jobs = T+ #' @rdname dplyr_verbs
|
||
64 | -! | +||
171 | +
- extracting[w] <- T+ #' @importFrom dplyr mutate_if
|
||
65 | -! | +||
172 | +
- ok %<-% {+ mutate_if.disk.frame <- create_chunk_mapper(dplyr::mutate_if) |
||
66 | -! | +||
173 | +
- sas_to_csv(inpath, w, nchunks, sas2csvpath, sep = sep)+ |
||
67 | +174 |
- }
+
|
|
68 | -! | +||
175 | +
- print(glue::glue("extracting: {w} of {nchunks}; time: {Sys.time()}"))+ #' @export
|
||
69 | +176 |
- + #' @rdname dplyr_verbs
|
|
70 | +177 |
- }
+ #' @importFrom dplyr rename_all
|
|
71 | +178 |
- }
+ rename_all.disk.frame <- create_chunk_mapper(dplyr::rename_all) |
|
72 | -! | +||
179 | +
- if(!done1) {+ |
||
73 | -! | +||
180 | +
- print(glue::glue("didn't get any work: {Sys.time()}"))+ |
||
74 | -! | +||
181 | +
- Sys.sleep(18)+ #' @export
|
||
75 | +182 |
- }
+ #' @rdname dplyr_verbs
|
|
76 | +183 |
- }
+ #' @importFrom dplyr rename_at
|
|
77 | +184 |
- }
+ rename_at.disk.frame <- create_chunk_mapper(dplyr::rename_at) |
1 | +185 |
- #' @param x a disk.frame
+
|
|
2 | +186 |
- #' @param y a data.frame or disk.frame. If data.frame then returns lazily; if disk.frame it performs the join eagerly and return a disk.frame
+
|
|
3 | +187 |
- #' @param outdir output directory for disk.frame
+ #' @export
|
|
4 | +188 |
- #' @rdname join
+ #' @rdname dplyr_verbs
|
|
5 | +189 |
- #' @export
+ #' @importFrom dplyr rename_if
|
|
6 | +190 |
- #' @examples
+ rename_if.disk.frame <- create_chunk_mapper(dplyr::rename_if) |
|
7 | +191 |
- #' cars.df = as.disk.frame(cars)
+
|
|
8 | +192 |
- #'
+
|
|
9 | +193 |
- #' join.df = semi_join(cars.df, cars.df)
+ #' @export
|
|
10 | +194 |
- #'
+ #' @rdname dplyr_verbs
|
|
11 | +195 |
- #' # clean up cars.df
+ #' @importFrom dplyr select_all
|
|
12 | +196 |
- #' delete(cars.df)
+ select_all.disk.frame <- create_chunk_mapper(dplyr::select_all) |
|
13 | +197 |
- #' delete(join.df)
+
|
|
14 | +198 |
- semi_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_semi_join"), merge_by_chunk_id = FALSE, overwrite = TRUE) {+ |
|
15 | -12x | +||
199 | +
- stopifnot("disk.frame" %in% class(x))+ #' @export
|
||
16 | +200 |
- + #' @rdname dplyr_verbs
|
|
17 | -12x | +||
201 | +
- overwrite_check(outdir, overwrite)+ #' @importFrom dplyr select_at
|
||
18 | +202 |
- + select_at.disk.frame <- create_chunk_mapper(dplyr::select_at) |
|
19 | -12x | +||
203 | +
- if("data.frame" %in% class(y)) {+ |
||
20 | -4x | +||
204 | +
- quo_dotdotdot = enquos(...)+ |
||
21 | -4x | +||
205 | +
- map_dfr(x, ~{+ #' @export
|
||
22 | -16x | +||
206 | +
- code = quo(semi_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot))+ #' @rdname dplyr_verbs
|
||
23 | -16x | +||
207 | +
- rlang::eval_tidy(code)+ #' @importFrom dplyr select_if
|
||
24 | +208 |
- })+ select_if.disk.frame <- create_chunk_mapper(dplyr::select_if) |
|
25 | -8x | +||
209 | +
- } else if("disk.frame" %in% class(y)) {+ |
||
26 | -8x | +||
210 | +
- if(is.null(merge_by_chunk_id)) {+ |
||
27 | -! | +||
211 | +
- stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly")+ #' @export
|
||
28 | +212 |
- }
+ #' @rdname dplyr_verbs
|
|
29 | -8x | +||
213 | +
- if(is.null(by)) {+ #' @importFrom dplyr summarise_all
|
||
30 | -! | +||
214 | +
- by <- intersect(names(x), names(y))+ chunk_summarise_all <- create_chunk_mapper(dplyr::summarise_all) |
||
31 | +215 |
- }
+
|
|
32 | +216 |
- + |
|
33 | -8x | +||
217 | +
- ncx = nchunks(x)+ #' @export
|
||
34 | -8x | +||
218 | +
- ncy = nchunks(y)+ #' @rdname dplyr_verbs
|
||
35 | -8x | +||
219 | +
- if (merge_by_chunk_id == FALSE) {+ #' @importFrom dplyr summarise_at
|
||
36 | -4x | +||
220 | +
- warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.")+ chunk_summarise_at <- create_chunk_mapper(dplyr::summarise_at) |
||
37 | -4x | +||
221 | +
- x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ |
||
38 | -4x | +||
222 | +
- y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ |
||
39 | -4x | +||
223 | +
- return(semi_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite))+ #' @export
|
||
40 | -4x | +||
224 | +
- } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {+ #' @rdname dplyr_verbs
|
||
41 | -4x | +||
225 | +
- res = map2.disk.frame(x, y, ~{+ #' @importFrom dplyr summarize_all
|
||
42 | -21x | +||
226 | +
- if(is.null(.y)) {+ chunk_summarize_all <- create_chunk_mapper(dplyr::summarize_all) |
||
43 | -! | +||
227 | +
- return(data.table())+ |
||
44 | -21x | +||
228 | +
- } else if (is.null(.x)) {+ |
||
45 | -! | +||
229 | +
- return(data.table())+ #' @export
|
||
46 | +230 |
- }
+ #' @rdname dplyr_verbs
|
|
47 | -21x | +||
231 | +
- semi_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite)+ #' @importFrom dplyr summarize_at
|
||
48 | -4x | +||
232 | +
- }, outdir = outdir)+ chunk_summarize_at <- create_chunk_mapper(dplyr::summarize_at) |
||
49 | -4x | +||
233 | +
- return(res)+ |
||
50 | +234 |
- } else {+ |
|
51 | +235 |
- # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
+ #' @export
|
|
52 | -! | +||
236 | +
- stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE")+ #' @rdname dplyr_verbs
|
||
53 | +237 |
- }
+ #' @importFrom dplyr summarize_if
|
|
54 | +238 |
- }
+ chunk_summarize_if <- create_chunk_mapper(dplyr::summarize_if) |
|
55 | +239 |
- }
+
|
1 | +240 |
- #' Obtain one chunk by chunk id
+
|
||
2 | +241 |
- #' @param df a disk.frame
+ #' @export
|
||
3 | +242 |
- #' @param n the chunk id. If numeric then matches by number, if character then returns the chunk with the same name as n
+ #' @rdname dplyr_verbs
|
||
4 | +243 |
- #' @param keep the columns to keep
+ #' @importFrom dplyr distinct
|
||
5 | +244 |
- #' @param full.names whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric
+ distinct.disk.frame <- function(...) { |
||
6 | -+ | |||
245 | +! |
- #' @param ... passed to fst::read_fst or whichever read function is used in the backend
+ stop("`distinct.disk.frame` is not available. Please use `chunk_distinct`") |
||
7 | +246 |
- #' @export
+ }
|
||
8 | +247 |
- #' @examples
+
|
||
9 | +248 |
- #' cars.df = as.disk.frame(cars, nchunks = 2)
+
|
||
10 | +249 |
- #' get_chunk(cars.df, 1)
+ #' @export
|
||
11 | +250 |
- #' get_chunk(cars.df, 2)
+ #' @rdname dplyr_verbs
|
||
12 | +251 |
- #' get_chunk(cars.df, 1, keep = "speed")
+ #' @importFrom dplyr distinct
|
||
13 | +252 |
- #'
+ chunk_distinct <- create_chunk_mapper(dplyr::distinct, warning_msg = "the `distinct` function applies distinct chunk-wise") |
||
14 | +253 |
- #' # if full.names = TRUE then the full path to the chunk need to be provided
+
|
||
15 | +254 |
- #' get_chunk(cars.df, file.path(attr(cars.df, "path"), "1.fst"), full.names = TRUE)
+ #' The shard keys of the disk.frame
|
||
16 | +255 |
- #'
+ #' @return character
|
||
17 | +256 |
- #' # clean up cars.df
+ #' @export
|
||
18 | +257 |
- #' delete(cars.df)
+ #' @param x a disk.frame
|
||
19 | +258 |
- get_chunk <- function(...) {+ groups.disk.frame <- function(x){ |
||
20 | -1096x | +259 | +1x |
- UseMethod("get_chunk")+ shardkey(x) |
21 | +260 |
}
|
||
22 | +261 | |||
23 | +262 |
-
+ #' Group by within each disk.frame
|
||
24 | +263 |
- #' @rdname get_chunk
+ #' @description
|
||
25 | +264 |
- #' @importFrom fst read_fst
+ #' The disk.frame group by operation perform group WITHIN each chunk. This is
|
||
26 | +265 |
- #' @export
+ #' often used for performance reasons. If the user wishes to perform group-by,
|
||
27 | +266 |
- get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) {+ #' they may choose to use the `hard_group_by` function which is expensive as it
|
||
28 | +267 |
- - |
- ||
29 | -1096x | -
- stopifnot("disk.frame" %in% class(df))+ #' reorganizes the chunks by the shard key.
|
||
30 | +268 |
- + #' @seealso hard_group_by group_by
|
||
31 | -1096x | +|||
269 | +
- path = attr(df,"path")+ #' @param .data a disk.frame
|
|||
32 | -1096x | +|||
270 | +
- keep1 = attr(df,"keep")+ #' @param ... passed to dplyr::group_by
|
|||
33 | +271 |
- + #' @export
|
||
34 | -1096x | +|||
272 | +
- cmds = attr(df,"lazyfn")+ #' @rdname chunk_group_by
|
|||
35 | -1096x | +|||
273 | +
- filename = ""+ #' @export
|
|||
36 | +274 |
- + chunk_group_by <- create_chunk_mapper(dplyr::group_by) |
||
37 | -1096x | +|||
275 | +
- if (typeof(keep) == "closure") {+ |
|||
38 | -! | +|||
276 | +
- keep = keep1+ #' @rdname chunk_group_by
|
|||
39 | -1096x | +|||
277 | +
- } else if(!is.null(keep1) & !is.null(keep)) {+ #' @export
|
|||
40 | -! | +|||
278 | +
- keep = intersect(keep1, keep)+ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) |
|||
41 | -! | +|||
279 | +
- if (!all(keep %in% keep1)) {+ |
|||
42 | -! | +|||
280 | +
- warning("some of the variables specified in keep = {keep} is not available")+ # do not introduce it as it was never introduced
|
|||
43 | +281 |
- }
+ #ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework")
|
||
44 | -1096x | +|||
282 | +
- } else if(is.null(keep)) {+ |
|||
45 | -1096x | +|||
283 | +
- keep = keep1+ |
|||
46 | +284 |
- }
+ #' @export
|
||
47 | +285 |
- + #' @rdname dplyr_verbs
|
||
48 | -1096x | +|||
286 | +
- if(is.numeric(n)) {+ glimpse.disk.frame <- function(.data, ...) { |
|||
49 | -28x | +|||
287 | +! |
- filename = file.path(path, paste0(n,".fst"))+ glimpse(head(.data, ...), ...) |
||
50 | -28x | +|||
288 | +
- if(!file.exists(filename)) {+ }
|
|||
51 | -! | +|||
289 | +
- filename = list.files(path, full.names = TRUE)[n]+ |
|||
52 | +290 |
- }
+ # Internal methods
|
||
53 | +291 |
- } else {+ # @param .data the data
|
||
54 | -1068x | +|||
292 | +
- if (full.names) {+ # @param cmd the function to record
|
|||
55 | -1041x | +|||
293 | +
- filename = n+ record <- function(.data, cmd){ |
|||
56 | -+ | |||
294 | +! |
- } else {+ attr(.data,"lazyfn") <- c(attr(.data,"lazyfn"), list(cmd)) |
||
57 | -27x | +|||
295 | +! |
- filename = file.path(path, n)+ .data
|
||
58 | +296 |
- }
+ }
|
||
59 | +297 |
- }
+
|
||
60 | +298 |
- + # Internal methods
|
||
61 | +299 |
- # if the file you are looking for don't exist
+ # @param .data the disk.frame
|
||
62 | -1096x | +|||
300 | +
- if (!fs::file_exists(filename)) {+ # @param cmds the list of function to play back
|
|||
63 | -! | +|||
301 | +
- warning(glue("The chunk {filename} does not exist; returning an empty data.table"))+ play <- function(.data, cmds=NULL) { |
|||
64 | -! | +|||
302 | +397x |
- notbl <- data.table()+ for (cmd in cmds){ |
||
65 | -! | +|||
303 | +466x |
- attr(notbl, "does not exist") <- TRUE+ if (typeof(cmd) == "closure") { |
||
66 | +304 | ! |
- return(notbl)+ .data <- cmd(.data) |
|
67 | +305 |
- }
+ } else { |
||
68 | +306 |
-
+ # create a temporary environment
|
||
69 | -1096x | +307 | +466x |
- if (is.null(cmds)) {+ an_env = new.env(parent = environment()) |
70 | -669x | +|||
308 | +
- if(typeof(keep)!="closure") {+ |
|||
71 | -669x | +309 | +466x |
- fst::read_fst(filename, columns = keep, as.data.table = TRUE,...)+ ng = names(cmd$vars_and_pkgs$globals) |
72 | +310 |
- } else {+ |
||
73 | -! | +|||
311 | +466x |
- fst::read_fst(filename, as.data.table = TRUE,...)+ if(length(ng) > 0) { |
||
74 | -+ | |||
312 | +388x |
- }
+ for(i in 1:length(cmd$vars_and_pkgs$globals)) { |
||
75 | -+ | |||
313 | +1369x |
- } else {+ g = cmd$vars_and_pkgs$globals[[i]] |
||
76 | -427x | +314 | +1369x |
- if(typeof(keep)!="closure") {+ assign(ng[i], g, pos = an_env) |
77 | -427x | +|||
315 | +
- play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), cmds)+ }
|
|||
78 | +316 |
- } else {+ }
|
||
79 | -! | +|||
317 | +
- play(fst::read_fst(filename, as.data.table = TRUE,...), cmds)+ + |
+ |||
318 | +466x | +
+ .data <- do.call(cmd$func, c(list(.data),cmd$dotdotdot), envir = an_env) |
||
80 | +319 |
}
|
||
81 | +320 |
}
|
||
321 | +395x | +
+ .data
+ |
+ ||
82 | +322 |
}
@@ -5353,4102 +8972,4202 @@ disk.frame coverage - 51.96% |
1 |
- #' Recommend number of chunks based on input size
+ #' One Stage function
|
||
2 |
- #' @description Computes the recommended number of chunks to break a data.frame
+ #' @param x the input
|
||
3 |
- #' into. It can accept filesizes in bytes (as integer) or a data.frame
+ #' @param listx a list
|
||
4 |
- #' @param df a disk.frame or the file size in bytes of a CSV file holding the
+ #' @param na.rm Remove NAs. TRUE of FALSE
|
||
5 |
- #' data
+ #' @param ... additional options
|
||
6 |
- #' @param type only = "csv" is supported. It indicates the file type
+ #' @rdname one-stage-group-by-verbs
|
||
7 |
- #' corresponding to file size `df`
+ #' @export
|
||
8 |
- #' @param minchunks the minimum number of chunks. Defaults to the number of CPU
+ var_df.chunk_agg.disk.frame <- function(x, na.rm = FALSE) { |
||
9 |
- #' cores (without hyper-threading)
+ # Guard against Github #241
|
||
10 | -+ | ! |
- #' @param conservatism a multiplier to the recommended number of chunks. The
+ data.frame( |
11 | -+ | ! |
- #' more chunks the smaller the chunk size and more likely that each chunk can
+ sumx = sum(x, na.rm = na.rm), |
12 | -+ | ! |
- #' fit into RAM
+ sumsqrx = sum(x^2, na.rm = na.rm), |
13 | -+ | ! |
- #' @param ram_size The amount of RAM available which is usually computed. Except on RStudio with R3.6+
+ nx = length(x) - ifelse(na.rm, sum(is.na(x)), 0) |
14 |
- #' @importFrom pryr object_size
+ )
|
||
15 |
- #' @importFrom utils memory.limit
+ }
|
||
16 |
- #' @export
+
|
||
17 |
- #' @examples
+ #' @export
|
||
18 |
- #' # recommend nchunks based on data.frame
+ #' @rdname one-stage-group-by-verbs
|
||
19 |
- #' recommend_nchunks(cars)
+ #' @importFrom dplyr bind_rows
|
||
20 |
- #'
+ var_df.collected_agg.disk.frame <- function(listx) { |
||
21 | -+ | ! |
- #' # recommend nchunks based on file size ONLY CSV is implemented at the moment
+ df = Reduce(dplyr::bind_rows, listx) |
22 |
- #' recommend_nchunks(1024^3)
+
|
||
23 | -+ | ! |
- recommend_nchunks <- function(df, type = "csv", minchunks = parallel::detectCores(logical = FALSE), conservatism = 2, ram_size = df_ram_size()) {+ sumlengthx = sum(df$nx) |
24 |
- + |
||
25 | -34x | +! |
- dfsize = 0+ first_part = sum(df$sumsqrx) / sumlengthx |
26 | -34x | +! |
- if ("data.frame" %in% class(df)) {+ second_part = sum(df$sumx) / sumlengthx |
27 |
- # the df's size in gigabytes
+
|
||
28 | -23x | +
- dfsize = as.numeric(pryr::object_size(df))/1024/1024/1024+ # unbiased adjustment
|
|
29 | -11x | +! |
- } else if ("disk.frame" %in% class(df)) {+ (first_part - second_part^2) * sumlengthx / (sumlengthx-1) |
30 | -! | +
- return(nchunks(df))+ }
|
|
31 | -11x | +
- } else if (is.numeric(df) & type == "csv") {+ |
|
32 |
- # assume that df is the estimated number of bytes of the data
+ #' @export
|
||
33 | -11x | +
- dfsize = df/1024/1024/1024+ #' @rdname one-stage-group-by-verbs
|
|
34 |
- } else {+ sd_df.chunk_agg.disk.frame <- var_df.chunk_agg.disk.frame |
||
35 | -! | +
- dfsize = df/1024/1024/1024+ |
|
36 |
- }
+ #' @export
|
||
37 |
-
+ #' @rdname one-stage-group-by-verbs
|
||
38 | -34x | +
- ram_size = df_ram_size()+ sd_df.collected_agg.disk.frame <- function(listx) { |
|
39 | -+ | ! |
- + sqrt(var_df.collected_agg.disk.frame(listx)) |
40 |
- # the number physical cores not counting hyper threaded ones as 2; they are counted as 1
+ }
|
||
41 | -34x | +
- nc = parallel::detectCores(logical = FALSE)+ |
|
42 |
- + |
||
43 |
- + #' mean chunk_agg
|
||
44 | -34x | +
- max(round(dfsize/ram_size*nc)*nc*conservatism, minchunks)+ #' @export
|
|
45 |
- }
+ #' @rdname one-stage-group-by-verbs
|
||
46 |
-
+ mean_df.chunk_agg.disk.frame <- function(x, na.rm = FALSE, ...) { |
||
47 | -+ | ! |
-
+ sumx = sum(x, na.rm = na.rm) |
48 | -+ | ! |
- #' Get the size of RAM in gigabytes
+ lengthx = length(x) - ifelse(na.rm, sum(is.na(x)), 0) |
49 | -+ | ! |
- #'
+ data.frame(sumx = sumx, lengthx = lengthx) |
50 |
- #' @return integer of RAM in gigabyte (GB)
+ }
|
||
51 |
- #' @export
+
|
||
52 |
- #' @importFrom bit64 as.integer64.character
+ #' mean collected_agg
|
||
53 |
- #' @examples
+ #' @export
|
||
54 |
- #' # returns the RAM size in gigabyte (GB)
+ #' @rdname one-stage-group-by-verbs
|
||
55 |
- #' df_ram_size()
+ mean_df.collected_agg.disk.frame <- function(listx) { |
||
56 | -+ | ! |
- df_ram_size <- function() {+ sum(sapply(listx, function(x) x$sumx))/sum(sapply(listx, function(x) x$lengthx)) |
57 | -34x | +
- ram_size = NULL+ }
|
|
58 |
- # the amount of memory available in gigabytes
+
|
||
59 | -34x | +
- if (Sys.info()[["sysname"]] == "Windows") {+ #' @export
|
|
60 | -34x | +
- if(.Platform$GUI == "RStudio") {+ #' @rdname one-stage-group-by-verbs
|
|
61 | -! | +
- majorv = as.integer(version$major)+ sum_df.chunk_agg.disk.frame <- function(x, ...) { |
|
62 | ! |
- minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])+ sum(x, ...) |
|
63 | -! | +
- if(majorv>=3 & minorv >= 6) {+ }
|
|
64 | -! | +
- ram_size <- system("wmic MemoryChip get Capacity", intern=TRUE) %>%+ |
|
65 | -! | +
- map(~strsplit(.x, " ")) %>%+ #' @export
|
|
66 | -! | +
- unlist %>%+ #' @rdname one-stage-group-by-verbs
|
|
67 | -! | +
- map(~bit64::as.integer64.character(.x)/1024^3) %>%+ sum_df.collected_agg.disk.frame <- function(listx, ...) { |
|
68 | ! |
- unlist %>%+ sum(unlist(listx), ...) |
|
69 | -! | +
- sum(na.rm=TRUE)+ }
|
|
70 |
- + |
||
71 | -! | +
- if(is.na(ram_size)) {+ #' @export
|
|
72 | -! | +
- if (is.null(ram_size)) {+ #' @rdname one-stage-group-by-verbs
|
|
73 | -! | +
- message("You are running RStudio with R 3.6+ on Windows. There is a bug with RAM size detection.")+ min_df.chunk_agg.disk.frame <- function(x, ...) { |
|
74 | ! |
- message("And disk.frame can't determine your RAM size using manual methods.")+ min(x, ...) |
|
75 | -! | +
- message("Going to assume your RAM size is 16GB (gigabyte). The program will continue to run.")+ }
|
|
76 | -! | +
- message("")+ |
|
77 | -! | +
- message("")+ #' @export
|
|
78 | -! | +
- message("Please report a bug at https://github.com/xiaodaigh/disk.frame/issues")+ #' @rdname one-stage-group-by-verbs
|
|
79 | -! | +
- message("Include this in your bug report:")+ min_df.collected_agg.disk.frame <- function(listx, ...) { |
|
80 | ! |
- message(system("wmic MemoryChip get Capacity", intern=TRUE))+ min(unlist(listx), ...) |
|
81 | -! | +
- message("")+ }
|
|
82 | -! | +
- message("")+ |
|
83 |
- #message("The option disk.frame.ram_size is not set.
+ #' @export
|
||
84 |
- #message("To set the ram_size, do options(disk.frame_ram_size = your_ram_size_in_gigabytes)")
+ #' @rdname one-stage-group-by-verbs
|
||
85 | -! | +
- ram_size = 16+ max_df.chunk_agg.disk.frame <- function(x, ...) { |
|
86 | -+ | ! |
- }
+ max(x, ...) |
87 |
- }
+ }
|
||
88 |
- }
+
|
||
89 |
- } else {+ #' @export
|
||
90 | -34x | +
- ram_size = memory.limit()/1024+ #' @rdname one-stage-group-by-verbs
|
|
91 |
- }
+ max_df.collected_agg.disk.frame <- function(listx, ...) { |
||
92 | -+ | ! |
- } else {+ max(unlist(listx), ...) |
93 |
- #ram_size = as.numeric(system('grep MemTotal /proc/meminfo', ignore.stdout = TRUE) / 1024)
+ }
|
||
94 | -! | +
- a = system('grep MemTotal /proc/meminfo', intern = TRUE)+ |
|
95 | -! | +
- l = strsplit(a, " ")[[1]]+ #' @export
|
|
96 | -! | +
- l = as.numeric(l[length(l)-1])+ #' @rdname one-stage-group-by-verbs
|
|
97 | -! | +
- ram_size = l/1024^2+ #' @importFrom stats median
|
|
98 |
- #ram_size = benchmarkme::get_ram()/1024/1024/1024
+ median_df.chunk_agg.disk.frame <- function(x, ...) { |
||
99 | -+ | ! |
- }
+ stats::median(x, ...) |
100 |
- + }
|
||
101 | -34x | +
- if(is.na(ram_size)) {+ |
|
102 | -! | +
- warning("RAM size not detected. Assumme you have at least 16GB of RAM")+ #' @export
|
|
103 | -! | +
- ram_size = 16+ #' @rdname one-stage-group-by-verbs
|
|
104 |
- }
+ median_df.collected_agg.disk.frame <- function(listx, ...) { |
||
105 | -+ | ! |
- # assume at least 1G of RAM
+ stats::median(unlist(listx), ...) |
106 | -34x | +
- ram_size = max(ram_size, 1, na.rm = TRUE)+ }
|
|
107 |
- + |
||
108 | -34x | +
- ram_size
+ #' @export
|
|
109 |
- }
+ #' @rdname one-stage-group-by-verbs
|
1 | +110 |
- #' Returns the shardkey (not implemented yet)
+ #' @importFrom dplyr n
|
|
2 | +111 |
- #' @importFrom jsonlite fromJSON
+ n_df.chunk_agg.disk.frame <- function(...) {+ |
+ |
112 | +! | +
+ dplyr::n() |
|
3 | +113 |
- #' @param df a disk.frame
+ }
|
|
4 | +114 | ++ | + + | +
115 |
#' @export
|
||
5 | +116 |
- # TODO make this work
+ #' @rdname one-stage-group-by-verbs
|
|
6 | +117 |
- shardkey <- function(df) {+ n_df.collected_agg.disk.frame <- function(listx, ...) { |
|
7 | -333x | +||
118 | +! |
- meta_file = file.path(attr(df,"path"),".metadata", "meta.json")+ sum(unlist(listx)) |
|
8 | -333x | +||
119 | ++ |
+ }
+ |
+ |
120 | ++ | + + | +|
121 | ++ |
+ #' @export
+ |
+ |
122 | ++ |
+ #' @rdname one-stage-group-by-verbs
+ |
+ |
123 | +
- if(!file.exists(meta_file)) {+ length_df.chunk_agg.disk.frame <- function(x, ...) { |
||
9 | +124 | ! |
- add_meta(df)+ length(x, ...) |
10 | +125 |
- }
+ }
|
|
11 | -333x | +||
126 | +
- meta = jsonlite::fromJSON(meta_file)+ |
||
12 | -333x | +||
127 | +
- list(shardkey = meta$shardkey, shardchunks = meta$shardchunks)+ #' @export
|
||
13 | +128 |
- }
+ #' @rdname one-stage-group-by-verbs
|
|
14 | +129 |
-
+ length_df.collected_agg.disk.frame <- function(listx, ...) {+ |
+ |
130 | +! | +
+ length(unlist(listx), ...) |
|
15 | +131 |
- #' Compare two disk.frame shardkeys
+ }
|
|
16 | +132 |
- #'
+
|
|
17 | +133 |
- #' @param sk1 shardkey1
+ #' @export
|
|
18 | +134 |
- #' @param sk2 shardkey2
+ #' @rdname one-stage-group-by-verbs
|
|
19 | +135 |
- #' @export
+ any_df.chunk_agg.disk.frame <- function(x, ...) {+ |
+ |
136 | +! | +
+ any(x, ...) |
|
20 | +137 |
- shardkey_equal <- function(sk1, sk2) {+ }
|
|
21 | -2x | +||
138 | +
- if (sk1$shardkey == "") {+ |
||
22 | +139 |
- # if the shardkey is not set then it's the same as having no shardkey
+ #' @export
|
|
23 | -2x | +||
140 | +
- return(FALSE)+ #' @rdname one-stage-group-by-verbs
|
||
24 | +141 |
- }
+ any_df.collected_agg.disk.frame <- function(listx, ...) { |
|
25 | +142 | ! |
- (sk1$shardkey == sk2$skardkey) && (sk1$shardchunks == sk2$shardchunks)+ any(unlist(listx), ...) |
26 | +143 |
}
|
1 | +144 |
- # Alternative to hashstr2i that can produce semi-sorted chunks
+
|
|
2 | +145 |
- # Apply as e.g.:
+ #' @export
|
|
3 | +146 |
- # split_values <- map(dff, sample_n, size=1) %>%
+ #' @rdname one-stage-group-by-verbs
|
|
4 | +147 |
- # select(c("id1", "id2")) %>%
+ all_df.chunk_agg.disk.frame <- function(x, ...) { |
|
5 | -+ | ||
148 | +! |
- # collect() %>%
+ all(x, ...) |
|
6 | +149 |
- # arrange(id1, id2)
+ }
|
|
7 | +150 |
- #
+
|
|
8 | +151 |
- # shard_by_rule <- splitstr2i(split_values)
+ #' @export
|
|
9 | +152 |
- # code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]")
+ #' @rdname one-stage-group-by-verbs
|
|
10 | +153 |
-
+ all_df.collected_agg.disk.frame <- function(listx, ...) { |
|
11 | -+ | ||
154 | +! |
- # Check if date
+ all(unlist(listx), ...) |
|
12 | -100x | +||
155 | +
- is.date <- function(x) inherits(x, 'Date')+ }
|
||
13 | +156 | ||
14 | +157 |
- # Escapes names
+ #' @export
|
|
15 | +158 |
- # Factors are converted to numbers
+ #' @rdname one-stage-group-by-verbs
|
|
16 | +159 |
- escape_name <- function(name, x) {+ n_distinct_df.chunk_agg.disk.frame <- function(x, na.rm = FALSE, ...) { |
|
17 | -50x | +||
160 | +! |
- if(is.factor(x)){+ if(na.rm) { |
|
18 | -2x | +||
161 | +! |
- paste0("as.numeric(", name, ")")+ setdiff(unique(x), NA) |
|
19 | +162 |
} else { |
|
20 | -48x | +||
163 | +! |
- name
+ unique(x) |
|
21 | +164 |
}
|
|
22 | +165 |
}
|
|
23 | +166 | ||
24 | +167 |
- # Escapes values
+ #' @export
|
|
25 | +168 |
- # Strings and dates are quoted
+ #' @importFrom dplyr n_distinct
|
|
26 | +169 |
- # Factors are converted to number
+ #' @rdname one-stage-group-by-verbs
|
|
27 | +170 |
- escape_value <- function(x) {+ n_distinct_df.collected_agg.disk.frame <- function(listx, ...) { |
|
28 | -100x | +||
171 | +! |
- if(is.character(x) | is.date(x)){+ n_distinct(unlist(listx)) |
|
29 | -72x | +||
172 | +
- paste0("\"", x, "\"")+ }
|
||
30 | -28x | +||
173 | +
- } else if(is.factor(x)){+ |
||
31 | -4x | +||
174 | +
- as.numeric(x)+ #' @export
|
||
32 | +175 |
- } else {+ #' @rdname one-stage-group-by-verbs
|
|
33 | -24x | +||
176 | +
- x
+ #' @importFrom stats quantile
|
||
34 | +177 |
- }
+ quantile_df.chunk_agg.disk.frame <- function(x, ...) {+ |
+ |
178 | +! | +
+ stats::quantile(x, ...) |
|
35 | +179 |
}
|
|
36 | +180 | ||
37 | +181 |
- # Switch condition - returns
+ #' @export
|
|
38 | +182 |
- # ({name} < {split_value} | ({name} == {split_value} ...
+ #' @rdname one-stage-group-by-verbs
|
|
39 | +183 |
-
+ #' @importFrom stats quantile
|
|
40 | +184 |
- switchcond <- function(name, split_values, desc_vars){+ quantile_df.collected_agg.disk.frame <- function(listx, ...) { |
|
41 | -50x | +||
185 | +! |
- paste0("(",+ stats::quantile(unlist(listx), ...) |
|
42 | -50x | +||
186 | +
- escape_name(name, split_values[,name]),+ }
|
||
43 | -50x | +||
187 | +
- ifelse(name %in% desc_vars, " < ", " > "),+ |
||
44 | -50x | +||
188 | +
- escape_value(split_values[,name]),+ #' @export
|
||
45 | +189 |
- " | (",
+ #' @rdname one-stage-group-by-verbs
|
|
46 | -50x | +||
190 | +
- name,
+ #' @importFrom stats quantile
|
||
47 | +191 |
- " == ",
+ IQR_df.chunk_agg.disk.frame <- function(x, na.rm = FALSE, ...) { |
|
48 | -50x | +||
192 | +! |
- escape_value(split_values[,name])+ stats::quantile(x, c(0.25, 0.75), na.rm = na.rm) |
|
49 | +193 |
- )
+ #100
|
|
50 | +194 |
}
|
|
51 | +195 | ||
52 | +196 |
- # Composes the switch conditions, so each split row becomes
+ #' @export
|
|
53 | +197 |
- # ({name1} < {split_value1} | ({name1} == {split_value1} &
+ #' @rdname one-stage-group-by-verbs
|
|
54 | +198 |
- # ({name2} < {split_value2} | ({name2} == {split_value2} ...)))) * 1
+ #' @importFrom stats quantile
|
|
55 | +199 |
- # the sum of the split row is the id
+ IQR_df.collected_agg.disk.frame <- function(listx, ...) { |
|
56 | -+ | ||
200 | +! |
- sortablestr2i <- function(split_values, desc_vars){+ q25 = unlist(listx)[c(TRUE, FALSE)] |
|
57 | -32x | +||
201 | +! |
- do.call(+ q75 = unlist(listx)[c(FALSE, TRUE)] |
|
58 | -32x | +||
202 | +! |
- paste,
+ quantile(q75, 0.75) - quantile(q25, 0.25) |
|
59 | -32x | +||
203 | +
- c(+ }
|
||
60 | -32x | +||
204 | +
- lapply(+ |
||
61 | -32x | +||
205 | +
- as.list(do.call(paste, c(lapply(colnames(split_values), switchcond, split_values, desc_vars), sep=" & "))),+ |
||
62 | -152x | +||
206 | +
- function(x) { paste0("(", x, paste0(rep(")", ncol(split_values) * 2 + 1), collapse = ""), "* 1")}+ #' A function to parse the summarize function
|
||
63 | +207 |
- ),
+ #' @importFrom dplyr filter select pull
|
|
64 | -32x | +||
208 | +
- sep = " + "+ #' @importFrom purrr map_dfr
|
||
65 | +209 |
- )
+ #' @rdname group_by
|
|
66 | +210 |
- )
+ #' @export
|
|
67 | +211 |
- }
+ summarise.grouped_disk.frame <- function(.data, ...) { |
1 | +212 |
- #' Merge function for disk.frames
+ |
|
2 | +213 |
- #' @export
+ |
|
3 | -+ | ||
214 | +1x |
- #' @param x a disk.frame
+ ca_code = generate_summ_code(...) |
|
4 | +215 |
- #' @param y a disk.frame or data.frame
+ |
|
5 | -+ | ||
216 | +1x |
- #' @param by the merge by keys
+ if(is.null(names(ca_code))) { |
|
6 | -+ | ||
217 | +! |
- #' @param outdir The output directory for the disk.frame
+ return(eval(parse(text = glue::glue(".data %>% {rlang::as_label(ca_code)}")))) |
|
7 | -+ | ||
218 | +1x |
- #' @param merge_by_chunk_id if TRUE then only chunks in df1 and df2 with the same chunk id will get merged
+ } else if("chunk_summ_code" %in% names(ca_code)) { |
|
8 | -+ | ||
219 | +1x |
- #' @param overwrite overwrite the outdir or not
+ chunk_summ_code = ca_code$chunk_summ_code |
|
9 | -+ | ||
220 | +1x |
- #' @param ... passed to merge and map.disk.frame
+ agg_summ_code = ca_code$agg_summ_code |
|
10 | +221 |
- #' @importFrom data.table data.table setDT
+ |
|
11 | +222 |
- #' @examples
+ # get the by variables
|
|
12 | -+ | ||
223 | +1x |
- #' b = as.disk.frame(data.frame(a = 51:150, b = 1:100))
+ group_by_cols = purrr::map_chr(attr(.data, "group_by_cols", exact=TRUE), ~{deparse(.x)}) |
|
13 | +224 |
- #' d = as.disk.frame(data.frame(a = 151:250, b = 1:100))
+ |
|
14 | +225 |
- #' bd.df = merge(b, d, by = "b")
+ # generate full code
+ |
+ |
226 | +1x | +
+ code_to_run = glue::glue("chunk_group_by({paste0(group_by_cols, collapse=',')}) %>% chunk_summarize({chunk_summ_code}) %>% collect %>% group_by({paste0(group_by_cols, collapse=',')}) %>% summarize({agg_summ_code})") |
|
15 | +227 |
- #'
+ + |
+ |
228 | +1x | +
+ class(.data) <- c("summarized_disk.frame", "disk.frame")+ |
+ |
229 | +1x | +
+ attr(.data, "summarize_code") = code_to_run+ |
+ |
230 | +1x | +
+ return(.data) |
|
16 | +231 |
- #' # clean up cars.df
+ } else { |
|
17 | -+ | ||
232 | +! |
- #' delete(b)
+ stop("something's wrong mate") |
|
18 | +233 |
- #' delete(d)
+ }
|
|
19 | +234 |
- #' delete(bd.df)
+ }
|
|
20 | +235 |
- merge.disk.frame <- function(x, y, by, outdir = tempfile(fileext = ".df"), ..., merge_by_chunk_id = FALSE, overwrite = FALSE) {- |
- |
21 | -3x | -
- stopifnot("disk.frame" %in% class(x))- |
- |
22 | -3x | -
- overwrite_check(outdir, overwrite = TRUE)+ |
|
23 | +236 |
- #fs::dir_create(outdir)
+ #' @export
|
|
24 | +237 |
- + #' @rdname group_by
|
|
25 | -3x | +||
238 | +
- if("data.frame" %in% class(y)) {+ summarize.grouped_disk.frame = summarise.grouped_disk.frame |
||
26 | -1x | +||
239 | +
- yby = c(list(y=y, by=by), list(...))+ |
||
27 | -1x | +||
240 | +
- res = map(x, ~{+ #' Group by within each disk.frame
|
||
28 | -5x | +||
241 | +
- res = do.call(merge, c(list(x = .x), yby))+ #' @description
|
||
29 | -5x | +||
242 | +
- res
+ #' The disk.frame group by operation perform group WITHIN each chunk. This is
|
||
30 | -1x | +||
243 | +
- }, outdir=outdir, ...)+ #' often used for performance reasons. If the user wishes to perform group-by,
|
||
31 | -1x | +||
244 | +
- res
+ #' they may choose to use the `hard_group_by` function which is expensive as it
|
||
32 | -2x | +||
245 | +
- } else if (merge_by_chunk_id | shardkey_equal(shardkey(x), shardkey(y))) {+ #' reorganizes the chunks by the shard key.
|
||
33 | +246 |
- # ifthe shardkeys are the same then only need to match by segment id
+ #' @seealso hard_group_by
|
|
34 | +247 |
- # as account with the same shardkey must end up in the same segment
+ #' @param .data a disk.frame
|
|
35 | -1x | +||
248 | +
- path1 = attr(x,"path")+ #' @param add from dplyr
|
||
36 | -1x | +||
249 | +
- path2 = attr(y,"path")+ #' @param .drop from dplyr
|
||
37 | +250 |
- + #' @param ... same as the dplyr::group_by
|
|
38 | -1x | +||
251 | +
- df3 = merge(+ #' @importFrom dplyr group_by_drop_default
|
||
39 | -1x | +||
252 | +
- data.table(+ #' @export
|
||
40 | -1x | +||
253 | +
- chunk_id = list.files(path1),+ #' @rdname group_by
|
||
41 | -1x | +||
254 | +
- pathA = list.files(path1,full.names = TRUE)+ # learning from https://docs.dask.org/en/latest/dataframe-groupby.html
|
||
42 | +255 |
- ),
+ group_by.disk.frame <- function(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { |
|
43 | +256 | 1x |
- data.table(+ class(.data) <- c("grouped_disk.frame", "disk.frame") |
44 | +257 | 1x |
- chunk_id = list.files(path2),+ attr(.data, "group_by_cols") = substitute(list(...))[-1] |
45 | +258 | 1x |
- pathB = list.files(path2,full.names = TRUE)+ .data
|
46 | +259 |
- )
+ }
|
|
47 | +260 |
- )
- |
- |
48 | -1x | -
- setDT(df3)+ |
|
49 | -1x | +||
261 | +
- df3[,{+ #' @export
|
||
50 | -5x | +||
262 | +
- data1 = read_fst(pathA,as.data.table = TRUE)+ #' @importFrom dplyr summarize
|
||
51 | -5x | +||
263 | +
- data2 = read_fst(pathB,as.data.table = TRUE)+ #' @rdname group_by
|
||
52 | -5x | +||
264 | +
- data3 = force(merge(data1, data2, by = by))+ summarize.disk.frame <- function(.data, ...) { |
||
53 | -5x | +||
265 | +! |
- rm(data1); rm(data2); gc()+ ca_code = generate_summ_code(...) |
|
54 | -5x | +||
266 | +
- write_fst(data3, glue("{outdir}/{.BY}"))+ |
||
55 | -5x | +||
267 | +! |
- NULL
+ if(is.null(names(ca_code))) { |
|
56 | -1x | +||
268 | +! |
- }, chunk_id]+ return(eval(parse(text = glue::glue(".data %>% {rlang::as_label(ca_code)}")))) |
|
57 | -1x | +||
269 | +! |
- return(disk.frame(outdir))+ } else if("chunk_summ_code" %in% names(ca_code)) { |
|
58 | -+ | ||
270 | +! |
- } else {+ chunk_summ_code = ca_code$chunk_summ_code |
|
59 | -1x | +||
271 | +! |
- stop("Cartesian joins are currently not implemented. Either make y a data.frame or set merge_by_chunk_id to TRUE")+ agg_summ_code = ca_code$agg_summ_code |
|
60 | +272 |
|
|
61 | +273 |
- # have to make every possible combination
+ # generate full code
|
|
62 | -+ | ||
274 | +! |
- # path1 = attr(df1,"path")
+ code_to_run = glue::glue("chunk_summarize({chunk_summ_code}) %>% collect %>% summarize({agg_summ_code})") |
|
63 | +275 |
- # path2 = attr(df2,"path")
+ |
|
64 | -+ | ||
276 | +! |
- #
+ class(.data) <- c("summarized_disk.frame", "disk.frame") |
|
65 | -+ | ||
277 | +! |
- # df3 = merge(
+ attr(.data, "summarize_code") = code_to_run |
|
66 | -+ | ||
278 | +! |
- # data.table(
+ return(.data ) |
|
67 | +279 |
- # justmerge = TRUE,
+ } else { |
|
68 | -+ | ||
280 | +! |
- # chunk_id1 = list.files(path1),
+ stop("something's wrong") |
|
69 | +281 |
- # pathA = list.files(path1,full.names = TRUE)
+ }
|
|
70 | +282 |
- # ),
+ }
|
|
71 | +283 |
- # data.table(
+
|
|
72 | +284 |
- # justmerge = TRUE,
+ #' Helper function to generate summarisation code
|
|
73 | +285 |
- # chunk_id2 = list.files(path2),
+ #' @importFrom data.table setDT setkey
|
|
74 | +286 |
- # pathB = list.files(path2,full.names = TRUE)
+ #' @importFrom utils methods
|
|
75 | +287 |
- # ),
+ #' @noRd
|
|
76 | +288 |
- # by = "justmerge",
+ generate_summ_code <- function(...) { |
|
77 | +289 |
- # all=TRUE,
+ # expand the code
|
|
78 | -+ | ||
290 | +1x |
- # allow.cartesian = TRUE
+ code_to_expand = glue::glue("quo(summarise({rlang::as_label(substitute(...))}))") |
|
79 | +291 |
- # )
+ |
|
80 | -+ | ||
292 | +1x |
- #
+ summ_code_quosure = eval(parse(text = code_to_expand)) |
|
81 | +293 |
- #
+ #print(summ_code_quosure)
|
|
82 | +294 |
- # setDT(df3)
+ |
|
83 | +295 |
- # i <- 0
+ # ZJ:
|
|
84 | +296 |
- # mapply(function(pathA, pathB) {
+ # try the traditional route which can't deal with !!!, so if this fails then try the !!! route
|
|
85 | -+ | ||
297 | +1x |
- # stop("error")
+ tryCatch({ |
|
86 | -+ | ||
298 | +1x |
- # data1 = read_fst(pathA,as.data.table = TRUE, columns = c("ACCOUNT_ID","MONTH_KEY"))
+ code = substitute(list(...))[-1] |
|
87 | +299 |
- # data2 = read_fst(pathB,as.data.table = TRUE, columns = c("ACCOUNT_ID","MONTH_KEY"))
+ # print("hehe")
|
|
88 | +300 |
- # data3 = merge(data1, data2, ...)
+ # print(code)
|
|
89 | -+ | ||
301 | +1x |
- # rm(data1); rm(data2); gc()
+ expr_id = 0 |
|
90 | -+ | ||
302 | +1x |
- # if(nrow(data3) > 0) {
+ temp_varn = 0 |
|
91 | +303 |
- # i <<- i + 1
+ |
|
92 | -+ | ||
304 | +1x |
- # write_fst(data3, glue("{outdir}/{i}.fst"))
+ list_of_chunk_agg_fns <- as.character(utils::methods(class = "chunk_agg.disk.frame")) |
|
93 | -+ | ||
305 | +1x |
- # }
+ list_of_collected_agg_fns <- as.character(utils::methods(class = "collected_agg.disk.frame")) |
|
94 | +306 |
- # NULL
+ # browser()
|
|
95 | +307 |
- # },df3$pathA, df3$pathB)
+ # generate the chunk_summarize_code
|
|
96 | -+ | ||
308 | +1x |
- # return(disk.frame(outdir))
+ summarize_code = purrr::map_dfr(code, ~{ |
|
97 | +309 |
- }
+ # print("raw code")
|
|
98 | +310 |
- }
+ # print(.x)
|
1 | -+ | |||
311 | +1x |
- #' Apply the same function to all chunks
+ expr_id <<- expr_id + 1 |
||
2 | +312 |
- #' @param .x a disk.frame
+ # parse the function into table form for easy interrogration
|
||
3 | +313 |
- #' @param .f a function to apply to each of the chunks
+ # The keep.source = TRUE options seems necessary to keep it working in Rscript mode
|
||
4 | -+ | |||
314 | +1x |
- #' @param outdir the output directory
+ gpd = getParseData(parse(text = deparse(.x), keep.source = TRUE), includeText = TRUE); |
||
5 | +315 |
- #' @param keep the columns to keep from the input
+ # print("raw table")
|
||
6 | +316 |
- #' @param chunks The number of chunks to output
+ # print(deparse(.x))
|
||
7 | +317 |
- #' @param lazy if TRUE then do this lazily
+ # print(gpd)
|
||
8 | -+ | |||
318 | +1x |
- #' @param compress 0-100 fst compression ratio
+ grp_funcs = gpd %>% filter(token == "SYMBOL_FUNCTION_CALL") %>% select(text) %>% pull |
||
9 | -+ | |||
319 | +1x |
- #' @param overwrite if TRUE removes any existing chunks in the data
+ grp_funcs = grp_funcs %>% paste0("_df") |
||
10 | +320 |
- #' @param use.names for map_dfr's call to data.table::rbindlist. See data.table::rbindlist
+ |
||
11 | +321 |
- #' @param fill for map_dfr's call to data.table::rbindlist. See data.table::rbindlist
+ # search in the space to find functions name `fn`.chunk_agg.disk.frame
|
||
12 | +322 |
- #' @param idcol for map_dfr's call to data.table::rbindlist. See data.table::rbindlist
+ # only allow one such functions for now TODO improve it
|
||
13 | -+ | |||
323 | +1x |
- #' @param vars_and_pkgs variables and packages to send to a background session. This is typically automatically detected
+ num_of_chunk_functions = sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".chunk_agg.disk.frame")))) |
||
14 | -+ | |||
324 | +1x |
- #' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}
+ num_of_collected_functions= sum(sapply(unique(grp_funcs), function(x) exists(paste0(x, ".collected_agg.disk.frame")))) |
||
15 | +325 |
- #' @param ... for compatibility with `purrr::map`
+ |
||
16 | +326 |
- #' @import fst
+ # the number chunk and aggregation functions must match
|
||
17 | -+ | |||
327 | +1x |
- #' @importFrom purrr as_mapper map
+ stopifnot(num_of_chunk_functions == num_of_collected_functions) |
||
18 | +328 |
- #' @importFrom future.apply future_lapply
+ |
||
19 | +329 |
- #' @export
+ # keep only grp_functions
|
||
20 | -+ | |||
330 | +1x |
- #' @examples
+ grp_funcs= grp_funcs[sapply(grp_funcs, function(x) exists(paste0(x, ".chunk_agg.disk.frame")))] |
||
21 | +331 |
- #' cars.df = as.disk.frame(cars)
+ |
||
22 | -+ | |||
332 | +1x |
- #'
+ if(num_of_chunk_functions == 0) { |
||
23 | -+ | |||
333 | +! |
- #' # return the first row of each chunk lazily
+ stop(sprintf("There must be at least one summarization function in %s", deparse(.x))) |
||
24 | -+ | |||
334 | +1x |
- #' #
+ } else if (num_of_chunk_functions > 1) { |
||
25 | -+ | |||
335 | +! |
- #' cars2 = map(cars.df, function(chunk) {
+ stop(sprintf("Two or more summarisation functions are detected in \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n * Nestling (like mean(sum(x) + y)) or \n * combinations (like sum(x) + mean(x))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x))) |
||
26 | +336 |
- #' chunk[,1]
+ }
|
||
27 | +337 |
- #' })
+ |
||
28 | +338 |
- #'
+ # check to see if the mean is only two from parent 0, otherwise it would a statement in the form of 1 + mean(x)
|
||
29 | +339 |
- #' collect(cars2)
+ # which isn't supported
|
||
30 | -+ | |||
340 | +1x |
- #'
+ data.table::setDT(gpd) |
||
31 | -+ | |||
341 | +1x |
- #' # same as above but using purrr
+ data.table::setkey(gpd, parent) |
||
32 | -+ | |||
342 | +1x |
- #' cars2 = map(cars.df, ~.x[1,])
+ if (gpd[id == gpd[id == gpd[(paste0(text,"_df") == grp_funcs) & (token == "SYMBOL_FUNCTION_CALL"), parent], parent], parent] != 0) { |
||
33 | -+ | |||
343 | +! |
- #'
+ stop(sprintf("Combining summarization with other operations \n\n```\n%s\n```\n\nThese are currently not supported by {disk.frame} at the moment \n * combinations (like sum(x) + 1)\n* combinations (like list(sum(x)))\n\nIf you want this implemented, please leave a comment or upvote at: https://github.com/xiaodaigh/disk.frame/issues/228 \n\n", deparse(.x))) |
||
34 | +344 |
- #' collect(cars2)
+ }
|
||
35 | +345 |
- #'
+ |
||
36 | -+ | |||
346 | +1x |
- #' # return the first row of each chunk eagerly as list
+ temp_varn <<- temp_varn + 1 |
||
37 | -+ | |||
347 | +1x |
- #' map(cars.df, ~.x[1,], lazy = FALSE)
+ grp_funcs_wo_df = sapply(grp_funcs, function(grp_func) substr(grp_func, 1, nchar(grp_func)-3)) |
||
38 | +348 |
- #'
+ |
||
39 | -+ | |||
349 | +1x |
- #' # return the first row of each chunk eagerly as data.table/data.frame by row-binding
+ tmpcode = deparse(evalparseglue("substitute({deparse(.x)}, list({grp_funcs_wo_df} = quote({grp_funcs}.chunk_agg.disk.frame)))")) %>% paste0(collapse = " ") |
||
40 | +350 |
- #' map_dfr(cars.df, ~.x[1,])
+ |
||
41 | -+ | |||
351 | +1x |
- #'
+ chunk_code = data.frame(assign_to = as.character(glue::glue("tmp{temp_varn}")), expr = tmpcode, stringsAsFactors = FALSE) |
||
42 | +352 |
- #' # lazy and delayed are just an aliases for map(..., lazy = TRUE)
+ |
||
43 | -+ | |||
353 | +1x |
- #' collect(lazy(cars.df, ~.x[1,]))
+ chunk_code$orig_code = deparse(.x) |
||
44 | -+ | |||
354 | +1x |
- #' collect(delayed(cars.df, ~.x[1,]))
+ chunk_code$expr_id = expr_id |
||
45 | -+ | |||
355 | +1x |
- #'
+ chunk_code$grp_fn = grp_funcs |
||
46 | -+ | |||
356 | +1x |
- #' # clean up cars.df
+ chunk_code$name = ifelse(is.null(names(code[expr_id])), "", names(code[expr_id])) |
||
47 | +357 |
- #' delete(cars.df)
+ |
||
48 | +358 |
- map <- function(.x, .f, ...) {+ # create the aggregation code
|
||
49 | -163x | +359 | +1x |
- UseMethod("map")+ chunk_code$agg_expr = as.character(glue::glue("{grp_funcs}.collected_agg.disk.frame({paste0(chunk_code$assign_to, collapse=', ')})")) |
50 | +360 |
- }
+ |
||
51 | +361 |
-
+ #print(sapply(chunk_code, typeof))
+ |
+ ||
362 | +1x | +
+ chunk_code
|
||
52 | +363 |
- #' @export
+ }) |
||
53 | +364 |
- map.default <- function(.x, .f, ...) {+ |
||
54 | -! | +|||
365 | +1x |
- purrr::map(.x, .f, ...)+ chunk_summ_code = paste0(summarize_code$assign_to, "=list(", summarize_code$expr, ")") %>% paste0(collapse = ", ") |
||
55 | +366 |
- }
+ |
||
56 | -+ | |||
367 | +1x |
-
+ agg_code_df = summarize_code %>% |
||
57 | -+ | |||
368 | +1x |
- #' @rdname map
+ select(expr_id, name, agg_expr, orig_code) %>% |
||
58 | -+ | |||
369 | +1x |
- #' @importFrom future getGlobalsAndPackages
+ unique %>% |
||
59 | -+ | |||
370 | +1x |
- #' @export
+ transmute(agg_code = paste0(ifelse(name == "", paste0("`", orig_code, "` = "), paste0(name, "=")), agg_expr)) |
||
60 | +371 |
- map.disk.frame <- function(.x, .f, ..., outdir = NULL, keep = NULL, chunks = nchunks(.x), compress = 50, lazy = TRUE, overwrite = FALSE, vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()), .progress = TRUE) {+ |
||
61 | -187x | +372 | +1x |
- .f = purrr::as_mapper(.f)+ agg_summ_code = paste0(agg_code_df$agg_code, collapse = ",") |
62 | -187x | +|||
373 | +
- if(lazy) {+ |
|||
63 | -111x | +374 | +1x |
- attr(.x, "lazyfn") =+ return(list(chunk_summ_code = chunk_summ_code, agg_summ_code = agg_summ_code)) |
64 | -111x | +375 | +1x |
- c(+ }, error = function(e) { |
65 | -111x | +|||
376 | +! |
- attr(.x, "lazyfn"),+ return(summ_code_quosure) |
||
66 | -111x | +|||
377 | +
- list(+ }) |
|||
67 | -111x | +|||
378 | +
- list(+ |
|||
68 | -111x | +|||
379 | +
- func = .f,+ }
|
|||
69 | -111x | +|||
380 | +
- vars_and_pkgs = vars_and_pkgs,+ |
|||
70 | -111x | +|||
381 | +
- dotdotdot = list(...)+ |
|||
71 | +382 |
- )
+ #' @export
|
||
72 | +383 |
- )
+ #' @importFrom dplyr summarize
|
||
73 | +384 |
- )
+ #' @rdname group_by
|
||
74 | -111x | +|||
385 | +
- return(.x)+ summarise.disk.frame <- summarize.disk.frame |
|||
75 | +386 |
- }
+
|
||
76 | +387 |
- + |
||
77 | -76x | +|||
388 | +
- if(!is.null(outdir)) {+ |
|||
78 | -4x | +|||
389 | +
- overwrite_check(outdir, overwrite)+ |
|||
79 | +390 |
- }
+
|
80 | +1 |
- + #' Increase or decrease the number of chunks in the disk.frame
|
||
81 | -76x | +|||
2 | +
- stopifnot(is_ready(.x))+ #' @param df the disk.frame to rechunk
|
|||
82 | +3 |
- + #' @param nchunks number of chunks
|
||
83 | -76x | +|||
4 | +
- keep1 = attr(.x,"keep")+ #' @param shardby the shardkeys
|
|||
84 | +5 |
- + #' @param outdir the output directory
|
||
85 | -76x | +|||
6 | +
- if(is.null(keep)) {+ #' @param overwrite overwrite the output directory
|
|||
86 | -76x | +|||
7 | +
- keep = keep1+ #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
|
|||
87 | +8 |
- }
+ #' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
|
||
88 | +9 |
- + #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
|
||
89 | -76x | +|||
10 | +
- path <- attr(.x, "path")+ #' @export
|
|||
90 | -76x | +|||
11 | +
- files <- list.files(path, full.names = TRUE)+ #' @examples
|
|||
91 | -76x | +|||
12 | +
- files_shortname <- list.files(path)+ #' # create a disk.frame with 2 chunks in tempdir()
|
|||
92 | +13 |
- + #' cars.df = as.disk.frame(cars, nchunks = 2)
|
||
93 | -76x | +|||
14 | +
- keep_future = keep+ #'
|
|||
94 | +15 |
- + #' # re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df
|
||
95 | -76x | +|||
16 | +
- cid = get_chunk_ids(.x, full.names = TRUE)+ #' rechunk(cars.df, 3)
|
|||
96 | +17 |
- + #'
|
||
97 | -76x | +|||
18 | +
- dotdotdot = list(...)+ #' new_path = tempfile(fileext = ".df")
|
|||
98 | +19 |
- + #' # re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory
|
||
99 | -76x | +|||
20 | +
- res = future.apply::future_lapply(1:length(files), function(ii, ...) {+ #' cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed")
|
|||
100 | +21 |
- #res = lapply(1:length(files), function(ii) {
+ #'
+ |
+ ||
22 | ++ |
+ #' # clean up cars.df
|
||
101 | -344x | +|||
23 | +
- ds = disk.frame::get_chunk(.x, cid[ii], keep=keep_future, full.names = TRUE)+ #' delete(cars.df)
|
|||
102 | +24 |
- + #' delete(cars2.df)
|
||
103 | -344x | +|||
25 | +
- res = .f(ds, ...)+ rechunk <- function(df, nchunks, outdir = attr(df, "path", exact=TRUE), shardby = NULL, overwrite = TRUE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) { |
|||
104 | +26 |
- + |
||
105 | +27 |
- #res = do.call(.f, c(ds, dotdotdot))
+ # we need to force the chunks to be computed first as it's common to make nchunks a multiple of chunks(df)
|
||
106 | +28 |
- + # but if we do it too late then the folder could be empty
|
||
107 | -344x | +29 | +5x |
- if(!is.null(outdir)) {+ force(nchunks)+ |
+
30 | ++ |
+ |
||
108 | -23x | +31 | +5x |
- if(nrow(res) == 0) {+ if (nchunks < 1) { |
109 | +32 | ! |
- warning(glue::glue("The output chunk has 0 row, therefore chunk {ii} NOT written"))+ stop(glue::glue("nchunks must be larger than 1")) |
|
110 | +33 |
- } else {- |
- ||
111 | -23x | -
- fst::write_fst(res, file.path(outdir, files_shortname[ii]), compress)+ }
|
||
112 | +34 |
- }
+ |
||
113 | -23x | +35 | +5x |
- return(ii)+ stopifnot("disk.frame" %in% class(df)) |
114 | +36 |
- } else {+ |
||
115 | -321x | +37 | +5x |
- return(res)+ user_had_not_set_shard_by = is.null(shardby) |
116 | -+ | |||
38 | +5x |
- }
+ user_had_set_shard_by = !user_had_not_set_shard_by |
||
117 | +39 |
- }, ...)+ |
||
118 | +40 |
- + # back up the files if writing to the same directory
|
||
119 | -76x | +41 | +5x |
- if(!is.null(outdir)) {+ if(outdir == attr(df,"path", exact=TRUE)) { |
120 | +42 | 4x |
- return(disk.frame(outdir))+ back_up_tmp_dir <- tempfile("back_up_tmp_dir")+ |
+ |
43 | +4x | +
+ fs::dir_create(back_up_tmp_dir) |
||
121 | +44 |
- } else {+ |
||
122 | -72x | +45 | +4x |
- return(res)+ fs::dir_copy( |
123 | -+ | |||
46 | +4x |
- }
+ file.path(outdir, ".metadata"), #from |
||
124 | -+ | |||
47 | +4x |
- }
+ file.path(back_up_tmp_dir, ".metadata") #to |
||
125 | +48 |
-
+ )
|
||
126 | +49 |
-
+ |
||
127 | +50 |
- #' @rdname map
+ # back-up the files first
|
||
128 | -+ | |||
51 | +4x |
- #' @param .id not used
+ full_files = dir(outdir, full.names = TRUE)+ |
+ ||
52 | +4x | +
+ short_files = dir(outdir) |
||
129 | +53 |
- #' @export
+ |
||
130 | +54 |
- map_dfr <- function(.x, .f, ..., .id = NULL) {+ # move all files to the back up folder
|
||
131 | -18x | +55 | +4x |
- UseMethod("map_dfr")+ purrr::map(full_files, ~{+ |
+
56 | +20x | +
+ fs::file_move(.x, back_up_tmp_dir) |
||
132 | +57 |
- }
+ }) |
||
133 | +58 |
-
+ + |
+ ||
59 | +4x | +
+ if(fs::dir_exists(file.path(outdir, ".metadata"))) {+ |
+ ||
60 | +4x | +
+ fs::dir_delete(file.path(outdir, ".metadata")) |
||
134 | +61 |
- #' @export
+ }
|
||
135 | +62 |
- #' @rdname map
+ |
||
136 | +63 |
- map_dfr.default <- function(.x, .f, ..., .id = NULL) {+ # TODO check for validity
|
||
137 | -! | +|||
64 | +4x |
- purrr::map_dfr(.x, .f, ..., .id = .id)+ message(glue::glue("files have been backed up to temporary dir {back_up_tmp_dir}. You can recover there files until you restart your R session")) |
||
138 | +65 |
- }
+ |
||
139 | -+ | |||
66 | +4x |
-
+ df = disk.frame(back_up_tmp_dir) |
||
140 | +67 |
- #' @export
+ }
|
||
141 | +68 |
- #' @rdname map
+ + |
+ ||
69 | +5x | +
+ overwrite_check(outdir, overwrite) |
||
142 | +70 |
- map_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL) {+ |
||
143 | -18x | -
- if(!is.null(.id)) {- |
- ||
144 | -! | +71 | +5x |
- warning(".id is not NULL, but the parameter is not used with map_dfr.disk.frame")+ dfp = attr(df, "path", exact=TRUE) |
145 | -+ | |||
72 | +5x |
- }
+ existing_shardkey = shardkey(df) |
||
146 | +73 |
|
||
147 | +74 |
- # TODO warn the user if outdir is map_dfr
+ # by default, if shardkey is defined then rechunk will continue to reuse it
|
||
148 | -+ | |||
75 | +5x |
- + if (is.null(shardby)) { |
||
149 | -18x | +76 | +4x |
- data.table::rbindlist(map.disk.frame(.x, .f, ..., outdir = NULL, lazy = FALSE), use.names = use.names, fill = fill, idcol = idcol)+ shardby = existing_shardkey[[1]] |
150 | +77 |
- }
+ }
|
||
151 | +78 | |||
152 | +79 | |||
153 | -+ | |||
80 | +5x |
- #' @export
+ if(user_had_set_shard_by) { |
||
154 | -+ | |||
81 | +1x |
- #' @rdname map
+ return(hard_group_by(df, shardby, nchunks = nchunks, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)) |
||
155 | -+ | |||
82 | +4x |
- #' @examples
+ } else if (identical(shardby, "") | is.null(shardby)) { |
||
156 | +83 |
- #' cars.df = as.disk.frame(cars)
+ # if no existing shardby
|
||
157 | -+ | |||
84 | +4x |
- #'
+ nr = nrow(df) |
||
158 | -+ | |||
85 | +4x |
- #' # .x is the chunk and .y is the ID as an integer
+ nr_per_chunk = ceiling(nr/nchunks) |
||
159 | -+ | |||
86 | +4x |
- #'
+ used_so_far = 0 |
||
160 | -+ | |||
87 | +4x |
- #' # lazy = TRUE support is not available at the moment
+ done = FALSE |
||
161 | -+ | |||
88 | +4x |
- #' imap(cars.df, ~.x[, id := .y], lazy = FALSE)
+ chunks_read = 1 |
||
162 | -+ | |||
89 | +4x |
- #'
+ chunks_written = 0 |
||
163 | -+ | |||
90 | +4x | +
+ res = disk.frame(outdir)+ |
+ ||
91 | +4x |
- #' imap_dfr(cars.df, ~.x[, id := .y])
+ a = get_chunk(df, 1) |
||
164 | -+ | |||
92 | +4x |
- #'
+ used_so_far = nrow(a) |
||
165 | -+ | |||
93 | +4x |
- #' # clean up cars.df
+ while(chunks_read < nchunks(df)) { |
||
166 | -+ | |||
94 | +30x |
- #' delete(cars.df)
+ if((nr_per_chunk <= used_so_far)) { |
||
167 | -+ | |||
95 | +14x |
- #' @rdname map
+ disk.frame::add_chunk(res, a[1:nr_per_chunk,]) |
||
168 | -+ | |||
96 | +14x |
- imap <- function(.x, .f, ...) {+ chunks_written = chunks_written + 1 |
||
169 | -! | +|||
97 | +14x |
- UseMethod("imap")+ a = a[-(1:nr_per_chunk),] |
||
170 | -+ | |||
98 | +14x |
- }
+ used_so_far = nrow(a) |
||
171 | +99 |
-
+ } else { |
||
172 | -+ | |||
100 | +16x |
- #' @export
+ chunks_read = chunks_read + 1 |
||
173 | -+ | |||
101 | +16x |
- #' @rdname map
+ newa = get_chunk(df, chunks_read) |
||
174 | -+ | |||
102 | +16x |
- imap.default <- function(.x, .f, ...) {+ used_so_far = used_so_far + nrow(newa) |
||
175 | -! | +|||
103 | +16x |
- purrr::imap(.x, .f, ...)+ a = rbindlist(list(a, newa)) |
||
176 | -+ | |||
104 | +16x |
- }
+ rm(newa) |
||
177 | +105 |
-
+ }
|
||
178 | +106 |
- #' `imap.disk.frame` accepts a two argument function where the first argument is a data.frame and the
+ }
|
||
179 | +107 |
- #' second is the chunk ID
+ |
||
180 | -+ | |||
108 | +4x | +
+ while(chunks_written < nchunks) {+ |
+ ||
109 | +6x |
- #' @export
+ rows_to_write = min(nr_per_chunk, nrow(a)) |
||
181 | -+ | |||
110 | +6x |
- #' @rdname map
+ disk.frame::add_chunk(res, a[1:rows_to_write,]) |
||
182 | -+ | |||
111 | +6x |
- imap.disk.frame <- function(.x, .f, outdir = NULL, keep = NULL, chunks = nchunks(.x), compress = 50, lazy = TRUE, overwrite = FALSE, ...) {+ a = a[-(1:rows_to_write),] |
||
183 | -1x | +112 | +6x |
- .f = purrr::as_mapper(.f)+ chunks_written = chunks_written + 1 |
184 | +113 |
- + }
|
||
185 | +114 |
- # TODO support lazy for imap
+ |
||
186 | -1x | +115 | +4x |
- if(lazy) {+ return(res) |
187 | +116 | ! |
- stop("imap.disk.frame: lazy = TRUE is not supported at this stage")+ } else if(!is.null(shardby)) { # if there is existing shard by; shardby has been replaced with new shard by |
|
188 | +117 | ! |
- attr(.x, "lazyfn") = c(attr(.x, "lazyfn"), .f)+ if(user_had_not_set_shard_by) { |
|
189 | +118 | ! |
- return(.x)+ warning("shardby = NULL; but there are already shardkey's defined for this disk.frame. Therefore a rechunk algorithm that preserves the shardkey's has been applied and this algorithm is slower than an algorithm that doesn't use a shardkey.") |
|
190 | +119 |
- }
+ }
|
||
191 | +120 |
- + |
||
192 | -1x | +|||
121 | +
- if(!is.null(outdir)) {+ # using some maths we can cut down on the number of operations
|
|||
193 | +122 | ! |
- overwrite_check(outdir, overwrite)+ nc = nchunks(df) |
|
194 | +123 |
- }
+ |
||
195 | +124 |
- + # TODO there is bug here! If the chunks are in numbers form!
|
||
196 | -1x | +|||
125 | +
- stopifnot(is_ready(.x))+ # if the number of possible new chunk ids is one then no need to perform anything. just merge those
|
|||
197 | -+ | |||
126 | +! |
- + possibles_new_chunk_id = purrr::map(1:nc, ~unique((.x-1 + (0:(nchunks-1))*nc) %% nchunks)+1) |
||
198 | -1x | +|||
127 | +! |
- keep1 = attr(.x,"keep")+ lp = purrr::map_int(possibles_new_chunk_id,length) |
||
199 | +128 |
- + |
||
200 | -1x | +|||
129 | +
- if(is.null(keep)) {+ #need to shards
|
|||
201 | -1x | +|||
130 | +! |
- keep = keep1+ nts = which(lp != 1) |
||
202 | +131 |
- }
+ |
||
203 | -+ | |||
132 | +! |
- + bad_boys = future.apply::future_lapply(nts, function(chunk_id) { |
||
204 | -1x | +|||
133 | +! |
- path <- attr(.x, "path")+ df1 = disk.frame::get_chunk(df, chunk_id) |
||
205 | -1x | +|||
134 | +! |
- files <- get_chunk_ids(.x)+ disk.frame::shard(df1, shardby, nchunks = nchunks, overwrite = TRUE) |
||
206 | -1x | +|||
135 | +
- files_shortname <- list.files(path)+ }) |
|||
207 | +136 |
- + |
||
208 | -1x | +|||
137 | +
- keep_future = keep+ # for those that don't need to be resharded
|
|||
209 | -1x | +|||
138 | +! |
- res = future.apply::future_lapply(1:length(files), function(ii) {+ tmp_fdlr = tempfile("rechunk_shard") |
||
210 | -5x | +|||
139 | +! |
- ds = disk.frame::get_chunk(.x, ii, keep=keep_future)+ fs::dir_create(tmp_fdlr) |
||
211 | -5x | +|||
140 | +
- res = .f(ds, ii)+ |
|||
212 | -5x | +|||
141 | +! |
- if(!is.null(outdir)) {+ oks = furrr::future_map(which(lp == 1), function(i) { |
||
213 | +142 | ! |
- fst::write_fst(res, file.path(outdir, files_shortname[ii]), compress)+ file_chunk = file.path(attr(df, "path", exact=TRUE), i %>% paste0(".fst")) |
|
214 | +143 | ! |
- return(ii)+ fs::file_move(file_chunk, file.path(tmp_fdlr, possibles_new_chunk_id[[i]] %>% paste0(".fst"))) |
|
215 | -+ | |||
144 | +! |
- } else {+ disk.frame(tmp_fdlr) |
||
216 | -5x | +|||
145 | +
- return(res)+ }) |
|||
217 | +146 |
- }
+ |
||
218 | -+ | |||
147 | +! |
- })+ list_of_sharded = c(bad_boys, oks)+ |
+ ||
148 | +! | +
+ new_one <- rbindlist.disk.frame(list_of_sharded, outdir=outdir, by_chunk_id = TRUE, overwrite = overwrite) |
||
219 | +149 |
- + |
||
220 | -1x | +|||
150 | +! |
- if(!is.null(outdir)) {+ res = add_meta(new_one, nchunks = nchunks(new_one), shardkey = shardby, shardchunks = nchunks) |
||
221 | +151 | ! |
- return(disk.frame(outdir))+ return(res) |
|
222 | +152 |
} else { |
||
223 | -1x | +|||
153 | +! |
- return(res)+ stop("rechunk: option not supported") |
||
224 | +154 |
}
|
||
225 | +155 |
}
|
226 | +1 |
-
+ #' @rdname join
|
||
227 | +2 |
#' @export
|
||
228 | +3 |
- #' @rdname map
+ #' @examples
|
||
229 | +4 |
- imap_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL) {- |
- ||
230 | -1x | -
- if(!is.null(.id)) {- |
- ||
231 | -! | -
- warning(".id is not NULL, but the parameter is not used with map_dfr.disk.frame")+ #' cars.df = as.disk.frame(cars)
|
||
232 | +5 |
- }
- |
- ||
233 | -1x | -
- data.table::rbindlist(imap.disk.frame(.x, .f, ..., lazy = FALSE), use.names = use.names, fill = fill, idcol = idcol)+ #'
|
||
234 | +6 |
- }
+ #' join.df = full_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
|
||
235 | +7 |
-
+ #'
|
||
236 | +8 |
-
+ #' # clean up cars.df
|
||
237 | +9 |
- #' @export
+ #' delete(cars.df)
|
||
238 | +10 |
- #' @rdname map
+ #' delete(join.df)
|
||
239 | +11 |
- imap_dfr <- function(.x, .f, ..., .id = NULL) {+ full_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_full_join"), overwrite = TRUE, merge_by_chunk_id, .progress = FALSE) { |
||
240 | -1x | +12 | +16x |
- UseMethod("imap_dfr")+ stopifnot("disk.frame" %in% class(x)) |
241 | +13 |
- }
+ |
||
242 | -+ | |||
14 | +16x |
-
+ overwrite_check(outdir, overwrite) |
||
243 | +15 |
-
+ |
||
244 | +16 |
- #' @export
+ |
||
245 | -+ | |||
17 | +16x |
- #' @rdname map
+ if("data.frame" %in% class(y)) { |
||
246 | +18 |
- imap_dfr.default <- function(.x, .f, ..., .id = NULL) {+ # full join cannot be support for y in data.frame
|
||
247 | -! | +|||
19 | +4x |
- purrr::imap_dfr(.x, .f, ..., .id = .id)+ ncx = nchunks(x) |
||
248 | -+ | |||
20 | +4x |
- }
+ dy = shard(y, shardby = by, nchunks = ncx, overwrite = TRUE) |
||
249 | -+ | |||
21 | +4x |
-
+ dx = hard_group_by(x, by = by, overwrite = TRUE) |
||
250 | -+ | |||
22 | +4x |
-
+ return(full_join.disk.frame(dx, dy, by, copy=copy, outdir=outdir, merge_by_chunk_id = TRUE)) |
||
251 | -+ | |||
23 | +12x |
- #' `lazy` is convenience function to apply `.f` to every chunk
+ } else if("disk.frame" %in% class(y)) { |
||
252 | -+ | |||
24 | +12x |
- #' @export
+ if(is.null(merge_by_chunk_id)) { |
||
253 | -+ | |||
25 | +! |
- #' @rdname map
+ stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") |
||
254 | +26 |
- lazy <- function(.x, .f, ...) {+ }
|
||
255 | -! | +|||
27 | +12x |
- UseMethod("lazy")+ if(is.null(by)) { |
||
256 | -+ | |||
28 | +! |
- }
+ by <- intersect(names(x), names(y)) |
||
257 | +29 |
-
+ }
|
||
258 | +30 |
- #' @rdname map
+ |
||
259 | -+ | |||
31 | +12x |
- #' @export
+ ncx = nchunks(x) |
||
260 | -+ | |||
32 | +12x |
- lazy.disk.frame <- function(.x, .f, ...) {+ ncy = nchunks(y) |
||
261 | -! | +|||
33 | +12x |
- map.disk.frame(.x, .f, ..., lazy = TRUE)+ if (merge_by_chunk_id == FALSE) { |
||
262 | -+ | |||
34 | +4x |
- }
+ warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") |
||
263 | -+ | |||
35 | +4x |
-
+ x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
||
264 | -+ | |||
36 | +4x |
- #' `delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl
+ y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
||
265 | -+ | |||
37 | +4x |
- #' @export
+ return(full_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) |
||
266 | -+ | |||
38 | +8x |
- #' @rdname map
+ } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { |
||
267 | -+ | |||
39 | +8x |
- delayed <- function(.x, .f, ...) {+ res = cmap2(x, y, ~{ |
||
268 | -1x | +40 | +37x |
- UseMethod("delayed")+ if(is.null(.y)) { |
269 | -+ | |||
41 | +! |
- }
+ return(.x) |
||
270 | -+ | |||
42 | +37x |
-
+ } else if (is.null(.x)) { |
||
271 | -+ | |||
43 | +! |
- #' @export
+ return(.y) |
||
272 | +44 |
- #' @rdname map
+ }
|
||
273 | -+ | |||
45 | +37x |
- delayed.disk.frame <- function(.x, .f, ...) {+ full_join(.x, .y, by = by, copy = copy) |
||
274 | -1x | +46 | +8x |
- map.disk.frame(.x, .f, ..., lazy = TRUE)+ }, outdir = outdir, overwrite = overwrite, .progress = .progress) |
275 | -+ | |||
47 | +8x |
- }
+ return(res) |
||
276 | +48 |
- + } else { |
||
277 | +49 |
- #' @export
+ # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
|
||
278 | -+ | |||
50 | +! |
- #' @rdname map
+ stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE") |
||
279 | +51 |
- chunk_lapply <- function (...) {- |
- ||
280 | -! | -
- warning("chunk_lapply is deprecated in favour of map.disk.frame")+ }
|
||
281 | -! | +|||
52 | +
- map.disk.frame(...)+ }
|
|||
282 | +53 |
}
@@ -9457,42 +13176,42 @@ disk.frame coverage - 51.96% |
1 |
- #' Sample n rows from a disk.frame
+ #' @param x a disk.frame
|
||
2 |
- #' @export
+ #' @param y a data.frame or disk.frame. If data.frame then returns lazily; if disk.frame it performs the join eagerly and return a disk.frame
|
||
3 |
- #' @importFrom dplyr sample_frac
+ #' @param outdir output directory for disk.frame
|
||
4 |
- #' @inheritParams dplyr::sample_frac
+ #' @rdname join
|
||
5 |
- #' @rdname sample
+ #' @export
|
||
9 |
- #' collect(sample_frac(cars.df, 0.5))
+ #' join.df = semi_join(cars.df, cars.df)
|
||
13 |
- sample_frac.disk.frame <- function(tbl, size=1, replace=FALSE, weight=NULL, .env=NULL, ...) {+ #' delete(join.df)
|
||
14 | -2x | +
- warning_msg = NULL+ semi_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_semi_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { |
|
15 | -2x | +12x |
- if(!is.null(weight)) {+ stopifnot("disk.frame" %in% class(x)) |
16 | -1x | +
- warning_msg = "sample_frac: for disk.frames weight = is not supported"+ |
|
17 | -1x | +12x |
- stop(warning_msg)+ overwrite_check(outdir, overwrite) |
18 | -1x | +
- } else if (!is.null(.env)) {+ |
|
19 | -! | +12x |
- warning_msg = "sample_frac: for disk.frames .env = is not supported"+ if("data.frame" %in% class(y)) { |
20 | -! | +4x |
- stop(warning_msg)+ quo_dotdotdot = enquos(...) |
21 | -+ | 4x |
- }
+ cmap_dfr(x, ~{ |
22 | -+ | 16x |
- + code = quo(semi_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) |
23 | -1x | +16x |
- fn = disk.frame::create_dplyr_mapper(dplyr::sample_frac)+ rlang::eval_tidy(code) |
24 | -+ | 4x |
- + }, .progress = .progress) |
25 | -1x | +8x |
- fn(tbl, size = size, replace = replace, ...)+ } else if("disk.frame" %in% class(y)) { |
26 | -+ | 8x |
- }
+ if(is.null(merge_by_chunk_id)) { |
27 | -- | - - | -
1 | -- |
- #' Apply data.table's foverlaps to the disk.frame
- |
- |
2 | -- |
- #' @description EXPERIMENTAL
- |
- |
3 | -- |
- #' @param df1 A disk.frame
- |
- |
4 | -- |
- #' @param df2 A disk.frame or a data.frame
- |
- |
5 | -- |
- #' @param by.x character/string vector. by.x used in foverlaps
- |
- |
6 | -- |
- #' @param by.y character/string vector. by.x used in foverlaps
- |
- |
7 | -+ | ! |
- #' @param outdir The output directory of the disk.frame
+ stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") |
8 | +28 |
- #' @param merge_by_chunk_id If TRUE then the merges will happen for chunks in df1 and df2 with the same chunk id which speed up processing. Otherwise every chunk of df1 is merged with every chunk of df2. Ignored with df2 is not a disk.frame
+ }
|
|
9 | -+ | ||
29 | +8x |
- #' @param compress The compression ratio for fst
+ if(is.null(by)) { |
|
10 | -+ | ||
30 | +! |
- #' @param overwrite overwrite existing directory
+ by <- intersect(names(x), names(y)) |
|
11 | +31 |
- #' @param ... passed to data.table::foverlaps and disk.frame::map.disk.frame
+ }
|
|
12 | +32 |
- #' @import fst
+ |
|
13 | -+ | ||
33 | +8x |
- #' @importFrom glue glue
+ ncx = nchunks(x) |
|
14 | -+ | ||
34 | +8x |
- #' @importFrom data.table foverlaps data.table setDT setkeyv
+ ncy = nchunks(y) |
|
15 | -+ | ||
35 | +8x |
- #' @importFrom future.apply future_lapply
+ if (merge_by_chunk_id == FALSE) { |
|
16 | -+ | ||
36 | +4x |
- #' @importFrom pryr do_call
+ warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") |
|
17 | -+ | ||
37 | +4x |
- #' @export
+ x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
|
18 | -+ | ||
38 | +4x |
- #' @examples
+ y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
|
19 | -+ | ||
39 | +4x |
- #' library(data.table)
+ return(semi_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) |
|
20 | -+ | ||
40 | +4x |
- #'
+ } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { |
|
21 | -+ | ||
41 | +4x |
- #' ## simple example:
+ res = cmap2.disk.frame(x, y, ~{ |
|
22 | -+ | ||
42 | +21x |
- #' x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10))
+ if(is.null(.y)) { |
|
23 | -+ | ||
43 | +! |
- #' y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3))
+ return(data.table()) |
|
24 | -+ | ||
44 | +21x |
- #' xy.df = foverlaps.disk.frame(
+ } else if (is.null(.x)) { |
|
25 | -+ | ||
45 | +! |
- #' x,
+ return(data.table()) |
|
26 | +46 |
- #' y,
+ }
|
|
27 | -+ | ||
47 | +21x |
- #' by.x = c("val1", "start", "end"),
+ semi_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite) |
|
28 | -+ | ||
48 | +4x |
- #' by.y = c("val1", "start", "end"),
+ }, outdir = outdir, .progress = .progress) |
|
29 | -+ | ||
49 | +4x |
- #' merge_by_chunk_id = TRUE,
+ return(res) |
|
30 | +50 |
- #' overwrite = TRUE)
+ } else { |
|
31 | +51 |
- #'
+ # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
|
|
32 | -+ | ||
52 | +! |
- #' # clean up
+ stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE") |
|
33 | +53 |
- #' delete(x)
+ }
|
|
34 | +54 |
- #' delete(y)
+ }
|
|
35 | +55 |
- #' delete(xy.df)
+ }
|
36 | +1 |
- foverlaps.disk.frame <-+ #' Apply the same function to all chunks
|
|
37 | +2 |
- function(+ #' @param .x a disk.frame
|
|
38 | +3 |
- df1,
+ #' @param .f a function to apply to each of the chunks
|
|
39 | +4 |
- df2,
+ #' @param outdir the output directory
|
|
40 | +5 |
- by.x = if (identical(shardkey(df1)$shardkey, "")) shardkey(df1)$shardkey else shardkey(df2)$shardkey,+ #' @param keep the columns to keep from the input
|
|
41 | +6 |
- by.y = shardkey(df2)$shardkey,+ #' @param chunks The number of chunks to output
|
|
42 | +7 |
- ...,
- |
- |
43 | -! | -
- outdir = {warning("temp dir create"); tempfile("df_foverlaps_tmp", fileext = ".df")},+ #' @param lazy if TRUE then do this lazily
|
|
44 | +8 |
- merge_by_chunk_id = FALSE, compress=50, overwrite = TRUE) {+ #' @param compress 0-100 fst compression ratio
|
|
45 | +9 |
- - |
- |
46 | -! | -
- stopifnot("disk.frame" %in% class(df1))+ #' @param overwrite if TRUE removes any existing chunks in the data
|
|
47 | +10 |
- - |
- |
48 | -! | -
- overwrite_check(outdir, overwrite)+ #' @param use.names for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
|
|
49 | +11 |
- - |
- |
50 | -! | -
- if("data.frame" %in% class(df2)) {- |
- |
51 | -! | -
- map.disk.frame(df1, ~foverlaps(.x, df2, ...), ..., lazy = FALSE, compress = compress, overwrite = overwrite)+ #' @param fill for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
|
|
52 | -! | +||
12 | +
- } else if (merge_by_chunk_id | (identical(shardkey(df1), shardkey(df2)))) {+ #' @param idcol for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist
|
||
53 | +13 |
- # if the shardkeys are the same then only need to match by segment id
+ #' @param vars_and_pkgs variables and packages to send to a background session. This is typically automatically detected
|
|
54 | +14 |
- # as account with the same shardkey must end up in the same segment
+ #' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}
|
|
55 | -! | +||
15 | +
- path1 = attr(df1,"path")+ #' @param ... for compatibility with `purrr::map`
|
||
56 | -! | +||
16 | +
- path2 = attr(df2,"path")+ #' @import fst
|
||
57 | +17 |
- + #' @importFrom purrr as_mapper map
|
|
58 | -! | +||
18 | +
- df3 = merge(+ #' @importFrom future.apply future_lapply
|
||
59 | -! | +||
19 | +
- data.table(+ #' @export
|
||
60 | -! | +||
20 | +
- chunk_id = list.files(path1),+ #' @examples
|
||
61 | -! | +||
21 | +
- pathA = list.files(path1,full.names = TRUE),+ #' cars.df = as.disk.frame(cars)
|
||
62 | -! | +||
22 | +
- file_id = 1:length(list.files(path1))+ #'
|
||
63 | +23 |
- ),
+ #' # return the first row of each chunk lazily
|
|
64 | -! | +||
24 | +
- data.table(+ #' #
|
||
65 | -! | +||
25 | +
- chunk_id = list.files(path2),+ #' cars2 = cmap(cars.df, function(chunk) {
|
||
66 | -! | +||
26 | +
- pathB = list.files(path2,full.names = TRUE)+ #' chunk[,1]
|
||
67 | +27 |
- ),
+ #' })
|
|
68 | -! | +||
28 | +
- by = "chunk_id"+ #'
|
||
69 | +29 |
- )
+ #' collect(cars2)
|
|
70 | -! | +||
30 | +
- setDT(df3)+ #'
|
||
71 | +31 |
- + #' # same as above but using purrr
|
|
72 | -! | +||
32 | +
- dotdotdot = list(...)+ #' cars2 = cmap(cars.df, ~.x[1,])
|
||
73 | +33 |
- + #'
|
|
74 | -! | +||
34 | +
- furrr::future_map(1:nrow(df3), function(row) {+ #' collect(cars2)
|
||
75 | +35 |
- #future.apply::future_lapply(1:nrow(df3), function(row) {
+ #'
|
|
76 | +36 |
- #lapply(1:nrow(df3), function(row) {
+ #' # return the first row of each chunk eagerly as list
|
|
77 | -! | +||
37 | +
- chunk_id = df3[row, chunk_id]+ #' cmap(cars.df, ~.x[1,], lazy = FALSE)
|
||
78 | +38 |
- + #'
|
|
79 | -! | +||
39 | +
- data1 = get_chunk(df1, chunk_id)+ #' # return the first row of each chunk eagerly as data.table/data.frame by row-binding
|
||
80 | -! | +||
40 | +
- data2 = get_chunk(df2, chunk_id)+ #' cmap_dfr(cars.df, ~.x[1,])
|
||
81 | +41 |
- + #'
|
|
82 | -! | +||
42 | +
- setDT(data1)+ #' # lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
|
||
83 | -! | +||
43 | +
- setDT(data2)+ #' collect(lazy(cars.df, ~.x[1,]))
|
||
84 | +44 |
- + #' collect(delayed(cars.df, ~.x[1,]))
|
|
85 | -! | +||
45 | +
- setkeyv(data2, by.y[(length(by.y)-2+1):length(by.y)])+ #'
|
||
86 | +46 |
- + #' # clean up cars.df
|
|
87 | -! | +||
47 | +
- dotdotdot$x = data1+ #' delete(cars.df)
|
||
88 | -! | +||
48 | +
- dotdotdot$y = data2+ cmap <- function(.x, .f, ...) { |
||
89 | -! | +||
49 | +151x |
- data3 = pryr::do_call(foverlaps, dotdotdot)+ UseMethod("cmap") |
|
90 | -! | +||
50 | +
- rm(data1); rm(data2); gc()+ }
|
||
91 | -! | +||
51 | +
- outdir
+
|
||
92 | -! | +||
52 | +
- fst::write_fst(data3, glue::glue("{outdir}/{chunk_id}"), compress = compress)+ |
||
93 | -! | +||
53 | +
- rm(data3); gc()+ #' @rdname cmap
|
||
94 | -! | +||
54 | ++ |
+ #' @importFrom future getGlobalsAndPackages
+ |
+ |
55 | +
- NULL
+ #' @export
|
||
95 | +56 |
- })+ cmap.disk.frame <- function(.x, .f, ..., outdir = NULL, keep = NULL, chunks = nchunks(.x), compress = 50, lazy = TRUE, overwrite = FALSE, vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()), .progress = TRUE) { |
|
96 | -! | +||
57 | +175x |
- return(disk.frame(outdir))+ .f = purrr::as_mapper(.f) |
|
97 | -+ | ||
58 | +175x |
- } else {+ if(lazy) { |
|
98 | -! | +||
59 | +101x |
- stop("foverlaps.disk.frame: only merge_by_chunk_id = TRUE is implemented")+ attr(.x, "lazyfn") = |
|
99 | -+ | ||
60 | +101x |
- }
+ c( |
|
100 | -+ | ||
61 | +101x |
- }
+ attr(.x, "lazyfn"), |
1 | -+ | |||
62 | +101x |
- #' Performs join/merge for disk.frames
+ list( |
||
2 | -+ | |||
63 | +101x |
- #' @rdname join
+ list( |
||
3 | -+ | |||
64 | +101x |
- #' @export
+ func = .f, |
||
4 | -+ | |||
65 | +101x |
- #' @examples
+ vars_and_pkgs = vars_and_pkgs, |
||
5 | -+ | |||
66 | +101x |
- #' cars.df = as.disk.frame(cars)
+ dotdotdot = list(...) |
||
6 | +67 |
- #'
+ )
|
||
7 | +68 |
- #' join.df = left_join(cars.df, cars.df)
+ )
|
||
8 | +69 |
- #'
+ )
|
||
9 | -+ | |||
70 | +101x |
- #' # clean up cars.df
+ return(.x) |
||
10 | +71 |
- #' delete(cars.df)
+ }
|
||
11 | +72 |
- #' delete(join.df)
+ |
||
12 | -+ | |||
73 | +74x |
- left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE) {+ if(!is.null(outdir)) { |
||
13 | -12x | +74 | +4x |
- stopifnot("disk.frame" %in% class(x))+ overwrite_check(outdir, overwrite) |
14 | +75 | ++ |
+ }
+ |
+ |
76 |
|
|||
15 | -12x | +77 | +74x |
- overwrite_check(outdir, overwrite)+ stopifnot(is_ready(.x)) |
16 | +78 |
|
||
17 | -12x | +79 | +74x |
- if("data.frame" %in% class(y)) {+ keep1 = attr(.x,"keep", exact=TRUE) |
18 | +80 |
- # note that x is named .data in the lazy evaluation
- |
- ||
19 | -4x | -
- quo_dotdotdot = enquos(...)+ |
||
20 | -4x | +81 | +74x |
- map_dfr(x, ~{+ if(is.null(keep)) { |
21 | -16x | +82 | +74x |
- code = quo(left_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot))+ keep = keep1 |
22 | -16x | +|||
83 | +
- rlang::eval_tidy(code)+ }
|
|||
23 | +84 |
- })+ |
||
24 | -8x | +85 | +74x |
- } else if("disk.frame" %in% class(y)) {+ path <- attr(.x, "path") |
25 | -8x | +86 | +74x |
- if(is.null(merge_by_chunk_id)) {+ files <- list.files(path, full.names = TRUE) |
26 | -! | +|||
87 | +74x |
- stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly")+ files_shortname <- list.files(path) |
||
27 | +88 |
- }
+ |
||
28 | -8x | +89 | +74x |
- if(is.null(by)) {+ keep_future = keep |
29 | -! | +|||
90 | +
- by <- intersect(names(x), names(y))+ |
|||
30 | -+ | |||
91 | +74x |
- }
+ cid = get_chunk_ids(.x, full.names = TRUE) |
||
31 | +92 |
- + |
||
32 | -8x | +93 | +74x |
- ncx = nchunks(x)+ dotdotdot = list(...) |
33 | -8x | +|||
94 | +
- ncy = nchunks(y)+ |
|||
34 | -8x | +95 | +74x |
- if (merge_by_chunk_id == FALSE) {+ res = future.apply::future_lapply(1:length(files), function(ii, ...) { |
35 | -4x | +|||
96 | +
- warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.")+ #res = lapply(1:length(files), function(ii) {
|
|||
36 | -4x | +97 | +332x |
- x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ ds = disk.frame::get_chunk(.x, cid[ii], keep=keep_future, full.names = TRUE) |
37 | -4x | +|||
98 | +
- y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ |
|||
38 | -4x | +99 | +332x |
- return(left_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite))+ res = .f(ds, ...) |
39 | -4x | +|||
100 | +
- } else if(merge_by_chunk_id == TRUE) {+ |
|||
40 | +101 |
- #} else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {
+ #res = do.call(.f, c(ds, dotdotdot))
|
||
41 | -4x | +|||
102 | +
- dotdotdot = list(...)+ |
|||
42 | -4x | +103 | +332x |
- res = map2.disk.frame(x, y, ~{+ if(!is.null(outdir)) { |
43 | -21x | +104 | +23x |
- if(is.null(.y)) {+ if(nrow(res) == 0) { |
44 | +105 | ! |
- return(.x)+ warning(glue::glue("The output chunk has 0 row, therefore chunk {ii} NOT written")) |
|
45 | -21x | +|||
106 | +
- } else if (is.null(.x)) {+ } else { |
|||
46 | -! | +|||
107 | +23x |
- return(data.table())+ fst::write_fst(res, file.path(outdir, files_shortname[ii]), compress) |
||
47 | +108 |
- }
+ }
|
||
48 | -21x | +109 | +23x |
- llj = purrr::lift(dplyr::left_join)+ return(ii) |
49 | +110 |
- #left_join(.x, .y, by = by, copy = copy, ...)
+ } else { |
||
50 | -21x | +111 | +309x |
- llj(c(list(x=.x, y =.y, by = by, copy = copy), dotdotdot))+ return(res) |
51 | -4x | +|||
112 | +
- }, outdir = outdir)+ }
|
|||
52 | -4x | +|||
113 | +
- return(res)+ }, ...) |
|||
53 | +114 |
- } else {+ |
||
54 | -+ | |||
115 | +74x |
- # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
+ if(!is.null(outdir)) { |
||
55 | -! | +|||
116 | +4x |
- stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE")+ return(disk.frame(outdir)) |
||
56 | +117 |
- }
+ } else {+ |
+ ||
118 | +70x | +
+ return(res) |
||
57 | +119 |
}
|
||
58 | +120 |
}
|
1 | +121 |
- #' Convert CSV file(s) to disk.frame format
+
|
||
2 | +122 |
- #' @importFrom glue glue
+ #' @export
|
||
3 | +123 |
- #' @importFrom fs dir_delete
+ #' @rdname cmap
|
||
4 | +124 |
- #' @importFrom pryr do_call
+ cmap_dfr <- function(.x, .f, ..., .id = NULL) { |
||
5 | -+ | |||
125 | +14x |
- #' @param infile The input CSV file or files
+ UseMethod("cmap_dfr") |
||
6 | +126 |
- #' @param outdir The directory to output the disk.frame to
+ }
|
||
7 | +127 |
- #' @param inmapfn A function to be applied to the chunk read in from CSV before
+
|
||
8 | +128 |
- #' the chunk is being written out. Commonly used to perform simple
+ #' @export
|
||
9 | +129 |
- #' transformations. Defaults to the identity function (ie. no transformation)
+ #' @rdname cmap
|
||
10 | +130 |
- #' @param nchunks Number of chunks to output
+ cmap_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL) { |
||
11 | -+ | |||
131 | +18x |
- #' @param in_chunk_size When reading in the file, how many lines to read in at
+ if(!is.null(.id)) { |
||
12 | -+ | |||
132 | +! |
- #' once. This is different to nchunks which controls how many chunks are
+ warning(".id is not NULL, but the parameter is not used with cmap_dfr.disk.frame") |
||
13 | +133 |
- #' output
+ }
|
||
14 | +134 |
- #' @param shardby The column(s) to shard the data by. For example suppose
+ |
||
15 | +135 |
- #' `shardby = c("col1","col2")` then every row where the values `col1` and
+ # TODO warn the user if outdir is cmap_dfr
|
||
16 | -+ | |||
136 | +18x |
- #' `col2` are the same will end up in the same chunk; this will allow merging
+ data.table::rbindlist(cmap.disk.frame(.x, .f, ..., outdir = NULL, lazy = FALSE), use.names = use.names, fill = fill, idcol = idcol) |
||
17 | +137 |
- #' by `col1` and `col2` to be more efficient
+ }
|
||
18 | +138 |
- #' @param compress For fst backends it's a number between 0 and 100 where 100 is
+
|
||
19 | +139 |
- #' the highest compression ratio.
+
|
||
20 | +140 |
- #' @param overwrite Whether to overwrite the existing directory
+ #' @export
|
||
21 | +141 |
- #' @param header Whether the files have header. Defaults to TRUE
+ #' @rdname cmap
|
||
22 | +142 |
- #' @param .progress A logical, for whether or not to print a progress bar for
+ cimap <- function(.x, .f, ...) { |
||
23 | -+ | |||
143 | +! |
- #' multiprocess, multisession, and multicore plans. From {furrr}
+ UseMethod("cimap") |
||
24 | +144 |
- #' @param backend The CSV reader backend to choose: "data.table" or "readr".
+ }
|
||
25 | +145 |
- #' disk.frame does not have its own CSV reader. It uses either
+
|
||
26 | +146 |
- #' data.table::fread or readr::read_delimited. It is worth noting that
+ #' `cimap.disk.frame` accepts a two argument function where the first argument is a data.frame and the
|
||
27 | +147 |
- #' data.table::fread does not detect dates and all dates are imported as
+ #' second is the chunk ID
|
||
28 | +148 |
- #' strings, and you are encouraged to use {fasttime} to convert the strings to
+ #' @export
|
||
29 | +149 |
- #' date. You can use the `inmapfn` to do that. However, if you want automatic
+ #' @rdname cmap
|
||
30 | +150 |
- #' date detection, then backend="readr" may suit your needs. However, readr
+ cimap.disk.frame <- function(.x, .f, outdir = NULL, keep = NULL, chunks = nchunks(.x), compress = 50, lazy = TRUE, overwrite = FALSE, ...) { |
||
31 | -+ | |||
151 | +1x |
- #' is often slower than data.table, hence data.table is chosen as the default.
+ .f = purrr::as_mapper(.f) |
||
32 | +152 |
- #' @param chunk_reader Even if you choose a backend there can still be multiple
+ |
||
33 | +153 |
- #' strategies on how to approach the CSV reads. For example, data.table::fread
+ # TODO support lazy for cimap
|
||
34 | -+ | |||
154 | +1x |
- #' tries to mmap the whole file which can cause the whole read process to
+ if(lazy) { |
||
35 | -+ | |||
155 | +! |
- #' fail. In that case we can change the chunk_reader to "readLines" which uses the
+ stop("cimap.disk.frame: lazy = TRUE is not supported at this stage") |
||
36 | -+ | |||
156 | +! |
- #' readLines function to read chunk by chunk and still use data.table::fread
+ attr(.x, "lazyfn") = c(attr(.x, "lazyfn"), .f) |
||
37 | -+ | |||
157 | +! |
- #' to process the chunks. There are currently no strategies for readr backend,
+ return(.x) |
||
38 | +158 |
- #' except the default one.
+ }
|
||
39 | +159 |
- #' @param ... passed to data.table::fread, disk.frame::as.disk.frame,
+ |
||
40 | -+ | |||
160 | +1x |
- #' disk.frame::shard
+ if(!is.null(outdir)) { |
||
41 | -+ | |||
161 | +! |
- #' @importFrom pryr do_call
+ overwrite_check(outdir, overwrite) |
||
42 | +162 |
- #@importFrom LaF detect_dm_csv process_blocks
+ }
|
||
43 | +163 |
- #' @importFrom bigreadr split_file get_split_files
+ |
||
44 | -+ | |||
164 | +1x |
- #' @family ingesting data
+ stopifnot(is_ready(.x)) |
||
45 | +165 |
- #' @export
+ |
||
46 | -+ | |||
166 | +1x |
- #' @examples
+ keep1 = attr(.x,"keep", exact=TRUE) |
||
47 | +167 |
- #' tmpfile = tempfile()
+ |
||
48 | -+ | |||
168 | +1x |
- #' write.csv(cars, tmpfile)
+ if(is.null(keep)) { |
||
49 | -+ | |||
169 | +1x |
- #' tmpdf = tempfile(fileext = ".df")
+ keep = keep1 |
||
50 | +170 |
- #' df = csv_to_disk.frame(tmpfile, outdir = tmpdf, overwrite = TRUE)
+ }
|
||
51 | +171 |
- #'
+ |
||
52 | -+ | |||
172 | +1x |
- #' # clean up
+ path <- attr(.x, "path") |
||
53 | -+ | |||
173 | +1x |
- #' fs::file_delete(tmpfile)
+ files <- get_chunk_ids(.x) |
||
54 | -+ | |||
174 | +1x |
- #' delete(df)
+ files_shortname <- list.files(path) |
||
55 | +175 |
- csv_to_disk.frame <- function(infile, outdir = tempfile(fileext = ".df"), inmapfn = base::I, nchunks = recommend_nchunks(sum(file.size(infile))),+ |
||
56 | -+ | |||
176 | +1x |
- in_chunk_size = NULL, shardby = NULL, compress=50, overwrite = TRUE, header = TRUE, .progress = TRUE, backend = c("data.table", "readr", "LaF"), chunk_reader = c("bigreadr", "data.table", "readr", "readLines"), ...) {+ keep_future = keep |
||
57 | -11x | +177 | +1x |
- backend = match.arg(backend)+ res = future.apply::future_lapply(1:length(files), function(ii) { |
58 | -11x | +178 | +5x |
- chunk_reader = match.arg(chunk_reader)+ ds = disk.frame::get_chunk(.x, files_shortname[ii], keep=keep_future) |
59 | -+ | |||
179 | +5x |
-
+ res = .f(ds, ii) |
||
60 | -11x | +180 | +5x |
- if(backend == "readr" | chunk_reader == "readr") {+ if(!is.null(outdir)) { |
61 | +181 | ! |
- if(!requireNamespace("readr")) {+ fst::write_fst(res, file.path(outdir, files_shortname[ii]), compress) |
|
62 | +182 | ! |
- stop("csv_to_disk.frame: You have chosen backend = 'readr' or chunk_reader = 'readr'. But `readr` package is not installed. To install run: `install_packages(\"readr\")`")+ return(ii) |
|
63 | +183 |
- }
+ } else { |
||
64 | -+ | |||
184 | +5x |
- }
+ return(res) |
||
65 | +185 |
- + }
|
||
66 | +186 |
- - |
- ||
67 | -11x | -
- overwrite_check(outdir, overwrite)+ }) |
||
68 | +187 |
|
||
69 | -+ | |||
188 | +1x |
- # we need multiple backend because data.table has poor support for the file is larger than RAM
+ if(!is.null(outdir)) { |
||
70 | -+ | |||
189 | +! |
- # https://github.com/Rdatatable/data.table/issues/3526
+ return(disk.frame(outdir)) |
||
71 | +190 |
- # TODO detect these cases
+ } else { |
||
72 | -+ | |||
191 | +1x |
- + return(res) |
||
73 | +192 |
- # user has requested chunk-wise reading but wants me to do it
+ }
|
||
74 | +193 |
- + }
|
||
75 | +194 |
- #if(is.null(in_chunk_size)) {
+
|
||
76 | +195 |
- + #' @export
|
||
77 | +196 |
- #} else if(is.character(in_chunk_size) && in_chunk_size == "guess") {
+ #' @rdname cmap
|
||
78 | +197 |
- + cimap_dfr <- function(.x, .f, ..., .id = NULL) { |
||
79 | -+ | |||
198 | +1x |
- #library(bigreadr)
+ UseMethod("cimap_dfr") |
||
80 | +199 |
- # system.time(wc_l <- R.utils::countLines(infile))
+ }
|
||
81 | +200 |
- # system.time(infos_split <- split_file(infile, every_nlines = 1e7))
+
|
||
82 | +201 |
- # file_parts <- get_split_files(infos_split)
+ #' @export
|
||
83 | +202 |
- + #' @rdname cmap
|
||
84 | +203 |
- #} else
+ cimap_dfr.disk.frame <- function(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL) { |
||
85 | -11x | -
- if(is.numeric(in_chunk_size)) {- |
- ||
86 | -! | +204 | +1x |
- if(backend =="data.table" & chunk_reader == "data.table") {+ if(!is.null(.id)) { |
87 | +205 | ! |
- rs = df_ram_size()+ warning(".id is not NULL, but the parameter is not used with cmap_dfr.disk.frame") |
|
88 | +206 |
- - |
- ||
89 | -! | -
- if (any((sapply(infile, file.size)/1024^3)> rs)) {- |
- ||
90 | -! | -
- message("csv_to_disk.frame: you are using backend = 'data.table' and chunk_reader = 'data.table'.")+ }
|
||
91 | -! | +|||
207 | +1x |
- message(glue::glue("But one of your input files is larger than available RAM {rs}."))+ data.table::rbindlist(cimap.disk.frame(.x, .f, ..., lazy = FALSE), use.names = use.names, fill = fill, idcol = idcol) |
||
92 | -! | +|||
208 | +
- message("if the file(s) fail to read, please set chunk_reader = 'readLines' or chunk_reader = 'readr'.")+ }
|
|||
93 | -! | +|||
209 | +
- message("E.g. csv_to_disk.frame(..., chunk_reader = 'readr')")+ |
|||
94 | +210 |
- }
+
|
||
95 | +211 |
- }
+ #' `lazy` is convenience function to apply `.f` to every chunk
|
||
96 | +212 |
- }
+ #' @export
|
||
97 | +213 |
- + #' @rdname cmap
|
||
98 | -11x | +|||
214 | +
- if(length(infile)>1) {+ lazy <- function(.x, .f, ...) { |
|||
99 | +215 | ! |
- message("csv_to_disk.frame: you are trying to read multiple files.")+ UseMethod("lazy") |
|
100 | +216 |
- #param_names = names(list(...))
+ }
|
||
101 | +217 |
- - |
- ||
102 | -! | -
- if(backend == "data.table") {+ |
||
103 | +218 |
- #if (!"colClasses" %in% param_names) {
+ #' @rdname cmap
|
||
104 | -! | +|||
219 | +
- message("Please use colClasses to set column types to minimize the chance of a failed read")+ #' @export
|
|||
105 | +220 |
- #}
+ lazy.disk.frame <- function(.x, .f, ...) { |
||
106 | +221 | ! |
- } else if (backend == "readr") {+ cmap.disk.frame(.x, .f, ..., lazy = TRUE) |
|
107 | +222 |
- #if (!"col_types" %in% param_names) {
+ }
|
||
108 | -! | +|||
223 | +
- message("Please use col_types to set column types to minimize the chance of a failed read")+ |
|||
109 | +224 |
- #}
+ #' `delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl
|
||
110 | -! | +|||
225 | +
- } else if (backend == "LaF") {+ #' @export
|
|||
111 | -! | +|||
226 | +
- message("Please check the documentation of LaF for how to set column classes")+ #' @rdname cmap
|
|||
112 | +227 |
- } else {+ delayed <- function(.x, .f, ...) { |
||
113 | -! | +|||
228 | +1x |
- stop("csv_to_disk.frame: backend not supported")+ UseMethod("delayed") |
||
114 | +229 |
- }
+ }
|
||
115 | +230 |
- }
+
|
||
116 | +231 |
- - |
- ||
117 | -11x | -
- if(backend == "LaF") {+ #' @export
|
||
118 | -! | +|||
232 | +
- if(!requireNamespace("LaF")) {+ delayed.disk.frame <- function(.x, .f, ...) { |
|||
119 | -! | +|||
233 | +1x |
- stop("You need to install the LaF package to use backend = 'LaF'. To install: install.packages('LaF')")+ cmap.disk.frame(.x, .f, ..., lazy = TRUE) |
||
120 | +234 |
- }
+ }
|
||
121 | -! | +|||
235 | +
- if(length(infile) > 1) {+ |
|||
122 | -! | +|||
236 | +
- stop("csv_to_disk.frame: backend = 'LaF' only supports single file, not multiple files as `infile`")+ #' @export
|
|||
123 | +237 |
- }
+ #' @rdname cmap
|
||
124 | +238 |
- + chunk_lapply <- function (...) { |
||
125 | +239 | ! |
- if(is.null(in_chunk_size)) {+ warning("chunk_lapply is deprecated in favour of cmap.disk.frame") |
|
126 | +240 | ! |
- stop("csv_to_disk.frame: backend = 'LaF' can only be used when in_chunk_size != NULL")+ cmap.disk.frame(...) |
|
127 | +241 |
- }
+ }
|
128 | +1 |
- }
+ #' Sample n rows from a disk.frame
|
||
129 | +2 |
- - |
- ||
130 | -11x | -
- if(backend == "LaF" & !is.null(in_chunk_size) & length(infile) == 1) {+ #' @export
|
||
131 | -! | +|||
3 | +
- df_out = disk.frame(outdir)+ #' @importFrom dplyr sample_frac
|
|||
132 | -! | +|||
4 | +
- dm = LaF::detect_dm_csv(infile, header = TRUE, ...)+ #' @inheritParams dplyr::sample_frac
|
|||
133 | -! | +|||
5 | +
- LaF::process_blocks(LaF::laf_open(dm), function(chunk, past) {+ #' @rdname sample
|
|||
134 | -! | +|||
6 | +
- if("data.frame" %in% class(chunk)) {+ #' @examples
|
|||
135 | -! | +|||
7 | +
- if(nrow(chunk) > 0 ) {+ #' cars.df = as.disk.frame(cars)
|
|||
136 | -! | +|||
8 | +
- add_chunk(df_out, chunk)+ #'
|
|||
137 | +9 |
- }
+ #' collect(sample_frac(cars.df, 0.5))
|
||
138 | +10 |
- }
+ #'
|
||
139 | -! | +|||
11 | +
- NULL
+ #' # clean up cars.df
|
|||
140 | -! | +|||
12 | +
- }, nrows = in_chunk_size)+ #' delete(cars.df)
|
|||
141 | -! | +|||
13 | +
- return(df_out)+ sample_frac.disk.frame <- function(tbl, size=1, replace=FALSE, weight=NULL, .env=NULL, ...) { |
|||
142 | -11x | +14 | +2x |
- } else if(backend == "data.table" & (is.null(in_chunk_size) | chunk_reader == "data.table")) {+ warning_msg = NULL |
143 | -11x | +15 | +2x |
- csv_to_disk.frame_data.table_backend(+ if(!is.null(weight)) { |
144 | -11x | +16 | +1x |
- infile,
+ warning_msg = "sample_frac: for disk.frames weight = is not supported" |
145 | -11x | +17 | +1x |
- outdir,
+ stop(warning_msg) |
146 | -11x | +18 | +1x |
- inmapfn,
+ } else if (!is.null(.env)) { |
147 | -11x | +|||
19 | +! |
- nchunks,
+ warning_msg = "sample_frac: for disk.frames .env = is not supported" |
||
148 | -11x | +|||
20 | +! |
- in_chunk_size,
+ stop(warning_msg) |
||
149 | -11x | +|||
21 | +
- shardby,
+ }
|
|||
150 | -11x | +|||
22 | +
- compress,
+ |
|||
151 | -11x | +23 | +1x |
- overwrite,
+ fn = disk.frame::create_chunk_mapper(dplyr::sample_frac) |
152 | -11x | +|||
24 | +
- header,
+ |
|||
153 | -11x | +25 | +1x |
- .progress, ...+ fn(tbl, size = size, replace = replace, ...) |
154 | +26 |
- )
- |
- ||
155 | -! | -
- } else if (backend == "data.table" & chunk_reader == "bigreadr" & !is.null(in_chunk_size)) {+ }
|
||
156 | +27 |
- # use bigreadr to split the files
- |
- ||
157 | -! | -
- tf = tempfile()+ |
||
158 | -! | +
1 | +
- pt = proc.time()+ .onLoad <- function(libname, pkgname){ |
||
159 | -! | +||
2 | +
- message(" ----------------------------------------------------- ")+ |
||
160 | -! | +||
3 | +
- message(glue::glue("Stage 1 of 2: splitting the file {infile} into smallers files:"))+ }
|
||
161 | -! | +||
4 | +
- message(glue::glue("Destination: {tf}"))+ |
||
162 | -! | +||
5 | +
- message(" ----------------------------------------------------- ")+ #' @importFrom future nbrOfWorkers
|
||
163 | +6 |
- + #' @importFrom crayon red blue green
|
|
164 | -! | +||
7 | +
- split_file_info = bigreadr::split_file(+ .onAttach <- function(libname, pkgname) { |
||
165 | -! | +||
8 | +
- infile,
+ #setup_disk.frame()
|
||
166 | -! | +||
9 | +
- every_nlines = in_chunk_size,+ |
||
167 | -! | +||
10 | +16x |
- prefix_out = tf,+ packageStartupMessage( |
|
168 | -! | +||
11 | +16x |
- repeat_header = header)+ crayon::red( |
|
169 | -! | +||
12 | +16x |
- files_split = bigreadr::get_split_files(split_file_info)+ glue::glue( |
|
170 | -! | +||
13 | +16x |
- message(paste("Stage 1 of 2 took:", data.table::timetaken(pt)))+ "\n\n## Message from disk.frame: |
|
171 | -! | +||
14 | +16x |
- message(" ----------------------------------------------------- ")+ We have {future::nbrOfWorkers()} workers to use with disk.frame. |
|
172 | -+ | ||
15 | +16x |
- + To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.")), |
|
173 | -! | +||
16 | +16x |
- pt2 = proc.time()+ crayon::green("\n\n |
|
174 | -! | +||
17 | +16x |
- message(glue::glue("Stage 2 of 2: Converting the smaller files into disk.frame"))+ It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:\n\n |
|
175 | -! | +||
18 | +16x |
- message(" ----------------------------------------------------- ")+ ```r |
|
176 | +19 |
- + # this will set up disk.frame with multiple workers
|
|
177 | -! | +||
20 | +16x |
- res = csv_to_disk.frame(+ setup_disk.frame() |
|
178 | -! | +||
21 | +
- files_split,
+ # this will allow unlimited amount of data to be passed from worker to worker
|
||
179 | -! | +||
22 | +16x |
- outdir = outdir,+ options(future.globals.maxSize = Inf) |
|
180 | -! | +||
23 | +
- inmapfn = inmapfn,+ ``` |
||
181 | -! | +||
24 | +16x |
- nchunks = nchunks,+ \n\n")) |
|
182 | -! | +||
25 | +
- shardby = shardby,+ }
|
||
183 | -! | +||
26 | +
- compress = compress,+ |
||
184 | -! | +||
27 | +
- overwrite = overwrite,+ globalVariables(c( |
||
185 | -! | +||
28 | +
- header = header,+ "sym", # from dplyr |
||
186 | -! | +||
29 | +
- .progress = .progress,+ "type", # used in bloomfilter |
||
187 | -! | +||
30 | +
- backend = backend,+ "size", # used in bloomfilter |
||
188 | +31 |
- ...)+ "modification_time", # used in bloomfilter |
|
189 | +32 |
- + "name", # used in gen_summ_code |
|
190 | -! | +||
33 | +
- message(paste("Stage 2 of 2 took:", data.table::timetaken(pt2)))+ "agg_expr", # used in gen_summ_code |
||
191 | -! | +||
34 | +
- message(" ----------------------------------------------------- ")+ "orig_code", # used in gen_summ_code |
||
192 | +35 |
- + "syms", # needed by dplyr to treat something as a symbol |
|
193 | -! | +||
36 | +
- message(paste("Stage 2 & 2 took:", data.table::timetaken(pt)))+ ".",
|
||
194 | -! | +||
37 | +
- message(" ----------------------------------------------------- ")+ ".BY",
|
||
195 | +38 |
- + ".N",
|
|
196 | -! | +||
39 | +
- return(res)+ ".SD",
|
||
197 | -! | +||
40 | +
- } else if (backend == "data.table" & chunk_reader == "readLines" & !is.null(in_chunk_size)) {+ ".out.disk.frame.id",
|
||
198 | -! | +||
41 | +
- if (length(infile) == 1) {+ ":=",
|
||
199 | +42 |
- # establish a read connection to the file
+ "N",
|
|
200 | -! | +||
43 | +
- con = file(infile, "r")+ "area",
|
||
201 | -! | +||
44 | +
- on.exit(close(con))+ "chunk_id",
|
||
202 | -! | +||
45 | +
- xx = readLines(con, n = in_chunk_size)+ "coltypes",
|
||
203 | -! | +||
46 | +
- diskf = disk.frame(outdir)+ "coltypes.x",
|
||
204 | -! | +||
47 | +
- header_copy = header+ "coltypes.y",
|
||
205 | -! | +||
48 | +
- colnames_copy = NULL+ "ctot",
|
||
206 | -! | +||
49 | +
- while(length(xx) > 0) {+ "existing_df",
|
||
207 | -! | +||
50 | +
- if(is.null(colnames_copy)) {+ "feature_s",
|
||
208 | -! | +||
51 | +
- new_chunk = inmapfn(data.table::fread(text = xx, header = header_copy, ...))+ "h",
|
||
209 | -! | +||
52 | +
- colnames_copy = names(new_chunk)+ "height",
|
||
210 | +53 |
- } else {+ "incompatible_types",
|
|
211 | +54 |
- # TODO detect the correct delim; manually adding header
+ "lag_height",
|
|
212 | -! | +||
55 | +
- header_colnames = paste0(colnames_copy, collapse = ",")+ "new_chunk",
|
||
213 | -! | +||
56 | +
- xx = c(header_colnames, xx)+ "ok",
|
||
214 | -! | +||
57 | +
- new_chunk = inmapfn(+ "pathA",
|
||
215 | -! | +||
58 | +
- data.table::fread(+ "pathB",
|
||
216 | -! | +||
59 | +
- text = xx,+ "w",
|
||
217 | -! | +||
60 | +
- header = TRUE+ "xid",
|
||
218 | +61 |
- , ...
+ "yid")) |
|
219 | +62 |
- )
+
|
|
220 | +63 |
- )
+ #' @useDynLib disk.frame
|
|
221 | +64 |
- + #' @importFrom Rcpp evalCpp
|
|
222 | +65 |
- }
+ #@exportPattern "^[[:alpha:]]+"
|
|
223 | +66 |
-
+ NULL
|
|
224 | -! | +
1 | +
- add_chunk(diskf, new_chunk)+ #' Fit generalized linear models (glm) with disk.frame
|
||
225 | -! | +||
2 | +
- xx = readLines(con, n = in_chunk_size)+ #'
|
||
226 | -! | +||
3 | +
- header_copy = FALSE+ #' @inherit biglm::bigglm
|
||
227 | +4 |
- }
+ #'
|
|
228 | -! | +||
5 | +
- return(diskf)+ #' @description
|
||
229 | +6 |
- } else {+ #' Fits GLMs using `{speedglm}` or `{biglm}`. The return object will be exactly as
|
|
230 | -! | +||
7 | +
- stop("chunk_reader = 'readLines' is not yet supported for multiple files")+ #' those return by those functions. This is a convenience wrapper
|
||
231 | +8 |
- }
+ #'
|
|
232 | -! | +||
9 | +
- } else if (backend == "data.table" & chunk_reader == "readr" & !is.null(in_chunk_size)) {+ #' @param glm_backend Which package to use for fitting GLMs. The default is
|
||
233 | -! | +||
10 | +
- if (length(infile) == 1) {+ #' "biglm", which has known issues with factor level if different levels are
|
||
234 | -! | +||
11 | +
- diskf = disk.frame(outdir)+ #' present in different chunks. The "speedglm" option is more robust, but does not
|
||
235 | +12 |
-
+ #' implement `predict` which makes prediction and implementation impossible.
|
|
236 | -! | +||
13 | +
- colnames_copy = NULL+ #'
|
||
237 | -! | +||
14 | +
- readr::read_lines_chunked(file = infile, callback = readr::SideEffectChunkCallback$new(function(xx, i) {+ #' @family Machine Learning (ML)
|
||
238 | +15 |
- + #' @export
|
|
239 | -! | +||
16 | +
- if(is.null(colnames_copy)) {+ #'
|
||
240 | -! | +||
17 | +
- new_chunk = inmapfn(+ #' @examples
|
||
241 | -! | +||
18 | +
- data.table::fread(+ #' cars.df = as.disk.frame(cars)
|
||
242 | -! | +||
19 | +
- text = xx,+ #' m = dfglm(dist ~ speed, data = cars.df)
|
||
243 | -! | +||
20 | +
- header = header+ #'
|
||
244 | +21 |
- , ...
+ #' # can use normal R functions
|
|
245 | +22 |
- )
+ #' # Only works in version > R 3.6
|
|
246 | +23 |
- )
+ #' majorv = as.integer(version$major)
|
|
247 | -! | +||
24 | +
- colnames_copy <<- names(new_chunk)+ #' minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])
|
||
248 | +25 |
- } else {+ #' if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) {
|
|
249 | -! | +||
26 | +
- header_colnames = paste0(colnames_copy, collapse = ",")+ #' summary(m)
|
||
250 | -! | +||
27 | +
- xx = c(header_colnames, xx)+ #' predict(m, get_chunk(cars.df, 1))
|
||
251 | -! | +||
28 | +
- new_chunk = inmapfn(+ #' predict(m, collect(cars.df))
|
||
252 | -! | +||
29 | +
- data.table::fread(+ #' # can use broom to tidy up the returned info
|
||
253 | -! | +||
30 | +
- text = xx,+ #' broom::tidy(m)
|
||
254 | -! | +||
31 | +
- header = TRUE+ #' }
|
||
255 | +32 |
- , ...
+ #'
|
|
256 | +33 |
- )
+ #' # clean up
|
|
257 | +34 |
- )
+ #' delete(cars.df)
|
|
258 | +35 |
- }
+ dfglm <- function(formula, data, ..., glm_backend = c("biglm", "speedglm", "biglmm")) { |
|
259 | -! | +||
36 | +2x |
- add_chunk(diskf, new_chunk)+ glm_backend = match.arg(glm_backend) |
|
260 | -! | +||
37 | +
- }), chunk_size = in_chunk_size, progress = .progress)+ |
||
261 | +38 |
- + # compute the major version
|
|
262 | -! | +||
39 | +2x |
- return(diskf)+ majorv = as.integer(version$major)+ |
+ |
40 | +2x | +
+ minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) |
|
263 | +41 |
- } else {+ |
|
264 | -! | +||
42 | +2x |
- stop("chunk_reader = 'readr' is not yet supported for multiple files")+ if((majorv == 3) & (minorv < 6) & (glm_backend == "biglm")) { |
|
265 | -+ | ||
43 | +! |
- }
+ warning("{bigglm} is not supported for R version below 3.6; auto switched to {speedglm}") |
|
266 | +44 | ! |
- } else if(backend == "readr") {+ glm_backend = "speedglm" |
267 | +45 |
- # if(is.null(in_chunk_size)) {
+ }
|
|
268 | +46 |
- # stop("for readr backend, only in_chunk_size != NULL is supported")
+ |
|
269 | -+ | ||
47 | +2x |
- # } else if (!is.null(shardby)) {
+ stopifnot(is_disk.frame(data))+ |
+ |
48 | +2x | +
+ streaming_fn <- make_glm_streaming_fn(data) |
|
270 | +49 |
- # stop("for readr backend, only shardby == NULL is supported")
+ + |
+ |
50 | +2x | +
+ if(glm_backend == "speedglm") {+ |
+ |
51 | +1x | +
+ if(!requireNamespace("speedglm")) {+ |
+ |
52 | +! | +
+ stop("speedglm package not installed. To install run `install.packages('speedglm')`") |
|
271 | +53 |
- # }
+ }
|
|
272 | -! | +||
54 | +1x |
- csv_to_disk.frame_readr(+ speedglm::shglm(formula, streaming_fn, ...) |
|
273 | -! | +||
55 | +1x |
- infile,
+ } else if (glm_backend == "biglm"){ |
|
274 | -! | +||
56 | +1x |
- outdir=outdir,+ if(!requireNamespace("biglm")) { |
|
275 | +57 | ! |
- inmapfn=inmapfn,+ stop("biglm package not installed. To install run `install.packages('biglm')`") |
276 | -! | +||
58 | +
- nchunks=nchunks,+ }
|
||
277 | -! | +||
59 | +1x |
- in_chunk_size=in_chunk_size,+ biglm::bigglm(formula, data = streaming_fn, ...) |
|
278 | +60 | ! |
- shardby=shardby,+ } else if (glm_backend == "biglmm"){ |
279 | +61 | ! |
- compress=compress,+ if(!requireNamespace("biglmm")) { |
280 | +62 | ! |
- overwrite=TRUE,+ stop("biglmm package not installed. To install run `install.packages('biglmm')`") |
281 | -! | +||
63 | +
- col_names=header,+ }
|
||
282 | +64 | ! |
- .progress=.progress, ...)+ biglmm::bigglm(formula, data = streaming_fn, ...) |
283 | +65 |
} else { |
|
284 | +66 | ! |
- stop("csv_to_disk.frame: this set of options is not supported")+ stop("glm_backend must be one of 'speedglm' or 'biglm' or 'biglmm'") |
285 | +67 |
}
|
|
286 | +68 |
}
|
287 | +1 |
-
+ #' Show a progress bar of the action being performed
|
|
288 | +2 |
-
+ #' @importFrom utils txtProgressBar setTxtProgressBar
|
|
289 | +3 |
- csv_to_disk.frame_data.table_backend <- function(infile, outdir = tempfile(fileext = ".df"), inmapfn = base::I, nchunks = recommend_nchunks(sum(file.size(infile))),+ #' @param df a disk.frame
|
|
290 | +4 |
- in_chunk_size = NULL, shardby = NULL, compress=50, overwrite = TRUE, header = TRUE, .progress = TRUE, ...) {+ #' @noRd
|
|
291 | +5 |
- # reading multiple files
+ progressbar <- function(df) { |
|
292 | -11x | +||
6 | +! |
- if(length(infile) > 1) {+ if(attr(df,"performing", exact=TRUE) == "hard_group_by") { |
|
293 | -! | +||
7 | +
- dotdotdot = list(...)+ # create progress bar
|
||
294 | -! | +||
8 | +
- origarg = list(inmapfn = inmapfn, nchunks = nchunks,+ |
||
295 | +9 | ! |
- in_chunk_size = in_chunk_size, shardby = shardby, compress = compress,+ shardby = "acct_id" |
296 | -! | +||
10 | +
- overwrite = TRUE, header = header)+ #list.files(
|
||
297 | +11 | ! |
- dotdotdotorigarg = c(dotdotdot, origarg)+ fparent = attr(df,"parent", exact=TRUE) |
298 | +12 |
|
|
299 | -! | +||
13 | +
- pt <- proc.time()+ #tmp = file.path(fparent,".performing","inchunks")
|
||
300 | +14 | ! |
- if(.progress) {+ tmp = "tmphardgroupby2"+ |
+
15 | ++ |
+ |
|
301 | +16 | ! |
- message("-- Converting CSVs to disk.frame --")+ l = length(list.files(fparent)) |
302 | -+ | ||
17 | +! |
- + pt_begin_split = proc.time() |
|
303 | +18 | ! |
- message(glue::glue("Converting {length(infile)} CSVs to {nchunks} disk.frame each consisting of {nchunks} chunks (Stage 1 of 2):"))+ doprog <- function(pt_from, sleep = 1) { |
304 | +19 |
- }
+ #tkpb = winProgressBar(title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), label = "Checking completeness",
|
|
305 | +20 |
- + # min = 0, max = l*1.5, initial = 0, width = 500)
|
|
306 | +21 | ! |
- outdf_tmp = furrr::future_imap(infile, ~{+ pb <- txtProgressBar(min = 0, max = l*1.5, style = 3)+ |
+
22 | ++ |
+ |
|
307 | +23 | ! |
- dotdotdotorigarg1 = c(dotdotdotorigarg, list(outdir = file.path(tempdir(), .y), infile=.x))+ on.exit(close(pb)) |
308 | +24 |
- + # on.exit(close(tkpb))
|
|
309 | +25 | ! |
- pryr::do_call(csv_to_disk.frame_data.table_backend, dotdotdotorigarg1)+ while(length(list.files(file.path(tmp,l))) < l) { |
310 | +26 | ! |
- }, .progress = .progress)+ wl = length(list.files(file.path(tmp,1:l)))/l+ |
+
27 | +! | +
+ tt <- proc.time()[3] - pt_from[3] |
|
311 | +28 |
- + #list.files(
|
|
312 | +29 | ! |
- if(.progress) {+ avg_speed = tt/wl |
313 | +30 | ! |
- message(paste("Stage 1 or 2 took:", data.table::timetaken(pt)))+ pred_speed = avg_speed*(l-wl) + avg_speed*l/2 |
314 | +31 | ! |
- message(" ")+ elapsed = round(tt/60,1) |
315 | +32 |
- }
+ |
|
316 | +33 |
- + #setWinProgressBar(tkpb, wl,
|
|
317 | -! | +||
34 | ++ |
+ # title = sprintf("Hard Group By Stage 1(/2) - %s", shardby),
+ |
+ |
35 | +
- message(glue::glue("Row-binding the {nchunks} disk.frames together to form one large disk.frame (Stage 2 of 2):"))+ # label = sprintf("%.0f out of %d; avg speed %.2f mins; elapsed %.1f mins; another %.1f mins", wl,l, round(avg_speed/60,2), elapsed, round(pred_speed/60,2)))
|
||
318 | +36 | ! |
- message(glue::glue("Creating the disk.frame at {outdir}"))+ setTxtProgressBar(pb, length(list.files(file.path(tmp,l))), |
319 | +37 | ! |
- pt2 <- proc.time()+ title = sprintf("Group By - %s", shardby)) |
320 | +38 | ! |
- outdf = rbindlist.disk.frame(outdf_tmp, outdir = outdir, by_chunk_id = TRUE, compress = compress, overwrite = overwrite, .progress = .progress)+ Sys.sleep(sleep) |
321 | +39 |
- - |
- |
322 | -! | -
- if(.progress) {+ }
|
|
323 | +40 |
- + }
|
|
324 | +41 | ! |
- message(paste("Stage 2 of 2 took:", data.table::timetaken(pt2)))+ doprog(pt_begin_split, 1)+ |
+
42 | ++ |
+ |
|
325 | +43 | ! |
- message(" ----------------------------------------------------- ")+ pt_begin_collate = proc.time() |
326 | +44 | ! |
- message(paste("Stage 1 & 2 in total took:", data.table::timetaken(pt)))+ doprog2 <- function(pt_from, sleep = 1) { |
327 | +45 |
- }
+ # tkpb = winProgressBar(title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating", shardby), label = "Checking completeness",
|
|
328 | +46 |
- + # min = 0, max = l*1.5, initial = 0, width = 600)
|
|
329 | +47 | ! |
- return(outdf)+ pb <- txtProgressBar(min = 0, max = l*1.5, style = 3) |
330 | +48 |
- } else { # reading one file+ |
|
331 | -11x | +||
49 | +! |
- l = length(list.files(outdir))+ on.exit(close(pb)) |
|
332 | -11x | +||
50 | +
- if(is.null(shardby)) {+ # on.exit(close(tkpb))
|
||
333 | -9x | +||
51 | +! |
- if(is.null(in_chunk_size)) {+ while(length(list.files("large_sorted")) < l) { |
|
334 | -9x | +||
52 | +! |
- a = as.disk.frame(inmapfn(data.table::fread(infile, header=header, ...)), outdir, compress=compress, nchunks = nchunks, overwrite = overwrite, ...)+ wl = length(list.files("large_sorted")) |
|
335 | -9x | +||
53 | +! |
- return(a)+ tt <- proc.time()[3] - pt_from[3] |
|
336 | +54 |
- } else {+ #list.files(
|
|
337 | +55 | ! |
- outdf = disk.frame(outdir)+ avg_speed = tt/wl |
338 | +56 | ! |
- i <- 0+ pred_speed = avg_speed*(l-wl) |
339 | +57 | ! |
- tmpdir1 = tempfile(pattern="df_tmp")+ elapsed = round(tt/60,1) |
340 | -! | +||
58 | +
- fs::dir_create(tmpdir1)+ |
||
341 | +59 |
- + # setWinProgressBar(tkpb, l + wl/2,
|
|
342 | -! | +||
60 | +
- done = FALSE+ # title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating -- %.0f out of %d chunks processed;", shardby, wl, l),
|
||
343 | -! | +||
61 | +
- skiprows = 0+ # label = sprintf("avg %.2f min/chunk; %.1f mins elapsed; %.1f mins remaining;", round(avg_speed/60,2), elapsed, round(pred_speed/60,2)))
|
||
344 | +62 | ! |
- column_names = ""+ setTxtProgressBar(pb, length(list.files("large_sorted")), |
345 | +63 | ! |
- while(!done) {+ title = sprintf("Hard Group By - %s", shardby)) |
346 | +64 | ! |
- if (identical(column_names, "")) {+ Sys.sleep(sleep) |
347 | -! | +||
65 | +
- tmpdt = inmapfn(data.table::fread(+ }
|
||
348 | -! | +||
66 | +
- infile,
+ }
|
||
349 | +67 | ! |
- skip = skiprows, nrows = in_chunk_size,...))+ doprog2(pt_begin_collate, 1) |
350 | -! | +||
68 | +
- column_names = names(tmpdt)+ }
|
||
351 | +69 |
- } else {+ }
|
|
352 | -! | +||
70 | +
- ddd = list(...)+ |
||
353 | -! | +||
71 | +
- if ("col.names" %in% names(ddd)) {+ #' Perform a hard group
|
||
354 | -! | +||
72 | +
- tmpdt = inmapfn(data.table::fread(+ #' @description
|
||
355 | -! | +||
73 | +
- infile,
+ #' A hard_group_by is a group by that also reorganizes the chunks to ensure that
|
||
356 | -! | +||
74 | +
- skip = skiprows, nrows = in_chunk_size,+ #' every unique grouping of `by`` is in the same chunk. Or in other words, every
|
||
357 | -! | +||
75 | +
- header=FALSE, ...))+ #' row that share the same `by` value will end up in the same chunk.
|
||
358 | +76 |
- } else {+ #' @param df a disk.frame
|
|
359 | -! | +||
77 | +
- tmpdt = inmapfn(data.table::fread(+ #' @param ... grouping variables
|
||
360 | -! | +||
78 | ++ |
+ #' @param outdir the output directory
+ |
+ |
79 | +
- infile,
+ #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df)
|
||
361 | -! | +||
80 | +
- skip = skiprows, nrows = in_chunk_size,+ #' @param overwrite overwrite the out put directory
|
||
362 | -! | +||
81 | +
- header=FALSE, col.names = column_names, ...))+ #' @param add same as dplyr::group_by
|
||
363 | +82 |
- }
+ #' @param .drop same as dplyr::group_by
|
|
364 | +83 |
- }
+ #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
|
|
365 | +84 |
- + #' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
|
|
366 | -! | +||
85 | +
- i <- i + 1+ #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
|
||
367 | -! | +||
86 | +
- skiprows = skiprows + in_chunk_size ++ #' @param sort_split_sample_size for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.
|
||
368 | +87 |
- # skips the header as well but only at the first chunk
+ #' @export
|
|
369 | -! | +||
88 | +
- ifelse(i == 1 & header, 1, 0)+ #' @examples
|
||
370 | -! | +||
89 | +
- rows <- tmpdt[,.N]+ #' iris.df = as.disk.frame(iris, nchunks = 2)
|
||
371 | -! | +||
90 | +
- if(rows < in_chunk_size) {+ #'
|
||
372 | -! | +||
91 | +
- done <- TRUE+ #' # group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk
|
||
373 | +92 |
- }
+ #' iris_hard.df = hard_group_by(iris.df, Species)
|
|
374 | +93 |
- + #'
|
|
375 | +94 |
- # add to chunk
+ #' get_chunk(iris_hard.df, 1)
|
|
376 | -! | +||
95 | +
- add_chunk(outdf, tmpdt)+ #' get_chunk(iris_hard.df, 2)
|
||
377 | -! | +||
96 | +
- rm(tmpdt); gc()+ #'
|
||
378 | +97 |
- }
+ #' # clean up cars.df
|
|
379 | +98 |
- + #' delete(iris.df)
|
|
380 | -! | +||
99 | +
- message(glue::glue("read {in_chunk_size*(i-1) + rows} rows from {infile}"))+ #' delete(iris_hard.df)
|
||
381 | +100 |
- + hard_group_by <- function(df, ..., add = FALSE, .drop = FALSE) {+ |
+ |
101 | +51x | +
+ UseMethod("hard_group_by") |
|
382 | +102 |
- # remove the files
+ }
|
|
383 | -! | +||
103 | +
- fs::dir_delete(tmpdir1)+ |
||
384 | +104 |
- }
+ #' @rdname hard_group_by
|
|
385 | +105 |
- } else { # so shard by some element+ #' @export
|
|
386 | -2x | +||
106 | +
- if(is.null(in_chunk_size)) {+ #' @importFrom dplyr group_by
|
||
387 | -2x | +||
107 | +
- return(+ hard_group_by.data.frame <- function(df, ..., add = FALSE, .drop = FALSE) { |
||
388 | -2x | +||
108 | +! |
- shard(inmapfn(data.table::fread(infile, header=header, ...)), shardby = shardby, nchunks = nchunks, outdir = outdir, overwrite = overwrite, compress = compress,...)+ dplyr::group_by(df, ..., add = FALSE, .drop = FALSE) |
|
389 | +109 |
- )
+ }
|
|
390 | +110 |
- } else {+ |
|
391 | -! | +||
111 | +
- i <- 0+ #' @rdname hard_group_by
|
||
392 | -! | +||
112 | +
- tmpdir1 = tempfile(pattern="df_tmp")+ #' @importFrom purrr map
|
||
393 | -! | +||
113 | +
- fs::dir_create(tmpdir1)+ #' @export
|
||
394 | +114 |
- #message(tmpdir1)
+ hard_group_by.disk.frame <- function( |
|
395 | +115 |
- + df,
|
|
396 | -! | +||
116 | +
- done = FALSE+ ...,
|
||
397 | -! | +||
117 | +
- skiprows = 0+ outdir=tempfile("tmp_disk_frame_hard_group_by"), |
||
398 | -! | +||
118 | +
- column_names = ""+ nchunks = disk.frame::nchunks(df), |
||
399 | -! | +||
119 | +
- while(!done) {+ overwrite = TRUE, |
||
400 | -! | +||
120 | +
- if (identical(column_names, "")) {+ shardby_function="hash", |
||
401 | -! | +||
121 | +
- tmpdt = inmapfn(data.table::fread(+ sort_splits=NULL, |
||
402 | -! | +||
122 | +
- infile,
+ desc_vars=NULL, |
||
403 | -! | +||
123 | +
- skip = skiprows, nrows = in_chunk_size,...))+ sort_split_sample_size=100 |
||
404 | -! | +||
124 | +
- column_names = names(tmpdt)+ ) { |
||
405 | +125 |
- } else {+ |
|
406 | -! | +||
126 | +51x |
- ddd = list(...)+ overwrite_check(outdir, overwrite) |
|
407 | -! | +||
127 | +
- if ("col.names" %in% names(ddd)) {+ |
||
408 | -! | +||
128 | +51x |
- tmpdt = inmapfn(data.table::fread(+ ff = list.files(attr(df, "path")) |
|
409 | -! | +||
129 | +51x |
- infile,
+ stopifnot(shardby_function %in% c("hash", "sort")) |
|
410 | -! | +||
130 | +
- skip = skiprows, nrows = in_chunk_size,+ |
||
411 | -! | +||
131 | +51x |
- header=FALSE, ...))+ if (shardby_function == "sort" && is.null(sort_splits)){ |
|
412 | +132 |
- } else {+ # Sample enough per chunk to generate reasonable splits
|
|
413 | -! | +||
133 | +6x |
- tmpdt = inmapfn(data.table::fread(+ sample_size_per_chunk = ceiling(nchunks / disk.frame::nchunks(df)) * sort_split_sample_size |
|
414 | -! | +||
134 | +
- infile,
+ |
||
415 | -! | +||
135 | +
- skip = skiprows, nrows = in_chunk_size,+ # Sample and sort
|
||
416 | -! | +||
136 | +6x |
- header=FALSE, col.names = column_names, ...))+ sort_splits_sample <- cmap(df, dplyr::sample_n, size=sample_size_per_chunk, replace=TRUE) %>% |
|
417 | -+ | ||
137 | +6x |
- }
+ select(...) %>% |
|
418 | -+ | ||
138 | +6x |
- }
+ collect() |
|
419 | +139 |
- + |
|
420 | -! | +||
140 | +
- i <- i + 1+ # NSE
|
||
421 | -! | +||
141 | +6x |
- skiprows = skiprows + in_chunk_size ++ tryCatch({ |
|
422 | -+ | ||
142 | +6x |
- # skips the header as well but only at the first chunk
+ sort_splits_sample <- sort_splits_sample %>% |
|
423 | -! | +||
143 | +6x |
- ifelse(i == 1 & header, 1, 0)+ arrange(!!!syms(...)) |
|
424 | -! | +||
144 | +6x |
- rows <- tmpdt[,.N]+ }, error = function(e) { |
|
425 | +145 | ! |
- if(rows < in_chunk_size) {+ sort_splits_sample <- sort_splits_sample %>% |
426 | +146 | ! |
- done <- TRUE+ arrange(...) |
427 | +147 |
- }
+ }) |
|
428 | +148 |
- - |
- |
429 | -! | -
- tmp.disk.frame = shard(+ |
|
430 | -! | +||
149 | +
- tmpdt,
+ # If 100 chunks, this return get 99 splits based on percentiles.
|
||
431 | -! | +||
150 | +6x |
- shardby = shardby,+ ntiles <- round((1:(nchunks-1)) * (nrow(sort_splits_sample) / (nchunks))) |
|
432 | -! | +||
151 | +
- nchunks = nchunks,+ |
||
433 | -! | +||
152 | +
- outdir = file.path(tmpdir1,i),+ # Get splits. May lead to less than nchunks if duplicates are selected.
|
||
434 | -! | +||
153 | +6x |
- overwrite = TRUE,+ sort_splits <- sort_splits_sample %>% |
|
435 | -! | +||
154 | +6x |
- compress = compress,...)+ dplyr::slice(ntiles) %>% |
|
436 | -! | +||
155 | +6x |
- rm(tmpdt); gc()+ distinct() |
|
437 | +156 |
- }
+ }
|
|
438 | +157 |
- + |
|
439 | -! | +||
158 | +
- message(glue::glue("read {in_chunk_size*(i-1) + rows} rows from {infile}"))+ # test if the unlist it will error
|
||
440 | +159 |
- #
+ + |
+ |
160 | +51x | +
+ tryCatch({ |
|
441 | +161 |
- # do not run this in parallel as the level above this is likely in parallel
+ # This will return the variable names
|
|
442 | +162 |
- # ZJ:
+ |
|
443 | -! | +||
163 | +
- system.time(+ # TODO use better ways to do NSE
|
||
444 | -! | +||
164 | +
- fnl_out <-+ # the below will fail if indeed ... can not be list-ed
|
||
445 | -! | +||
165 | +
- rbindlist.disk.frame(+ # there should be a better way to do this
|
||
446 | -! | +||
166 | +51x |
- lapply(+ by <- unlist(list(...)) |
|
447 | -! | +||
167 | +
- list.files(+ |
||
448 | -! | +||
168 | +
- tmpdir1, full.names = TRUE), disk.frame),+ # shard and create temporary diskframes
|
||
449 | -! | +||
169 | +51x |
- outdir = outdir, by_chunk_id = TRUE, parallel=FALSE, overwrite = overwrite, compress = compress))+ tmp_df = cmap(df, function(df1) { |
|
450 | -+ | ||
170 | +230x |
- + tmpdir = tempfile() |
|
451 | -+ | ||
171 | +230x |
- # remove the files
+ shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) |
|
452 | -! | +||
172 | +51x |
- fs::dir_delete(tmpdir1)+ }, lazy = FALSE) |
|
453 | +173 |
- }
+ |
|
454 | +174 |
- }
+ |
|
455 | -! | +||
175 | +
- df = disk.frame(outdir)+ # now rbindlist
|
||
456 | -! | +||
176 | +51x |
- df = add_meta(df, nchunks=disk.frame::nchunks(df), shardkey = shardby, shardchunks = nchunks, compress = compress)+ res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) |
|
457 | -! | +||
177 | +
- return(df)+ |
||
458 | +178 |
- }
+ # clean up the tmp dir
|
|
459 | -+ | ||
179 | +51x |
- }
+ purrr::walk(tmp_df, ~{ |
1 | -+ | ||
180 | +230x |
- #' Show a progress bar of the action being performed
+ fs::dir_delete(attr(.x, "path", exact=TRUE)) |
|
2 | +181 |
- #' @importFrom utils txtProgressBar setTxtProgressBar
+ }) |
|
3 | +182 |
- #' @param df a disk.frame
+ |
|
4 | +183 |
- #' @noRd
+
|
|
5 | -+ | ||
184 | +51x |
- progressbar <- function(df) {+ res1 <- NULL |
|
6 | -! | +||
185 | +51x |
- if(attr(df,"performing") == "hard_group_by") {+ if(typeof(by) == "character") { |
|
7 | -+ | ||
186 | +51x |
- # create progress bar
+ eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) |
|
8 | -+ | ||
187 | +! |
- + } else if(length(by) == 1) { |
|
9 | +188 | ! |
- shardby = "acct_id"+ res1 = res %>% dplyr::group_by({{by}}) |
10 | +189 |
- #list.files(
+ } else { |
|
11 | +190 | ! |
- fparent = attr(df,"parent")+ eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})'))) |
12 | +191 |
- + }
|
|
13 | +192 |
- #tmp = file.path(fparent,".performing","inchunks")
+ |
|
14 | -! | +||
193 | +51x | +
+ res1
+ |
+ |
194 | +51x |
- tmp = "tmphardgroupby2"+ }, error = function(e) { |
|
15 | +195 |
- + #message(e)
|
|
16 | -! | +||
196 | +
- l = length(list.files(fparent))+ # This will return the variable names
|
||
17 | +197 | ! |
- pt_begin_split = proc.time()+ by = rlang::enquos(...) %>% |
18 | +198 | ! |
- doprog <- function(pt_from, sleep = 1) {+ substr(2, nchar(.)) |
19 | +199 |
- #tkpb = winProgressBar(title = sprintf("Hard Group By Stage 1(/2) - %s", shardby), label = "Checking completeness",
+ |
|
20 | +200 |
- # min = 0, max = l*1.5, initial = 0, width = 500)
+
|
|
21 | -! | +||
201 | +
- pb <- txtProgressBar(min = 0, max = l*1.5, style = 3)+ |
||
22 | +202 |
- + # shard and create temporary diskframes
|
|
23 | +203 | ! |
- on.exit(close(pb))- |
-
24 | -- |
- # on.exit(close(tkpb))
+ tmp_df = cmap(df, function(df1) { |
|
25 | +204 | ! |
- while(length(list.files(file.path(tmp,l))) < l) {+ tmpdir = tempfile() |
26 | +205 | ! |
- wl = length(list.files(file.path(tmp,1:l)))/l+ shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) |
27 | +206 | ! |
- tt <- proc.time()[3] - pt_from[3]+ }, lazy = FALSE) |
28 | +207 |
- #list.files(
+ |
|
29 | -! | +||
208 | +
- avg_speed = tt/wl+ # now rbindlist
|
||
30 | +209 | ! |
- pred_speed = avg_speed*(l-wl) + avg_speed*l/2+ res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite) |
31 | -! | +||
210 | +
- elapsed = round(tt/60,1)+ |
||
32 | +211 |
- + # clean up the tmp dir
|
|
33 | -+ | ||
212 | +! |
- #setWinProgressBar(tkpb, wl,
+ purrr::walk(tmp_df, ~{+ |
+ |
213 | +! | +
+ fs::dir_delete(attr(.x, "path", exact=TRUE)) |
|
34 | +214 |
- # title = sprintf("Hard Group By Stage 1(/2) - %s", shardby),
+ }) |
|
35 | +215 |
- # label = sprintf("%.0f out of %d; avg speed %.2f mins; elapsed %.1f mins; another %.1f mins", wl,l, round(avg_speed/60,2), elapsed, round(pred_speed/60,2)))
+ |
|
36 | +216 | ! |
- setTxtProgressBar(pb, length(list.files(file.path(tmp,l))),+ res1 = res %>% chunk_group_by(!!!syms(by)) |
37 | -! | +||
217 | +
- title = sprintf("Group By - %s", shardby))+ |
||
38 | +218 | ! |
- Sys.sleep(sleep)+ res1
|
39 | +219 |
- }
+ }) |
|
40 | +220 |
- }
- |
- |
41 | -! | -
- doprog(pt_begin_split, 1)+ }
|
42 | +1 |
- - |
- |
43 | -! | -
- pt_begin_collate = proc.time()+ #' Move or copy a disk.frame to another location
|
|
44 | -! | +||
2 | +
- doprog2 <- function(pt_from, sleep = 1) {+ #'
|
||
45 | +3 |
- # tkpb = winProgressBar(title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating", shardby), label = "Checking completeness",
+ #' @param df The disk.frame
|
|
46 | +4 |
- # min = 0, max = l*1.5, initial = 0, width = 600)
+ #' @param outdir The new location
|
|
47 | -! | +||
5 | +
- pb <- txtProgressBar(min = 0, max = l*1.5, style = 3)+ #' @param copy Merely copy and not move
|
||
48 | +6 |
- + #' @param ... NOT USED
|
|
49 | -! | +||
7 | +
- on.exit(close(pb))+ #'
|
||
50 | +8 |
- # on.exit(close(tkpb))
+ #' @return a disk.frame
|
|
51 | -! | +||
9 | +
- while(length(list.files("large_sorted")) < l) {+ #' @export
|
||
52 | -! | +||
10 | +
- wl = length(list.files("large_sorted"))+ #' @examples
|
||
53 | -! | +||
11 | +
- tt <- proc.time()[3] - pt_from[3]+ #' cars.df = as.disk.frame(cars)
|
||
54 | +12 |
- #list.files(
+ #'
|
|
55 | -! | +||
13 | +
- avg_speed = tt/wl+ #' cars_copy.df = copy_df_to(cars.df, outdir = tempfile(fileext=".df"))
|
||
56 | -! | +||
14 | +
- pred_speed = avg_speed*(l-wl)+ #'
|
||
57 | -! | +||
15 | +
- elapsed = round(tt/60,1)+ #' cars2.df = move_to(cars.df, outdir = tempfile(fileext=".df"))
|
||
58 | +16 |
- + #'
|
|
59 | +17 |
- # setWinProgressBar(tkpb, l + wl/2,
+ #' # clean up
|
|
60 | +18 |
- # title = sprintf("Hard Group By - %s -- Stage 2 (of 2) collating -- %.0f out of %d chunks processed;", shardby, wl, l),
+ #' delete(cars_copy.df)
|
|
61 | +19 |
- # label = sprintf("avg %.2f min/chunk; %.1f mins elapsed; %.1f mins remaining;", round(avg_speed/60,2), elapsed, round(pred_speed/60,2)))
+ #' delete(cars2.df)
|
|
62 | -! | +||
20 | +
- setTxtProgressBar(pb, length(list.files("large_sorted")),+ move_to <- function(df, outdir, ..., copy = FALSE) { |
||
63 | +21 | ! |
- title = sprintf("Hard Group By - %s", shardby))+ if(!fs::dir_exists(outdir)) { |
64 | +22 | ! |
- Sys.sleep(sleep)+ fs::dir_create(outdir) |
65 | +23 |
- }
+ }
|
|
66 | +24 |
- }
+ |
|
67 | +25 | ! |
- doprog2(pt_begin_collate, 1)+ if(!copy %in% c(TRUE, FALSE)) { |
68 | -+ | ||
26 | +! |
- }
+ stop("disk.frame::move_to ERROR: copy argument must be TRUE or FALSE") |
|
69 | +27 |
- }
+ }
|
|
70 | +28 |
-
+ |
|
71 | +29 |
- #' Perform a hard group
+ ## copy all files over
|
|
72 | -+ | ||
30 | +! |
- #' @description
+ listfiles = list.files(attr(df,"path", exact=TRUE), full.names = TRUE) |
|
73 | -+ | ||
31 | +! |
- #' A hard_group_by is a group by that also reorganizes the chunks to ensure that
+ shortlistfiles = list.files(attr(df,"path", exact=TRUE)) |
|
74 | -+ | ||
32 | +! |
- #' every unique grouping of `by`` is in the same chunk. Or in other words, every
+ purrr::walk2(listfiles, shortlistfiles, ~{ |
|
75 | -+ | ||
33 | +! |
- #' row that share the same `by` value will end up in the same chunk.
+ if(copy) { |
|
76 | -+ | ||
34 | +! |
- #' @param df a disk.frame
+ fs::file_copy(.x, file.path(outdir, .y)) |
|
77 | +35 |
- #' @param ... grouping variables
+ } else { |
|
78 | -+ | ||
36 | +! |
- #' @param outdir the output directory
+ fs::file_move(.x, file.path(outdir, .y)) |
|
79 | +37 |
- #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df)
+ }
|
|
80 | +38 |
- #' @param overwrite overwrite the out put directory
+ }) |
|
81 | +39 |
- #' @param add same as dplyr::group_by
+
|
|
82 | +40 |
- #' @param .drop same as dplyr::group_by
+ ## copy .metadata over
|
|
83 | -+ | ||
41 | +! |
- #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
+ fs::dir_create(file.path(outdir, ".metadata")) |
|
84 | +42 |
- #' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
+ |
|
85 | -+ | ||
43 | +! |
- #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
+ metadata_path = file.path(attr(df,"path", exact=TRUE), ".metadata") |
|
86 | +44 |
- #' @param sort_split_sample_size for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.
+ |
|
87 | -+ | ||
45 | +! |
- #' @export
+ listfiles = list.files(metadata_path, full.names = TRUE) |
|
88 | -+ | ||
46 | +! |
- #' @examples
+ shortlistfiles = list.files(metadata_path) |
|
89 | -+ | ||
47 | +! |
- #' iris.df = as.disk.frame(iris, nchunks = 2)
+ purrr::walk2(listfiles, shortlistfiles, ~{ |
|
90 | -+ | ||
48 | +! |
- #'
+ if(copy) { |
|
91 | -+ | ||
49 | +! |
- #' # group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk
+ fs::file_copy(.x, file.path(outdir, ".metadata", .y)) |
|
92 | +50 |
- #' iris_hard.df = hard_group_by(iris.df, Species)
+ } else { |
|
93 | -+ | ||
51 | +! |
- #'
+ fs::file_move(.x, file.path(outdir, ".metadata", .y)) |
|
94 | +52 |
- #' get_chunk(iris_hard.df, 1)
+ }
|
|
95 | +53 |
- #' get_chunk(iris_hard.df, 2)
+ }) |
|
96 | +54 |
- #'
+ |
|
97 | -+ | ||
55 | +! |
- #' # clean up cars.df
+ if(!copy) { |
|
98 | -+ | ||
56 | +! |
- #' delete(iris.df)
+ delete(df) |
|
99 | +57 |
- #' delete(iris_hard.df)
+ }
|
|
100 | +58 |
- hard_group_by <- function(df, ..., add = FALSE, .drop = FALSE) {+ |
|
101 | -55x | +||
59 | +! |
- UseMethod("hard_group_by")+ disk.frame(outdir) |
|
102 | +60 |
}
|
|
103 | -- | - - | -|
104 | +61 |
- #' @rdname hard_group_by
+
|
|
105 | +62 |
- #' @export
+ #' @rdname move_to
|
|
106 | +63 |
- #' @importFrom dplyr group_by
+ #' @export
|
|
107 | +64 |
- hard_group_by.data.frame <- function(df, ..., add = FALSE, .drop = FALSE) {+ copy_df_to <- function(df, outdir, ...) { |
|
108 | -2x | +||
65 | +! |
- dplyr::group_by(df, ..., add = FALSE, .drop = FALSE)+ move_to(df, outdir, ..., copy = TRUE) |
|
109 | +66 |
}
|
110 | +1 |
-
+ #' Recommend number of chunks based on input size
|
||
111 | +2 |
- #' @rdname hard_group_by
+ #' @description Computes the recommended number of chunks to break a data.frame
|
||
112 | +3 |
- #' @importFrom purrr map
+ #' into. It can accept filesizes in bytes (as integer) or a data.frame
|
||
113 | +4 |
- #' @export
+ #' @param df a disk.frame or the file size in bytes of a CSV file holding the
|
||
114 | +5 |
- hard_group_by.disk.frame <- function(+ #' data
|
||
115 | +6 |
- df,
+ #' @param type only = "csv" is supported. It indicates the file type
|
||
116 | +7 |
- ...,
+ #' corresponding to file size `df`
|
||
117 | +8 |
- outdir=tempfile("tmp_disk_frame_hard_group_by"),+ #' @param minchunks the minimum number of chunks. Defaults to the number of CPU
|
||
118 | +9 |
- nchunks = disk.frame::nchunks(df),+ #' cores (without hyper-threading)
|
||
119 | +10 |
- overwrite = TRUE,+ #' @param conservatism a multiplier to the recommended number of chunks. The
|
||
120 | +11 |
- shardby_function="hash",+ #' more chunks the smaller the chunk size and more likely that each chunk can
|
||
121 | +12 |
- sort_splits=NULL,+ #' fit into RAM
|
||
122 | +13 |
- desc_vars=NULL,+ #' @param ram_size The amount of RAM available which is usually computed. Except on RStudio with R3.6+
|
||
123 | +14 |
- sort_split_sample_size=100+ #' @importFrom pryr object_size
|
||
124 | +15 |
- ) {+ #' @importFrom utils memory.limit
|
||
125 | +16 |
- - |
- ||
126 | -53x | -
- overwrite_check(outdir, overwrite)+ #' @importFrom benchmarkme get_ram
|
||
127 | +17 |
- - |
- ||
128 | -53x | -
- ff = list.files(attr(df, "path"))+ #' @export
|
||
129 | -53x | +|||
18 | +
- stopifnot(shardby_function %in% c("hash", "sort"))+ #' @examples
|
|||
130 | +19 |
- + #' # recommend nchunks based on data.frame
|
||
131 | -53x | +|||
20 | +
- if (shardby_function == "sort" && is.null(sort_splits)){+ #' recommend_nchunks(cars)
|
|||
132 | +21 |
- # Sample enough per chunk to generate reasonable splits
+ #'
|
||
133 | -7x | +|||
22 | +
- sample_size_per_chunk = ceiling(nchunks / disk.frame::nchunks(df)) * sort_split_sample_size+ #' # recommend nchunks based on file size ONLY CSV is implemented at the moment
|
|||
134 | +23 |
- + #' recommend_nchunks(1024^3)
|
||
135 | +24 |
- # Sample and sort
+ recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthreads(), conservatism = 8, ram_size = df_ram_size()) { |
||
136 | -7x | +|||
25 | +
- sort_splits_sample <- map(df, dplyr::sample_n, size=sample_size_per_chunk, replace=TRUE) %>%+ |
|||
137 | -7x | +26 | +32x |
- select(...) %>%+ dfsize = 0 |
138 | -7x | -
- collect()- |
- ||
139 | -+ | 27 | +32x |
- + if ("data.frame" %in% class(df)) { |
140 | +28 |
- # NSE
+ # the df's size in gigabytes
|
||
141 | -7x | +29 | +24x |
- tryCatch({+ dfsize = as.numeric(pryr::object_size(df))/1024/1024/1024 |
142 | -7x | +30 | +8x |
- sort_splits_sample <- sort_splits_sample %>%+ } else if ("disk.frame" %in% class(df)) { |
143 | -7x | +|||
31 | +! |
- arrange(!!!syms(...))+ return(nchunks(df)) |
||
144 | -7x | +32 | +8x |
- }, error = function(e) {+ } else if (is.numeric(df) & type == "csv") { |
145 | -1x | +|||
33 | +
- sort_splits_sample <- sort_splits_sample %>%+ # assume that df is the estimated number of bytes of the data
|
|||
146 | -1x | +34 | +8x |
- arrange(...)+ dfsize = df/1024/1024/1024 |
147 | +35 |
- })+ } else {+ |
+ ||
36 | +! | +
+ dfsize = df/1024/1024/1024 |
||
148 | +37 |
- + }
|
||
149 | +38 |
- # If 100 chunks, this return get 99 splits based on percentiles.
+
|
||
150 | -7x | +|||
39 | +
- ntiles <- round((1:(nchunks-1)) * (nrow(sort_splits_sample) / (nchunks)))+ # ram_size = df_ram_size()
|
|||
151 | +40 |
|
||
152 | +41 |
- # Get splits. May lead to less than nchunks if duplicates are selected.
+ # the number physical cores not counting hyper threaded ones as 2; they are counted as 1
|
||
153 | -7x | +42 | +32x |
- sort_splits <- sort_splits_sample %>%+ nc = data.table::getDTthreads() #parallel::detectCores(logical = FALSE) |
154 | -7x | +|||
43 | ++ |
+ + |
+ ||
44 | +
- dplyr::slice(ntiles) %>%+ |
|||
155 | -7x | +45 | +32x |
- distinct()+ max(round(dfsize/ram_size*conservatism)*nc, minchunks) |
156 | +46 |
- }
+ }
|
||
157 | +47 |
- + |
||
158 | +48 |
- # test if the unlist it will error
+
|
||
159 | +49 |
- + #' Get the size of RAM in gigabytes
|
||
160 | -53x | +|||
50 | +
- tryCatch({+ #'
|
|||
161 | +51 |
- # This will return the variable names
+ #' @return integer of RAM in gigabyte (GB)
|
||
162 | +52 |
- + #' @export
|
||
163 | +53 |
- # TODO use better ways to do NSE
+ #' @importFrom bit64 as.integer64.character
|
||
164 | +54 |
- # the below will fail if indeed ... can not be list-ed
+ #' @examples
|
||
165 | +55 |
- # there should be a better way to do this
+ #' # returns the RAM size in gigabyte (GB)
|
||
166 | -53x | +|||
56 | +
- by <- unlist(list(...))+ #' df_ram_size()
|
|||
167 | +57 |
- + df_ram_size <- function() { |
||
168 | +58 |
- # shard and create temporary diskframes
+ #browser()
|
||
169 | -51x | +59 | +38x |
- tmp_df = map(df, function(df1) {+ tryCatch({ |
170 | -230x | +60 | +38x |
- tmpdir = tempfile()+ ram_size = NULL |
171 | -230x | +|||
61 | +
- shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)+ # the amount of memory available in gigabytes
|
|||
172 | -51x | +62 | +38x |
- }, lazy = FALSE)+ if (Sys.info()[["sysname"]] == "Windows") { |
173 | -+ | |||
63 | +38x |
- + majorv = as.integer(version$major) |
||
174 | -+ | |||
64 | +38x |
- + minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) |
||
175 | -+ | |||
65 | +38x |
- # now rbindlist
+ if((majorv>=3 & minorv >= 6) | majorv >= 4) { |
||
176 | -51x | +66 | +38x |
- res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite)+ ram_size <- system("wmic MemoryChip get Capacity", intern=TRUE) %>% |
177 | -+ | |||
67 | +38x |
- + map(~strsplit(.x, " ")) %>% |
||
178 | -+ | |||
68 | +38x |
- # clean up the tmp dir
+ unlist %>% |
||
179 | -51x | +69 | +38x |
- purrr::walk(tmp_df, ~{+ map(~bit64::as.integer64.character(.x)/1024^3) %>% |
180 | -230x | +70 | +38x |
- fs::dir_delete(attr(.x, "path"))+ unlist %>% |
181 | -+ | |||
71 | +38x |
- })+ sum(na.rm=TRUE) |
||
182 | +72 |
- + } else { |
||
183 | -+ | |||
73 | +! |
-
+ ram_size = memory.limit()/1024 |
||
184 | -51x | +|||
74 | +
- res1 <- NULL+ }
|
|||
185 | -51x | +|||
75 | +
- if(typeof(by) == "character") {+ |
|||
186 | -51x | +76 | +38x |
- eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})')))+ if(.Platform$GUI == "RStudio") { |
187 | +77 | ! |
- } else if(length(by) == 1) {+ if(is.null(ram_size) | is.na(ram_size)) { |
|
188 | +78 | ! |
- res1 = res %>% dplyr::group_by({{by}})- |
- |
189 | -- |
- } else {+ message("You are running RStudio with R 3.6+ on Windows. There is a bug with RAM size detection.") |
||
190 | +79 | ! |
- eval(parse(text = glue::glue('res1 = chunk_group_by(res, {paste(by,collapse=",")})')))- |
- |
191 | -- |
- }
+ message("And disk.frame can't determine your RAM size using manual methods.") |
||
192 | -+ | |||
80 | +! |
- + message("Going to assume your RAM size is 16GB (gigabyte). The program will continue to run.") |
||
193 | -51x | +|||
81 | +! |
- res1
+ message("") |
||
194 | -53x | +|||
82 | +! |
- }, error = function(e) {+ message("") |
||
195 | -+ | |||
83 | +! |
- #message(e)
+ message("Please report a bug at https://github.com/xiaodaigh/disk.frame/issues") |
||
196 | -+ | |||
84 | +! |
- # This will return the variable names
+ message("Include this in your bug report:") |
||
197 | -2x | +|||
85 | +! |
- by = rlang::enquos(...) %>%+ message(system("wmic MemoryChip get Capacity", intern=TRUE)) |
||
198 | -2x | +|||
86 | +! |
- substr(2, nchar(.))+ message("") |
||
199 | -+ | |||
87 | +! |
- + message("") |
||
200 | +88 |
-
+ #message("The option disk.frame.ram_size is not set.
|
||
201 | +89 |
- + #message("To set the ram_size, do options(disk.frame_ram_size = your_ram_size_in_gigabytes)")
+ |
+ ||
90 | +! | +
+ ram_size = 16 |
||
202 | +91 |
- # shard and create temporary diskframes
+ }
|
||
203 | -2x | +|||
92 | +
- tmp_df = map(df, function(df1) {+ }
|
|||
204 | +93 |
- ##browser
+ } else { |
||
205 | -12x | +|||
94 | +! |
- tmpdir = tempfile()+ os = R.version$os |
||
206 | -12x | +|||
95 | +! |
- shard(df1, shardby = by, nchunks = nchunks, outdir = tmpdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)+ if (length(grep("^darwin", os))) { |
||
207 | -2x | +|||
96 | +! |
- }, lazy = FALSE)+ a = substring(system("sysctl hw.memsize", intern = TRUE), 13) |
||
208 | +97 |
- + } #else { |
||
209 | +98 |
- # now rbindlist
+ # This would work but is not allowed by CRAN
|
||
210 | -2x | +|||
99 | +
- res = rbindlist.disk.frame(tmp_df, outdir=outdir, overwrite = overwrite)+ #a = system('grep MemTotal /proc/meminfo', intern = TRUE)
|
|||
211 | +100 |
- + #}
|
||
212 | -+ | |||
101 | +! |
- # clean up the tmp dir
+ l = strsplit(a, " ")[[1]] |
||
213 | -2x | +|||
102 | +! |
- purrr::walk(tmp_df, ~{+ l = as.numeric(l[length(l)-1]) |
||
214 | -12x | +|||
103 | +! |
- fs::dir_delete(attr(.x, "path"))+ ram_size = l/1024^2 |
||
215 | +104 |
- })+ }
|
||
216 | +105 |
|
||
217 | -2x | +106 | +38x |
- res1 = res %>% chunk_group_by(!!!syms(by))+ if(is.null(ram_size)) { |
218 | -+ | |||
107 | +! |
- + warning("RAM size not detected. Assumme you have at least 16GB of RAM")+ |
+ ||
108 | +! | +
+ ram_size = 16 |
||
219 | -2x | +109 | +38x |
- res1
+ } else if(is.na(ram_size)) { |
220 | -+ | |||
110 | +! |
- })+ warning("RAM size not detected. Assumme you have at least 16GB of RAM") |
||
221 | -+ | |||
111 | +! |
- }
+ ram_size = 16 |
1 | +112 |
- #' Compute without writing
+ }
|
|
2 | +113 |
- #' @description
+ # assume at least 1G of RAM
|
|
3 | -+ | ||
114 | +38x |
- #' Perform the computation; same as calling map without .f and lazy = FALSE
+ ram_size = max(ram_size, 1, na.rm = TRUE) |
|
4 | +115 |
- #' @param x a disk.frame
+ |
|
5 | -+ | ||
116 | +38x |
- #' @param outdir the output directory
+ return(ram_size) |
|
6 | -+ | ||
117 | +38x |
- #' @param overwrite whether to overwrite or not
+ }, error = function(e) { |
|
7 | -+ | ||
118 | +! |
- #' @param name Not used. Kept for compatibility with dplyr
+ if(requireNamespace("benchmarkme")) { |
|
8 | -+ | ||
119 | +! |
- #' @param ... Not used. Kept for dplyr compatibility
+ ram_size = benchmarkme::get_ram()/1024^3 |
|
9 | +120 |
- #' @export
+ |
|
10 | -+ | ||
121 | +! |
- #' @importFrom dplyr compute
+ if(is.na(ram_size)) { |
|
11 | -+ | ||
122 | +! |
- #' @examples
+ warning("RAM size can't be determined. Assume you have 16GB of RAM.") |
|
12 | -+ | ||
123 | +! |
- #' cars.df = as.disk.frame(cars)
+ warning("Please report this error at github.com/xiaodaigh/disk.frame/issues") |
|
13 | -+ | ||
124 | +! |
- #' cars.df2 = cars.df %>% map(~.x)
+ warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number")) |
|
14 | -+ | ||
125 | +! |
- #' # the computation is performed and the data is now stored elsewhere
+ return(16) |
|
15 | +126 |
- #' cars.df3 = compute(cars.df2)
+ } else { |
|
16 | -+ | ||
127 | +! |
- #'
+ ram_size = max(ram_size, 1, na.rm = TRUE) |
|
17 | -+ | ||
128 | +! |
- #' # clean up
+ return(ram_size) |
|
18 | +129 |
- #' delete(cars.df)
+ }
|
|
19 | +130 |
- #' delete(cars.df3)
+ } else{ |
|
20 | -+ | ||
131 | +! |
- compute.disk.frame <- function(x, name, outdir = tempfile("tmp_df_", fileext=".df"), overwrite = TRUE, ...) {+ warning("RAM size can't be determined. Assume you have 16GB of RAM.") |
|
21 | -3x | +||
132 | +! | +
+ warning("You can try to `install.packages('benchmarkme')` as that may help determine RAM size")+ |
+ |
133 | +! | +
+ return(16)+ |
+ |
134 | +
- overwrite_check(outdir, overwrite)+ }
|
||
22 | -3x | +||
135 | +
- write_disk.frame(x, outdir = outdir, overwrite = TRUE)+ }) |
||
23 | +136 |
}
@@ -16520,361 +20188,305 @@ disk.frame coverage - 51.96% |
1 |
- #@importFrom readr DataFrameCallback
+ #' Check if the outdir exists or not
|
||
2 |
- #' @noRd
+ #' @description
|
||
3 |
- #' @noMd
+ #' If the overwrite is TRUE then the folder will be deleted, otherwise the folder will be created.
|
||
4 |
- csv_to_disk.frame_readr <- function(infile, outdir = tempfile(fileext = ".df"), inmapfn = base::I, nchunks = recommend_nchunks(sum(file.size(infile))),+ #' @param outdir the output directory
|
||
5 |
- in_chunk_size = NULL, shardby = NULL, compress=50, overwrite = TRUE, col_names = TRUE, .progress = TRUE, delim=",", ...) {+ #' @param overwrite TRUE or FALSE if `outdir`` exists and overwrite = FALSE then throw an error
|
||
6 | -! | +
- overwrite_check(outdir, overwrite)+ #' @import fs
|
|
7 |
- #
+ #' @importFrom glue glue
|
||
8 | -! | +
- df = disk.frame(outdir)+ #' @export
|
|
9 |
- + #'
|
||
10 |
- # TODO check header
+ #' @examples
|
||
11 | -! | +
- if(is.null(in_chunk_size)) {+ #' tf = tempfile()
|
|
12 | -! | +
- if(is.null(shardby)) {+ #' overwrite_check(tf, overwrite = FALSE)
|
|
13 | -! | +
- return(+ #' overwrite_check(tf, overwrite = TRUE)
|
|
14 | -! | +
- as.disk.frame(+ #'
|
|
15 | -! | +
- readr::read_delim(infile, delim = delim, ...) %>% inmapfn,+ #' # clean up
|
|
16 | -! | +
- outdir = outdir,+ #' fs::dir_delete(tf)
|
|
17 | -! | +
- overwrite = TRUE,+ overwrite_check <- function(outdir, overwrite) { |
|
18 | -! | +
- nchunks = nchunks,+ ##browser
|
|
19 | -! | +826x |
- compress = compress,+ if (is.null(outdir)) { |
20 | -+ | ! |
- ...
+ warning("outdir is NULL; no overwrite check is performed") |
21 | -+ | ! |
- )
+ return(NULL) |
22 |
- )
+ }
|
||
23 |
- } else {+ |
||
24 | -! | +826x |
- return(shard(readr::read_delim(infile, delim = delim, ...), shardby = shardby, outdir = outdir, nchunks = nchunks, overwrite = TRUE,...))+ if(overwrite & fs::dir_exists(outdir)) { |
25 | -+ | 418x |
- }
+ if(!is_disk.frame(outdir)) { |
26 | -+ | 1x |
- } else {+ stop(glue::glue("The directory is {outdir} is not a disk.frame folder. Execution has stopped to prevent accidental deletion of potentially important files")) |
27 | -! | +
- if(!is.null(shardby)) {+ }
|
|
28 | -! | +417x |
- tmp_dir = tempfile()+ if(length(fs::dir_ls(outdir, all = TRUE)) != 0) { |
29 | -! | +4x |
- overwrite_check(tmp_dir, TRUE)+ tryCatch({ |
30 | -! | +4x |
- df_tmp = disk.frame(tmp_dir)+ fs::dir_delete(outdir) |
31 | -! | +4x |
- f = function(x, pos) {+ }, error = function(e) { |
32 | ! |
- add_chunk(df_tmp, inmapfn(x))- |
- |
33 | -- |
- }
- |
- |
34 | -- |
- - |
- |
35 | -! | -
- message("csv_to_disk.frame reader backend: Stage 1/1 -- reading file")+ message(e) |
|
36 | +33 | ! |
- readr::read_delim_chunked(infile, readr::SideEffectChunkCallback$new(f), chunk_size = in_chunk_size, delim = delim, ...)+ stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reason and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems")) |
37 | +34 |
- - |
- |
38 | -! | -
- message(glue::glue("csv_to_disk.frame reader backend: Stage 2/2 -- performing shardby {shardby}"))- |
- |
39 | -! | -
- df = shard(df_tmp,shardby = shardby,outdir = outdir, nchunks = nchunks, overwrite = TRUE)- |
- |
40 | -! | -
- df = add_meta(df_tmp, nchunks=disk.frame::nchunks(df), shardkey = shardby, shardchunks = nchunks, compress = compress)- |
- |
41 | -! | -
- df
+ }) |
|
42 | +35 |
- } else {- |
- |
43 | -! | -
- f = function(x, pos) {- |
- |
44 | -! | -
- add_chunk(df, inmapfn(x))+ }
|
|
45 | +36 |
- }
+ |
|
46 | -! | +||
37 | +417x |
- readr::read_delim_chunked(infile, readr::SideEffectChunkCallback$new(f), chunk_size = in_chunk_size, delim = delim, ...)+ fs::dir_create(outdir) |
|
47 | -! | +||
38 | +408x |
- df = add_meta(df, nchunks=disk.frame::nchunks(df), shardkey = shardby, shardchunks = nchunks, compress = compress)+ } else if(overwrite == FALSE & fs::dir_exists(outdir)) { |
|
48 | -! | +||
39 | +2x |
- df
+ stop(glue::glue("overwrite = FALSE and outdir '{outdir}' already exists")) |
|
49 | +40 |
- }
+ } else {+ |
+ |
41 | +406x | +
+ fs::dir_create(outdir) |
|
50 | +42 |
}
|
|
51 | +43 |
}
@@ -16883,21 +20495,21 @@ disk.frame coverage - 51.96% |
1 |
- #' @rdname join
+ #' @export
|
|||
2 |
- #' @export
+ #' @rdname join
|
|||
6 |
- #' join.df = full_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
+ #' join.df = inner_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
|
|||
11 |
- full_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_full_join"), overwrite = TRUE, merge_by_chunk_id) {+ inner_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_inner_join"), merge_by_chunk_id = NULL, overwrite = TRUE, .progress = FALSE) { |
|||
12 | -16x | +12x |
stopifnot("disk.frame" %in% class(x)) |
@@ -16979,7 +20591,7 @@ |
14 | -16x | +12x |
overwrite_check(outdir, overwrite) |
@@ -17000,258 +20612,349 @@ |
17 | -16x | +12x |
- if("data.frame" %in% class(y)) {+ if(!is.null(outdir)) { |
|
18 | -+ | 12x |
- # full join cannot be support for y in data.frame
+ if(overwrite & fs::dir_exists(outdir)) { |
|
19 | -4x | +12x |
- ncx = nchunks(x)+ fs::dir_delete(outdir) |
|
20 | +12x | +
+ fs::dir_create(outdir)+ |
+ ||
21 | ++ |
+ } else {+ |
+ ||
22 | +! | +
+ fs::dir_create(outdir)+ |
+ ||
23 | ++ |
+ }
+ |
+ ||
24 | ++ |
+ }
+ |
+ ||
25 | ++ |
+ + |
+ ||
26 | +12x | +
+ if("data.frame" %in% class(y)) {+ |
+ ||
27 | 4x |
- dy = shard(y, shardby = by, nchunks = ncx, overwrite = TRUE)+ quo_dotdotdot = enquos(...) |
||
21 | +28 | 4x |
- dx = hard_group_by(x, by = by, overwrite = TRUE)+ res = cmap_dfr(x, ~{ |
|
22 | +29 | +16x | +
+ code = quo(inner_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot))+ |
+ |
30 | +16x | +
+ rlang::eval_tidy(code)+ |
+ ||
31 | 4x |
- return(full_join.disk.frame(dx, dy, by, copy=copy, outdir=outdir, merge_by_chunk_id = TRUE))+ }, .progress = .progress) |
||
23 | -12x | +32 | +4x | +
+ return(res)+ |
+
33 | +8x |
} else if("disk.frame" %in% class(y)) { |
||
24 | -12x | +34 | +8x |
if(is.null(merge_by_chunk_id)) { |
25 | +35 | ! |
stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") |
|
26 | +36 |
}
|
||
27 | -12x | +37 | +8x |
if(is.null(by)) { |
28 | +38 | ! |
by <- intersect(names(x), names(y)) |
|
29 | +39 |
}
|
||
30 | +40 |
|
||
31 | -12x | +41 | +8x |
ncx = nchunks(x) |
32 | -12x | +42 | +8x |
ncy = nchunks(y) |
33 | -12x | +43 | +8x |
if (merge_by_chunk_id == FALSE) { |
34 | +44 | 4x |
- warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.")+ x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
|
35 | +45 | 4x |
- x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
|
36 | +46 | 4x |
- y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ return(inner_join.disk.frame(x, y, by, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) |
|
37 | +47 | 4x |
- return(full_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite))+ } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { |
|
38 | -8x | +48 | +4x |
- } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {+ dotdotdot <- list(...)+ |
+
49 | ++ |
+ |
||
39 | -8x | +50 | +4x |
- res = map2(x, y, ~{+ res = cmap2.disk.frame(x, y, ~{ |
40 | -37x | +51 | +21x |
if(is.null(.y)) { |
41 | +52 | ! |
- return(.x)+ return(data.table()) |
|
42 | -37x | +53 | +21x |
} else if (is.null(.x)) { |
43 | +54 | ! |
- return(.y)+ return(data.table()) |
|
44 | +55 |
}
|
||
56 | ++ |
+ #inner_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite)
+ |
+ ||
45 | -37x | +57 | +21x |
- full_join(.x, .y, by = by, copy = copy)+ lij = purrr::lift(dplyr::inner_join) |
46 | -8x | +58 | +21x |
- }, outdir = outdir, overwrite = overwrite)+ lij(c(list(x = .x, y = .y, by = by, copy = copy), dotdotdot)) |
47 | -8x | +59 | +4x | +
+ }, outdir = outdir, .progress = .progress)+ |
+
60 | +4x |
return(res) |
||
48 | +61 |
} else { |
||
49 | +62 |
# TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
|
||
50 | +63 | ! |
stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE") |
|
51 | +64 |
}
|
||
52 | +65 |
}
|
||
53 | +66 |
}
@@ -17260,245 +20963,245 @@ disk.frame coverage - 51.96% |
1 |
- #' Checks if a folder is a disk.frame
+ #' Bring the disk.frame into R
|
||
2 |
- #' @param df a disk.frame or directory to check
+ #'
|
||
3 |
- #' @export
+ #' Bring the disk.frame into RAM by loading the data and running all lazy
|
||
4 |
- #' @examples
+ #' operations as data.table/data.frame or as a list
|
||
5 |
- #' cars.df = as.disk.frame(cars)
+ #' @param x a disk.frame
|
||
6 |
- #'
+ #' @param parallel if TRUE the collection is performed in parallel. By default
|
||
7 |
- #' is_disk.frame(cars) # FALSE
+ #' if there are delayed/lazy steps then it will be parallel, otherwise it will
|
||
8 |
- #' is_disk.frame(cars.df) # TRUE
+ #' not be in parallel. This is because parallel requires transferring data
|
||
9 |
- #'
+ #' from background R session to the current R session and if there is no
|
||
10 |
- #' # clean up cars.df
+ #' computation then it's better to avoid transferring data between session,
|
||
11 |
- #' delete(cars.df)
+ #' hence parallel = FALSE is a better choice
|
||
12 |
- is_disk.frame <- function(df) {+ #' @param ... not used
|
||
13 |
- ##browser
+ #' @importFrom data.table data.table as.data.table
|
||
14 | -792x | +
- if("disk.frame" %in% class(df)) {+ #' @importFrom furrr future_map_dfr future_options
|
|
15 | -29x | +
- df = attr(df, "path")+ #' @importFrom purrr map_dfr
|
|
16 | -763x | +
- } else if(!"character" %in% class(df)) { # character then check the path+ #' @importFrom dplyr collect select mutate
|
|
17 | -324x | +
- return(FALSE)+ #' @return collect return a data.frame/data.table
|
|
18 |
- }
+ #' @examples
|
||
19 |
- + #' cars.df = as.disk.frame(cars)
|
||
20 | -468x | +
- files <- fs::dir_ls(df, type="file", all = TRUE)+ #' # use collect to bring the data into RAM as a data.table/data.frame
|
|
21 |
- # if all files are fst
+ #' collect(cars.df)
|
||
22 | -468x | +
- if(length(files)>0) {+ #'
|
|
23 | -32x | +
- if(any(purrr::map_lgl(files, ~length(grep(glob2rx("*.fst"), .x)) == 0))) {+ #' # clean up
|
|
24 |
- # some of the fiels do not have a .fst extension
+ #' delete(cars.df)
|
||
25 | -1x | +
- return(FALSE)+ #' @export
|
|
26 |
- }
+ #' @rdname collect
|
||
27 |
- }
+ collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { |
||
28 | -+ | 78x |
- + cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) |
29 | -467x | +
- dirs = fs::dir_ls(df, type="directory", all = TRUE)+ #cids = as.integer(get_chunk_ids(x))
|
|
30 | -467x | +78x |
- if(length(dirs) > 1) {+ if(nchunks(x) > 0) { |
31 | -! | +75x |
- return(FALSE)+ if(parallel) { |
32 | -467x | +30x |
- } else if(length(dirs) == 1) {+ furrr::future_map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE)) |
33 | -13x | +
- if(substr(dirs, nchar(dirs)-8,nchar(dirs)) != ".metadata") {+ } else { |
|
34 | -! | +45x |
- return(FALSE)+ purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE)) |
36 |
- }
+ } else { |
||
37 | -+ | 3x |
- + data.table() |
38 | -467x | +
- return(TRUE)+ }
|
|
1 | -- |
- #' Create dplyr function for disk.frame
- |
- ||
2 | -- |
- #'
- |
- ||
3 | -- |
- #' A function to make it easier to create functions like \code{filter}
- |
- ||
4 | -- |
- #'
- |
- ||
5 | -- |
- #' @examples
- |
- ||
6 | -- |
- #'
- |
- ||
7 | -- |
- #' filter = create_dplyr_mapper(dplyr::filter)
- |
- ||
8 | -- |
- #'
- |
- ||
9 | +41 |
- #' #' example: creating a function that keeps only the first and last n row
+ #' @param simplify Should the result be simplified to array
|
||
10 | +42 |
- #' first_and_last <- function(chunk, n, ...) {
+ #' @export
|
||
11 | +43 |
- #' nr = nrow(chunk)
+ #' @rdname collect
|
||
12 | +44 |
- #' print(nr-n+1:nr)
+ #' @return collect_list returns a list
|
||
13 | +45 |
- #' chunk[c(1:n, (nr-n+1):nr), ]
+ #' @examples
|
||
14 | +46 |
- #' }
+ #' cars.df = as.disk.frame(cars)
|
||
15 | +47 |
#'
|
||
16 | -- |
- #' #' create the function for use with disk.frame
- |
- ||
17 | -- |
- #' first_and_last_df = create_dplyr_mapper(first_and_last)
- |
- ||
18 | +48 |
- #'
+ #' # returns the result as a list
|
||
19 | +49 |
- #' mtcars.df = as.disk.frame(mtcars)
+ #' collect_list(cmap(cars.df, ~1))
|
||
20 | +50 |
#'
|
||
21 | +51 |
- #' #' the operation is lazy
+ #' # clean up
|
||
22 | +52 |
- #' lazy_mtcars.df = mtcars.df %>%
+ #' delete(cars.df)
|
||
23 | +53 |
- #' first_and_last_df(2)
+ collect_list <- function(x, simplify = FALSE, parallel = !is.null(attr(x,"lazyfn"))) { |
||
24 | -+ | |||
54 | +1x |
- #'
+ cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) |
||
25 | +55 |
- #' #' bring into R
+ |
||
26 | +56 |
- #' collect(lazy_mtcars.df)
+ |
||
27 | -+ | |||
57 | +1x |
- #'
+ if(nchunks(x) > 0) { |
||
28 | -+ | |||
58 | +1x |
- #' #' clean up
+ res <- NULL |
||
29 | -+ | |||
59 | +1x |
- #' delete(mtcars.df)
+ if (parallel) { |
||
30 | +60 |
- #'
+ #res = furrr::future_map(1:nchunks(x), ~get_chunk(x, .x))
|
||
31 | -+ | |||
61 | +! |
- #' @param dplyr_fn The dplyr function to create a mapper for
+ res = future.apply::future_lapply(cids, function(.x) { |
||
32 | -+ | |||
62 | +! |
- #' @param warning_msg The warning message to display when invoking the mapper
+ get_chunk(x, .x, full.names = TRUE) |
||
33 | +63 |
- #' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr
+ }) |
||
34 | +64 |
- #' @importFrom rlang enquos quo
+ } else { |
||
35 | -+ | |||
65 | +1x |
- #' @export
+ res = purrr::map(cids, ~get_chunk(x, .x, full.names = TRUE)) |
||
36 | +66 |
- create_dplyr_mapper <- function(dplyr_fn, warning_msg = NULL, as.data.frame = TRUE) {+ }
|
||
37 | +67 | 1x |
- return_func <- function(.data, ...) {+ if (simplify) { |
|
38 | -82x | +|||
68 | +! |
- if (!is.null(warning_msg)) {+ return(simplify2array(res))+ |
+ ||
69 | ++ |
+ } else { |
||
39 | -3x | +70 | +1x |
- warning(warning_msg)+ return(res) |
40 | +71 |
}
|
||
41 | +72 |
- + } else { |
||
42 | -82x | +|||
73 | +! |
- quo_dotdotdot = rlang::enquos(...)+ list() |
||
43 | +74 |
- + }
|
||
44 | +75 |
- # this is designed to capture any global stuff
+ }
|
||
45 | -82x | +
1 | +
- vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot)+ #' A streaming function for speedglm
|
|||
46 | -82x | +|||
2 | +
- data_for_eval_tidy = force(vars_and_pkgs$globals)+ #'
|
|||
47 | +3 |
- + #' Define a function that can be used to feed data into speedglm and biglm
|
||
48 | -82x | +|||
4 | +
- res = map(.data, ~{+ #'
|
|||
49 | -375x | +|||
5 | +
- this_env = environment()+ #' @param data a disk.frame
|
|||
50 | +6 |
- + #' @param verbose Whether to print the status of data loading. Default to FALSE
|
||
51 | -375x | +|||
7 | +
- if(length(data_for_eval_tidy) > 0) {+ #'
|
|||
52 | -! | +|||
8 | +
- for(i in 1:length(data_for_eval_tidy)) {+ #' @return return a function, fn, that can be used as the data argument in biglm::bigglm or speedglm::shglm
|
|||
53 | -! | +|||
9 | +
- assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env)+ #'
|
|||
54 | +10 |
- }
+ #' @family Machine Learning (ML)
|
||
55 | +11 |
- }
+ #' @export
|
||
56 | +12 |
- + #'
|
||
57 | -375x | +|||
13 | +
- lapply(quo_dotdotdot, function(x) {+ #' @examples
|
|||
58 | -500x | +|||
14 | +
- attr(x, ".Environment") = this_env+ #' cars.df = as.disk.frame(cars)
|
|||
59 | +15 |
- })+ #' streamacq = make_glm_streaming_fn(cars.df, verbose = FALSE)
|
||
60 | +16 |
- + #'
|
||
61 | -375x | +|||
17 | +
- if(as.data.frame) {+ #' majorv = as.integer(version$major)
|
|||
62 | -375x | +|||
18 | +
- if("grouped_df" %in% class(.x)) {+ #' minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])
|
|||
63 | -42x | +|||
19 | +
- code = rlang::quo(dplyr_fn(.x, !!!quo_dotdotdot))+ #' if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) {
|
|||
64 | +20 |
- } else {+ #' m = biglm::bigglm(dist ~ speed, data = streamacq)
|
||
65 | -333x | +|||
21 | +
- code = rlang::quo(dplyr_fn(as.data.frame(.x), !!!quo_dotdotdot))+ #' summary(m)
|
|||
66 | +22 |
- }
+ #' predict(m, get_chunk(cars.df, 1))
|
||
67 | +23 |
- } else {+ #' predict(m, collect(cars.df, 1))
|
||
68 | -! | +|||
24 | +
- code = rlang::quo(dplyr_fn(.x, !!!quo_dotdotdot))+ #' } else {
|
|||
69 | +25 |
- }
+ #' m = speedglm::shglm(dist ~ speed, data = streamacq)
|
||
70 | +26 |
- + #' }
|
||
71 | +27 |
- # ZJ: we need both approaches. TRUST ME
+ make_glm_streaming_fn <- function(data, verbose = FALSE) { |
||
72 | -+ | |||
28 | +2x |
- # TODO better NSE at some point
+ i = 0 |
||
73 | +29 |
- #tryCatch({
+ |
||
74 | -375x | +30 | +2x |
- rlang::eval_tidy(code)+ chunkids = disk.frame::get_chunk_ids(data, strip_extension = FALSE) |
75 | -+ | |||
31 | +2x |
- #}, error = function(e) {
+ is = sample(length(chunkids), replace = FALSE) |
||
76 | -+ | |||
32 | +2x |
- # if the previous failed
+ verbose = verbose |
||
77 | -+ | |||
33 | +2x |
- # eval(parse(text=rlang::as_label(code)), envir = this_env)
+ nchunks_copy = length(chunkids) |
||
78 | +34 |
- #})
+ |
||
79 | -82x | +35 | +2x |
- }, lazy = TRUE)+ function(reset = FALSE) { |
80 | -+ | |||
36 | +41x |
- }
+ if(reset) { |
||
81 | -1x | +37 | +6x |
- return_func
+ if(verbose) { |
82 | -+ | |||
38 | +! |
- }
+ print("disk.frame stream has been reset; next read will be from beginning") |
||
83 | +39 |
-
+ }
|
||
84 | +40 |
- #' The dplyr verbs implemented for disk.frame
+ |
||
85 | -+ | |||
41 | +6x |
- #' @description
+ i <<- 0 |
||
86 | +42 |
- #' Please see the dplyr document for their usage. Please note that `group_by`
+ } else { |
||
87 | -+ | |||
43 | +35x |
- #' and `arrange` performs the actions within each chunk
+ i <<- i + 1 |
||
88 | +44 |
- #' @export
+ |
||
89 | -+ | |||
45 | +35x |
- #' @importFrom dplyr select rename filter mutate transmute arrange do groups group_by group_by glimpse summarise
+ if (i > nchunks_copy) { |
||
90 | -+ | |||
46 | +5x |
- #' @param ... Same as the dplyr functions
+ return(NULL) |
||
91 | +47 |
- #' @param .data a disk.frame
+ }
|
||
92 | -+ | |||
48 | +30x |
- #' @rdname dplyr_verbs
+ if(verbose) { |
||
93 | -+ | |||
49 | +! |
- #' @family dplyr verbs
+ print(glue::glue("streaming: {i}/{nchunks_copy}; chunk id: {chunkids[i]}")) |
||
94 | +50 |
- #' @examples
+ }
|
||
95 | -+ | |||
51 | +30x |
- #' library(dplyr)
+ return(get_chunk(data, chunkids[i])) |
||
96 | +52 |
- #' library(magrittr)
+ }
|
||
97 | +53 |
- #' cars.df = as.disk.frame(cars)
+ }
|
||
98 | +54 |
- #' mult = 2
+ }
|
99 | +1 |
- #'
+ #' Keep only the variables from the input listed in selections
|
|
100 | +2 |
- #' # use all any of the supported dplyr
+ #' @param df a disk.frame
|
|
101 | +3 |
- #' cars2 = cars.df %>%
+ #' @param selections The list of variables to keep from the input source
|
|
102 | +4 |
- #' select(speed) %>%
+ #' @param ... not yet used
|
|
103 | +5 |
- #' mutate(speed2 = speed * mult) %>%
+ #' @export
|
|
104 | +6 |
- #' filter(speed < 50) %>%
+ #' @examples
|
|
105 | +7 |
- #' rename(speed1 = speed) %>%
+ #' cars.df = as.disk.frame(cars)
|
|
106 | +8 |
- #' collect
+ #'
|
|
107 | +9 |
- #'
+ #' # when loading cars's chunks into RAM, load only the column speed
|
|
108 | +10 |
- #' # clean up cars.df
+ #' collect(srckeep(cars.df, "speed"))
|
|
109 | +11 |
- #' delete(cars.df)
+ #'
|
|
110 | +12 |
- select.disk.frame <- function(.data, ...) {- |
- |
111 | -12x | -
- quo_dotdotdot = rlang::enquos(...)- |
- |
112 | -12x | -
- map(.data, ~{- |
- |
113 | -62x | -
- code = rlang::quo(dplyr::select(.x, !!!quo_dotdotdot))- |
- |
114 | -62x | -
- rlang::eval_tidy(code)- |
- |
115 | -12x | -
- }, lazy = TRUE)+ #' # clean up cars.df
|
|
116 | +13 |
- }
+ #' delete(cars.df)
|
|
117 | +14 |
-
+ srckeep <- function(df, selections, ...) { |
|
118 | -+ | ||
15 | +1x |
-
+ stopifnot("disk.frame" %in% class(df)) |
|
119 | -+ | ||
16 | +1x |
-
+ attr(df,"keep") = selections |
|
120 | +17 |
- #' @export
+ |
|
121 | -+ | ||
18 | +1x |
- #' @rdname dplyr_verbs
+ df
|
|
122 | +19 |
- rename.disk.frame <- create_dplyr_mapper(dplyr::rename)+ }
|
|
123 | +20 | ||
124 | +21 |
-
+ #' @param chunks The chunks to load
|
|
125 | +22 |
- #' @export
+ #' @rdname srckeep
|
|
126 | +23 |
- #' @rdname dplyr_verbs
+ #' @export
|
|
127 | +24 |
- filter.disk.frame <- create_dplyr_mapper(dplyr::filter)+ srckeepchunks <- function(df, chunks, ...) { |
|
128 | -+ | ||
25 | +! |
-
+ stopifnot("disk.frame" %in% class(df)) |
|
129 | +26 |
-
+ # TODO relax this
|
|
130 | -+ | ||
27 | +! |
- #' @export
+ stopifnot(is.integer(chunks)) |
|
131 | +28 |
- #' @rdname dplyr_verbs
+ |
|
132 | -+ | ||
29 | +! |
- #' @importFrom dplyr filter_all
+ attr(df,"keep_chunks") = chunks+ |
+ |
30 | +! | +
+ df
|
|
133 | +31 |
- filter_all.disk.frame <- create_dplyr_mapper(dplyr::filter_all)+ }
|
134 | +1 |
-
+ #' Create function that applies to each chunk if disk.frame
|
|
135 | +2 |
-
+ #'
|
|
136 | +3 |
- #' @export
+ #' A function to make it easier to create functions like \code{filter}
|
|
137 | +4 |
- #' @rdname dplyr_verbs
+ #'
|
|
138 | +5 |
- #' @importFrom dplyr filter_if
+ #' @examples
|
|
139 | +6 |
- filter_if.disk.frame <- create_dplyr_mapper(dplyr::filter_if)+ #'
|
|
140 | +7 |
-
+ #' filter = create_chunk_mapper(dplyr::filter)
|
|
141 | +8 |
-
+ #'
|
|
142 | +9 |
- #' @export
+ #' #' example: creating a function that keeps only the first and last n row
|
|
143 | +10 |
- #' @rdname dplyr_verbs
+ #' first_and_last <- function(chunk, n, ...) {
|
|
144 | +11 |
- #' @importFrom dplyr filter_at
+ #' nr = nrow(chunk)
|
|
145 | +12 |
- filter_at.disk.frame <- create_dplyr_mapper(dplyr::filter_at)+ #' print(nr-n+1:nr)
|
|
146 | +13 |
-
+ #' chunk[c(1:n, (nr-n+1):nr), ]
|
|
147 | +14 |
-
+ #' }
|
|
148 | +15 |
- #' @export
+ #'
|
|
149 | +16 |
- #' @rdname dplyr_verbs
+ #' #' create the function for use with disk.frame
|
|
150 | +17 |
- #' @importFrom future getGlobalsAndPackages
+ #' first_and_last_df = create_chunk_mapper(first_and_last)
|
|
151 | +18 |
- #' @importFrom rlang eval_tidy quo enquos
+ #'
|
|
152 | +19 |
- #' @importFrom dplyr mutate
+ #' mtcars.df = as.disk.frame(mtcars)
|
|
153 | +20 |
- mutate.disk.frame <- create_dplyr_mapper(dplyr::mutate)+ #'
|
|
154 | +21 |
-
+ #' #' the operation is lazy
|
|
155 | +22 |
-
+ #' lazy_mtcars.df = mtcars.df %>%
|
|
156 | +23 |
- #' @export
+ #' first_and_last_df(2)
|
|
157 | +24 |
- #' @importFrom dplyr transmute
+ #'
|
|
158 | +25 |
- #' @rdname dplyr_verbs
+ #' #' bring into R
|
|
159 | +26 |
- transmute.disk.frame <- create_dplyr_mapper(dplyr::transmute)+ #' collect(lazy_mtcars.df)
|
|
160 | +27 |
-
+ #'
|
|
161 | +28 |
-
+ #' #' clean up
|
|
162 | +29 |
- #' @export
+ #' delete(mtcars.df)
|
|
163 | +30 |
- #' @importFrom dplyr arrange
+ #'
|
|
164 | +31 |
- #' @rdname dplyr_verbs
+ #' @param chunk_fn The dplyr function to create a mapper for
|
|
165 | +32 |
- arrange.disk.frame =create_dplyr_mapper(dplyr::arrange, warning_msg="`arrange.disk.frame` is now deprecated. Please use `chunk_arrange` instead. This is in preparation for a more powerful `arrange` that sorts the whole disk.frame")+ #' @param warning_msg The warning message to display when invoking the mapper
|
|
166 | +33 |
-
+ #' @param as.data.frame force the input chunk of a data.frame; needed for dtplyr
|
|
167 | +34 |
-
+ #' @importFrom rlang enquos quo
|
|
168 | +35 |
#' @export
|
|
169 | +36 |
- #' @importFrom dplyr arrange
+ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TRUE) { |
|
170 | -+ | ||
37 | +1x |
- #' @rdname dplyr_verbs
+ return_func <- function(.data, ...) { |
|
171 | -+ | ||
38 | +74x |
- chunk_arrange <- create_dplyr_mapper(dplyr::arrange)+ if (!is.null(warning_msg)) { |
|
172 | -+ | ||
39 | +1x |
-
+ warning(warning_msg) |
|
173 | +40 |
-
+ }
|
|
174 | +41 |
- #' @export
+ |
|
175 | +42 |
- #' @importFrom dplyr tally
+ + |
+ |
43 | +74x | +
+ quo_dotdotdot = rlang::enquos(...) |
|
176 | +44 |
- #' @rdname dplyr_verbs
+ |
|
177 | +45 |
- tally.disk.frame <- create_dplyr_mapper(dplyr::tally)+ # this is designed to capture any global stuff
+ |
+ |
46 | +74x | +
+ vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot) |
|
178 | -+ | ||
47 | +74x |
-
+ data_for_eval_tidy = force(vars_and_pkgs$globals) |
|
179 | +48 |
-
+ |
|
180 | -+ | ||
49 | +74x |
- #' @export
+ res = cmap(.data, ~{ |
|
181 | +50 |
- #' @importFrom dplyr count
+ |
|
182 | -+ | ||
51 | +327x |
- #' @rdname dplyr_verbs
+ this_env = environment() |
|
183 | +52 |
- count.disk.frame <- create_dplyr_mapper(dplyr::count)+ |
|
184 | -+ | ||
53 | +327x |
-
+ if(length(data_for_eval_tidy) > 0) { |
|
185 | -+ | ||
54 | +! |
- # TODO family is not required is group-by
+ for(i in 1:length(data_for_eval_tidy)) { |
|
186 | -+ | ||
55 | +! |
- # TODO alot of these .disk.frame functions are not generic
+ assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env) |
|
187 | +56 |
-
+ }
|
|
188 | +57 |
-
+ }
|
|
189 | +58 |
- #' @export
+ |
|
190 | -+ | ||
59 | +327x |
- #' @importFrom dplyr add_count
+ lapply(quo_dotdotdot, function(x) { |
|
191 | -+ | ||
60 | +428x |
- #' @rdname dplyr_verbs
+ attr(x, ".Environment") = this_env |
|
192 | +61 |
- add_count.disk.frame <- create_dplyr_mapper(dplyr::add_count)+ }) |
|
193 | +62 |
-
+ |
|
194 | -+ | ||
63 | +327x |
-
+ if(as.data.frame) { |
|
195 | -+ | ||
64 | +327x |
- #' @export
+ if("grouped_df" %in% class(.x)) { |
|
196 | -+ | ||
65 | +18x |
- #' @importFrom dplyr add_tally
+ code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) |
|
197 | +66 |
- #' @rdname dplyr_verbs
+ } else { |
|
198 | -+ | ||
67 | +309x |
- add_tally.disk.frame <- create_dplyr_mapper(dplyr::add_tally)+ code = rlang::quo(chunk_fn(as.data.frame(.x), !!!quo_dotdotdot)) |
|
199 | +68 |
-
+ }
|
|
200 | +69 |
-
+ } else { |
|
201 | -+ | ||
70 | +! |
- #' @export
+ code = rlang::quo(chunk_fn(.x, !!!quo_dotdotdot)) |
|
202 | +71 |
- #' @importFrom dplyr summarize
+ }
|
|
203 | +72 |
- #' @rdname dplyr_verbs
+ |
|
204 | +73 |
- chunk_summarize <- create_dplyr_mapper(dplyr::summarize)+ # ZJ: we need both approaches. TRUST ME
|
|
205 | +74 |
-
+ # TODO better NSE at some point need dist
|
|
206 | -+ | ||
75 | +327x |
-
+ tryCatch({ |
|
207 | -+ | ||
76 | +327x |
- #' @export
+ return(rlang::eval_tidy(code)) |
|
208 | -+ | ||
77 | +327x |
- #' @importFrom dplyr summarise
+ }, error = function(e) { |
|
209 | -+ | ||
78 | +2x |
- #' @rdname dplyr_verbs
+ as_label_code = rlang::as_label(code) |
|
210 | -+ | ||
79 | +2x |
- chunk_summarise <- create_dplyr_mapper(dplyr::summarise)+ if(as_label_code == "chunk_fn(...)") { |
|
211 | -+ | ||
80 | +2x |
-
+ stop(glue::glue("disk.frame has detected a syntax error in \n\n`{code}`\n\n. If you believe your syntax is correct, raise an issue at https://github.com/xiaodaigh/disk.frame with a MWE")) |
|
212 | +81 |
-
+ } else { |
|
213 | +82 |
- #' @export
+ # likely to be dealing with data.tables
|
|
214 | -+ | ||
83 | +! |
- #' @importFrom dplyr summarize
+ return(eval(parse(text=as_label_code), envir = this_env)) |
|
215 | +84 |
- #' @rdname dplyr_verbs
+ }
|
|
216 | +85 |
- summarize.disk.frame <- create_dplyr_mapper(dplyr::summarize, warning_msg="`summarize.disk.frame` is now deprecated. Please use `chunk_summarize` instead. This is in preparation for a more powerful `group_by` framework")+ }) |
|
217 | -+ | ||
86 | +74x |
- #function(...) {
+ }, lazy = TRUE) |
|
218 | +87 |
- #stop("`summarize.disk.frame` has been removed. Please use `chunk_summarize` instead. This is in preparation for a more powerful `group_by` framework")
+ }
|
|
219 | -+ | ||
88 | +1x |
- #}
+ return_func
|
|
220 | +89 |
-
+ }
|
221 | +1 |
-
+ #' @param by join by
|
|
222 | +2 |
- #' @export
+ #' @param copy same as dplyr::anti_join
|
|
223 | +3 |
- #' @importFrom dplyr summarize
+ #' @param merge_by_chunk_id the merge is performed by chunk id
|
|
224 | +4 |
- #' @rdname dplyr_verbs
+ #' @param overwrite overwrite output directory
|
|
225 | +5 |
- summarise.disk.frame <- create_dplyr_mapper(dplyr::summarise, warning_msg="`summarise.disk.frame` is now deprecated. Please use `chunk_summarise` instead. This is in preparation for a more powerful `group_by` framework")+ #' @param .progress Show progress or not. Defaults to FALSE
|
|
226 | +6 |
- #function(...) {
+ #' @param ... same as dplyr's joins
|
|
227 | +7 |
- # stop("`summarise.disk.frame` has been removed. Please use `chunk_summarise` instead. This is in preparation for a more powerful `group_by` framework")
+ #' @rdname join
|
|
228 | +8 |
- # }
+ #' @importFrom rlang quo enquos
|
|
229 | +9 |
-
+ #' @importFrom dplyr anti_join left_join full_join semi_join inner_join
|
|
230 | +10 |
-
+ #' @return disk.frame or data.frame/data.table
|
|
231 | +11 |
#' @export
|
|
232 | +12 |
- #' @rdname dplyr_verbs
+ #' @examples
|
|
233 | +13 |
- #' @importFrom dplyr do
+ #' df.df = as.disk.frame(data.frame(x = 1:3, y = 4:6), overwrite = TRUE)
|
|
234 | +14 |
- do.disk.frame <- create_dplyr_mapper(dplyr::do)+ #' df2.df = as.disk.frame(data.frame(x = 1:2, z = 10:11), overwrite = TRUE)
|
|
235 | +15 |
-
+ #'
|
|
236 | +16 |
-
+ #' anti_joined.df = anti_join(df.df, df2.df)
|
|
237 | +17 |
- #' @export
+ #'
|
|
238 | +18 |
- #' @rdname dplyr_verbs
+ #' anti_joined.df %>% collect
|
|
239 | +19 |
- #' @importFrom dplyr group_by_all
+ #'
|
|
240 | +20 |
- group_by_all.disk.frame <- create_dplyr_mapper(dplyr::group_by_all)+ #' anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11))
|
|
241 | +21 |
-
+ #'
|
|
242 | +22 |
-
+ #' # clean up
|
|
243 | +23 |
- #' @export
+ #' delete(df.df)
|
|
244 | +24 |
- #' @rdname dplyr_verbs
+ #' delete(df2.df)
|
|
245 | +25 |
- #' @importFrom dplyr group_by_at
+ #' delete(anti_joined.df)
|
|
246 | +26 |
- group_by_at.disk.frame <- create_dplyr_mapper(dplyr::group_by_at)+ anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_anti_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { |
|
247 | +27 | ||
248 | -+ | ||
28 | +12x |
-
+ stopifnot("disk.frame" %in% class(x)) |
|
249 | +29 |
- #' @export
+ |
|
250 | -+ | ||
30 | +12x |
- #' @rdname dplyr_verbs
+ overwrite_check(outdir, overwrite) |
|
251 | +31 |
- #' @importFrom dplyr group_by_if
+ |
|
252 | -+ | ||
32 | +12x |
- group_by_if.disk.frame <- create_dplyr_mapper(dplyr::group_by_if)+ if("data.frame" %in% class(y)) { |
|
253 | -+ | ||
33 | +4x |
-
+ quo_dotdotdot = enquos(...) |
|
254 | -+ | ||
34 | +4x |
-
+ cmap_dfr.disk.frame(x, ~{ |
|
255 | -+ | ||
35 | +16x |
- #' @export
+ code = quo(anti_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) |
|
256 | -+ | ||
36 | +16x |
- #' @rdname dplyr_verbs
+ rlang::eval_tidy(code) |
|
257 | -+ | ||
37 | +4x |
- #' @importFrom dplyr mutate_all
+ }, .progress = .progress) |
|
258 | -+ | ||
38 | +8x |
- mutate_all.disk.frame <- create_dplyr_mapper(dplyr::mutate_all)+ } else if("disk.frame" %in% class(y)) { |
|
259 | -+ | ||
39 | +8x |
-
+ if(is.null(merge_by_chunk_id)) { |
|
260 | -+ | ||
40 | +! |
-
+ stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") |
|
261 | +41 |
- #' @export
+ }
|
|
262 | -+ | ||
42 | +8x |
- #' @rdname dplyr_verbs
+ if(is.null(by)) { |
|
263 | -+ | ||
43 | +! |
- #' @importFrom dplyr mutate_at
+ by <- intersect(names(x), names(y)) |
|
264 | +44 |
- mutate_at.disk.frame <- create_dplyr_mapper(dplyr::mutate_at)+ }
|
|
265 | +45 |
-
+ |
|
266 | -+ | ||
46 | +8x |
-
+ ncx = nchunks(x) |
|
267 | -+ | ||
47 | +8x |
- #' @export
+ ncy = nchunks(y) |
|
268 | -+ | ||
48 | +8x |
- #' @rdname dplyr_verbs
+ if (merge_by_chunk_id == FALSE) { |
|
269 | -+ | ||
49 | +4x |
- #' @importFrom dplyr mutate_if
+ warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") |
|
270 | +50 |
- mutate_if.disk.frame <- create_dplyr_mapper(dplyr::mutate_if)+ + |
+ |
51 | +4x | +
+ x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ |
+ |
52 | +4x | +
+ y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ |
+ |
53 | +4x | +
+ return(anti_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite)) |
|
271 | -+ | ||
54 | +4x |
-
+ } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) { |
|
272 | -+ | ||
55 | +4x |
-
+ res = cmap2.disk.frame(x, y, ~{ |
|
273 | +56 |
- #' @export
+ #res = cmap2(x, y, ~{
|
|
274 | -+ | ||
57 | +21x |
- #' @rdname dplyr_verbs
+ if(is.null(.y)) { |
|
275 | -+ | ||
58 | +! |
- #' @importFrom dplyr rename_all
+ return(.x) |
|
276 | -+ | ||
59 | +21x |
- rename_all.disk.frame <- create_dplyr_mapper(dplyr::rename_all)+ } else if (is.null(.x)) { |
|
277 | -+ | ||
60 | +! |
-
+ return(data.table()) |
|
278 | +61 |
-
+ }
|
|
279 | -+ | ||
62 | +21x |
- #' @export
+ anti_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite) |
|
280 | -+ | ||
63 | +4x |
- #' @rdname dplyr_verbs
+ }, outdir = outdir, .progress = .progress) |
|
281 | -+ | ||
64 | +4x |
- #' @importFrom dplyr rename_at
+ return(res) |
|
282 | +65 |
- rename_at.disk.frame <- create_dplyr_mapper(dplyr::rename_at)+ } else { |
|
283 | +66 |
-
+ # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
|
|
284 | -+ | ||
67 | +! |
-
+ stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE") |
|
285 | +68 |
- #' @export
+ }
|
|
286 | +69 |
- #' @rdname dplyr_verbs
+ }
|
|
287 | +70 |
- #' @importFrom dplyr rename_if
+ }
|
288 | +1 |
- rename_if.disk.frame <- create_dplyr_mapper(dplyr::rename_if)+ #' rbindlist disk.frames together
|
|
289 | +2 |
-
+ #' @param df_list A list of disk.frames
|
|
290 | +3 |
-
+ #' @param outdir Output directory of the row-bound disk.frames
|
|
291 | +4 |
- #' @export
+ #' @param by_chunk_id If TRUE then only the chunks with the same chunk IDs will be bound
|
|
292 | +5 |
- #' @rdname dplyr_verbs
+ #' @param parallel if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE
|
|
293 | +6 |
- #' @importFrom dplyr select_all
+ #' @param compress 0-100, 100 being the highest compression rate.
|
|
294 | +7 |
- select_all.disk.frame <- create_dplyr_mapper(dplyr::select_all)+ #' @param overwrite overwrite the output directory
|
|
295 | +8 |
-
+ #' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}
|
|
296 | +9 |
-
+ #' @import fs
|
|
297 | +10 |
- #' @export
+ #' @importFrom data.table data.table setDT
|
|
298 | +11 |
- #' @rdname dplyr_verbs
+ #' @importFrom future.apply future_lapply
|
|
299 | +12 |
- #' @importFrom dplyr select_at
+ #' @importFrom purrr map_chr map_dfr map map_lgl
|
|
300 | +13 |
- select_at.disk.frame <- create_dplyr_mapper(dplyr::select_at)+ #' @importFrom purrr map
|
|
301 | +14 |
-
+ #' @export
|
|
302 | +15 |
-
+ #' @examples
|
|
303 | +16 |
- #' @export
+ #' cars.df = as.disk.frame(cars)
|
|
304 | +17 |
- #' @rdname dplyr_verbs
+ #'
|
|
305 | +18 |
- #' @importFrom dplyr select_if
+ #' # row-bind two disk.frames
|
|
306 | +19 |
- select_if.disk.frame <- create_dplyr_mapper(dplyr::select_if)+ #' cars2.df = rbindlist.disk.frame(list(cars.df, cars.df))
|
|
307 | +20 |
-
+ #'
|
|
308 | +21 |
-
+ #' # clean up cars.df
|
|
309 | +22 |
- #' @export
+ #' delete(cars.df)
|
|
310 | +23 |
- #' @rdname dplyr_verbs
+ #' delete(cars2.df)
|
|
311 | +24 |
- #' @importFrom dplyr summarise_all
+ rbindlist.disk.frame <- function(df_list, outdir = tempfile(fileext = ".df"), by_chunk_id = TRUE, parallel = TRUE, compress=50, overwrite = TRUE, .progress = TRUE) { |
|
312 | -+ | ||
25 | +53x |
- chunk_summarise_all <- create_dplyr_mapper(dplyr::summarise_all)+ stopifnot(typeof(df_list) == "list") |
|
313 | +26 |
-
+ |
|
314 | -+ | ||
27 | +53x |
-
+ overwrite_check(outdir, overwrite) |
|
315 | +28 |
- #' @export
+ |
|
316 | -+ | ||
29 | +53x |
- #' @rdname dplyr_verbs
+ purrr::map(df_list, ~{ |
|
317 | -+ | ||
30 | +233x |
- #' @importFrom dplyr summarise_at
+ if(!"disk.frame" %in% class(.x)) { |
|
318 | -+ | ||
31 | +1x |
- chunk_summarise_at <- create_dplyr_mapper(dplyr::summarise_at)+ stop("error running rbindlist.disk.frame: Not every element of df_list is a disk.frame") |
|
319 | +32 |
-
+ }
|
|
320 | +33 |
-
+ }) |
|
321 | +34 |
- #' @export
+ + |
+ |
35 | +52x | +
+ if(by_chunk_id) { |
|
322 | -+ | ||
36 | +52x |
- #' @rdname dplyr_verbs
+ list_of_paths = purrr::map_chr(df_list, ~attr(.x,"path", exact=TRUE)) |
|
323 | -+ | ||
37 | +52x |
- #' @importFrom dplyr summarize
+ list_of_chunks = purrr::map_dfr(list_of_paths, ~data.table(path = list.files(.x),full_path = list.files(.x,full.names = TRUE))) |
|
324 | -+ | ||
38 | +52x |
- chunk_summarize <- create_dplyr_mapper(dplyr::summarize)+ setDT(list_of_chunks) |
|
325 | +39 |
-
+ |
|
326 | +40 |
-
+ # split the list of chunks into lists for easy operation with future
|
|
327 | -+ | ||
41 | +52x |
- #' @export
+ slist = split(list_of_chunks$full_path,list_of_chunks$path) |
|
328 | +42 |
- #' @rdname dplyr_verbs
+ |
|
329 | -+ | ||
43 | +52x |
- #' @importFrom dplyr summarize_all
+ if(parallel) { |
|
330 | +44 |
- chunk_summarize_all <- create_dplyr_mapper(dplyr::summarize_all)+ #system.time(future.apply::future_lapply(1:length(slist), function(i) {
|
|
331 | -+ | ||
45 | +52x |
-
+ if(.progress) { |
|
332 | -+ | ||
46 | +52x |
-
+ message("Appending disk.frames: ") |
|
333 | +47 |
- #' @export
+ }
|
|
334 | -+ | ||
48 | +52x |
- #' @rdname dplyr_verbs
+ system.time(furrr::future_map(1:length(slist), function(i) { |
|
335 | -+ | ||
49 | +263x |
- #' @importFrom dplyr summarize_at
+ full_paths1 = slist[[i]] |
|
336 | -+ | ||
50 | +263x |
- chunk_summarize_at <- create_dplyr_mapper(dplyr::summarize_at)+ outfilename = names(slist[i]) |
|
337 | -+ | ||
51 | +263x |
-
+ fst::write_fst(purrr::map_dfr(full_paths1, ~fst::read_fst(.x)),file.path(outdir,outfilename), compress = compress) |
|
338 | -+ | ||
52 | +263x |
-
+ NULL
|
|
339 | -+ | ||
53 | +52x |
- #' @export
+ }, .progress = .progress)) |
|
340 | +54 |
- #' @rdname dplyr_verbs
+ } else { |
|
341 | -+ | ||
55 | +! |
- #' @importFrom dplyr summarize_if
+ system.time(lapply(1:length(slist), function(i) { |
|
342 | -+ | ||
56 | +! |
- chunk_summarize_if <- create_dplyr_mapper(dplyr::summarize_if)+ full_paths1 = slist[[i]] |
|
343 | -+ | ||
57 | +! |
-
+ outfilename = names(slist[i]) |
|
344 | -+ | ||
58 | +! |
-
+ fst::write_fst(purrr::map_dfr(full_paths1, ~fst::read_fst(.x)),file.path(outdir,outfilename), compress = compress) |
|
345 | -+ | ||
59 | +! |
- #' @export
+ NULL
|
|
346 | +60 |
- #' @rdname dplyr_verbs
+ })) |
|
347 | +61 |
- #' @importFrom dplyr distinct
+ }
|
|
348 | +62 |
- distinct.disk.frame <- function(...) {+ |
|
349 | -! | +||
63 | +52x |
- stop("`distinct.disk.frame` is not available. Please use `chunk_distinct`")+ rbind_res = disk.frame(outdir) |
|
350 | +64 |
- }
+ |
|
351 | -+ | ||
65 | +52x |
-
+ shardkeys <- purrr::map(df_list, shardkey) |
|
352 | +66 |
-
+ |
|
353 | +67 |
- #' @export
+ # if all the sharkeys are identical then
|
|
354 | +68 |
- #' @rdname dplyr_verbs
+ ##browser
|
|
355 | -+ | ||
69 | +52x |
- #' @importFrom dplyr distinct
+ if(all(purrr::map_lgl(shardkeys[-1], ~identical(.x, shardkeys[[1]])))) { |
|
356 | -+ | ||
70 | +52x | +
+ return(add_meta(rbind_res,+ |
+ |
71 | +52x |
- chunk_distinct <- create_dplyr_mapper(dplyr::distinct, warning_msg = "the `distinct` function applies distinct chunk-wise")+ shardkey = shardkeys[[1]]$shardkey, |
|
357 | -+ | ||
72 | +52x |
-
+ shardchunks = shardkeys[[1]]$shardchunks, |
|
358 | -+ | ||
73 | +52x |
- #' The shard keys of the disk.frame
+ compress = compress)) |
|
359 | +74 |
- #' @return character
+ } else { |
|
360 | -+ | ||
75 | +! |
- #' @export
+ return(rbind_res) |
|
361 | +76 |
- #' @param x a disk.frame
+ }
|
|
362 | +77 |
- groups.disk.frame <- function(x){+ } else { |
|
363 | -1x | +||
78 | +! |
- shardkey(x)+ stop("For rbindlist.disk.frame, only by_chunk_id = TRUE is implemented") |
|
364 | +79 |
- }
+ }
|
|
365 | +80 |
-
+ }
|
366 | +1 |
- #' Group by within each disk.frame
+ #' Apply data.table's foverlaps to the disk.frame
|
|
367 | +2 |
- #' @description
+ #' @description EXPERIMENTAL
|
|
368 | +3 |
- #' The disk.frame group by operation perform group WITHIN each chunk. This is
+ #' @param df1 A disk.frame
|
|
369 | +4 |
- #' often used for performance reasons. If the user wishes to perform group-by,
+ #' @param df2 A disk.frame or a data.frame
|
|
370 | +5 |
- #' they may choose to use the `hard_group_by` function which is expensive as it
+ #' @param by.x character/string vector. by.x used in foverlaps
|
|
371 | +6 |
- #' reorganizes the chunks by the shard key.
+ #' @param by.y character/string vector. by.x used in foverlaps
|
|
372 | +7 |
- #' @seealso hard_group_by
+ #' @param outdir The output directory of the disk.frame
|
|
373 | +8 |
- #' @param .data a disk.frame
+ #' @param merge_by_chunk_id If TRUE then the merges will happen for chunks in df1 and df2 with the same chunk id which speed up processing. Otherwise every chunk of df1 is merged with every chunk of df2. Ignored with df2 is not a disk.frame
|
|
374 | +9 |
- #' @param ... same as the dplyr::group_by
+ #' @param compress The compression ratio for fst
|
|
375 | +10 |
- #' @export
+ #' @param overwrite overwrite existing directory
|
|
376 | +11 |
-
+ #' @param ... passed to data.table::foverlaps and disk.frame::cmap.disk.frame
|
|
377 | +12 |
- #' @rdname group_by
+ #' @import fst
|
|
378 | +13 |
- # TODO check shardkey
+ #' @importFrom glue glue
|
|
379 | +14 |
- # group_by.disk.frame <- function(.data, ...) {
+ #' @importFrom data.table foverlaps data.table setDT setkeyv
|
|
380 | +15 |
- # dplyr_fn = dplyr::group_by
+ #' @importFrom future.apply future_lapply
|
|
381 | +16 |
- #
+ #' @importFrom pryr do_call
|
|
382 | +17 |
- # quo_dotdotdot = rlang::enquos(...)
+ #' @export
|
|
383 | +18 |
- #
+ #' @examples
|
|
384 | +19 |
- # # this is designed to capture any global stuff
+ #' library(data.table)
|
|
385 | +20 |
- # vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot)
+ #'
|
|
386 | +21 |
- # data_for_eval_tidy = force(vars_and_pkgs$globals)
+ #' ## simple example:
|
|
387 | +22 |
- #
+ #' x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10))
|
|
388 | +23 |
- # res = map(.data, ~{
+ #' y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3))
|
|
389 | +24 |
- # this_env = environment()
+ #' byxy = c("val1", "start", "end")
|
|
390 | +25 |
- #
+ #' xy.df = foverlaps.disk.frame(
|
|
391 | +26 |
- # if(length(data_for_eval_tidy) > 0) {
+ #' x, y, by.x = byxy, by.y = byxy,
|
|
392 | +27 |
- # for(i in 1:length(data_for_eval_tidy)) {
+ #' merge_by_chunk_id = TRUE, overwrite = TRUE)
|
|
393 | +28 |
- # assign(names(data_for_eval_tidy)[i], data_for_eval_tidy[[i]], pos = this_env)
+ #'
|
|
394 | +29 |
- # }
+ #' # clean up
|
|
395 | +30 |
- # }
+ #' delete(x)
|
|
396 | +31 |
- #
+ #' delete(y)
|
|
397 | +32 |
- # lapply(quo_dotdotdot, function(x) {
+ #' delete(xy.df)
|
|
398 | +33 |
- # attr(x, ".Environment") = this_env
+ foverlaps.disk.frame <- function(df1, df2, by.x = if (identical(shardkey(df1)$shardkey, "")) shardkey(df1)$shardkey else shardkey(df2)$shardkey, by.y = shardkey(df2)$shardkey, ...,outdir = tempfile("df_foverlaps_tmp", fileext = ".df"), merge_by_chunk_id = FALSE, compress=50, overwrite = TRUE) { |
|
399 | +34 |
- # })
+
|
|
400 | -+ | ||
35 | +! |
- #
+ stopifnot("disk.frame" %in% class(df1)) |
|
401 | +36 |
- # code = rlang::quo(dplyr_fn(.x, !!!quo_dotdotdot))
+ |
|
402 | -+ | ||
37 | +! |
- # eval(parse(text=rlang::as_label(code)), envir = this_env)
+ overwrite_check(outdir, overwrite) |
|
403 | +38 |
- # }, lazy = TRUE)
+ |
|
404 | -+ | ||
39 | +! |
- # }
+ if("data.frame" %in% class(df2)) { |
|
405 | -+ | ||
40 | +! |
- #group_by.disk.frame <- create_dplyr_mapper(dplyr::group_by, warning_msg = "The group_by operation is applied WITHIN each chunk, hence the results may not be as expected. To address this issue, you can rechunk(df, shardby = your_group_keys) which can be computationally expensive. Otherwise, you may use a second stage summary to obtain the desired result.")
+ cmap.disk.frame(df1, ~foverlaps(.x, df2, ...), ..., lazy = FALSE, compress = compress, overwrite = overwrite) |
|
406 | -+ | ||
41 | +! |
- group_by.disk.frame <- create_dplyr_mapper(dplyr::group_by, warning_msg="`group_by.disk.frame` is now deprecated. Please use `chunk_group_by` instead. This is in preparation for a more powerful `group_by` framework")+ } else if (merge_by_chunk_id | (identical(shardkey(df1), shardkey(df2)))) { |
|
407 | +42 |
- #function(...) {
+ # if the shardkeys are the same then only need to match by segment id
|
|
408 | +43 |
- #stop("`arrange.disk.frame` has been removed. Please use `chunk_arrange` instead. This is preparation for a more powerful `group_by` framework")
+ # as account with the same shardkey must end up in the same segment
|
|
409 | -+ | ||
44 | +! |
- #}
+ path1 = attr(df1,"path") |
|
410 | -+ | ||
45 | +! |
-
+ path2 = attr(df2,"path") |
|
411 | +46 |
- #' @export
+ |
|
412 | -+ | ||
47 | +! |
-
+ df3 = merge( |
|
413 | -+ | ||
48 | +! |
- #' @rdname group_by
+ data.table( |
|
414 | -+ | ||
49 | +! |
- chunk_group_by <- create_dplyr_mapper(dplyr::group_by)+ chunk_id = list.files(path1), |
|
415 | -+ | ||
50 | +! |
-
+ pathA = list.files(path1,full.names = TRUE), |
|
416 | -+ | ||
51 | +! |
- #' @export
+ file_id = 1:length(list.files(path1)) |
|
417 | +52 |
- #' @rdname dplyr_verbs
+ ),
|
|
418 | -+ | ||
53 | +! |
- glimpse.disk.frame <- function(.data, ...) {+ data.table( |
|
419 | +54 | ! |
- glimpse(head(.data, ...), ...)+ chunk_id = list.files(path2), |
420 | -+ | ||
55 | +! |
- }
+ pathB = list.files(path2,full.names = TRUE) |
|
421 | +56 |
-
+ ),
|
|
422 | -+ | ||
57 | +! |
- # Internal methods
+ by = "chunk_id" |
|
423 | +58 |
- # @param .data the data
+ )
|
|
424 | -+ | ||
59 | +! |
- # @param cmd the function to record
+ setDT(df3) |
|
425 | +60 |
- record <- function(.data, cmd){+ |
|
426 | +61 | ! |
- attr(.data,"lazyfn") <- c(attr(.data,"lazyfn"), list(cmd))+ dotdotdot = list(...) |
427 | -! | +||
62 | +
- .data
+ |
||
428 | -+ | ||
63 | +! |
- }
+ furrr::future_map(1:nrow(df3), function(row) { |
|
429 | +64 |
-
+ #future.apply::future_lapply(1:nrow(df3), function(row) {
|
|
430 | +65 |
- # Internal methods
+ #lapply(1:nrow(df3), function(row) {
|
|
431 | -+ | ||
66 | +! |
- # @param .data the disk.frame
+ chunk_id = df3[row, chunk_id] |
|
432 | +67 |
- # @param cmds the list of function to play back
+ |
|
433 | -+ | ||
68 | +! |
- play <- function(.data, cmds=NULL) {+ data1 = get_chunk(df1, chunk_id) |
|
434 | -427x | +||
69 | +! |
- for (cmd in cmds){+ data2 = get_chunk(df2, chunk_id) |
|
435 | -526x | +||
70 | +
- if (typeof(cmd) == "closure") {+ |
||
436 | +71 | ! |
- .data <- cmd(.data)+ setDT(data1) |
437 | -+ | ||
72 | +! |
- } else {+ setDT(data2) |
|
438 | +73 |
- # create a temporary environment
+ |
|
439 | -526x | +||
74 | +! |
- an_env = new.env(parent = environment())+ setkeyv(data2, by.y[(length(by.y)-2+1):length(by.y)]) |
|
440 | +75 |
|
|
441 | -526x | +||
76 | +! |
- ng = names(cmd$vars_and_pkgs$globals)+ dotdotdot$x = data1 |
|
442 | -+ | ||
77 | +! |
- + dotdotdot$y = data2 |
|
443 | -526x | +||
78 | +! |
- if(length(ng) > 0) {+ data3 = pryr::do_call(foverlaps, dotdotdot) |
|
444 | -442x | +||
79 | +! |
- for(i in 1:length(cmd$vars_and_pkgs$globals)) {+ rm(data1); rm(data2); gc() |
|
445 | -1567x | +||
80 | +! |
- g = cmd$vars_and_pkgs$globals[[i]]+ outdir
|
|
446 | -1567x | +||
81 | +! |
- assign(ng[i], g, pos = an_env)+ fst::write_fst(data3, glue::glue("{outdir}/{chunk_id}"), compress = compress) |
|
447 | -+ | ||
82 | +! |
- }
+ rm(data3); gc() |
|
448 | -+ | ||
83 | +! |
- }
+ NULL
|
|
449 | +84 |
- + }) |
|
450 | -526x | +||
85 | +! |
- .data <- do.call(cmd$func, c(list(.data),cmd$dotdotdot), envir = an_env)+ return(disk.frame(outdir)) |
|
451 | +86 |
- }
+ } else { |
|
452 | -+ | ||
87 | +! |
- }
+ stop("foverlaps.disk.frame: only merge_by_chunk_id = TRUE is implemented") |
|
453 | -425x | +||
88 | +
- .data
+ }
|
||
454 | +89 |
}
@@ -20730,3071 +24421,2969 @@ disk.frame coverage - 51.96% |
1 |
- #' @param by join by
+ #' Add a chunk to the disk.frame
|
||
2 |
- #' @param copy same as dplyr::anti_join
+ #'
|
||
3 |
- #' @param merge_by_chunk_id the merge is performed by chunk id
+ #' If no chunk_id is specified, then the chunk is added at the end as the
|
||
4 |
- #' @param overwrite overwrite output directory
+ #' largest numbered file, "n.fst".
|
||
5 |
- #' @param ... same as dplyr's joins
+ #'
|
||
6 |
- #' @rdname join
+ #' @details The function is the preferred way to add a chunk to a disk.frame. It
|
||
7 |
- #' @importFrom rlang quo enquos
+ #' performs checks on the types to make sure that the new chunk doesn't have
|
||
8 |
- #' @importFrom dplyr anti_join left_join full_join semi_join inner_join
+ #' different types to the disk.frame.
|
||
9 |
- #' @return disk.frame or data.frame/data.table
+ #'
|
||
10 |
- #' @export
+ #' @param df the disk.frame to add a chunk to
|
||
11 |
- #' @examples
+ #' @param chunk a data.frame to be added as a chunk
|
||
12 |
- #' df.df = as.disk.frame(data.frame(x = 1:3, y = 4:6), overwrite = TRUE)
+ #' @param chunk_id a numeric number indicating the id of the chunk. If NULL it
|
||
13 |
- #' df2.df = as.disk.frame(data.frame(x = 1:2, z = 10:11), overwrite = TRUE)
+ #' will be set to the largest chunk_id + 1
|
||
14 |
- #'
+ #' @param full.names whether the chunk_id name match should be to the full file
|
||
15 |
- #' anti_joined.df = anti_join(df.df, df2.df)
+ #' path not just the file name
|
||
16 |
- #'
+ #' @importFrom data.table data.table
|
||
17 |
- #' anti_joined.df %>% collect
+ #' @importFrom utils capture.output
|
||
18 |
- #'
+ #' @export
|
||
19 |
- #' anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11))
+ #' @return disk.frame
|
||
20 |
- #'
+ #' @examples
|
||
21 |
- #' # clean up
+ #' # create a disk.frame
|
||
22 |
- #' delete(df.df)
+ #' df_path = file.path(tempdir(), "tmp_add_chunk")
|
||
23 |
- #' delete(df2.df)
+ #' diskf = disk.frame(df_path)
|
||
24 |
- #' delete(anti_joined.df)
+ #'
|
||
25 |
- anti_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_anti_join"), merge_by_chunk_id = FALSE, overwrite = TRUE) {+ #' # add a chunk to diskf
|
||
26 |
-
+ #' add_chunk(diskf, cars)
|
||
27 | -12x | +
- stopifnot("disk.frame" %in% class(x))+ #' add_chunk(diskf, cars)
|
|
28 |
- + #'
|
||
29 | -12x | +
- overwrite_check(outdir, overwrite)+ #' nchunks(diskf) # 2
|
|
30 |
- + #'
|
||
31 | -12x | +
- if("data.frame" %in% class(y)) {+ #' df2 = disk.frame(file.path(tempdir(), "tmp_add_chunk2"))
|
|
32 | -4x | +
- quo_dotdotdot = enquos(...)+ #'
|
|
33 | -4x | +
- map_dfr(x, ~{+ #' # add chunks by specifying the chunk_id number; this is especially useful if
|
|
34 | -16x | +
- code = quo(anti_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot))+ #' # you wish to add multiple chunk in parralel
|
|
35 | -16x | +
- rlang::eval_tidy(code)+ #'
|
|
36 |
- })+ #' add_chunk(df2, data.frame(chunk=1), 1)
|
||
37 | -8x | +
- } else if("disk.frame" %in% class(y)) {+ #' add_chunk(df2, data.frame(chunk=2), 3)
|
|
38 | -8x | +
- if(is.null(merge_by_chunk_id)) {+ #'
|
|
39 | -! | +
- stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly")+ #' nchunks(df2) # 2
|
|
40 |
- }
+ #'
|
||
41 | -8x | +
- if(is.null(by)) {+ #' dir(attr(df2, "path", exact=TRUE))
|
|
42 | -! | +
- by <- intersect(names(x), names(y))+ #' # [1] "1.fst" "3.fst"
|
|
43 |
- }
+ #'
|
||
44 |
- + #' # clean up
|
||
45 | -8x | +
- ncx = nchunks(x)+ #' delete(diskf)
|
|
46 | -8x | +
- ncy = nchunks(y)+ #' delete(df2)
|
|
47 | -8x | +
- if (merge_by_chunk_id == FALSE) {+ add_chunk <- function(df, chunk, chunk_id = NULL, full.names = FALSE) { |
|
48 | -4x | +
- warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.")+ # sometimes chunk_id is defined in terms of itself
|
|
49 | -+ | 24x |
- + force(chunk_id) |
50 | -4x | +24x |
- x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ stopifnot("disk.frame" %in% class(df)) |
51 | -4x | +24x |
- y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE)+ if(!is_disk.frame(df)) { |
52 | -4x | +! |
- return(anti_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite))+ stop("can not add_chunk as this is not a disk.frame") |
53 | -4x | +
- } else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {+ }
|
|
54 |
- #res = map2.disk.frame(x, y, ~{
+ |
||
55 | -4x | +
- res = map2(x, y, ~{+ # get the metadata for all chunks
|
|
56 | -21x | +24x |
- if(is.null(.y)) {+ path = attr(df,"path", exact=TRUE) |
57 | -! | +24x |
- return(.x)+ files <- fs::dir_ls(path, type="file", glob = "*.fst") |
58 | -21x | +
- } else if (is.null(.x)) {+ |
|
59 | -! | +
- return(data.table())+ |
|
60 |
- }
+ # if a chunk_id is not specified
|
||
61 | -21x | +24x |
- anti_join(.x, .y, by = by, copy = copy, ..., overwrite = overwrite)+ if(is.null(chunk_id)) { |
62 | -4x | +22x |
- }, outdir = outdir)+ chunk_id = 1 + max(purrr::map_int(files, ~{ |
63 | -4x | +58x |
- return(res)+ s = stringr::str_extract(.x,"[:digit:]+\\.fst") |
64 | -- |
- } else {- |
- |
65 | -- |
- # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
- |
- |
66 | -! | -
- stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE")- |
- |
67 | -- |
- }
- |
- |
68 | -- |
- }
- |
- |
69 | -- |
- }
- |
-
1 | -- |
- #' Check if the outdir exists or not
- |
- ||
2 | -+ | 58x |
- #' @description
+ as.integer(substr(s, 1, nchar(s) - 4)) |
|
3 | -+ | |||
65 | +22x |
- #' If the overwrite is TRUE then the folder will be deleted, otherwise the folder will be created.
+ }), nchunks(df)) |
||
4 | +66 |
- #' @param outdir the output directory
+ }
|
||
5 | +67 |
- #' @param overwrite TRUE or FALSE if `outdir`` exists and overwrite = FALSE then throw an error
+ |
||
6 | -+ | |||
68 | +24x |
- #' @import fs
+ if(length(files) > 0) { |
||
7 | -+ | |||
69 | +20x |
- #' @importFrom glue glue
+ filename = "" |
||
8 | +70 |
- #' @export
+ |
||
9 | -+ | |||
71 | +20x |
- #'
+ if(is.numeric(chunk_id)) { |
||
10 | -+ | |||
72 | +20x |
- #' @examples
+ filename = file.path(path,glue::glue("{as.integer(chunk_id)}.fst")) |
||
11 | +73 |
- #' tf = tempfile()
+ } else { # if the chunk_id is not numeric |
||
12 | -+ | |||
74 | +! |
- #' overwrite_check(tf, overwrite = FALSE)
+ if (full.names) { |
||
13 | -+ | |||
75 | +! |
- #' overwrite_check(tf, overwrite = TRUE)
+ filename = file.path(path, chunk_id) |
||
14 | +76 |
- #'
+ } else { |
||
15 | -+ | |||
77 | +! |
- #' # clean up
+ filename = chunk_id |
||
16 | +78 |
- #' fs::dir_delete(tf)
+ }
|
||
17 | +79 |
- overwrite_check <- function(outdir, overwrite) {+ }
|
||
18 | +80 |
- ##browser
+ |
||
19 | -861x | -
- if (is.null(outdir)) {- |
- ||
20 | -! | +81 | +20x |
- warning("outdir is NULL; no overwrite check is performed")+ if(fs::file_exists(filename)) { |
21 | +82 | ! |
- return(NULL)+ stop(glue::glue("failed to add_chunk as chunk_id = {chunk_id} already exist")) |
|
22 | +83 |
- }
+ }
|
||
23 | +84 |
- - |
- ||
24 | -861x | -
- if(overwrite & fs::dir_exists(outdir)) {- |
- ||
25 | -439x | -
- if(!is_disk.frame(outdir)) {+ |
||
26 | -1x | +85 | +20x |
- stop(glue::glue("The directory is {outdir} is not a disk.frame folder. Execution has stopped to prevent accidental deletion of potentially important files"))+ metas = purrr::map(files, fst::metadata_fst) |
27 | +86 |
- }
+ |
||
28 | -438x | +87 | +20x |
- if(length(fs::dir_ls(outdir, all = TRUE)) != 0) {+ types <- c("unknown", "character", "factor", "ordered factor", |
29 | -6x | +88 | +20x |
- tryCatch({+ "integer", "POSIXct", "difftime", "IDate", "ITime", "double", |
30 | -6x | +89 | +20x |
- fs::dir_delete(outdir)+ "Date", "POSIXct", "difftime", "ITime", "logical", "integer64", |
31 | -6x | -
- }, error = function(e) {- |
- ||
32 | -! | -
- message(e)- |
- ||
33 | -! | -
- stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reason and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems"))- |
- ||
34 | -+ | 90 | +20x |
- })+ "nanotime", "raw") |
35 | +91 |
- }
+ |
||
36 | +92 |
- + # need to ensure that all column names and types match
|
||
37 | -438x | +93 | +20x |
- fs::dir_create(outdir)+ metas_df = purrr::imap_dfr(metas, |
38 | -422x | +94 | +20x |
- } else if(overwrite == FALSE & fs::dir_exists(outdir)) {+ ~data.table::data.table( |
39 | -2x | -
- stop(glue::glue("overwrite = FALSE and outdir '{outdir}' already exists"))- |
- ||
40 | -+ | 95 | +20x |
- } else {+ colnames = .x$columnNames, |
41 | -420x | -
- fs::dir_create(outdir)- |
- ||
42 | -+ | 96 | +20x |
- }
+ coltypes = types[.x$columnTypes], |
43 | -+ | |||
97 | +20x |
- }
+ chunk_id = .y)) |
1 | +98 |
- #' Move or copy a disk.frame to another location
+ |
|
2 | -+ | ||
99 | +20x |
- #'
- |
- |
3 | -+ | ||
100 | +20x |
- #' @param df The disk.frame
+ metas_df_summ[,existing_df := TRUE] |
|
4 | +101 |
- #' @param outdir The new location
+ |
|
5 | -+ | ||
102 | +20x |
- #' @param copy Merely copy and not move
+ new_chunk_meta = |
|
6 | -+ | ||
103 | +20x |
- #' @param ... NOT USED
+ data.table::data.table( |
|
7 | -+ | ||
104 | +20x |
- #'
+ colnames = names(chunk), |
|
8 | -+ | ||
105 | +20x |
- #' @return a disk.frame
+ coltypes = purrr::map(chunk, typeof) %>% unlist, |
|
9 | -+ | ||
106 | +20x |
- #' @export
+ new_chunk = TRUE) |
|
10 | +107 |
- #' @examples
+ |
|
11 | -+ | ||
108 | +20x |
- #' cars.df = as.disk.frame(cars)
+ merged_meta = full_join(new_chunk_meta, metas_df_summ, by=c("colnames")) |
|
12 | -+ | ||
109 | +20x |
- #'
+ data.table::setDT(merged_meta) |
|
13 | +110 |
- #' cars_copy.df = copy_df_to(cars.df, outdir = tempfile(fileext=".df"))
+ |
|
14 | +111 |
- #'
+ # find out which vars are matched
|
|
15 | -+ | ||
112 | +20x |
- #' cars2.df = move_to(cars.df, outdir = tempfile(fileext=".df"))
+ check_vars = full_join( |
|
16 | -+ | ||
113 | +20x |
- #'
+ new_chunk_meta[,.(colnames, new_chunk)], |
|
17 | -+ | ||
114 | +20x |
- #' # clean up
+ metas_df[,.(colnames=unique(colnames), existing_df = TRUE)], by = "colnames") |
|
18 | +115 |
- #' delete(cars_copy.df)
+ |
|
19 | -+ | ||
116 | +20x |
- #' delete(cars2.df)
+ data.table::setDT(check_vars) |
|
20 | -+ | ||
117 | +20x |
- move_to <- function(df, outdir, ..., copy = FALSE) {+ if(nrow(check_vars[is.na(new_chunk)]) > 0) { |
|
21 | +118 | ! |
- if(!fs::dir_exists(outdir)) {+ warning( |
22 | +119 | ! |
- fs::dir_create(outdir)+ glue::glue( |
23 | -+ | ||
120 | +! |
- }
+ "these variables are in the disk.frame but not in the new chunk: \n {paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ')}")) |
|
24 | +121 |
- + }
|
|
25 | -! | +||
122 | +20x |
- if(!copy %in% c(TRUE, FALSE)) {+ if(nrow(check_vars[is.na(existing_df)]) > 0){ |
|
26 | +123 | ! |
- stop("disk.frame::move_to ERROR: copy argument must be TRUE or FALSE")+ warning(glue::glue("these variables are in the new chunk but not in the existing disk.frame: {paste0(check_vars[is.na(existing_df), colnames], collapse=', ')}")) |
27 | +124 |
- }
+ }
|
|
28 | +125 |
- + |
|
29 | +126 |
- ## copy all files over
+ # find out which vars are matched but the types don't match
|
|
30 | -! | +||
127 | +20x |
- listfiles = list.files(attr(df,"path"), full.names = TRUE)+ metas_df_summ1 = merged_meta[existing_df == TRUE & new_chunk == TRUE & coltypes.x != coltypes.y] |
|
31 | -! | +||
128 | +
- shortlistfiles = list.files(attr(df,"path"))+ # find incompatible types
|
||
32 | -! | +||
129 | +20x |
- purrr::walk2(listfiles, shortlistfiles, ~{+ metas_df_summ1[, incompatible_types := { |
|
33 | -! | +||
130 | +20x |
- if(copy) {+ coltypes.x %in% c("integer", "double", "Date") & coltypes.y == "character" | |
|
34 | -! | +||
131 | +20x |
- fs::file_copy(.x, file.path(outdir, .y))+ coltypes.x == "character" & coltypes.y %in% c("integer", "double", "Date") | |
|
35 | -+ | ||
132 | +20x |
- } else {+ coltypes.x %in% c("integer", "double") & coltypes.y == "Date" | |
|
36 | -! | +||
133 | +20x |
- fs::file_move(.x, file.path(outdir, .y))+ coltypes.x == "Date" & coltypes.y %in% c("integer", "double") |
|
37 | +134 |
- }
+ }] |
|
38 | +135 |
- })+ |
|
39 | -+ | ||
136 | +20x |
-
+ metas_df_summ2 = metas_df_summ1[incompatible_types == TRUE,] |
|
40 | +137 |
- ## copy .metadata over
+ + |
+ |
138 | +20x | +
+ if(nrow(metas_df_summ2)>0) { |
|
41 | +139 | ! |
- fs::dir_create(file.path(outdir, ".metadata"))+ message("the belows types are incompatible between the new chunk and the disk.frame; this chunk can not be added\n") |
-
42 | -+ | ||
140 | +! |
- + message(paste0(utils::capture.output(metas_df_summ2), collapse = "\n")) |
|
43 | +141 | ! |
- metadata_path = file.path(attr(df,"path"), ".metadata")+ stop("") |
44 | +142 |
- + }
|
|
45 | -! | +||
143 | +
- listfiles = list.files(metadata_path, full.names = TRUE)+ }
|
||
46 | -! | +||
144 | +
- shortlistfiles = list.files(metadata_path)+ |
||
47 | -! | +||
145 | +24x |
- purrr::walk2(listfiles, shortlistfiles, ~{+ fst::write_fst(chunk, file.path(attr(df,"path", exact=TRUE), paste0(chunk_id,".fst"))) |
|
48 | -! | +||
146 | +24x |
- if(copy) {+ disk.frame(attr(df,"path", exact=TRUE)) |
|
49 | -! | +||
147 | +
- fs::file_copy(.x, file.path(outdir, ".metadata", .y))+ }
|
50 | +1 |
- } else {+ #' `cmap2` a function to two disk.frames
|
|
51 | -! | +||
2 | +
- fs::file_move(.x, file.path(outdir, ".metadata", .y))+ #' @description
|
||
52 | +3 |
- }
+ #' Perform a function on both disk.frames .x and .y, each chunk of .x and .y
|
|
53 | +4 |
- })+ #' gets run by .f(x.chunk, y.chunk)
|
|
54 | +5 |
- + #' @param .x a disk.frame
|
|
55 | -! | +||
6 | +
- if(!copy) {+ #' @param .y a disk.frame
|
||
56 | -! | +||
7 | +
- delete(df)+ #' @param .f a function to be called on each chunk of x and y matched by
|
||
57 | +8 |
- }
+ #' chunk_id
|
|
58 | +9 |
- + #' @param ... not used
|
|
59 | -! | +||
10 | +
- disk.frame(outdir)+ #' @param outdir output directory
|
||
60 | +11 |
- }
+ #' @import stringr fst
|
|
61 | +12 |
-
+ #' @importFrom purrr as_mapper map2
|
|
62 | +13 |
- #' @rdname move_to
+ #' @importFrom data.table data.table
|
|
63 | +14 |
#' @export
|
|
64 | +15 |
- copy_df_to <- function(df, outdir, ...) {+ #' @rdname cmap2
|
|
65 | -! | +||
16 | +
- move_to(df, outdir, ..., copy = TRUE)+ #' @examples
|
||
66 | +17 |
- }
+ #' cars.df = as.disk.frame(cars)
|
1 | +18 |
- #' Removes a chunk from the disk.frame
+ #'
|
||
2 | +19 |
- #' @param df a disk.frame
+ #' cars2.df = cmap2(cars.df, cars.df, ~data.table::rbindlist(list(.x, .y)))
|
||
3 | +20 |
- #' @param chunk_id the chunk ID of the chunk to remove. If it's a number then return number.fst
+ #' collect(cars2.df)
|
||
4 | +21 |
- #' @param full.names TRUE or FALSE. Defaults to FALSE. If true then chunk_id is the full path to the chunk otherwise it's the relative path
+ #'
|
||
5 | +22 |
- #' @export
+ #' # clean up cars.df
|
||
6 | +23 |
- #' @examples
+ #' delete(cars.df)
|
||
7 | +24 |
- #' # TODO add these to tests
+ #' delete(cars2.df)
|
||
8 | +25 |
- #' cars.df = as.disk.frame(cars, nchunks = 4)
+ cmap2 <- function(.x, .y, .f, ...){ |
||
9 | -+ | |||
26 | +10x |
- #'
+ UseMethod("cmap2") |
||
10 | +27 |
- #' # removes 3rd chunk
+ }
|
||
11 | +28 |
- #' remove_chunk(cars.df, 3)
+
|
||
12 | +29 |
- #' nchunks(cars.df) # 3
+ #' @export
|
||
13 | +30 |
- #'
+ #' @rdname cmap2
|
||
14 | +31 |
- #' # removes 4th chunk
+ map2 <- function(.x, .y, .f, ...){+ |
+ ||
32 | +! | +
+ UseMethod("map2") |
||
15 | +33 |
- #' remove_chunk(cars.df, "4.fst")
+ }
|
||
16 | +34 |
- #' nchunks(cars.df) # 3
+
|
||
17 | +35 |
- #'
+ #' @export
|
||
18 | +36 |
- #' # removes 2nd chunk
+ map2.default <- function(.x, .y, .f, ...) {+ |
+ ||
37 | +! | +
+ purrr::map2(.x,.y,.f,...) |
||
19 | +38 |
- #' remove_chunk(cars.df, file.path(attr(cars.df, "path"), "2.fst"), full.names = TRUE)
+ }
|
||
20 | +39 |
- #' nchunks(cars.df) # 1
+
|
||
21 | +40 |
- #'
+ #' @export
|
||
22 | +41 |
- #' # clean up cars.df
+ map2.disk.frame <- function(...) {+ |
+ ||
42 | +! | +
+ warning("map2.disk.frame(df, df1, ..) where df is disk.frame is deprecated. Use cmap(df, df1, ...) instead")+ |
+ ||
43 | +! | +
+ cmap2.disk.frame(...) |
||
23 | +44 |
- #' delete(cars.df)
+ }
|
||
24 | +45 |
- remove_chunk <- function(df, chunk_id, full.names = FALSE) {+ |
||
25 | -1x | +|||
46 | +
- filename = ""+ #' @export
|
|||
26 | -1x | +|||
47 | +
- path = attr(df,"path")+ #' @importFrom pryr do_call
|
|||
27 | -1x | +|||
48 | +
- if(is.numeric(chunk_id)) {+ cmap2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df"), .progress = TRUE) { |
|||
28 | -1x | -
- filename = file.path(path, glue::glue("{as.integer(chunk_id)}.fst"))- |
- ||
29 | -+ | 49 | +26x |
- } else {+ if(!"disk.frame" %in% class(.x)) { |
30 | +50 | ! |
- if (full.names) {+ code = deparse(substitute(cmap2.disk.frame(.x,.y, ...))) %>% paste(collapse = "\n") |
|
31 | +51 | ! |
- filename = chunk_id+ stop(sprintf("running %s : the .x argument must be a disk.frame", code)) |
|
32 | +52 |
- } else {+ }
|
||
33 | -! | +|||
53 | +
- filename = file.path(path, chunk_id)+ |
|||
34 | +54 |
- }
+ |
||
35 | -+ | |||
55 | +26x |
- }
+ .f = purrr::as_mapper(.f) |
||
36 | +56 |
|
||
37 | +57 |
- #if(filename %in% fs::dir_ls(path, glob="*.fst")) {
+ |
||
38 | -1x | +58 | +26x |
- if(fs::file_exists(filename)) {+ if("disk.frame" %in% class(.y)) { |
39 | -1x | +59 | +25x |
- fs::file_delete(filename)+ fs::dir_create(outdir) |
40 | +60 |
- } else {+ |
||
41 | -! | +|||
61 | +
- warning(glue::glue("the chunk {filename} does not exist and hence can't be removed; make sure you suffix the file with the .fst extension"))+ # get all the chunk ids
|
|||
42 | -+ | |||
62 | +25x |
- }
+ xc = data.table(cid = get_chunk_ids(.x)) |
||
43 | -1x | +63 | +25x |
- df
+ xc[,xid:=get_chunk_ids(.x, full.names = TRUE)] |
44 | -+ | |||
64 | +25x |
- }
+ yc = data.table(cid = get_chunk_ids(.y)) |
1 | -+ | |||
65 | +25x |
- #' Write disk.frame to disk
+ yc[,yid:=get_chunk_ids(.y, full.names = TRUE)] |
||
2 | +66 |
- #' @description
+ |
||
3 | -+ | |||
67 | +25x |
- #' Write a data.frame/disk.frame to a disk.frame location. If df is a data.frame
+ xyc = merge(xc, yc, by="cid", all = TRUE, allow.cartesian = TRUE) |
||
4 | +68 |
- #' then using the as.disk.frame function is recommended for most cases
+ |
||
5 | -+ | |||
69 | +25x |
- #' @param df a disk.frame
+ ddd = list(...) |
||
6 | +70 |
- #' @param outdir output directory for the disk.frame
+ # apply the functions
|
||
7 | +71 |
- #' @param nchunks number of chunks
+ |
||
8 | +72 |
- #' @param overwrite overwrite output directory
+ #future.apply::future_mapply(function(xid, yid, outid) {
|
||
9 | +73 |
- #' @param shardby the columns to shard by
+ #mapply(function(xid, yid, outid) {
|
||
10 | -+ | |||
74 | +25x |
- #' @param compress compression ratio for fst files
+ furrr::future_pmap(list(xyc$xid, xyc$yid, xyc$cid), function(xid, yid, outid) { |
||
11 | -+ | |||
75 | +126x |
- #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
+ xch = disk.frame::get_chunk(.x, xid, full.names = TRUE) |
||
12 | -+ | |||
76 | +126x |
- #' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
+ ych = disk.frame::get_chunk(.y, yid, full.names = TRUE) |
||
13 | -+ | |||
77 | +126x |
- #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
+ xych = .f(xch, ych) |
||
14 | -+ | |||
78 | +126x |
- #' @param ... passed to map.disk.frame
+ if(base::nrow(xych) > 0) { |
||
15 | -+ | |||
79 | +111x |
- #' @export
+ fst::write_fst(xych, file.path(outdir, paste0(outid,".fst"))) |
||
16 | +80 |
- #' @import fst fs
+ } else { |
||
17 | -+ | |||
81 | +15x |
- #' @importFrom glue glue
+ warning(glue::glue("one of the chunks, {xid}, is empty")) |
||
18 | +82 |
- #' @examples
+ }
+ |
+ ||
83 | +126x | +
+ NULL
|
||
19 | +84 |
- #' cars.df = as.disk.frame(cars)
+ }
|
||
20 | +85 |
- #'
+ #,xyc$xid, xyc$yid, xyc$cid # together with mapply
|
||
21 | -+ | |||
86 | +25x |
- #' # write out a lazy disk.frame to disk
+ , .progress = .progress |
||
22 | +87 |
- #' cars2.df = write_disk.frame(map(cars.df, ~.x[1,]), overwrite = TRUE)
+ )
|
||
23 | +88 |
- #' collect(cars2.df)
+ |
||
24 | -+ | |||
89 | +25x |
- #'
+ return(disk.frame(outdir)) |
||
25 | +90 |
- #' # clean up cars.df
+ } else { |
||
26 | +91 |
- #' delete(cars.df)
+ # if .y is not a disk.frame
|
||
27 | -+ | |||
92 | +1x |
- #' delete(cars2.df)
+ warning("in cmap2(.x,.y,...) the .y is not a disk.frame, so returning a list instead of a disk.frame") |
||
28 | +93 |
- write_disk.frame <- function(+ |
||
29 | -+ | |||
94 | +1x |
- df,
+ f_for_passing = force(.f) |
||
30 | -+ | |||
95 | +1x |
- outdir = tempfile(fileext = ".df"),+ ddd = list(...) |
||
31 | -+ | |||
96 | +1x |
- nchunks = ifelse(+ tmp_disk.frame = force(.x) |
||
32 | -+ | |||
97 | +1x |
- "disk.frame"%in% class(df),+ res = furrr::future_map2(get_chunk_ids(tmp_disk.frame, full.names = TRUE), .y, function(xs, ys) { |
||
33 | -+ | |||
98 | +5x |
- nchunks.disk.frame(df),+ ddd = c(list(get_chunk(tmp_disk.frame, xs, full.names = TRUE), ys), ddd) |
||
34 | +99 |
- recommend_nchunks(df)),+ |
||
35 | -+ | |||
100 | +5x |
- overwrite = FALSE,+ pryr::do_call(f_for_passing, ddd) |
||
36 | +101 |
- shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) {+ }) |
||
37 | +102 |
-
+ |
||
38 | -328x | +103 | +1x |
- force(nchunks)+ return(res) |
39 | -328x | +|||
104 | +
- overwrite_check(outdir, overwrite)+ }
|
|||
40 | +105 |
-
+ }
|
||
41 | +106 | |||
42 | -328x | -
- if(is.null(outdir)) {- |
- ||
43 | -! | +
1 | +
- stop("outdir must not be NULL")+ #' Convert a SAS file (.sas7bdat) format to CSV or disk.frame by chunk
|
||
44 | +2 |
- }
+ #' @param infile the SAS7BDAT file
|
|
45 | +3 |
-
+ #' @param chunk which convert of nchunks to convert
|
|
46 | -328x | +||
4 | +
- if(is_disk.frame(df)) {+ #' @param nchunks number of chunks
|
||
47 | -4x | +||
5 | +
- if(is.null(shardby)) {+ #' @param sas2csvpath path to sas2csv.exe
|
||
48 | -4x | +||
6 | +
- map.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE)+ #' @param sep separater of the CSV file, defaults to |
|
||
49 | +7 |
- } else {+ #' @family ingesting data
|
|
50 | +8 |
- # TODO really inefficient
+ #' @noRd
|
|
51 | +9 |
- #df2 = map.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE)
+ sas_to_csv <- function(infile, chunk, nchunks = disk.frame::recommend_nchunks(fs::file_size(infile)), sas2csvpath = "sas2csv/sas2csv.exe", sep="|") { |
|
52 | +10 | ! |
- shard(df,+ if(!file.exists(sas2csvpath)) { |
53 | +11 | ! |
- outdir = outdir,+ stop("You must have the sas2csv.exe installed. Only Windows is supported at the moment. Please contact the author") |
54 | -! | +||
12 | +
- nchunks = nchunks,+ }
|
||
55 | +13 | ! |
- overwrite = TRUE,+ sasfile = glue::glue('"{infile}"') |
56 | +14 | ! |
- shardby = shardby,+ fs::dir_create(file.path("outcsv")) |
57 | +15 | ! |
- compress = compress,+ fs::dir_create(file.path("outcsv", chunk)) |
58 | +16 | ! |
- shardby_function=shardby_function,+ options = glue::glue("-o outcsv/{chunk}/ -d {sep} -c -n {nchunks} -k {paste(chunk-1,collapse = ' ')} -m")+ |
+
17 | ++ |
+ |
|
59 | +18 | ! |
- sort_splits=sort_splits, desc_vars=desc_vars,+ cmd = paste(sas2csvpath, sasfile, options) |
60 | -+ | ||
19 | +! |
- ...
+ system(cmd) |
|
61 | +20 |
- )
+ }
|
|
62 | +21 |
- }
- |
- |
63 | -324x | -
- } else if ("data.frame" %in% class(df)) {- |
- |
64 | -324x | -
- if(".out.disk.frame.id" %in% names(df)) {+ |
|
65 | -324x | +||
22 | +
- df[,{+ #' Convert a SAS file (.sas7bdat format) to disk.frame via CSVs
|
||
66 | -1586x | +||
23 | +
- if (base::nrow(.SD) > 0) {+ #' @param inpath input SAS7BDAT file
|
||
67 | -1586x | +||
24 | +
- list_columns = purrr::map_lgl(.SD, is.list)+ #' @param outpath output disk.frame
|
||
68 | -1586x | +||
25 | +
- if(any(list_columns)){+ #' @param nchunks number of chunks
|
||
69 | -1x | +||
26 | +
- stop(glue::glue("The data frame contains these list-columns: '{paste0(names(.SD)[list_columns], collapse='\', \'')}'. List-columns are not yet supported by disk.frame. Remove these columns to create a disk.frame"))+ #' @param sep separater of the intermediate CSV file, defaults to |
|
||
70 | +27 |
- } else {+ #' @param remove_csv TRUE/FALSE. Remove the intermediate CSV after usage?
|
|
71 | -1585x | +||
28 | +
- fst::write_fst(.SD, file.path(outdir, paste0(.BY, ".fst")), compress = compress)+ #' @importFrom future %<-%
|
||
72 | -1585x | +||
29 | +
- NULL
+ #' @family ingesting data
|
||
73 | +30 |
- }
+ #' @noRd
|
|
74 | +31 |
- }
+ sas_to_disk.frame = function(inpath, outpath, nchunks = disk.frame::recommend_nchunks(inpath), sas2csvpath = "sas2csv/sas2csv.exe", sep = "|", remove_csv = T) { |
|
75 | -1585x | +||
32 | +! |
- NULL
+ files = file.path(outpath, paste0(1:nchunks,".fst")) |
|
76 | -324x | +||
33 | +! |
- }, .out.disk.frame.id]+ ready = rep(F, nchunks) | file.exists(files) |
|
77 | -323x | +||
34 | +
- res = disk.frame(outdir)+ # ready = c(rep(T, 96), rep(F, 4))
|
||
78 | -323x | +||
35 | +! |
- add_meta(res, shardkey = shardby, shardchunks = nchunks, compress = compress)+ extracting = rep(F, nchunks) |
|
79 | +36 |
- } else {+ |
|
80 | +37 | ! |
- as.disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, compress = compress, ...)+ fs::dir_create(outpath) |
81 | -+ | ||
38 | +! |
- }
+ fs::dir_delete(file.path("outcsv")) |
|
82 | +39 |
- } else {+ |
|
83 | +40 | ! |
- stop("write_disk.frame error: df must be a disk.frame or data.frame")+ print("this program converts SAS datasets to CSV first before conversion to disk.frame.") |
84 | -+ | ||
41 | +! |
- }
+ print(glue::glue("the intermediate CSVs are here: {file.path(getwd(), 'outcsv')}")) |
|
85 | +42 |
- }
+ |
|
86 | -+ | ||
43 | +! |
-
+ while(!all(ready)) { |
|
87 | -+ | ||
44 | +! |
- #' @rdname write_disk.frame
+ done1 = F |
|
88 | -+ | ||
45 | +! |
- output_disk.frame <- function(...) {+ extracting_jobs = F |
|
89 | +46 | ! |
- warning("output_disk.frame is DEPRECATED. Use write_disk.frame istead")+ for(w in which(!ready)) { |
90 | +47 | ! |
- write_disk.frame(...)+ incsv = file.path("outcsv", w, paste0("_", w-1,".csv")) |
91 | -+ | ||
48 | +! |
- }
+ if(file.exists(incsv)) { |
1 | -+ | ||
49 | +! |
- #' Fit generalized linear models (glm) with disk.frame
+ done1 = T |
|
2 | -+ | ||
50 | +! |
- #'
+ ready[w] = T |
|
3 | -+ | ||
51 | +! |
- #' @inherit biglm::bigglm
+ ok %<-% { |
|
4 | -+ | ||
52 | +! |
- #'
+ fst::write_fst(data.table::fread(incsv), file.path(outpath, paste0(w,".fst.tmp"))) |
|
5 | -+ | ||
53 | +! |
- #' @description
+ file.rename(file.path(outpath, paste0(w,".fst.tmp")), file.path(outpath, paste0(w,".fst"))) |
|
6 | -+ | ||
54 | +! |
- #' Fits GLMs using `{speedglm}` or `{biglm}`. The return object will be exactly as
+ if(remove_csv) { |
|
7 | -+ | ||
55 | +! |
- #' those return by those functions. This is a convenience wrapper
+ file.remove(incsv) |
|
8 | +56 |
- #'
+ }
|
|
9 | -+ | ||
57 | +! |
- #' @param glm_backend Which package to use for fitting GLMs. The default is
+ gc() |
|
10 | +58 |
- #' "biglm", which has known issues with factor level if different levels are
+ }
|
|
11 | -+ | ||
59 | +! |
- #' present in different chunks. The "speedglm" option is more robust, but does not
+ print(glue::glue("converting: {w} of {nchunks}; time: {Sys.time()}"))+ |
+ |
60 | +! | +
+ } else if (!extracting_jobs & !extracting[w]) { |
|
12 | +61 |
- #' implement `predict` which makes prediction and implementation impossible.
+ |
|
13 | -+ | ||
62 | +! |
- #'
+ done1 = T |
|
14 | -+ | ||
63 | +! |
- #' @family Machine Learning (ML)
+ extracting_jobs = T |
|
15 | -+ | ||
64 | +! |
- #' @export
+ extracting[w] <- T |
|
16 | -+ | ||
65 | +! |
- #'
+ ok %<-% { |
|
17 | -+ | ||
66 | +! |
- #' @examples
+ sas_to_csv(inpath, w, nchunks, sas2csvpath, sep = sep) |
|
18 | +67 |
- #' cars.df = as.disk.frame(cars)
+ }
|
|
19 | -+ | ||
68 | +! |
- #' m = dfglm(dist ~ speed, data = cars.df)
+ print(glue::glue("extracting: {w} of {nchunks}; time: {Sys.time()}")) |
|
20 | +69 |
- #'
+ |
|
21 | +70 |
- #' # can use normal R functions
+ }
|
|
22 | +71 |
- #' summary(m)
+ }
|
|
23 | -+ | ||
72 | +! |
- #' predict(m, get_chunk(cars.df, 1))
+ if(!done1) { |
|
24 | -+ | ||
73 | +! |
- #' predict(m, collect(cars.df))
+ print(glue::glue("didn't get any work: {Sys.time()}")) |
|
25 | -+ | ||
74 | +! |
- #'
+ Sys.sleep(18) |
|
26 | +75 |
- #' # can use broom to tidy up the returned info
+ }
|
|
27 | +76 |
- #' broom::tidy(m)
+ }
|
|
28 | +77 |
- #'
+ }
|
29 | +1 |
- #' # clean up
+ #' Write disk.frame to disk
|
|
30 | +2 |
- #' delete(cars.df)
+ #' @description
|
|
31 | +3 |
- dfglm <- function(formula, data, ..., glm_backend = c("biglm", "speedglm")) {+ #' Write a data.frame/disk.frame to a disk.frame location. If df is a data.frame
|
|
32 | +4 |
- #
- |
- |
33 | -! | -
- glm_backend = match.arg(glm_backend)+ #' then using the as.disk.frame function is recommended for most cases
|
|
34 | +5 |
- - |
- |
35 | -! | -
- stopifnot(is_disk.frame(data))- |
- |
36 | -! | -
- streaming_fn <- make_glm_streaming_fn(data)+ #' @param df a disk.frame
|
|
37 | +6 |
- - |
- |
38 | -! | -
- if(glm_backend == "speedglm") {- |
- |
39 | -! | -
- if(!requireNamespace("speedglm")) {- |
- |
40 | -! | -
- stop("speedglm package not installed. To install run `install.packages('speedglm')`")+ #' @param outdir output directory for the disk.frame
|
|
41 | +7 |
- }
- |
- |
42 | -! | -
- speedglm::shglm(formula, streaming_fn, ...)+ #' @param nchunks number of chunks
|
|
43 | -! | +||
8 | +
- } else if (glm_backend == "biglm"){+ #' @param overwrite overwrite output directory
|
||
44 | -! | +||
9 | +
- if(!requireNamespace("biglm")) {+ #' @param shardby the columns to shard by
|
||
45 | -! | +||
10 | +
- stop("biglm package not installed. To install run `install.packages('biglm')`")+ #' @param compress compression ratio for fst files
|
||
46 | +11 |
- }
+ #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
|
|
47 | -! | +||
12 | +
- biglm::bigglm(formula, data = streaming_fn, ...)+ #' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
|
||
48 | +13 |
- } else {+ #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
|
|
49 | -! | +||
14 | +
- stop("glm_backend must be one of 'speedglm' or 'biglm'")+ #' @param ... passed to cmap.disk.frame
|
||
50 | +15 |
- }
+ #' @export
|
|
51 | +16 |
- }
+ #' @import fst fs
|
1 | +17 |
- #' Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame
+ #' @importFrom glue glue
|
||
2 | +18 |
- #' @param df A data.frame/data.table or disk.frame. If disk.frame, then rechunk(df, ...) is run
+ #' @examples
|
||
3 | +19 |
- #' @param shardby The column(s) to shard the data by.
+ #' cars.df = as.disk.frame(cars)
|
||
4 | +20 |
- #' @param nchunks The number of chunks
+ #'
|
||
5 | +21 |
- #' @param outdir The output directory of the disk.frame
+ #' # write out a lazy disk.frame to disk
|
||
6 | +22 |
- #' @param overwrite If TRUE then the chunks are overwritten
+ #' cars2.df = write_disk.frame(cmap(cars.df, ~.x[1,]), overwrite = TRUE)
|
||
7 | +23 |
- #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
+ #' collect(cars2.df)
|
||
8 | +24 |
- #' @param sort_splits If shardby_function is "sort", the split values for sharding
+ #'
|
||
9 | +25 |
- #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
+ #' # clean up cars.df
|
||
10 | +26 |
- #' @param ... not used
+ #' delete(cars.df)
|
||
11 | +27 |
- #' @importFrom data.table setDT
+ #' delete(cars2.df)
|
||
12 | +28 |
- #' @importFrom glue glue
+ write_disk.frame <- function( |
||
13 | +29 |
- #' @export
+ df,
|
||
14 | +30 |
- #' @examples
+ outdir = tempfile(fileext = ".df"), |
||
15 | +31 |
- #'
+ nchunks = ifelse( |
||
16 | +32 |
- #' # shard the cars data.frame by speed so that rows with the same speed are in the same chunk
+ "disk.frame"%in% class(df), |
||
17 | +33 |
- #' iris.df = shard(iris, "Species")
+ nchunks.disk.frame(df), |
||
18 | +34 |
- #'
+ recommend_nchunks(df)), |
||
19 | +35 |
- #' # clean up cars.df
+ overwrite = FALSE, |
||
20 | +36 |
- #' delete(iris.df)
+ shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) { |
||
21 | +37 |
- shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), overwrite = FALSE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) {+ |
||
22 | -253x | +38 | +314x |
force(nchunks) |
23 | -253x | +39 | +314x |
overwrite_check(outdir, overwrite) |
24 | -253x | +|||
40 | +
- stopifnot(shardby_function %in% c("hash", "sort"))+ |
|||
25 | +41 |
- + |
||
26 | -253x | +42 | +314x |
- if("data.frame" %in% class(df)) {+ if(is.null(outdir)) { |
27 | -252x | +|||
43 | +! |
- data.table::setDT(df)+ stop("outdir must not be NULL") |
||
28 | -252x | +|||
44 | +
- if(shardby_function == "hash"){+ }
|
|||
29 | -219x | +|||
45 | +
- message("Hashing...")+ |
|||
30 | -219x | +46 | +314x |
- if(length(shardby) == 1) {+ if(is_disk.frame(df)) { |
31 | -161x | +47 | +4x |
- code = glue::glue("df[,.out.disk.frame.id := hashstr2i(as.character({shardby}), nchunks)]")+ if(is.null(shardby)) { |
32 | -+ | |||
48 | +4x |
- } else {+ cmap.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE) |
||
33 | -58x | +|||
49 | +
- shardby_list = glue::glue("paste0({paste0(sort(shardby),collapse=',')})")+ } else { |
|||
34 | -58x | +|||
50 | +
- code = glue::glue("df[,.out.disk.frame.id := hashstr2i({shardby_list}, nchunks)]")+ # TODO really inefficient
|
|||
35 | +51 |
- }
+ #df2 = cmap.disk.frame(df, ~.x, outdir = outdir, lazy = FALSE, ..., compress = compress, overwrite = TRUE)
|
||
36 | -33x | +|||
52 | +! |
- } else if(shardby_function == "sort"){+ shard(df, |
||
37 | -33x | +|||
53 | +! |
- if(nchunks == 1){+ outdir = outdir, |
||
38 | -1x | +|||
54 | +! |
- message("Only one chunk: set .out.disk.frame.id = 0")+ nchunks = nchunks, |
||
39 | -1x | +|||
55 | +! |
- code = glue::glue("df[,.out.disk.frame.id := 0]")+ overwrite = TRUE, |
||
40 | -+ | |||
56 | +! |
- } else {+ shardby = shardby, |
||
41 | -32x | +|||
57 | +! |
- shard_by_rule <- sortablestr2i(sort_splits, desc_vars)+ compress = compress, |
||
42 | -32x | +|||
58 | +! |
- message(shard_by_rule)+ shardby_function=shardby_function, |
||
43 | -32x | +|||
59 | +! |
- code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]")+ sort_splits=sort_splits, desc_vars=desc_vars, |
||
44 | +60 |
- }
+ ...
|
||
45 | +61 |
- }
+ )
|
||
46 | +62 |
-
+ }
|
||
47 | -252x | +63 | +310x |
- tryCatch(+ } else if ("data.frame" %in% class(df)) { |
48 | -252x | +64 | +310x | +
+ if(".out.disk.frame.id" %in% names(df)) {+ |
+
65 | +310x | +
+ df[,{+ |
+ ||
66 | +1496x | +
+ if (base::nrow(.SD) > 0) {+ |
+ ||
67 | +1496x | +
+ list_columns = purrr::map_lgl(.SD, is.list)+ |
+ ||
68 | +1496x | +
+ if(any(list_columns)){+ |
+ ||
69 | +1x |
- eval(parse(text=code)),+ stop(glue::glue("The data frame contains these list-columns: '{paste0(names(.SD)[list_columns], collapse='\', \'')}'. List-columns are not yet supported by disk.frame. Remove these columns to create a disk.frame")) |
||
49 | -252x | +|||
70 | +
- error = function(e) {+ } else { |
|||
50 | -! | +|||
71 | +1495x |
- message("error occurred in shard")+ fst::write_fst(.SD, file.path(outdir, paste0(.BY, ".fst")), compress = compress) |
||
51 | -+ | |||
72 | +1495x |
- }
+ NULL
|
||
52 | +73 |
- )
+ }
|
||
53 | +74 |
- + }
|
||
54 | -252x | -
- stopifnot(".out.disk.frame.id" %in% names(df))- |
- ||
55 | -+ | 75 | +1495x |
- + NULL
|
56 | -252x | +76 | +310x |
- res = write_disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, shardchunks = nchunks, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)+ }, .out.disk.frame.id] |
57 | -252x | +77 | +309x |
- return(res)+ res = disk.frame(outdir) |
58 | -1x | +78 | +309x |
- } else if ("disk.frame" %in% class(df)){+ add_meta(res, shardkey = shardby, shardchunks = nchunks, compress = compress) |
59 | -1x | +|||
79 | +
- nchunks_rechunk = nchunks+ } else { |
|||
60 | -1x | +|||
80 | +! |
- return(rechunk(df, shardby = shardby, nchunks = nchunks_rechunk, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars))+ as.disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, compress = compress, ...) |
||
61 | +81 |
- }
+ }
|
||
62 | +82 |
- }
+ } else {+ |
+ ||
83 | +! | +
+ stop("write_disk.frame error: df must be a disk.frame or data.frame") |
||
63 | +84 |
-
+ }
|
||
64 | +85 |
- #' `distribute` is an alias for `shard`
+ }
|
||
65 | +86 |
- #' @export
+
|
||
66 | +87 |
- #' @rdname shard
+ #' @rdname write_disk.frame
|
||
67 | +88 |
- distribute <- function(...) {+ output_disk.frame <- function(...) { |
||
68 | +89 | ! |
- UseMethod("shard")+ warning("output_disk.frame is DEPRECATED. Use write_disk.frame istead")+ |
+ |
90 | +! | +
+ write_disk.frame(...) |
||
69 | +91 |
}
@@ -23803,2115 +27392,2142 @@ disk.frame coverage - 51.96% |
1 |
- #' Perform a hard arrange
+ #' Show the code to setup disk.frame
|
||
2 |
- #' @description
+ #' @export
|
||
3 |
- #' A hard_arrange is a sort by that also reorganizes the chunks to ensure that
+ show_ceremony <- function() { |
||
4 | -+ | ! |
- #' every unique grouping of `by`` is in the same chunk. Or in other words, every
+ glue::glue(crayon::green(ceremony_text())) |
5 |
- #' row that share the same `by` value will end up in the same chunk.
+ }
|
||
6 |
- #' @param df a disk.frame
+
|
||
7 |
- #' @param ... grouping variables
+
|
||
8 |
- #' @param outdir the output directory
+ #' @rdname show_ceremony
|
||
9 |
- #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df)
+ #' @export
|
||
10 |
- #' @param overwrite overwrite the out put directory
+ ceremony_text <- function() { |
||
11 |
- #' @param add same as dplyr::arrange
+ " |
||
12 |
- #' @param .drop same as dplyr::arrange
+ # this willl set disk.frame with multiple workers
|
||
13 | -+ | ! |
- #' @export
+ setup_disk.frame() |
14 |
- #' @examples
+ # this will allow unlimited amount of data to be passed from worker to worker
|
||
15 | -+ | ! |
- #' iris.df = as.disk.frame(iris, nchunks = 2)
+ options(future.globals.maxSize = Inf) |
16 |
- #'
+ " |
||
17 |
- #' # arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk
+ }
|
||
18 |
- #' iris_hard.df = hard_arrange(iris.df, Species)
+
|
||
19 |
- #'
+ #' @rdname show_ceremony
|
||
20 |
- #' get_chunk(iris_hard.df, 1)
+ #' @export
|
||
21 | -+ | ! |
- #' get_chunk(iris_hard.df, 2)
+ show_boilerplate <- function() show_ceremony() |
22 |
- #'
+
|
||
23 |
- #' # clean up cars.df
+ #' @rdname show_ceremony
|
||
24 |
- #' delete(iris.df)
+ #' @export
|
||
25 |
- #' delete(iris_hard.df)
+ insert_ceremony <- function() { |
||
26 | -+ | ! |
- hard_arrange <- function(df, ..., add = FALSE, .drop = FALSE) {+ if(requireNamespace("rstudioapi")) { |
27 | -9x | +! |
- UseMethod("hard_arrange")+ rstudioapi::insertText(ceremony_text()) |
28 |
- }
+ } else { |
||
29 | -+ | ! |
-
+ stop("insert ceremony can only be used inside RStudio") |
30 |
- #' @rdname hard_arrange
+ }
|
||
31 |
- #' @export
+ }
|
32 | +1 |
- #' @importFrom dplyr arrange
+ #' Obtain one chunk by chunk id
+ |
+ ||
2 | ++ |
+ #' @param df a disk.frame
+ |
+ ||
3 | ++ |
+ #' @param n the chunk id. If numeric then matches by number, if character then returns the chunk with the same name as n
+ |
+ ||
4 | ++ |
+ #' @param keep the columns to keep
+ |
+ ||
5 | ++ |
+ #' @param full.names whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric
+ |
+ ||
6 | ++ |
+ #' @param ... passed to fst::read_fst or whichever read function is used in the backend
|
||
33 | +7 |
- hard_arrange.data.frame <- function(df, ...) {+ #' @export
|
||
34 | -3x | +|||
8 | +
- dplyr::arrange(df, ...)+ #' @examples
|
|||
35 | +9 |
- }
+ #' cars.df = as.disk.frame(cars, nchunks = 2)
|
||
36 | +10 |
-
+ #' get_chunk(cars.df, 1)
|
||
37 | +11 |
- #' @rdname hard_arrange
+ #' get_chunk(cars.df, 2)
|
||
38 | +12 |
- #' @importFrom purrr map
+ #' get_chunk(cars.df, 1, keep = "speed")
|
||
39 | +13 |
- #' @export
+ #'
|
||
40 | +14 |
- hard_arrange.disk.frame <- function(df, ..., outdir=tempfile("tmp_disk_frame_hard_arrange"), nchunks = disk.frame::nchunks(df), overwrite = TRUE) {+ #' # if full.names = TRUE then the full path to the chunk need to be provided
|
||
41 | -6x | +|||
15 | +
- overwrite_check(outdir, overwrite)+ #' get_chunk(cars.df, file.path(attr(cars.df, "path"), "1.fst"), full.names = TRUE)
|
|||
42 | +16 |
- + #'
|
||
43 | +17 |
- # Refer also to Dplyr arrange: https://github.com/tidyverse/dplyr/blob/master/src/arrange.cpp
+ #' # clean up cars.df
|
||
44 | -6x | +|||
18 | +
- q <- enquos(...)+ #' delete(cars.df)
|
|||
45 | -6x | +|||
19 | +
- is_sym <- sapply(q, rlang::quo_is_symbol)+ get_chunk <- function(...) { |
|||
46 | -6x | +20 | +1058x |
- arrange_codes <- sapply(q, rlang::as_label)+ UseMethod("get_chunk") |
47 | +21 |
- + }
|
||
48 | +22 |
- # Check if desc...
- |
- ||
49 | -6x | -
- is_desc <- substr(arrange_codes, 1, 5) == "desc("+ |
||
50 | +23 |
- + |
||
51 | +24 |
- # If expr is a symbol from the data, just use it.
+ #' @rdname get_chunk
|
||
52 | +25 |
- # Otherwise need to evaluate ...
+ #' @importFrom fst read_fst
|
||
53 | +26 |
- # (TODO - currently only support variables and desc in the data)
+ #' @export
|
||
54 | +27 |
- # Peels off "desc" from the original
+ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { |
||
55 | -6x | +28 | +1058x |
- vars <- sub(")", "", sub("desc(", "", arrange_codes, fixed=TRUE), fixed=TRUE)+ stopifnot("disk.frame" %in% class(df)) |
56 | +29 |
-
+ |
||
57 | -6x | +30 | +1058x |
- desc_vars <- vars[is_desc]+ keep_chunks = attr(df, "keep_chunks", exact=TRUE) |
58 | +31 |
|
||
59 | -6x | +|||
32 | +
- if(!all(vars %in% colnames(df))){+ # TODO relax this
|
|||
60 | -! | +|||
33 | +
- stop(paste0("Expressions currently not supported. Columns not found in colnames:", vars[!vars %in% colnames(df)]))+ # if(!is.null(keep_chunks)) {
|
|||
61 | +34 |
- }
+ # # browser()
|
||
62 | +35 |
- + # # n_int = as.integer(n)
|
||
63 | +36 |
- # Hard group by in a partially sorted way at the chunk level and then arrange within chunks
+ # #
|
||
64 | -6x | +|||
37 | +
- df %>%+ # # if(is.na(n_int)) {
|
|||
65 | -6x | +|||
38 | +
- disk.frame::hard_group_by(vars, outdir=outdir, nchunks=nchunks, overwrite=overwrite, shardby_function="sort", desc_vars=desc_vars) %>%+ # # if(as.character(n) %in% get_chunk_ids(df)[keep_chunks]) {
|
|||
66 | -6x | +|||
39 | +
- chunk_arrange(...)+ # # return(NULL)
|
|||
67 | +40 |
- }
+ # # } else if(normalizePath(as.character(n)) %in% sapply(get_chunk_ids(df, full.names = TRUE)[keep_chunks],normalizePath)) {
|
1 | +41 |
- #' Print disk.frame
+ # # return(NULL)
|
|
2 | +42 |
- #' @description
+ # # }
|
|
3 | +43 |
- #' a new print method for disk.frame
+ # # } else {
|
|
4 | +44 |
- #' @export
+ # # if(!n %in% keep_chunk) {
|
|
5 | +45 |
- #' @param x disk.frame
+ # # return(NULL)
|
|
6 | +46 |
- #' @param ... not used
+ # # }
|
|
7 | +47 |
- #' @importFrom glue glue
+ # # }
|
|
8 | +48 |
- # TODO add chunk
+ # }
|
|
9 | +49 |
- print.disk.frame <- function(x, ...) {+ |
|
10 | -! | +||
50 | +
- a = paste(sep = "\n"+ |
||
11 | -! | +||
51 | +1058x |
- ,glue::glue("path: \"{attr(x,'path')}\"")+ path = attr(df,"path", exact=TRUE) |
|
12 | -! | +||
52 | +1058x |
- ,glue::glue("nchunks: {disk.frame::nchunks(x)}")+ keep1 = attr(df,"keep", exact=TRUE) |
|
13 | -! | +||
53 | +
- ,glue::glue("nrow (at source): {disk.frame::nrow(x)}")+ |
||
14 | -! | +||
54 | +1058x |
- ,glue::glue("ncol (at source): {disk.frame::ncol(x)}")+ cmds = attr(df,"lazyfn", exact=TRUE) |
|
15 | -! | +||
55 | +1058x |
- ,glue::glue("nrow (post operations): ???")+ filename = ""+ |
+ |
56 | ++ |
+ + |
+ |
57 | +1058x | +
+ if (typeof(keep) == "closure") { |
|
16 | +58 | ! |
- ,glue::glue("ncol (post operations): ???\n")+ keep = keep1 |
17 | -+ | ||
59 | +1058x |
- )
+ } else if(!is.null(keep1) & !is.null(keep)) { |
|
18 | +60 | ! |
- message(a)- |
-
19 | -- |
- }
+ keep = intersect(keep1, keep) |
1 | -+ | |||
61 | +! |
- #' Number of rows or columns
+ if (!all(keep %in% keep1)) { |
||
2 | -+ | |||
62 | +! |
- #' @param ... passed to base::nrow
+ warning("some of the variables specified in keep = {keep} is not available") |
||
3 | +63 |
- #' @export
+ }
|
||
4 | -+ | |||
64 | +1058x |
- #' @rdname ncol_nrow
+ } else if(is.null(keep)) { |
||
5 | -+ | |||
65 | +1058x |
- #' @examples
+ keep = keep1 |
||
6 | +66 |
- #' cars.df = as.disk.frame(cars)
+ }
|
||
7 | +67 |
- #'
+ |
||
8 | -+ | |||
68 | +1058x |
- #' # return total number of column and rows
+ if(is.numeric(n)) { |
||
9 | -+ | |||
69 | +23x |
- #' ncol(cars.df)
+ filename = file.path(path, paste0(n,".fst")) |
||
10 | -+ | |||
70 | +23x |
- #' nrow(cars.df)
+ if(!file.exists(filename)) { |
||
11 | -+ | |||
71 | +! |
- #'
+ filename = list.files(path, full.names = TRUE)[n] |
||
12 | +72 |
- #' # clean up cars.df
+ }
|
||
13 | +73 |
- #' delete(cars.df)
+ } else { |
||
14 | -+ | |||
74 | +1035x |
- nrow <- function(df,...) {+ if (full.names) { |
||
15 | -375x | +75 | +981x |
- UseMethod("nrow")+ filename = n |
16 | +76 |
- }
+ } else { |
||
17 | -+ | |||
77 | +54x |
-
+ filename = file.path(path, n) |
||
18 | +78 |
- #' @rdname ncol_nrow
+ }
|
||
19 | +79 |
- #' @export
+ }
|
||
20 | +80 |
- #' @noRd
+ |
||
21 | +81 |
- nrow.default <- function(df, ...) {+ # if the file you are looking for don't exist
|
||
22 | -348x | +82 | +1058x |
- base::nrow(df, ...)+ if (!fs::file_exists(filename)) { |
23 | -+ | |||
83 | +! |
- }
+ warning(glue("The chunk {filename} does not exist; returning an empty data.table")) |
||
24 | -+ | |||
84 | +! |
-
+ notbl <- data.table() |
||
25 | -+ | |||
85 | +! |
- #' @export
+ attr(notbl, "does not exist") <- TRUE |
||
26 | -+ | |||
86 | +! |
- #' @rdname ncol_nrow
+ return(notbl) |
||
27 | +87 |
- #' @import fst
+ }
|
||
28 | +88 |
- nrow.disk.frame <- function(df, ...) {- |
- ||
29 | -27x | -
- stopifnot(is_ready(df))+ |
||
30 | -27x | +89 | +1058x |
- path1 <- attr(df,"path")+ if (is.null(cmds)) { |
31 | -27x | +90 | +661x |
- if(dir.exists(path1)) {+ if(typeof(keep)!="closure") { |
32 | -27x | +91 | +661x |
- path2 <- list.files(path1,full.names = TRUE)+ fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) |
33 | -27x | +|||
92 | +
- if(length(path2) == 0) {+ } else { |
|||
34 | +93 | ! |
- return(0)+ fst::read_fst(filename, as.data.table = TRUE,...) |
|
35 | +94 |
}
|
||
36 | -27x | +|||
95 | +
- tmpfstmeta = fst::fst.metadata(path2[1])+ } else { |
|||
37 | -27x | +96 | +397x |
- if("nrOfRows" %in% names(tmpfstmeta)) {+ if(typeof(keep)!="closure") { |
38 | -27x | +97 | +397x |
- return(sum(sapply(path2, function(p2) fst::fst.metadata(p2)$nrOfRows)))+ play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), cmds) |
39 | +98 |
} else { |
||
40 | +99 | ! |
- return(sum(sapply(path2, function(p2) fst::fst.metadata(p2)$NrOfRows)))+ play(fst::read_fst(filename, as.data.table = TRUE,...), cmds) |
|
41 | +100 |
}
|
||
42 | +101 |
- } else {+ }
|
||
43 | +102 |
- #return(fst::fst.metadata(path1)$NrOfRows)
+ }
|
||
44 | -! | +
1 | +
- stop(glue::glue("nrow error: directory {} does not exist"))+ #' [ interface for disk.frame using fst backend
|
|||
45 | +2 |
- }
+ #' @param df a disk.frame
|
||
46 | +3 |
- }
+ #' @param ... same as data.table
|
||
47 | +4 |
-
+ #' @param keep the columns to srckeep
|
||
48 | +5 |
- #' @import fst
+ #' @param rbind Whether to rbind the chunks. Defaults to TRUE
|
||
49 | +6 |
- #' @export
+ #' @param use.names Same as in data.table::rbindlist
|
||
50 | +7 |
- #' @rdname ncol_nrow
+ #' @param fill Same as in data.table::rbindlist
|
||
51 | +8 |
- ncol <- function(df) {+ #' @param idcol Same as in data.table::rbindlist
|
||
52 | -171x | +|||
9 | +
- UseMethod("ncol")+ #' @import fst
|
|||
53 | +10 |
- }
+ #' @importFrom future.apply future_lapply
|
||
54 | +11 |
-
+ #' @importFrom data.table rbindlist
|
||
55 | +12 |
- #' @import fs
+ #' @importFrom globals findGlobals
|
||
56 | +13 |
#' @export
|
||
57 | +14 |
- #' @param df a disk.frame
+ #' @examples
|
||
58 | +15 |
- #' @rdname ncol_nrow
+ #' cars.df = as.disk.frame(cars)
|
||
59 | +16 |
- ncol.disk.frame <- function(df) {+ #' speed_limit = 50
|
||
60 | -11x | +|||
17 | +
- length(colnames(df))+ #' cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
|
|||
61 | +18 |
- }
+ #'
|
||
62 | +19 |
-
+ #' # clean up
|
||
63 | +20 |
- #' @export
+ #' delete(cars.df)
|
||
64 | +21 |
- ncol.default <- function(df) {+ `[.disk.frame` <- function(df, ..., keep = NULL, rbind = TRUE, use.names = TRUE, fill = FALSE, idcol = NULL) { |
||
65 | -160x | +22 | +3x |
- base::ncol(df)+ keep_for_future = keep |
66 | +23 |
- }
+ |
1 | -+ | ||
24 | +3x |
- #' Add a chunk to the disk.frame
+ dotdotdot = substitute(...()) #this is an alist |
|
2 | +25 |
- #'
+ |
|
3 | -+ | ||
26 | +3x |
- #' If no chunk_id is specified, then the chunk is added at the end as the
+ ag = globals::findGlobals(dotdotdot) |
|
4 | -+ | ||
27 | +3x |
- #' largest numbered file, "n.fst".
+ ag = setdiff(ag, "") # "" can cause issues with future |
|
5 | +28 |
- #'
+ |
|
6 | -+ | ||
29 | +3x |
- #' @details The function is the preferred way to add a chunk to a disk.frame. It
+ res = future.apply::future_lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) { |
|
7 | +30 |
- #' performs checks on the types to make sure that the new chunk doesn't have
+ #lapply(get_chunk_ids(df, strip_extension = FALSE), function(chunk_id) {
|
|
8 | -+ | ||
31 | +18x |
- #' different types to the disk.frame.
+ chunk = get_chunk(df, chunk_id, keep = keep_for_future) |
|
9 | -+ | ||
32 | +18x |
- #'
+ data.table::setDT(chunk) |
|
10 | -+ | ||
33 | +18x |
- #' @param df the disk.frame to add a chunk to
+ expr <- quote(chunk) |
|
11 | -+ | ||
34 | +18x |
- #' @param chunk a data.frame to be added as a chunk
+ expr <- c(expr, dotdotdot) |
|
12 | -+ | ||
35 | +18x |
- #' @param chunk_id a numeric number indicating the id of the chunk. If NULL it
+ res <- do.call(`[`, expr) |
|
13 | -+ | ||
36 | +18x |
- #' will be set to the largest chunk_id + 1
+ res
|
|
14 | -+ | ||
37 | +3x |
- #' @param full.names whether the chunk_id name match should be to the full file
+ }, future.globals = c("df", "keep_for_future", "dotdotdot", ag), future.packages = c("data.table","disk.frame") |
|
15 | +38 |
- #' path not just the file name
+ )
|
|
16 | +39 |
- #' @importFrom data.table data.table
+ |
|
17 | -+ | ||
40 | +3x |
- #' @importFrom utils capture.output
+ if(rbind & all(sapply(res, function(x) "data.frame" %in% class(x)))) { |
|
18 | -+ | ||
41 | +3x |
- #' @export
+ rbindlist(res, use.names = use.names, fill = fill, idcol = idcol)+ |
+ |
42 | +! | +
+ } else if(rbind) {+ |
+ |
43 | +! | +
+ unlist(res) |
|
19 | +44 |
- #' @return disk.frame
+ } else { |
|
20 | -+ | ||
45 | +! |
- #' @examples
+ res
|
|
21 | +46 |
- #' # create a disk.frame
+ }
|
|
22 | +47 |
- #' df_path = file.path(tempdir(), "tmp_add_chunk")
+ }
|
|
23 | +48 |
- #' diskf = disk.frame(df_path)
+
|
|
24 | +49 |
- #'
+ # Solutions from https://stackoverflow.com/questions/57122960/how-to-use-non-standard-evaluation-nse-to-evaluate-arguments-on-data-table?answertab=active#tab-top
|
|
25 | +50 |
- #' # add a chunk to diskf
+ # `[.dd` <- function(x, ...) {
|
|
26 | +51 |
- #' add_chunk(diskf, cars)
+ # code <- rlang::enexprs(...)
|
|
27 | +52 |
- #' add_chunk(diskf, cars)
+ # lapply(x, function(dt) {
|
|
28 | +53 |
- #'
+ # ex <- rlang::expr(dt[!!!code])
|
|
29 | +54 |
- #' nchunks(diskf) # 2
+ # rlang::eval_tidy(ex)
|
|
30 | +55 |
- #'
+ # })
|
|
31 | +56 |
- #' df2 = disk.frame(file.path(tempdir(), "tmp_add_chunk2"))
+ # }
|
|
32 | +57 |
- #'
+ #
|
|
33 | +58 |
- #' # add chunks by specifying the chunk_id number; this is especially useful if
+ #
|
|
34 | +59 |
- #' # you wish to add multiple chunk in parralel
+ # `[.dd` <- function(x,...) {
|
|
35 | +60 |
- #'
+ # a <- substitute(...()) #this is an alist
|
|
36 | +61 |
- #' add_chunk(df2, data.frame(chunk=1), 1)
+ # expr <- quote(x[[i]])
|
|
37 | +62 |
- #' add_chunk(df2, data.frame(chunk=2), 3)
+ # expr <- c(expr, a)
|
|
38 | +63 |
- #'
+ # res <- list()
|
|
39 | +64 |
- #' nchunks(df2) # 2
+ # for (i in seq_along(x)) {
|
|
40 | +65 |
- #'
+ # res[[i]] <- do.call(`[`, expr)
|
|
41 | +66 |
- #' dir(attr(df2, "path"))
+ # }
|
|
42 | +67 |
- #' # [1] "1.fst" "3.fst"
+ # res
|
|
43 | +68 |
- #'
+ # }
|
44 | +1 |
- #' # clean up
+ #' Returns the number of chunks in a disk.frame
|
||
45 | +2 |
- #' delete(diskf)
+ #' @param df a disk.frame
|
||
46 | +3 |
- #' delete(df2)
+ #' @param skip.ready.check NOT implemented
|
||
47 | +4 |
- add_chunk <- function(df, chunk, chunk_id = NULL, full.names = FALSE) {+ #' @param ... not used
|
||
48 | +5 |
- # sometimes chunk_id is defined in terms of itself
- |
- ||
49 | -24x | -
- force(chunk_id)- |
- ||
50 | -24x | -
- stopifnot("disk.frame" %in% class(df))- |
- ||
51 | -24x | -
- if(!is_disk.frame(df)) {- |
- ||
52 | -! | -
- stop("can not add_chunk as this is not a disk.frame")+ #' @export
|
||
53 | +6 |
- }
+ #' @examples
|
||
54 | +7 |
- + #' cars.df = as.disk.frame(cars)
|
||
55 | +8 |
- # get the metadata for all chunks
- |
- ||
56 | -24x | -
- path = attr(df,"path")- |
- ||
57 | -24x | -
- files <- fs::dir_ls(path, type="file", glob = "*.fst")+ #'
|
||
58 | +9 |
- + #' # return the number of chunks
|
||
59 | +10 |
- + #' nchunks(cars.df)
|
||
60 | +11 |
- # if a chunk_id is not specified
- |
- ||
61 | -24x | -
- if(is.null(chunk_id)) {- |
- ||
62 | -22x | -
- chunk_id = 1 + max(purrr::map_int(files, ~{- |
- ||
63 | -58x | -
- s = stringr::str_extract(.x,"[:digit:]+\\.fst")+ #' nchunk(cars.df)
|
||
64 | -58x | +|||
12 | +
- as.integer(substr(s, 1, nchar(s) - 4))+ #'
|
|||
65 | -22x | +|||
13 | +
- }), nchunks(df))+ #' # clean up cars.df
|
|||
66 | +14 |
- }
+ #' delete(cars.df)
|
||
67 | +15 |
- + nchunks <- function(df, ...) { |
||
68 | -24x | +16 | +257x |
- if(length(files) > 0) {+ UseMethod("nchunks") |
69 | -20x | +|||
17 | +
- filename = ""+ }
|
|||
70 | +18 |
- + |
||
71 | -20x | +|||
19 | +
- if(is.numeric(chunk_id)) {+ |
|||
72 | -20x | +|||
20 | +
- filename = file.path(path,glue::glue("{as.integer(chunk_id)}.fst"))+ #' @rdname nchunks
|
|||
73 | +21 |
- } else { # if the chunk_id is not numeric+ #' @export
|
||
74 | -! | +|||
22 | +
- if (full.names) {+ nchunk <- function(df, ...) { |
|||
75 | -! | +|||
23 | +6x |
- filename = file.path(path, chunk_id)+ UseMethod("nchunk") |
||
76 | +24 |
- } else {+ }
|
||
77 | -! | +|||
25 | +
- filename = chunk_id+ |
|||
78 | +26 |
- }
+ #' @rdname nchunks
|
||
79 | +27 |
- }
+ #' @export
|
||
80 | +28 |
- + nchunk.disk.frame <- function(df, ...) { |
||
81 | -20x | +29 | +6x |
- if(fs::file_exists(filename)) {+ nchunks.disk.frame(df, ...) |
82 | -! | +|||
30 | +
- stop(glue::glue("failed to add_chunk as chunk_id = {chunk_id} already exist"))+ }
|
|||
83 | +31 |
- }
+
|
||
84 | +32 |
- + #' @importFrom fs dir_ls
|
||
85 | -20x | +|||
33 | +
- metas = purrr::map(files, fst::metadata_fst)+ #' @rdname nchunks
|
|||
86 | +34 |
- + #' @export
|
||
87 | -20x | +|||
35 | +
- types <- c("unknown", "character", "factor", "ordered factor",+ nchunks.disk.frame <- function(df, skip.ready.check = FALSE, ...) { |
|||
88 | -20x | +|||
36 | +
- "integer", "POSIXct", "difftime", "IDate", "ITime", "double",+ #if(!skip.ready.check) stopifnot(is_ready(df))
|
|||
89 | -20x | +37 | +628x |
- "Date", "POSIXct", "difftime", "ITime", "logical", "integer64",+ fpath <- attr(df,"path", exact=TRUE) |
90 | -20x | +38 | +628x |
- "nanotime", "raw")+ if(is.dir.disk.frame(df)) { |
91 | -+ | |||
39 | +628x |
- + return(length(fs::dir_ls(fpath, type="file"))) |
||
92 | +40 |
- # need to ensure that all column names and types match
+ } else { |
||
93 | -20x | +|||
41 | +! |
- metas_df = purrr::imap_dfr(metas,+ return(1) |
||
94 | -20x | +|||
42 | +
- ~data.table::data.table(+ }
|
|||
95 | -20x | +|||
43 | +
- colnames = .x$columnNames,+ }
|
|||
96 | -20x | +
1 | +
- coltypes = types[.x$columnTypes],+ #' Performs join/merge for disk.frames
|
|||
97 | -20x | +|||
2 | +
- chunk_id = .y))+ #' @rdname join
|
|||
98 | +3 |
- + #' @export
|
||
99 | -20x | +|||
4 | +
- metas_df_summ = metas_df[,.N,.(colnames, coltypes)][order(N)]+ #' @examples
|
|||
100 | -20x | +|||
5 | +
- metas_df_summ[,existing_df := TRUE]+ #' cars.df = as.disk.frame(cars)
|
|||
101 | +6 |
- + #'
|
||
102 | -20x | +|||
7 | +
- new_chunk_meta =+ #' join.df = left_join(cars.df, cars.df)
|
|||
103 | -20x | +|||
8 | +
- data.table::data.table(+ #'
|
|||
104 | -20x | +|||
9 | +
- colnames = names(chunk),+ #' # clean up cars.df
|
|||
105 | -20x | +|||
10 | +
- coltypes = purrr::map(chunk, typeof) %>% unlist,+ #' delete(cars.df)
|
|||
106 | -20x | +|||
11 | +
- new_chunk = TRUE)+ #' delete(join.df)
|
|||
107 | +12 |
- + left_join.disk.frame <- function(x, y, by=NULL, copy=FALSE, ..., outdir = tempfile("tmp_disk_frame_left_join"), merge_by_chunk_id = FALSE, overwrite = TRUE, .progress = FALSE) { |
||
108 | -20x | +13 | +12x |
- merged_meta = full_join(new_chunk_meta, metas_df_summ, by=c("colnames"))+ stopifnot("disk.frame" %in% class(x))+ |
+
14 | ++ |
+ |
||
109 | -20x | +15 | +12x |
- data.table::setDT(merged_meta)+ overwrite_check(outdir, overwrite) |
110 | +16 |
- + |
||
111 | -+ | |||
17 | +12x |
- # find out which vars are matched
+ if("data.frame" %in% class(y)) { |
||
112 | -20x | +|||
18 | +
- check_vars = full_join(+ # note that x is named .data in the lazy evaluation
|
|||
113 | -20x | +19 | +4x |
- new_chunk_meta[,.(colnames, new_chunk)],+ quo_dotdotdot = enquos(...) |
114 | -20x | +20 | +4x |
- metas_df[,.(colnames=unique(colnames), existing_df = TRUE)], by = "colnames")+ cmap_dfr(x, ~{ |
115 | -+ | |||
21 | +16x |
- + code = quo(left_join(.x, y, by = by, copy = copy, !!!quo_dotdotdot)) |
||
116 | -20x | +22 | +16x |
- data.table::setDT(check_vars)+ rlang::eval_tidy(code) |
117 | -20x | +23 | +4x |
- if(nrow(check_vars[is.na(new_chunk)]) > 0) {+ }, .progress = .progress) |
118 | -! | +|||
24 | +8x |
- warning(+ } else if("disk.frame" %in% class(y)) { |
||
119 | -! | +|||
25 | +8x |
- glue::glue(+ if(is.null(merge_by_chunk_id)) { |
||
120 | +26 | ! |
- "these variables are in the disk.frame but not in the new chunk: \n {paste0(check_vars[is.na(new_chunk), colnames], collapse=',\n ')}"))+ stop("both x and y are disk.frames. You need to specify merge_by_chunk_id = TRUE or FALSE explicitly") |
|
121 | +27 |
}
|
||
122 | -20x | +28 | +8x |
- if(nrow(check_vars[is.na(existing_df)]) > 0){+ if(is.null(by)) { |
123 | +29 | ! |
- warning(glue::glue("these variables are in the new chunk but not in the existing disk.frame: {paste0(check_vars[is.na(existing_df), colnames], collapse=', ')}"))+ by <- intersect(names(x), names(y)) |
|
124 | +30 |
}
|
||
125 | +31 |
|
||
126 | -+ | |||
32 | +8x |
- # find out which vars are matched but the types don't match
+ ncx = nchunks(x) |
||
127 | -20x | +33 | +8x |
- metas_df_summ1 = merged_meta[existing_df == TRUE & new_chunk == TRUE & coltypes.x != coltypes.y]+ ncy = nchunks(y) |
128 | -+ | |||
34 | +8x |
- # find incompatible types
+ if (merge_by_chunk_id == FALSE) { |
||
129 | -20x | +35 | +4x |
- metas_df_summ1[, incompatible_types := {+ warning("merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.") |
130 | -20x | +36 | +4x |
- coltypes.x %in% c("integer", "double", "Date") & coltypes.y == "character" |+ x = hard_group_by(x, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
131 | -20x | +37 | +4x |
- coltypes.x == "character" & coltypes.y %in% c("integer", "double", "Date") |+ y = hard_group_by(y, by, nchunks = max(ncy,ncx), overwrite = TRUE) |
132 | -20x | +38 | +4x |
- coltypes.x %in% c("integer", "double") & coltypes.y == "Date" |+ return(left_join.disk.frame(x, y, by, copy = copy, outdir = outdir, merge_by_chunk_id = TRUE, overwrite = overwrite, .progress = .progress)) |
133 | -20x | +39 | +4x |
- coltypes.x == "Date" & coltypes.y %in% c("integer", "double")+ } else if(merge_by_chunk_id == TRUE) { |
134 | +40 |
- }]+ #} else if ((identical(shardkey(x)$shardkey, "") & identical(shardkey(y)$shardkey, "")) | identical(shardkey(x), shardkey(y))) {
|
||
135 | -+ | |||
41 | +4x |
- + dotdotdot = list(...) |
||
136 | -20x | +42 | +4x |
- metas_df_summ2 = metas_df_summ1[incompatible_types == TRUE,]+ res = cmap2.disk.frame(x, y, ~{ |
137 | -+ | |||
43 | +21x |
- + if(is.null(.y)) {+ |
+ ||
44 | +! | +
+ return(.x) |
||
138 | -20x | +45 | +21x |
- if(nrow(metas_df_summ2)>0) {+ } else if (is.null(.x)) { |
139 | +46 | ! |
- message("the belows types are incompatible between the new chunk and the disk.frame; this chunk can not be added\n")+ return(data.table()) |
|
140 | -! | +|||
47 | +
- message(paste0(utils::capture.output(metas_df_summ2), collapse = "\n"))+ }
|
|||
141 | -! | +|||
48 | +21x |
- stop("")+ llj = purrr::lift(dplyr::left_join) |
||
142 | +49 |
- }
+ #left_join(.x, .y, by = by, copy = copy, ...)
+ |
+ ||
50 | +21x | +
+ llj(c(list(x=.x, y =.y, by = by, copy = copy), dotdotdot))+ |
+ ||
51 | +4x | +
+ }, outdir = outdir)+ |
+ ||
52 | +4x | +
+ return(res) |
||
143 | +53 |
- }
+ } else { |
||
144 | +54 |
-
+ # TODO if the shardkey are the same and only the shardchunks are different then just shard again on one of them is fine
|
||
145 | -24x | +|||
55 | +! |
- fst::write_fst(chunk, file.path(attr(df,"path"), paste0(chunk_id,".fst")))+ stop("merge_by_chunk_id is TRUE but shardkey(x) does NOT equal to shardkey(y). You may want to perform a hard_group_by() on both x and/or y or set merge_by_chunk_id = FALSE") |
||
146 | -24x | +|||
56 | ++ |
+ }
+ |
+ ||
57 | +
- disk.frame(attr(df,"path"))+ }
|
|||
147 | +58 |
}
@@ -25920,850 +29536,829 @@ disk.frame coverage - 51.96% |
1 |
- #' Return the column names of the disk.frame
+ #' Merge function for disk.frames
|
||
2 |
- #'
+ #' @export
|
||
3 |
- #' The returned column names are from the source. So if you have lazy operations then the
+ #' @param x a disk.frame
|
||
4 |
- #' colnames here does not reflects the results of those operations. To obtain the correct names try
+ #' @param y a disk.frame or data.frame
|
||
5 |
- #' \code{names(collect(get_chunk(df, 1)))}
+ #' @param by the merge by keys
|
||
6 |
- #'
+ #' @param outdir The output directory for the disk.frame
|
||
7 |
- #' @param x a disk.frame
+ #' @param merge_by_chunk_id if TRUE then only chunks in df1 and df2 with the same chunk id will get merged
|
||
8 |
- #' @param ... not used
+ #' @param overwrite overwrite the outdir or not
|
||
9 |
- #' @export
+ #' @param ... passed to merge and cmap.disk.frame
|
||
10 |
- #' @importFrom fs dir_ls
+ #' @importFrom data.table data.table setDT
|
||
11 |
- #' @importFrom fst metadata_fst
+ #' @examples
|
||
12 |
- #' @export
+ #' b = as.disk.frame(data.frame(a = 51:150, b = 1:100))
|
||
13 |
- colnames <- function(x, ...) {+ #' d = as.disk.frame(data.frame(a = 151:250, b = 1:100))
|
||
14 | -52x | +
- UseMethod("colnames")+ #' bd.df = merge(b, d, by = "b", merge_by_chunk_id = TRUE)
|
|
15 |
- }
+ #'
|
||
16 |
-
+ #' # clean up cars.df
|
||
17 |
- #' @rdname colnames
+ #' delete(b)
|
||
18 |
- #' @export
+ #' delete(d)
|
||
19 |
- names.disk.frame <- function(x, ...) {+ #' delete(bd.df)
|
||
20 | -2x | +
- colnames.disk.frame(x, ...)+ merge.disk.frame <- function(x, y, by, outdir = tempfile(fileext = ".df"), ..., merge_by_chunk_id = FALSE, overwrite = FALSE) { |
|
21 | -+ | 3x |
- }
+ stopifnot("disk.frame" %in% class(x)) |
22 | -+ | 3x |
-
+ overwrite_check(outdir, overwrite = TRUE) |
23 |
- #' @rdname colnames
+ #fs::dir_create(outdir)
|
||
24 |
- #' @export
+ |
||
25 | -+ | 3x |
- colnames.disk.frame <- function(x, ...) {+ if("data.frame" %in% class(y)) { |
26 | -20x | +1x |
- res = attr(x, "path") %>%+ yby = c(list(y=y, by=by), list(...)) |
27 | -20x | +1x |
- fs::dir_ls(type="file")+ res = cmap(x, ~{ |
28 | -20x | +5x |
- if(length(res) == 0) {+ res = do.call(merge, c(list(x = .x), yby)) |
29 | -! | +5x |
- return(vector("character"))+ res
|
30 | -+ | 1x |
- }
+ }, outdir=outdir, ...) |
31 | -20x | +1x |
- fst::metadata_fst(res[1])$columnNames+ res
|
32 | -+ | 2x |
- }
+ } else if (merge_by_chunk_id | shardkey_equal(shardkey(x), shardkey(y))) { |
33 |
-
+ # ifthe shardkeys are the same then only need to match by segment id
|
||
34 |
-
+ # as account with the same shardkey must end up in the same segment
|
||
35 | -+ | 1x |
- #' @rdname colnames
+ path1 = attr(x,"path", exact=TRUE) |
36 | -+ | 1x |
- #' @export
+ path2 = attr(y,"path", exact=TRUE) |
37 |
- colnames.default <- function(x, ...) {+ |
||
38 | -34x | +1x |
- base::colnames(x, ...)+ df3 = merge( |
39 | -+ | 1x |
- }
+ data.table( |
1 | -+ | ||
40 | +1x |
- #' Bring the disk.frame into R
+ chunk_id = list.files(path1), |
|
2 | -+ | ||
41 | +1x |
- #'
+ pathA = list.files(path1,full.names = TRUE) |
|
3 | +42 |
- #' Bring the disk.frame into RAM by loading the data and running all lazy
+ ),
|
|
4 | -+ | ||
43 | +1x |
- #' operations as data.table/data.frame or as a list
+ data.table( |
|
5 | -+ | ||
44 | +1x |
- #' @param x a disk.frame
+ chunk_id = list.files(path2), |
|
6 | -+ | ||
45 | +1x |
- #' @param parallel if TRUE the collection is performed in parallel. By default
+ pathB = list.files(path2,full.names = TRUE) |
|
7 | +46 |
- #' if there are delayed/lazy steps then it will be parallel, otherwise it will
+ )
|
|
8 | +47 |
- #' not be in parallel. This is because parallel requires transferring data
+ )
|
|
9 | -+ | ||
48 | +1x |
- #' from background R session to the current R session and if there is no
+ setDT(df3) |
|
10 | -+ | ||
49 | +1x | +
+ df3[,{+ |
+ |
50 | +5x | +
+ data1 = read_fst(pathA,as.data.table = TRUE)+ |
+ |
51 | +5x | +
+ data2 = read_fst(pathB,as.data.table = TRUE)+ |
+ |
52 | +5x | +
+ data3 = force(merge(data1, data2, by = by))+ |
+ |
53 | +5x | +
+ rm(data1); rm(data2); gc()+ |
+ |
54 | +5x | +
+ write_fst(data3, glue("{outdir}/{.BY}"))+ |
+ |
55 | +5x | +
+ NULL
+ |
+ |
56 | +1x | +
+ }, chunk_id]+ |
+ |
57 | +1x |
- #' computation then it's better to avoid transferring data between session,
+ return(disk.frame(outdir)) |
|
11 | +58 |
- #' hence parallel = FALSE is a better choice
+ } else { |
|
12 | -+ | ||
59 | +1x |
- #' @param ... not used
+ stop("Cartesian joins are currently not implemented. Either make y a data.frame or set merge_by_chunk_id to TRUE") |
|
13 | +60 |
- #' @importFrom data.table data.table as.data.table
+ |
|
14 | +61 |
- #' @importFrom furrr future_map_dfr future_options
+ # have to make every possible combination
|
|
15 | +62 |
- #' @importFrom purrr map_dfr
+ # path1 = attr(df1,"path")
|
|
16 | +63 |
- #' @importFrom dplyr collect select mutate
+ # path2 = attr(df2,"path")
|
|
17 | +64 |
- #' @return collect return a data.frame/data.table
+ #
|
|
18 | +65 |
- #' @examples
+ # df3 = merge(
|
|
19 | +66 |
- #' cars.df = as.disk.frame(cars)
+ # data.table(
|
|
20 | +67 |
- #' # use collect to bring the data into RAM as a data.table/data.frame
+ # justmerge = TRUE,
|
|
21 | +68 |
- #' collect(cars.df)
+ # chunk_id1 = list.files(path1),
|
|
22 | +69 |
- #'
+ # pathA = list.files(path1,full.names = TRUE)
|
|
23 | +70 |
- #' # clean up
+ # ),
|
|
24 | +71 |
- #' delete(cars.df)
+ # data.table(
|
|
25 | +72 |
- #' @export
+ # justmerge = TRUE,
|
|
26 | +73 |
- #' @rdname collect
+ # chunk_id2 = list.files(path2),
|
|
27 | +74 |
- collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) {- |
- |
28 | -86x | -
- cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE)+ # pathB = list.files(path2,full.names = TRUE)
|
|
29 | +75 |
- #cids = as.integer(get_chunk_ids(x))
- |
- |
30 | -86x | -
- if(nchunks(x) > 0) {- |
- |
31 | -83x | -
- if(parallel) {+ # ),
|
|
32 | +76 |
- #furrr::future_map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE), .options = furrr::future_options(packages = "disk.frame"))
+ # by = "justmerge",
|
|
33 | +77 |
- #furrr::future_map_dfr(cids, ~disk.frame::get_chunk(x, .x, full.names = TRUE))
- |
- |
34 | -35x | -
- furrr::future_map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE))+ # all=TRUE,
|
|
35 | +78 |
- #purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE))
+ # allow.cartesian = TRUE
|
|
36 | +79 |
- #purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE))
+ # )
|
|
37 | +80 |
- #future.apply::future_lapply(chunk_ids, function(.x) disk.frame::get_chunk(x, .x))
+ #
|
|
38 | +81 |
- #lapply(chunk_ids, function(chunk) get_chunk(x, chunk)) %>% rbindlist
+ #
|
|
39 | +82 |
- } else {- |
- |
40 | -48x | -
- purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE))+ # setDT(df3)
|
|
41 | +83 |
- }
+ # i <- 0
|
|
42 | +84 |
- } else {- |
- |
43 | -3x | -
- data.table()+ # mapply(function(pathA, pathB) {
|
|
44 | +85 |
- }
+ # stop("error")
|
|
45 | +86 |
- }
+ # data1 = read_fst(pathA,as.data.table = TRUE, columns = c("ACCOUNT_ID","MONTH_KEY"))
|
|
46 | +87 |
-
+ # data2 = read_fst(pathB,as.data.table = TRUE, columns = c("ACCOUNT_ID","MONTH_KEY"))
|
|
47 | +88 |
- #' @param simplify Should the result be simplified to array
+ # data3 = merge(data1, data2, ...)
|
|
48 | +89 |
- #' @export
+ # rm(data1); rm(data2); gc()
|
|
49 | +90 |
- #' @rdname collect
+ # if(nrow(data3) > 0) {
|
|
50 | +91 |
- #' @return collect_list returns a list
+ # i <<- i + 1
|
|
51 | +92 |
- #' @examples
+ # write_fst(data3, glue("{outdir}/{i}.fst"))
|
|
52 | +93 |
- #' cars.df = as.disk.frame(cars)
+ # }
|
|
53 | +94 |
- #'
+ # NULL
|
|
54 | +95 |
- #' # returns the result as a list
+ # },df3$pathA, df3$pathB)
|
|
55 | +96 |
- #' collect_list(map(cars.df, ~1))
+ # return(disk.frame(outdir))
|
|
56 | +97 |
- #'
+ }
|
|
57 | +98 |
- #' # clean up
+ }
|
58 | +1 |
- #' delete(cars.df)
+ #' Print disk.frame
|
|
59 | +2 |
- collect_list <- function(x, simplify = FALSE, parallel = !is.null(attr(x,"lazyfn"))) {- |
- |
60 | -1x | -
- cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE)+ #' @description
|
|
61 | +3 |
- + #' a new print method for disk.frame
|
|
62 | +4 |
- - |
- |
63 | -1x | -
- if(nchunks(x) > 0) {- |
- |
64 | -1x | -
- res <- NULL- |
- |
65 | -1x | -
- if (parallel) {+ #' @export
|
|
66 | +5 |
- #res = furrr::future_map(1:nchunks(x), ~get_chunk(x, .x))
+ #' @param x disk.frame
|
|
67 | -! | +||
6 | +
- res = future.apply::future_lapply(cids, function(.x) {+ #' @param ... not used
|
||
68 | -! | +||
7 | +
- get_chunk(x, .x, full.names = TRUE)+ #' @importFrom glue glue
|
||
69 | +8 |
- })+ # TODO add chunk
|
|
70 | +9 |
- } else {+ print.disk.frame <- function(x, ...) { |
|
71 | +10 | 1x |
- res = purrr::map(cids, ~get_chunk(x, .x, full.names = TRUE))+ a = paste(sep = "\n" |
72 | -+ | ||
11 | +1x |
- }
+ ,glue::glue("path: \"{attr(x,'path', exact=TRUE)}\"") |
|
73 | +12 | 1x |
- if (simplify) {+ ,glue::glue("nchunks: {disk.frame::nchunks(x)}") |
74 | -! | +||
13 | +1x |
- return(simplify2array(res))+ ,glue::glue("nrow (at source): {disk.frame::nrow(x)}") |
|
75 | -+ | ||
14 | +1x |
- } else {+ ,glue::glue("ncol (at source): {disk.frame::ncol(x)}") |
|
76 | +15 | 1x |
- return(res)+ ,glue::glue("nrow (post operations): ???") |
77 | -+ | ||
16 | +1x |
- }
+ ,glue::glue("ncol (post operations): ???\n") |
|
78 | +17 |
- } else {- |
- |
79 | -! | -
- list()+ )
|
|
80 | -+ | ||
18 | +1x |
- }
+ message(a) |
|
81 | +19 |
}
@@ -26772,98 +30367,98 @@ disk.frame coverage - 51.96% |
1 |
- #' `map` a function to two disk.frames
+ #' @export
|
||
2 |
- #' @description
+ #' @rdname cmap
|
||
3 |
- #' Perform a function on both disk.frames .x and .y, each chunk of .x and .y
+ map <- function(.x, .f, ...) { |
||
4 | -+ | 76x |
- #' gets run by .f(x.chunk, y.chunk)
+ UseMethod("map") |
5 |
- #' @param .x a disk.frame
+ }
|
||
6 |
- #' @param .y a disk.frame
+
|
||
7 |
- #' @param .f a function to be called on each chunk of x and y matched by
+ #' @export
|
||
8 |
- #' chunk_id
+ #' @rdname cmap
|
||
9 |
- #' @param ... not used
+ map.disk.frame <- function(...) { |
||
10 | -+ | ! |
- #' @param outdir output directory
+ warning("map(df, ...) where df is a disk.frame has been deprecated. Please use cmap(df,...) instead") |
11 | -+ | ! |
- #' @import stringr fst
+ cmap.disk.frame(...) |
12 |
- #' @importFrom purrr as_mapper map2
+ }
|
||
13 |
- #' @importFrom data.table data.table
+
|
||
15 |
- #' @examples
+ #' @rdname cmap
|
||
16 |
- #' cars.df = as.disk.frame(cars)
+ map.default <- function(.x, .f, ...) { |
||
17 | -+ | 76x |
- #'
+ purrr::map(.x, .f, ...) |
18 |
- #' cars2.df = map2(cars.df, cars.df, ~data.table::rbindlist(list(.x, .y)))
+ }
|
||
19 |
- #' collect(cars2.df)
+
|
||
20 |
- #'
+
|
||
21 |
- #' # clean up cars.df
+ #' @export
|
||
22 |
- #' delete(cars.df)
+ #' @rdname cmap
|
||
23 |
- #' delete(cars2.df)
+ imap_dfr <- function(.x, .f, ..., .id = NULL) { |
||
24 | -+ | ! |
- map2 <- function(.x, .y, .f, ...){+ UseMethod("imap_dfr") |
25 | -14x | +
- UseMethod("map2")+ }
|
|
26 |
- }
+
|
||
27 |
-
+ #' @export
|
||
28 |
- #' @export
+ #' @rdname cmap
|
||
29 |
- map2.default <- function(.x, .y, .f, ...) {+ imap_dfr.disk.frame <- function(...) { |
||
30 | ! |
- purrr::map2(.x,.y,.f,...)+ warning("imap_dfr(df, ...) where df is disk.frame is deprecated. Please use cimap_dfr(df, ...) instead") |
|
31 | -+ | ! |
- }
+ cimap_dfr.disk.frame(...) |
32 |
-
+ }
|
||
33 |
- #' @export
+
|
||
34 |
- #' @importFrom assertthat assert_that
+ #' @export
|
||
35 |
- #' @importFrom pryr do_call
+ #' @rdname cmap
|
||
36 |
- map2.disk.frame <- function(.x, .y, .f, ..., outdir = tempfile(fileext = ".df"), .progress = TRUE) {+ imap_dfr.default <- function(.x, .f, ..., .id = NULL) { |
||
37 | -26x | +! |
- assertthat::assert_that("disk.frame" %in% class(.x), msg = "running map2.disk.frame(.x,.y, ...): the .x argument must be a disk.frame")+ purrr::imap_dfr(.x, .f, ..., .id = .id) |
38 | -26x | +
- .f = purrr::as_mapper(.f)+ }
|
|
39 |
- + |
||
40 |
- + #' @export
|
||
41 | -26x | +
- if("disk.frame" %in% class(.y)) {+ #' @rdname cmap
|
|
42 | -25x | +
- fs::dir_create(outdir)+ #' @examples
|
|
43 |
- + #' cars.df = as.disk.frame(cars)
|
||
44 |
- # get all the chunk ids
+ #'
|
||
45 | -25x | +
- xc = data.table(cid = get_chunk_ids(.x))+ #' # .x is the chunk and .y is the ID as an integer
|
|
46 | -25x | +
- xc[,xid:=get_chunk_ids(.x, full.names = TRUE)]+ #'
|
|
47 | -25x | +
- yc = data.table(cid = get_chunk_ids(.y))+ #' # lazy = TRUE support is not available at the moment
|
|
48 | -25x | +
- yc[,yid:=get_chunk_ids(.y, full.names = TRUE)]+ #' cimap(cars.df, ~.x[, id := .y], lazy = FALSE)
|
|
49 |
- + #'
|
||
50 | -25x | +
- xyc = merge(xc, yc, by="cid", all = TRUE, allow.cartesian = TRUE)+ #' cimap_dfr(cars.df, ~.x[, id := .y])
|
|
51 |
- + #'
|
||
52 | -25x | +
- ddd = list(...)+ #' # clean up cars.df
|
|
53 |
- # apply the functions
+ #' delete(cars.df)
|
||
54 |
- + imap <- function(.x, .f, ...) { |
||
55 | -+ | ! |
- #future.apply::future_mapply(function(xid, yid, outid) {
+ UseMethod("imap") |
56 |
- #mapply(function(xid, yid, outid) {
+ }
|
||
57 | -25x | +
- furrr::future_pmap(list(xyc$xid, xyc$yid, xyc$cid), function(xid, yid, outid) {+ |
|
58 | -126x | +
- xch = disk.frame::get_chunk(.x, xid, full.names = TRUE)+ imap.disk.frame <- function(...) { |
|
59 | -126x | +! |
- ych = disk.frame::get_chunk(.y, yid, full.names = TRUE)+ warning("imap(df,..) where df is disk.frame is deprecated. Use cimap(df, ...) instead") |
60 | -126x | +! |
- xych = .f(xch, ych)+ cimap.disk.frame(...) |
61 | -126x | +
- if(base::nrow(xych) > 0) {+ }
|
|
62 | -111x | +
- fst::write_fst(xych, file.path(outdir, paste0(outid,".fst")))+ |
|
63 |
- } else {+ #' @export
|
||
64 | -15x | +
- warning(glue::glue("one of the chunks, {xid}, is empty"))+ #' @rdname cmap
|
|
65 |
- }
+ imap.default <- function(.x, .f, ...) { |
||
66 | -126x | +! |
- NULL
+ purrr::imap(.x, .f, ...) |
67 |
- }
+ }
|
||
68 |
- #,xyc$xid, xyc$yid, xyc$cid # together with mapply
+
|
||
69 | -25x | +
- , .progress = .progress+ #' @rdname cmap
|
|
70 |
- )
+ #' @param .id not used
|
||
71 |
- + #' @export
|
||
72 | -25x | +
- return(disk.frame(outdir))+ map_dfr.disk.frame <- function(...) { |
|
73 | -+ | ! |
- } else {+ warning("map_dfr(df, ...) where df is disk.frame is deprecated. Please use cmap_dfr instead") |
74 | -+ | ! |
- # if .y is not a disk.frame
+ cmap_dfr.disk.frame(...) |
75 | -1x | +
- warning("in map2(.x,.y,...) the .y is not a disk.frame, so returning a list instead of a disk.frame")+ }
|
|
76 |
- + |
||
77 | -1x | +
- f_for_passing = force(.f)+ map_dfr <- function(.x, .f, ..., .id = NULL) { |
|
78 | -1x | +! |
- ddd = list(...)+ UseMethod("map_dfr") |
79 | -1x | +
- tmp_disk.frame = force(.x)+ }
|
|
80 | -1x | +
- res = furrr::future_map2(get_chunk_ids(tmp_disk.frame, full.names = TRUE), .y, function(xs, ys) {+ |
|
81 | -5x | +
- ddd = c(list(get_chunk(tmp_disk.frame, xs, full.names = TRUE), ys), ddd)+ #' @export
|
|
82 |
- + #' @rdname cmap
|
||
83 | -5x | +
- pryr::do_call(f_for_passing, ddd)+ map_dfr.default <- function(.x, .f, ..., .id = NULL) { |
|
84 | -+ | ! |
- })+ purrr::map_dfr(.x, .f, ..., .id = .id) |
85 |
- + }
|
||
86 | -1x | +
1 | +
- return(res)+ #' Perform a hard arrange
|
||
87 | +2 |
- }
+ #' @description
|
|
88 | +3 |
- }
+ #' A hard_arrange is a sort by that also reorganizes the chunks to ensure that
|
|
89 | +4 |
-
+ #' every unique grouping of `by`` is in the same chunk. Or in other words, every
|
|
90 | +5 |
- #' @rdname map2
+ #' row that share the same `by` value will end up in the same chunk.
|
|
91 | +6 |
- map_by_chunk_id <- function(.x, .y, .f, ..., outdir) {+ #' @param df a disk.frame
|
|
92 | -! | +||
7 | +
- warning("map_by_chunk_id is deprecated. Use map2 instead")+ #' @param ... grouping variables
|
||
93 | -! | +||
8 | +
- map2.disk.frame(.x, .y, .f, ..., outdir = outdir)+ #' @param outdir the output directory
|
||
94 | +9 |
- }
+ #' @param nchunks The number of chunks in the output. Defaults = nchunks.disk.frame(df)
|
1 | +10 | ++ |
+ #' @param overwrite overwrite the out put directory
+ |
+
11 | ++ |
+ #' @param add same as dplyr::arrange
+ |
+ |
12 | ++ |
+ #' @param .drop same as dplyr::arrange
+ |
+ |
13 | ++ |
+ #' @export
+ |
+ |
14 | ++ |
+ #' @examples
+ |
+ |
15 | ++ |
+ #' iris.df = as.disk.frame(iris, nchunks = 2)
+ |
+ |
16 | ++ |
+ #'
+ |
+ |
17 | ++ |
+ #' # arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk
+ |
+ |
18 | ++ |
+ #' iris_hard.df = hard_arrange(iris.df, Species)
+ |
+ |
19 | ++ |
+ #'
+ |
+ |
20 |
- #' A streaming function for speedglm
+ #' get_chunk(iris_hard.df, 1)
|
||
2 | +21 |
- #'
+ #' get_chunk(iris_hard.df, 2)
|
|
3 | +22 |
- #' Define a function that can be used to feed data into speedglm and biglm
+ #'
|
|
4 | +23 |
- #'
+ #' # clean up cars.df
|
|
5 | +24 |
- #' @param data a disk.frame
+ #' delete(iris.df)
|
|
6 | +25 |
- #' @param verbose Whether to print the status of data loading. Default to FALSE
+ #' delete(iris_hard.df)
|
|
7 | +26 |
- #'
+ hard_arrange <- function(df, ..., add = FALSE, .drop = FALSE) {+ |
+ |
27 | +9x | +
+ UseMethod("hard_arrange") |
|
8 | +28 |
- #' @return return a function, fn, that can be used as the data argument in biglm::bigglm or speedglm::shglm
+ }
|
|
9 | +29 |
- #'
+
|
|
10 | +30 |
- #' @family Machine Learning (ML)
+ #' @rdname hard_arrange
|
|
11 | +31 |
#' @export
|
|
12 | +32 |
- #'
+ #' @importFrom dplyr arrange
|
|
13 | +33 |
- #' @examples
+ hard_arrange.data.frame <- function(df, ...) { |
|
14 | -+ | ||
34 | +3x |
- #' cars.df = as.disk.frame(cars)
+ dplyr::arrange(df, ...) |
|
15 | +35 |
- #' streamacq = make_glm_streaming_fn(cars.df, verbose = FALSE)
+ }
|
|
16 | +36 |
- #' m = biglm::bigglm(dist ~ speed, data = streamacq)
+
|
|
17 | +37 |
- #' summary(m)
+ #' @rdname hard_arrange
|
|
18 | +38 |
- #' predict(m, get_chunk(cars.df, 1))
+ #' @importFrom purrr map
|
|
19 | +39 |
- #' predict(m, collect(cars.df, 1))
+ #' @export
|
|
20 | +40 |
- make_glm_streaming_fn <- function(data, verbose = FALSE) {+ hard_arrange.disk.frame <- function(df, ..., outdir=tempfile("tmp_disk_frame_hard_arrange"), nchunks = disk.frame::nchunks(df), overwrite = TRUE) { |
|
21 | -! | +||
41 | +6x |
- i = 0+ overwrite_check(outdir, overwrite) |
|
22 | +42 |
|
|
23 | -! | +||
43 | +
- chunkids = disk.frame::get_chunk_ids(data, strip_extension = FALSE)+ # Refer also to Dplyr arrange: https://github.com/tidyverse/dplyr/blob/master/src/arrange.cpp
|
||
24 | -! | +||
44 | +6x |
- is = sample(length(chunkids), replace = FALSE)+ q <- enquos(...) |
|
25 | -! | +||
45 | +6x |
- verbose = verbose+ is_sym <- sapply(q, rlang::quo_is_symbol) |
|
26 | -! | +||
46 | +6x |
- nchunks_copy = length(chunkids)+ arrange_codes <- sapply(q, rlang::as_label) |
|
27 | +47 |
|
|
28 | -! | +||
48 | +
- function(reset = FALSE) {+ # Check if desc...
|
||
29 | -! | +||
49 | +6x |
- if(reset) {+ is_desc <- substr(arrange_codes, 1, 5) == "desc(" |
|
30 | -! | +||
50 | +
- if(verbose) {+ |
||
31 | -! | +||
51 | +
- print("disk.frame stream has been reset; next read will be from beginning")+ # If expr is a symbol from the data, just use it.
|
||
32 | +52 |
- }
+ # Otherwise need to evaluate ...
|
|
33 | +53 |
- + # (TODO - currently only support variables and desc in the data)
|
|
34 | -! | +||
54 | +
- i <<- 0+ # Peels off "desc" from the original
+ |
+ ||
55 | +6x | +
+ vars <- sub(")", "", sub("desc(", "", arrange_codes, fixed=TRUE), fixed=TRUE) |
|
35 | +56 |
- } else {+ |
|
36 | -! | +||
57 | +6x |
- i <<- i + 1+ desc_vars <- vars[is_desc] |
|
37 | +58 |
- + |
|
38 | -! | +||
59 | +6x |
- if (i > nchunks_copy) {+ if(!all(vars %in% colnames(df))){ |
|
39 | +60 | ! |
- return(NULL)+ stop(paste0("Expressions currently not supported. Columns not found in colnames:", vars[!vars %in% colnames(df)])) |
40 | +61 |
- }
- |
- |
41 | -! | -
- if(verbose) {+ }
|
|
42 | -! | +||
62 | +
- print(glue::glue("streaming: {i}/{nchunks_copy}; chunk id: {chunkids[i]}"))+ |
||
43 | +63 |
- }
+ # Hard group by in a partially sorted way at the chunk level and then arrange within chunks
|
|
44 | -! | +||
64 | +6x |
- return(get_chunk(data, chunkids[i]))+ df %>% |
|
45 | -+ | ||
65 | +6x |
- }
+ disk.frame::hard_group_by(vars, outdir=outdir, nchunks=nchunks, overwrite=overwrite, shardby_function="sort", desc_vars=desc_vars) %>% |
|
46 | -+ | ||
66 | +6x |
- }
+ chunk_arrange(...) |
|
47 | +67 |
}
@@ -27771,850 +31443,787 @@ disk.frame coverage - 51.96% |
1 |
- #' Returns the number of chunks in a disk.frame
+ #' Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame
|
||
2 |
- #' @param df a disk.frame
+ #' @param df A data.frame/data.table or disk.frame. If disk.frame, then rechunk(df, ...) is run
|
||
3 |
- #' @param skip.ready.check NOT implemented
+ #' @param shardby The column(s) to shard the data by.
|
||
4 |
- #' @param ... not used
+ #' @param nchunks The number of chunks
|
||
5 |
- #' @export
+ #' @param outdir The output directory of the disk.frame
|
||
6 |
- #' @examples
+ #' @param overwrite If TRUE then the chunks are overwritten
|
||
7 |
- #' cars.df = as.disk.frame(cars)
+ #' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
|
||
8 |
- #'
+ #' @param sort_splits If shardby_function is "sort", the split values for sharding
|
||
9 |
- #' # return the number of chunks
+ #' @param desc_vars for the "sort" shardby function, the variables to sort descending.
|
||
10 |
- #' nchunks(cars.df)
+ #' @param ... not used
|
||
11 |
- #' nchunk(cars.df)
+ #' @importFrom data.table setDT
|
||
12 |
- #'
+ #' @importFrom glue glue
|
||
13 |
- #' # clean up cars.df
+ #' @export
|
||
14 |
- #' delete(cars.df)
+ #' @examples
|
||
15 |
- nchunks <- function(df, ...) {+ #'
|
||
16 | -267x | +
- UseMethod("nchunks")+ #' # shard the cars data.frame by speed so that rows with the same speed are in the same chunk
|
|
17 |
- }
+ #' iris.df = shard(iris, "Species")
|
||
18 |
-
+ #'
|
||
19 |
-
+ #' # clean up cars.df
|
||
20 |
- #' @rdname nchunks
+ #' delete(iris.df)
|
||
21 |
- #' @export
+ shard <- function(df, shardby, outdir = tempfile(fileext = ".df"), ..., nchunks = recommend_nchunks(df), overwrite = FALSE, shardby_function="hash", sort_splits=NULL, desc_vars=NULL) { |
||
22 | -+ | 241x |
- nchunk <- function(df, ...) {+ force(nchunks) |
23 | -6x | +241x |
- UseMethod("nchunk")+ overwrite_check(outdir, overwrite) |
24 | -+ | 241x |
- }
+ stopifnot(shardby_function %in% c("hash", "sort")) |
25 |
-
+ |
||
26 | -+ | 241x |
- #' @rdname nchunks
+ if("data.frame" %in% class(df)) { |
27 | -+ | 240x |
- #' @export
+ data.table::setDT(df) |
28 | -+ | 240x |
- nchunk.disk.frame <- function(df, ...) {+ if(shardby_function == "hash"){ |
29 | -6x | +213x |
- nchunks.disk.frame(df, ...)+ message("Hashing...") |
30 | -+ | 213x |
- }
+ if(length(shardby) == 1) { |
31 | -+ | 161x |
-
+ code = glue::glue("df[,.out.disk.frame.id := hashstr2i(as.character({shardby}), nchunks)]") |
32 |
- #' @importFrom fs dir_ls
+ } else { |
||
33 | -+ | 52x |
- #' @rdname nchunks
+ shardby_list = glue::glue("paste0({paste0(sort(shardby),collapse=',')})") |
34 | -+ | 52x |
- #' @export
+ code = glue::glue("df[,.out.disk.frame.id := hashstr2i({shardby_list}, nchunks)]") |
35 |
- nchunks.disk.frame <- function(df, skip.ready.check = FALSE, ...) {+ }
|
||
36 | -+ | 27x |
- #if(!skip.ready.check) stopifnot(is_ready(df))
+ } else if(shardby_function == "sort"){ |
37 | -654x | +27x |
- fpath <- attr(df,"path")+ if(nchunks == 1){ |
38 | -654x | +1x |
- if(is.dir.disk.frame(df)) {+ message("Only one chunk: set .out.disk.frame.id = 0") |
39 | -654x | +1x |
- return(length(fs::dir_ls(fpath, type="file")))+ code = glue::glue("df[,.out.disk.frame.id := 0]") |
40 |
- } else {+ } else { |
||
41 | -! | +26x |
- return(1)+ shard_by_rule <- sortablestr2i(sort_splits, desc_vars) |
42 | -+ | 26x |
- }
+ message(shard_by_rule) |
43 | -- |
- }
- |
-
1 | -- |
- #' rbindlist disk.frames together
- |
- ||
2 | -- |
- #' @param df_list A list of disk.frames
- |
- ||
3 | -- |
- #' @param outdir Output directory of the row-bound disk.frames
- |
- ||
4 | -- |
- #' @param by_chunk_id If TRUE then only the chunks with the same chunk IDs will be bound
- |
- ||
5 | -- |
- #' @param parallel if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE
- |
- ||
6 | -- |
- #' @param compress 0-100, 100 being the highest compression rate.
- |
- ||
7 | -- |
- #' @param overwrite overwrite the output directory
- |
- ||
8 | -- |
- #' @param .progress A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From {furrr}
- |
- ||
9 | -- |
- #' @import fs
- |
- ||
10 | -- |
- #' @importFrom data.table data.table setDT
- |
- ||
11 | -+ | 26x |
- #' @importFrom future.apply future_lapply
+ code = glue::glue("df[,.out.disk.frame.id := {shard_by_rule}]") |
|
12 | +44 |
- #' @importFrom purrr map_chr map_dfr map map_lgl
+ }
|
||
13 | +45 |
- #' @importFrom purrr map
+ }
|
||
14 | +46 |
- #' @importFrom assertthat assert_that
+
|
||
15 | -+ | |||
47 | +240x |
- #' @export
+ tryCatch( |
||
16 | -+ | |||
48 | +240x |
- #' @examples
+ eval(parse(text=code)), |
||
17 | -+ | |||
49 | +240x |
- #' cars.df = as.disk.frame(cars)
+ error = function(e) { |
||
18 | -+ | |||
50 | +! |
- #'
+ message("error occurred in shard") |
||
19 | +51 |
- #' # row-bind two disk.frames
+ }
|
||
20 | +52 |
- #' cars2.df = rbindlist.disk.frame(list(cars.df, cars.df))
+ )
|
||
21 | +53 |
- #'
+ |
||
22 | -+ | |||
54 | +240x |
- #' # clean up cars.df
+ stopifnot(".out.disk.frame.id" %in% names(df)) |
||
23 | +55 |
- #' delete(cars.df)
+ |
||
24 | -+ | |||
56 | +240x |
- #' delete(cars2.df)
+ res = write_disk.frame(df, outdir = outdir, nchunks = nchunks, overwrite = TRUE, shardby = shardby, shardchunks = nchunks, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars) |
||
25 | -+ | |||
57 | +240x |
- rbindlist.disk.frame <- function(df_list, outdir = tempfile(fileext = ".df"), by_chunk_id = TRUE, parallel = TRUE, compress=50, overwrite = TRUE, .progress = TRUE) {+ return(res) |
||
26 | -55x | +58 | +1x |
- assertthat::assert_that(typeof(df_list) == "list")+ } else if ("disk.frame" %in% class(df)){ |
27 | -+ | |||
59 | +1x |
- + nchunks_rechunk = nchunks |
||
28 | -55x | +60 | +1x |
- overwrite_check(outdir, overwrite)+ return(rechunk(df, shardby = shardby, nchunks = nchunks_rechunk, outdir = outdir, overwrite = TRUE, shardby_function=shardby_function, sort_splits=sort_splits, desc_vars=desc_vars)) |
29 | +61 |
- + }
|
||
30 | -55x | +|||
62 | +
- purrr::map(df_list, ~assertthat::assert_that("disk.frame" %in% class(.x), msg = "error running rbindlist.disk.frame: Not every element of df_list is a disk.frame"))+ }
|
|||
31 | +63 |
- + |
||
32 | -54x | +|||
64 | +
- if(by_chunk_id) {+ #' `distribute` is an alias for `shard`
|
|||
33 | -54x | +|||
65 | +
- list_of_paths = purrr::map_chr(df_list, ~attr(.x,"path"))+ #' @export
|
|||
34 | -54x | +|||
66 | +
- list_of_chunks = purrr::map_dfr(list_of_paths, ~data.table(path = list.files(.x),full_path = list.files(.x,full.names = TRUE)))+ #' @rdname shard
|
|||
35 | -54x | +|||
67 | +
- setDT(list_of_chunks)+ distribute <- function(...) { |
|||
36 | -+ | |||
68 | +! |
- + UseMethod("shard") |
||
37 | +69 |
- # split the list of chunks into lists for easy operation with future
+ }
|
||
38 | -54x | +
1 | +
- slist = split(list_of_chunks$full_path,list_of_chunks$path)+ #' Make a data.frame into a disk.frame
|
|||
39 | +2 |
- + #' @param df a disk.frame
|
||
40 | -54x | +|||
3 | +
- if(parallel) {+ #' @param outdir the output directory
|
|||
41 | +4 |
- #system.time(future.apply::future_lapply(1:length(slist), function(i) {
+ #' @param nchunks number of chunks
|
||
42 | -54x | +|||
5 | +
- if(.progress) {+ #' @param overwrite if TRUE the outdir will be overwritten, if FALSE it will throw an error if the directory is not empty
|
|||
43 | -54x | +|||
6 | +
- message("Appending disk.frames: ")+ #' @param compress the compression level 0-100; 100 is highest
|
|||
44 | +7 |
- }
+ #' @param shardby The shardkey
|
||
45 | -54x | +|||
8 | +
- system.time(furrr::future_map(1:length(slist), function(i) {+ #' @param ... passed to output_disk.frame
|
|||
46 | -275x | +|||
9 | +
- full_paths1 = slist[[i]]+ #' @import fst
|
|||
47 | -275x | +|||
10 | +
- outfilename = names(slist[i])+ #' @importFrom data.table setDT
|
|||
48 | -275x | +|||
11 | +
- fst::write_fst(purrr::map_dfr(full_paths1, ~fst::read_fst(.x)),file.path(outdir,outfilename), compress = compress)+ #' @export
|
|||
49 | -275x | +|||
12 | +
- NULL
+ #' @examples
|
|||
50 | -54x | +|||
13 | +
- }, .progress = .progress))+ #' # write to temporary location
|
|||
51 | +14 |
- } else {+ #' cars.df = as.disk.frame(cars)
|
||
52 | -! | +|||
15 | +
- system.time(lapply(1:length(slist), function(i) {+ #'
|
|||
53 | -! | +|||
16 | +
- full_paths1 = slist[[i]]+ #' # specify a different path in the temporary folder, you are free to choose a different folder
|
|||
54 | -! | +|||
17 | +
- outfilename = names(slist[i])+ #' cars_new_location.df = as.disk.frame(cars, outdir = file.path(tempdir(), "some_path.df"))
|
|||
55 | -! | +|||
18 | +
- fst::write_fst(purrr::map_dfr(full_paths1, ~fst::read_fst(.x)),file.path(outdir,outfilename), compress = compress)+ #'
|
|||
56 | -! | +|||
19 | +
- NULL
+ #' # specify a different number of chunks
|
|||
57 | +20 |
- }))+ #' # this writes to tempdir() by default
|
||
58 | +21 |
- }
+ #' cars_chunks.df = as.disk.frame(cars, nchunks = 4, overwrite = TRUE)
|
||
59 | +22 |
- + #'
|
||
60 | -54x | +|||
23 | +
- rbind_res = disk.frame(outdir)+ #' # clean up
|
|||
61 | +24 |
- + #' delete(cars.df)
|
||
62 | -54x | +|||
25 | +
- shardkeys <- purrr::map(df_list, shardkey)+ #' delete(cars_new_location.df)
|
|||
63 | +26 |
- + #' delete(cars_chunks.df)
|
||
64 | +27 |
- # if all the sharkeys are identical then
+ as.disk.frame <- function(df, outdir = tempfile(fileext = ".df"), nchunks = recommend_nchunks(df), overwrite = FALSE, shardby = NULL, compress = 50,...) { |
||
65 | +28 |
- ##browser
+ |
||
66 | -54x | +29 | +72x |
- if(all(purrr::map_lgl(shardkeys[-1], ~identical(.x, shardkeys[[1]])))) {+ stopifnot("data.frame" %in% class(df)) |
67 | -54x | +30 | +72x |
- return(add_meta(rbind_res,+ overwrite_check(outdir, overwrite) |
68 | -54x | +31 | +71x |
- shardkey = shardkeys[[1]]$shardkey,+ data.table::setDT(df)+ |
+
32 | ++ |
+ |
||
69 | -54x | +33 | +71x |
- shardchunks = shardkeys[[1]]$shardchunks,+ if (is.null(shardby)) { |
70 | -54x | +34 | +70x |
- compress = compress))+ odfi = rep(1:nchunks, each = ceiling(nrow(df)/nchunks)) |
71 | -+ | |||
35 | +70x |
- } else {+ odfi = odfi[1:nrow(df)] |
||
72 | -! | +|||
36 | +70x |
- return(rbind_res)+ df[, .out.disk.frame.id := odfi] |
||
73 | +37 |
- }
+ + |
+ ||
38 | +70x | +
+ write_disk.frame(df, outdir, nchunks, overwrite = TRUE, shardby="", compress = compress, ...) |
||
74 | +39 |
} else { |
||
75 | -! | +|||
40 | +1x |
- stop("For rbindlist.disk.frame, only by_chunk_id = TRUE is implemented")+ shard(df, shardby = shardby, outdir = outdir, nchunks = nchunks, overwrite = TRUE, compress = compress, ...) |
||
76 | +41 |
}
|
||
77 | +42 |
}
@@ -28623,14 +32232,14 @@ disk.frame coverage - 51.96% |
1 |
- #' Make a data.frame into a disk.frame
+ #' Removes a chunk from the disk.frame
|
||
3 |
- #' @param outdir the output directory
+ #' @param chunk_id the chunk ID of the chunk to remove. If it's a number then return number.fst
|
||
4 |
- #' @param nchunks number of chunks
+ #' @param full.names TRUE or FALSE. Defaults to FALSE. If true then chunk_id is the full path to the chunk otherwise it's the relative path
|
||
5 |
- #' @param overwrite if TRUE the outdir will be overwritten, if FALSE it will throw an error if the directory is not empty
+ #' @export
|
||
6 |
- #' @param compress the compression level 0-100; 100 is highest
+ #' @examples
|
||
7 |
- #' @param shardby The shardkey
+ #' # TODO add these to tests
|
||
8 |
- #' @param ... passed to output_disk.frame
+ #' cars.df = as.disk.frame(cars, nchunks = 4)
|
||
9 |
- #' @import fst
+ #'
|
||
10 |
- #' @importFrom data.table setDT
+ #' # removes 3rd chunk
|
||
11 |
- #' @export
+ #' remove_chunk(cars.df, 3)
|
||
12 |
- #' @examples
+ #' nchunks(cars.df) # 3
|
||
13 |
- #' # write to temporary location
+ #'
|
||
14 |
- #' cars.df = as.disk.frame(cars)
+ #' # removes 4th chunk
|
||
15 |
- #'
+ #' remove_chunk(cars.df, "4.fst")
|
||
16 |
- #' # specify a different path in the temporary folder, you are free to choose a different folder
+ #' nchunks(cars.df) # 3
|
||
17 |
- #' cars_new_location.df = as.disk.frame(cars, outdir = file.path(tempdir(), "some_path.df"))
+ #'
|
||
18 |
- #'
+ #' # removes 2nd chunk
|
||
19 |
- #' # specify a different number of chunks
+ #' remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE)
|
||
20 |
- #' # this writes to tempdir() by default
+ #' nchunks(cars.df) # 1
|
||
21 |
- #' cars_chunks.df = as.disk.frame(cars, nchunks = 4, overwrite = TRUE)
+ #'
|
||
22 |
- #'
+ #' # clean up cars.df
|
||
23 |
- #' # clean up
+ #' delete(cars.df)
|
||
24 |
- #' delete(cars.df)
+ remove_chunk <- function(df, chunk_id, full.names = FALSE) { |
||
25 | -+ | 1x |
- #' delete(cars_new_location.df)
+ filename = "" |
26 | -+ | 1x |
- #' delete(cars_chunks.df)
+ path = attr(df,"path", exact=TRUE) |
27 | -+ | 1x |
- as.disk.frame <- function(df, outdir = tempfile(fileext = ".df"), nchunks = recommend_nchunks(df), overwrite = FALSE, shardby = NULL, compress = 50,...) {+ if(is.numeric(chunk_id)) { |
28 | -+ | 1x |
- + filename = file.path(path, glue::glue("{as.integer(chunk_id)}.fst")) |
29 | -74x | +
- stopifnot("data.frame" %in% class(df))+ } else { |
|
30 | -74x | +! |
- overwrite_check(outdir, overwrite)+ if (full.names) { |
31 | -73x | +! |
- data.table::setDT(df)+ filename = chunk_id |
32 |
- + } else { |
||
33 | -73x | +! |
- if (is.null(shardby)) {+ filename = file.path(path, chunk_id) |
34 | -72x | +
- odfi = rep(1:nchunks, each = ceiling(nrow(df)/nchunks))+ }
|
|
35 | -72x | +
- odfi = odfi[1:nrow(df)]+ }
|
|
36 | -72x | +
- df[, .out.disk.frame.id := odfi]+ |
|
37 |
- + #if(filename %in% fs::dir_ls(path, glob="*.fst")) {
|
||
38 | -72x | +1x |
- write_disk.frame(df, outdir, nchunks, overwrite = TRUE, shardby="", compress = compress, ...)+ if(fs::file_exists(filename)) { |
39 | +1x | +
+ fs::file_delete(filename)+ |
+ |
40 |
} else { |
||
40 | -1x | +||
41 | +! |
- shard(df, shardby = shardby, outdir = outdir, nchunks = nchunks, overwrite = TRUE, compress = compress, ...)+ warning(glue::glue("the chunk {filename} does not exist and hence can't be removed; make sure you suffix the file with the .fst extension")) |
|
41 | +42 |
}
|
|
43 | +1x | +
+ df
+ |
+ |
42 | +44 |
}
@@ -28923,321 +32546,404 @@ disk.frame coverage - 51.96% |
1 |
- #' Get the chunk IDs and files names
+ #' Checks if a folder is a disk.frame
|
||
2 |
- #' @param df a disk.frame
+ #' @param df a disk.frame or directory to check
|
||
3 |
- #' @param full.names If TRUE returns the full path to the file, Defaults to FALSE
+ #' @export
|
||
4 |
- #' @param strip_extension If TRUE then the file extension in the chunk_id is removed. Defaults to TRUE
+ #' @examples
|
||
5 |
- #' @param ... passed to list.files
+ #' cars.df = as.disk.frame(cars)
|
||
6 |
- #' @importFrom stringr fixed
+ #'
|
||
7 |
- #' @export
+ #' is_disk.frame(cars) # FALSE
|
||
8 |
- #' @examples
+ #' is_disk.frame(cars.df) # TRUE
|
||
9 |
- #' cars.df = as.disk.frame(cars)
+ #'
|
||
10 |
- #'
+ #' # clean up cars.df
|
||
11 |
- #' # return the integer-string chunk IDs
+ #' delete(cars.df)
|
||
12 |
- #' get_chunk_ids(cars.df)
+ is_disk.frame <- function(df) { |
||
13 |
- #'
+ ##browser
|
||
14 | -+ | 759x |
- #' # return the file name chunk IDs
+ if("disk.frame" %in% class(df)) { |
15 | -+ | 31x |
- #' get_chunk_ids(cars.df, full.names = TRUE)
+ df = attr(df, "path", exact=TRUE) |
16 | -+ | 728x |
- #'
+ } else if(!"character" %in% class(df)) { # character then check the path |
17 | -+ | 310x |
- #' # return the file name chunk IDs with file extension
+ return(FALSE) |
18 |
- #' get_chunk_ids(cars.df, strip_extension = FALSE)
+ }
|
||
19 |
- #'
+ |
||
20 | -+ | 449x |
- #' # clean up cars.df
+ files <- fs::dir_ls(df, type="file", all = TRUE) |
21 |
- #' delete(cars.df)
+ # if all files are fst
|
||
22 | -+ | 449x |
- get_chunk_ids <- function(df, ..., full.names = FALSE, strip_extension = TRUE) {+ if(length(files)>0) { |
23 | -272x | +32x |
- lf = list.files(attr(df,"path"), full.names = full.names, ...)+ if(any(purrr::map_lgl(files, ~length(grep(glob2rx("*.fst"), .x)) == 0))) { |
24 | -272x | +
- if(full.names) {+ # some of the fiels do not have a .fst extension
|
|
25 | -214x | +1x |
- return(lf)+ return(FALSE) |
26 |
- }
+ }
|
||
27 | -58x | +
- purrr::map_chr(lf, ~{+ }
|
|
28 | -301x | +
- tmp = stringr::str_split(.x,stringr::fixed("."), simplify = TRUE)+ |
|
29 | -301x | +448x |
- l = length(tmp)+ dirs = fs::dir_ls(df, type="directory", all = TRUE) |
30 | -301x | +448x |
- if(l == 1) {+ if(length(dirs) > 1) { |
31 | ! |
- return(tmp)+ return(FALSE) |
|
32 | -301x | +448x |
- } else if(strip_extension) {+ } else if(length(dirs) == 1) { |
33 | -269x | +13x |
- paste0(tmp[-l], collapse="")+ if(substr(dirs, nchar(dirs)-8,nchar(dirs)) != ".metadata") { |
34 | -+ | ! |
- } else {+ return(FALSE) |
35 | -32x | +
- .x
+ }
|
|
36 |
- }
+ }
|
||
37 |
- })+ |
||
38 | +448x | +
+ return(TRUE)+ |
+ |
39 |
}
|
||
40 | ++ | + + | +
1 |
- #' Set up disk.frame environment
+ sample_n.disk.frame <- function(df, size = 1, replace = FALSE, weight = NULL, .env = NULL){+ |
+ |
2 | +1x | +
+ stop("not implemented yet")+ |
+
3 | +! | +
+ if(!is.null(weight)) {+ |
+
4 | +! | +
+ stop("sample_n(..., weight =) is not implemented yet")+ |
+
5 | ++ |
+ }
+ |
+
6 | ++ | + + | +
7 | ++ |
+ #delayed(df, ~sample_frac(.x, size, replace, weight, .env))
+ |
+
8 | ++ |
+ }
+ |
+
9 | ++ | + + | +
1 | ++ |
+ #' Get the chunk IDs and files names
|
|
2 |
- #' @param workers the number of workers (background R processes in the
+ #' @param df a disk.frame
|
||
3 |
- #' @param future_backend which future backend to use for parallelization
+ #' @param full.names If TRUE returns the full path to the file, Defaults to FALSE
|
||
4 |
- #' @param gui Whether to use a Graphical User Interface (GUI) for selecting the options. Defaults to FALSE
+ #' @param strip_extension If TRUE then the file extension in the chunk_id is removed. Defaults to TRUE
|
||
5 |
- #' @param ... passed to `future::plan`
+ #' @param ... passed to list.files
|
||
6 |
- #' @importFrom future plan multiprocess nbrOfWorkers sequential
+ #' @importFrom stringr fixed
|
||
8 |
- #' @examples
+ #' @examples
|
||
9 |
- #' if (interactive()) {
+ #' cars.df = as.disk.frame(cars)
|
||
10 |
- #' # setup disk.frame to use multiple workers these may use more than two
+ #'
|
||
11 |
- #' # cores, and is therefore not allowed on CRAN. Hence it's set to run only in
+ #' # return the integer-string chunk IDs
|
||
12 |
- #' # interactive session
+ #' get_chunk_ids(cars.df)
|
||
13 |
- #' setup_disk.frame()
+ #'
|
||
14 |
- #'
+ #' # return the file name chunk IDs
|
||
15 |
- #' # use a Shiny GUI to adjust settings
+ #' get_chunk_ids(cars.df, full.names = TRUE)
|
||
16 |
- #' # only run in interactive()
+ #'
|
||
17 |
- #' setup_disk.frame(gui = TRUE)
+ #' # return the file name chunk IDs with file extension
|
||
18 |
- #' }
+ #' get_chunk_ids(cars.df, strip_extension = FALSE)
|
||
20 |
- #' # set the number workers to 2
+ #' # clean up cars.df
|
||
21 |
- #' setup_disk.frame(2)
+ #' delete(cars.df)
|
||
22 |
- #'
+ get_chunk_ids <- function(df, ..., full.names = FALSE, strip_extension = TRUE) { |
||
23 | -+ | 263x |
- #' # if you do not wish to use multiple workers you can set it to sequential
+ lf = list.files(attr(df,"path"), full.names = full.names, ...) |
24 | -+ | 263x |
- #' setup_disk.frame(future_backend=future::sequential)
+ if(full.names) { |
25 | -+ | 204x |
- setup_disk.frame <- function(workers = parallel::detectCores(logical = FALSE), future_backend = future::multiprocess, ..., gui = FALSE) {+ return(lf) |
26 | -6x | +
- if(!gui) {+ }
|
|
27 | -6x | +59x |
- future::plan(future_backend, workers = workers, gc = TRUE, ...)+ purrr::map_chr(lf, ~{ |
28 | -6x | +305x |
- message(sprintf("The number of workers available for disk.frame is %d", future::nbrOfWorkers()))+ tmp = stringr::str_split(.x,stringr::fixed("."), simplify = TRUE) |
29 | -+ | 305x |
- # do not limit the amount of transfers to other workers
+ l = length(tmp) |
30 | -+ | 305x |
- # this is not allowed by CRAN policy
+ if(l == 1) { |
31 | -+ | ! |
- #options(future.globals.maxSize = future.globals.maxSize)
+ return(tmp) |
32 | -+ | 305x |
- #options(disk.frame.nworkers = workers)
+ } else if(strip_extension) { |
33 | -! | +269x |
- } else if(gui) {+ paste0(tmp[-l], collapse="") |
34 | -! | +
- if (!requireNamespace("shiny", quietly = TRUE)) {+ } else { |
|
35 | -! | +36x |
- stop("Package \"shiny\" must be installed to use GUI. You can install shiny using install.packages('shiny')",+ .x
|
36 | -! | +
- call. = FALSE)+ }
|
|
37 |
- }
+ }) |
||
38 |
- + }
|
||
39 | -! | +
1 | +
- ui <- shiny::fluidPage(+ #' Helper function to evalparse some `glue::glue` string
|
||
40 | -! | +||
2 | +
- shiny::h1("disk.frame settings"),+ #' @param code the code in character(string) format to evaluate
|
||
41 | -! | +||
3 | +
- shiny::sliderInput(+ #' @param env the environment in which to evaluate the code
|
||
42 | -! | +||
4 | +
- "nbrOfWorkers",
+ #' @export
|
||
43 | -! | +||
5 | +
- sprintf("Number of workers (recommendation = %d)", parallel::detectCores(logical = FALSE)),+ evalparseglue <- function(code, env = parent.frame()) { |
||
44 | -! | +||
6 | +2x |
- 1,
+ eval(parse(text = glue::glue(code, .envir = env)), envir = env) |
|
45 | -! | +||
7 | +
- parallel::detectCores(),+ }
|
||
46 | -! | +||
8 | +
- value = future::nbrOfWorkers(),+ |
||
47 | -! | +||
9 | +
- step = 1),+ #' Generate synthetic dataset for testing
|
||
48 | -! | +||
10 | +
- shiny::includeMarkdown(system.file("options.rmd", package="disk.frame"))+ #' @param N number of rows. Defaults to 200 million
|
||
49 | +11 |
- # , shiny::checkboxInput(
+ #' @param K controls the number of unique values for id. Some ids will have K distinct values while others have N/K distinct values
|
|
50 | +12 |
- # "inf_fgm",
+ #' @importFrom stats runif
|
|
51 | +13 |
- # "Recommended: Set Maximum transfer size between workers to Inf (so ignore slider below)",
+ #' @export
|
|
52 | +14 |
- # value = ifelse(
+ gen_datatable_synthetic <- function(N=2e8, K=100) { |
|
53 | -+ | ||
15 | +15x |
- # is.null(getOption("future.globals.maxSize")),
+ data.table( |
|
54 | -+ | ||
16 | +15x |
- # TRUE,
+ id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) |
|
55 | -+ | ||
17 | +15x |
- # is.infinite(getOption("future.globals.maxSize")))
+ id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) |
|
56 | -+ | ||
18 | +15x |
- # )
+ id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) |
|
57 | -+ | ||
19 | +15x |
- # ,shiny::sliderInput(
+ id4 = sample(K, N, TRUE), # large groups (int) |
|
58 | -+ | ||
20 | +15x |
- # "future.globals.maxSize",
+ id5 = sample(K, N, TRUE), # large groups (int) |
|
59 | -+ | ||
21 | +15x |
- # "Maximum transfer size between workers (gb)",
+ id6 = sample(N/K, N, TRUE), # small groups (int) |
|
60 | -+ | ||
22 | +15x |
- # 0,
+ v1 = sample(5, N, TRUE), # int in range [1,5] |
|
61 | -+ | ||
23 | +15x |
- # ifelse(is.infinite(memory.limit()), 3904, memory.limit()/1024/1024/1024),
+ v2 = sample(5, N, TRUE), # int in range [1,5] |
|
62 | -+ | ||
24 | +15x | +
+ v3 = sample(round(runif(100,max=100),4), N, TRUE), # numeric e.g. 23.5749+ |
+ |
25 | +15x |
- # value = ifelse(is.infinite(getOption("future.globals.maxSize")), 3904, memory.limit()/1024/1024/1024),
+ date1 = sample(seq(as.Date('1970-01-01'), as.Date('2019-01-01'), by = "day"), N, TRUE) # date |
|
63 | +26 |
- # step = 0.5
+ )
|
|
64 | +27 |
- # )
+ }
|
65 | +1 |
- )
+ #' Return the column names of the disk.frame
|
|
66 | +2 |
- - |
- |
67 | -! | -
- server <- function(input, output, session) {- |
- |
68 | -! | -
- shiny::observe({- |
- |
69 | -! | -
- future::plan(future_backend, workers = input$nbrOfWorkers, gc = TRUE, ...)+ #'
|
|
70 | +3 |
- })+ #' The returned column names are from the source. So if you have lazy operations then the
|
|
71 | +4 |
- + #' colnames here does not reflects the results of those operations. To obtain the correct names try
|
|
72 | +5 |
- #shiny::observe({
+ #' \code{names(collect(get_chunk(df, 1)))}
|
|
73 | +6 |
- #if(input$inf_fgm) {
+ #'
|
|
74 | +7 |
- #options(future.globals.maxSize = Inf)
+ #' @param x a disk.frame
|
|
75 | +8 |
- #} else {
+ #' @param ... not used
|
|
76 | +9 |
- #options(future.globals.maxSize = input$future.globals.maxSize*1024*1024*1024)
+ #' @export
|
|
77 | +10 |
- #}
+ #' @importFrom fs dir_ls
|
|
78 | +11 |
- #})
+ #' @importFrom fst metadata_fst
|
|
79 | +12 |
- }
+ #' @export
|
|
80 | +13 |
- + colnames <- function(x, ...) { |
|
81 | -! | +||
14 | +47x |
- shiny::shinyApp(ui, server)+ UseMethod("colnames") |
|
82 | +15 |
- } else {+ }
|
|
83 | -! | +||
16 | +
- stop("setup_disk.frame: gui must be set to either TRUE or FALSE")+ |
||
84 | +17 |
- }
+ #' @rdname colnames
|
|
85 | +18 |
- }
+ #' @export
|
1 | +19 |
- sample_n.disk.frame <- function(df, size = 1, replace = FALSE, weight = NULL, .env = NULL){+ names.disk.frame <- function(x, ...) { |
||
2 | -1x | +20 | +2x |
- stop("not implemented yet")+ colnames.disk.frame(x, ...) |
3 | -! | +|||
21 | +
- if(!is.null(weight)) {+ }
|
|||
4 | -! | +|||
22 | +
- stop("sample_n(..., weight =) is not implemented yet")+ |
|||
5 | +23 |
- }
+ #' @rdname colnames
|
||
6 | +24 |
-
+ #' @export
|
||
7 | +25 |
- #delayed(df, ~sample_frac(.x, size, replace, weight, .env))
+ colnames.disk.frame <- function(x, ...) { |
||
8 | -+ | |||
26 | +21x |
- }
+ res = attr(x, "path", exact=TRUE) %>% |
||
9 | -+ | |||
27 | +21x |
-
+ fs::dir_ls(type="file") |
1 | -+ | |||
28 | +21x |
- #' Column names for RStudio auto-complete
+ if(length(res) == 0) {+ |
+ ||
29 | +! | +
+ return(vector("character")) |
||
2 | +30 |
- #' @description
+ }
+ |
+ ||
31 | +21x | +
+ fst::metadata_fst(res[1])$columnNames |
||
3 | +32 |
- #' Returns the names of the columns. Needed for RStudio to complete variable
+ }
|
||
4 | +33 |
- #' names
+
|
||
5 | +34 |
- #' @param x a disk.frame
+
|
||
6 | +35 |
- #' @importFrom dplyr tbl_vars
+ #' @rdname colnames
|
||
7 | +36 |
#' @export
|
||
8 | +37 |
- tbl_vars.disk.frame <- function(x) {+ colnames.default <- function(x, ...) { |
||
9 | -1x | +38 | +28x |
- names.disk.frame(x)+ base::colnames(x, ...) |
10 | +39 |
}
@@ -29941,436 +33647,466 @@ disk.frame coverage - 51.96% |
1 |
- #' Convert disk.frame to data.frame by collecting all chunks
+ #' Number of rows or columns
|
||
2 |
- #' @param x a disk.frame
+ #' @param ... passed to base::nrow
|
||
3 |
- #' @param row.names NULL or a character vector giving the row names for the data frame. Missing values are not allowed.
+ #' @export
|
||
4 |
- #' @param optional logical. If TRUE, setting row names and converting column names (to syntactic names: see make.names) is optional. Note that all of R's base package as.data.frame() methods use optional only for column names treatment, basically with the meaning of data.frame(*, check.names = !optional). See also the make.names argument of the matrix method.
+ #' @rdname ncol_nrow
|
||
5 |
- #' @param ... additional arguments to be passed to or from methods.
+ #' @examples
|
||
6 |
- #' @export
+ #' cars.df = as.disk.frame(cars)
|
||
7 |
- #' @examples
+ #'
|
||
8 |
- #' cars.df = as.disk.frame(cars)
+ #' # return total number of column and rows
|
||
9 |
- #' as.data.frame(cars.df)
+ #' ncol(cars.df)
|
||
10 |
- #'
+ #' nrow(cars.df)
|
||
11 |
- #' # clean up
+ #'
|
||
12 |
- #' delete(cars.df)
+ #' # clean up cars.df
|
||
13 |
- as.data.frame.disk.frame <- function(x, row.names, optional, ...) { # needs to retain x for consistency+ #' delete(cars.df)
|
||
14 | -1x | +
- as.data.frame(collect(x), row.names, optional, ...)+ nrow <- function(df,...) { |
|
15 | -+ | 361x |
- }
+ UseMethod("nrow") |
16 |
-
+ }
|
||
17 |
- #' Convert disk.frame to data.table by collecting all chunks
+
|
||
18 |
- #' @param x a disk.frame
+ #' @rdname ncol_nrow
|
||
19 |
- #' @param keep.rownames passed to as.data.table
+ #' @export
|
||
20 |
- #' @param ... passed to as.data.table
+ #' @noRd
|
||
21 |
- #' @export
+ nrow.default <- function(df, ...) { |
||
22 | -+ | 333x |
- #' @examples
+ base::nrow(df, ...) |
23 |
- #' library(data.table)
+ }
|
||
24 |
- #' cars.df = as.disk.frame(cars)
+
|
||
25 |
- #' as.data.table(cars.df)
+ #' @export
|
||
26 |
- #'
+ #' @rdname ncol_nrow
|
||
27 |
- #' # clean up
+ #' @import fst
|
||
28 |
- #' delete(cars.df)
+ nrow.disk.frame <- function(df, ...) { |
||
29 | -+ | 28x |
- as.data.table.disk.frame <- function(x, keep.rownames = FALSE, ...) {+ stopifnot(is_ready(df)) |
30 | -1x | +28x |
- as.data.table(collect(x), keep.rownames = keep.rownames, ...)+ path1 <- attr(df,"path", exact=TRUE) |
31 | -+ | 28x |
- }
+ if(dir.exists(path1)) { |
1 | -+ | ||
32 | +28x |
- #' Delete a disk.frame
+ path2 <- list.files(path1,full.names = TRUE) |
|
2 | -+ | ||
33 | +28x |
- #' @param df a disk.frame
+ if(length(path2) == 0) { |
|
3 | -+ | ||
34 | +! |
- #' @importFrom fs dir_delete
+ return(0) |
|
4 | +35 |
- #' @export
+ }
+ |
+ |
36 | +28x | +
+ tmpfstmeta = fst::fst.metadata(path2[1])+ |
+ |
37 | +28x | +
+ if("nrOfRows" %in% names(tmpfstmeta)) {+ |
+ |
38 | +28x | +
+ return(sum(sapply(path2, function(p2) fst::fst.metadata(p2)$nrOfRows))) |
|
5 | +39 |
- #' @examples
+ } else {+ |
+ |
40 | +! | +
+ return(sum(sapply(path2, function(p2) fst::fst.metadata(p2)$NrOfRows))) |
|
6 | +41 |
- #' cars.df = as.disk.frame(cars)
+ }
|
|
7 | +42 |
- #' delete(cars.df)
+ } else { |
|
8 | +43 |
- delete <- function(df) {+ #return(fst::fst.metadata(path1)$NrOfRows)
|
|
9 | -8x | +||
44 | +! |
- stopifnot("disk.frame" %in% class(df))+ stop(glue::glue("nrow error: directory {} does not exist")) |
|
10 | +45 |
- + }
|
|
11 | -8x | +||
46 | +
- fs::dir_delete(attr(df, "path"))+ }
|
||
12 | +47 |
- }
+
|
1 | +48 |
- #' Keep only the variables from the input listed in selections
+ #' @import fst
|
|
2 | +49 |
- #' @param df a disk.frame
+ #' @export
|
|
3 | +50 |
- #' @param selections The list of variables to keep from the input source
+ #' @rdname ncol_nrow
|
|
4 | +51 |
- #' @export
+ ncol <- function(df) {+ |
+ |
52 | +142x | +
+ UseMethod("ncol") |
|
5 | +53 |
- #' @examples
+ }
|
|
6 | +54 |
- #' cars.df = as.disk.frame(cars)
+
|
|
7 | +55 |
- #'
+ #' @import fs
|
|
8 | +56 |
- #' # when loading cars's chunks into RAM, load only the column speed
+ #' @export
|
|
9 | +57 |
- #' collect(srckeep(cars.df, "speed"))
+ #' @param df a disk.frame
|
|
10 | +58 |
- #'
+ #' @rdname ncol_nrow
|
|
11 | +59 |
- #' # clean up cars.df
+ ncol.disk.frame <- function(df) {+ |
+ |
60 | +12x | +
+ length(colnames(df)) |
|
12 | +61 |
- #' delete(cars.df)
+ }
|
|
13 | +62 |
- srckeep <- function(df, selections) {+ |
|
14 | -1x | +||
63 | +
- stopifnot("disk.frame" %in% class(df))+ #' @export
|
||
15 | -1x | +||
64 | +
- attr(df,"keep") = selections+ ncol.default <- function(df) { |
||
16 | -1x | +||
65 | +130x |
- df
+ base::ncol(df) |
|
17 | +66 |
}
@@ -30379,612 +34115,611 @@ disk.frame coverage - 51.96% |
1 |
- .onLoad <- function(libname, pkgname){+ #' Bring the disk.frame into R
|
||
2 |
- + #'
|
||
3 |
- }
+ #' Bring the disk.frame into RAM by loading the data and running all lazy
|
||
4 |
-
+ #' operations as data.table/data.frame or as a list
|
||
5 |
- #' @importFrom future nbrOfWorkers
+ #' @param x a disk.frame
|
||
6 |
- #' @importFrom crayon red blue green
+ #' @param parallel if TRUE the collection is performed in parallel. By default
|
||
7 |
- .onAttach <- function(libname, pkgname) {+ #' if there are delayed/lazy steps then it will be parallel, otherwise it will
|
||
8 |
- #setup_disk.frame()
+ #' not be in parallel. This is because parallel requires transferring data
|
||
9 |
- + #' from background R session to the current R session and if there is no
|
||
10 | -12x | +
- packageStartupMessage(+ #' computation then it's better to avoid transferring data between session,
|
|
11 | -12x | +
- crayon::red(+ #' hence parallel = FALSE is a better choice
|
|
12 | -12x | +
- glue::glue(+ #' @param ... not used
|
|
13 | -12x | +
- "\n\n## Message from disk.frame:+ #' @importFrom data.table data.table as.data.table
|
|
14 | -12x | +
- We have {future::nbrOfWorkers()} workers to use with disk.frame.+ #' @importFrom furrr future_map_dfr future_options
|
|
15 | -12x | +
- To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.")),+ #' @importFrom purrr map_dfr
|
|
16 | -12x | +
- crayon::green("\n\n+ #' @importFrom dplyr collect select mutate
|
|
17 | -12x | +
- It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:\n\n+ #' @return collect return a data.frame/data.table
|
|
18 | -12x | +
- ```r+ #' @examples
|
|
19 |
- # this will set up disk.frame with multiple workers
+ #' cars.df = as.disk.frame(cars)
|
||
20 | -12x | +
- setup_disk.frame()+ #' # use collect to bring the data into RAM as a data.table/data.frame
|
|
21 |
- # this will allow unlimited amount of data to be passed from worker to worker
+ #' collect(cars.df)
|
||
22 | -12x | +
- options(future.globals.maxSize = Inf)+ #'
|
|
23 |
- ```+ #' # clean up
|
||
24 | -12x | +
- \n\n"))+ #' delete(cars.df)
|
|
25 |
- }
+ #' @export
|
||
26 |
-
+ #' @rdname collect
|
||
27 |
- globalVariables(c(+ collect.summarized_disk.frame <- function(x, ..., parallel = !is.null(attr(x,"lazyfn"))) { |
||
28 | -+ | ! |
- "syms", # needed by dplyr to treat something as a symbol+ code_to_run = glue::glue("x %>% {attr(x, 'summarize_code') %>% as.character}") |
29 | -+ | ! |
- ".",
+ class(x) <- "disk.frame" |
30 | -+ | ! |
- ".BY",
+ eval(parse(text = code_to_run)) |
31 |
- ".N",
+ }
+ |
+
1 | ++ |
+ #' Convert disk.frame to data.frame by collecting all chunks
|
|
32 | +2 |
- ".SD",
+ #' @param x a disk.frame
|
|
33 | +3 |
- ".out.disk.frame.id",
+ #' @param row.names NULL or a character vector giving the row names for the data frame. Missing values are not allowed.
|
|
34 | +4 |
- ":=",
+ #' @param optional logical. If TRUE, setting row names and converting column names (to syntactic names: see make.names) is optional. Note that all of R's base package as.data.frame() methods use optional only for column names treatment, basically with the meaning of data.frame(*, check.names = !optional). See also the make.names argument of the matrix method.
|
|
35 | +5 |
- "N",
+ #' @param ... additional arguments to be passed to or from methods.
|
|
36 | +6 |
- "area",
+ #' @export
|
|
37 | +7 |
- "chunk_id",
+ #' @examples
|
|
38 | +8 |
- "coltypes",
+ #' cars.df = as.disk.frame(cars)
|
|
39 | +9 |
- "coltypes.x",
+ #' as.data.frame(cars.df)
|
|
40 | +10 |
- "coltypes.y",
+ #'
|
|
41 | +11 |
- "ctot",
+ #' # clean up
|
|
42 | +12 |
- "existing_df",
+ #' delete(cars.df)
|
|
43 | +13 |
- "feature_s",
+ as.data.frame.disk.frame <- function(x, row.names, optional, ...) { # needs to retain x for consistency+ |
+ |
14 | +1x | +
+ as.data.frame(collect(x), row.names, optional, ...) |
|
44 | +15 |
- "h",
+ }
|
|
45 | +16 |
- "height",
+
|
|
46 | +17 |
- "incompatible_types",
+ #' Convert disk.frame to data.table by collecting all chunks
|
|
47 | +18 |
- "lag_height",
+ #' @param x a disk.frame
|
|
48 | +19 |
- "new_chunk",
+ #' @param keep.rownames passed to as.data.table
|
|
49 | +20 |
- "ok",
+ #' @param ... passed to as.data.table
|
|
50 | +21 |
- "pathA",
+ #' @export
|
|
51 | +22 |
- "pathB",
+ #' @examples
|
|
52 | +23 |
- "w",
+ #' library(data.table)
|
|
53 | +24 |
- "xid",
+ #' cars.df = as.disk.frame(cars)
|
|
54 | +25 |
- "yid"))+ #' as.data.table(cars.df)
|
|
55 | +26 |
-
+ #'
|
|
56 | +27 |
- #' @useDynLib disk.frame
+ #' # clean up
|
|
57 | +28 |
- #' @importFrom Rcpp evalCpp
+ #' delete(cars.df)
|
|
58 | +29 |
- #@exportPattern "^[[:alpha:]]+"
+ as.data.table.disk.frame <- function(x, keep.rownames = FALSE, ...) {+ |
+ |
30 | +1x | +
+ as.data.table(collect(x), keep.rownames = keep.rownames, ...) |
|
59 | +31 |
- NULL
+ }
|
1 |
- #' Helper function to evalparse some `glue::glue` string
+ #' Compute without writing
|
||
2 |
- #' @param code the code in character(string) format to evaluate
+ #' @description
|
||
3 |
- #' @param env the environment in which to evaluate the code
+ #' Perform the computation; same as calling cmap without .f and lazy = FALSE
|
||
4 |
- #' @export
+ #' @param x a disk.frame
|
||
5 |
- evalparseglue <- function(code, env = parent.frame()) {+ #' @param outdir the output directory
|
||
6 | -! | +
- eval(parse(text = glue::glue(code, .envir = env)), envir = env)+ #' @param overwrite whether to overwrite or not
|
|
7 |
- }
+ #' @param name Not used. Kept for compatibility with dplyr
|
||
8 |
-
+ #' @param ... Not used. Kept for dplyr compatibility
|
||
9 |
- #' Generate synthetic dataset for testing
+ #' @export
|
||
10 |
- #' @param N number of rows. Defaults to 200 million
+ #' @importFrom dplyr compute
|
||
11 |
- #' @param K controls the number of unique values for id. Some ids will have K distinct values while others have N/K distinct values
+ #' @examples
|
||
12 |
- #' @importFrom stats runif
+ #' cars.df = as.disk.frame(cars)
|
||
13 |
- #' @export
+ #' cars.df2 = cars.df %>% cmap(~.x)
|
||
14 |
- gen_datatable_synthetic <- function(N=2e8, K=100) {+ #' # the computation is performed and the data is now stored elsewhere
|
||
15 | -19x | +
- data.table(+ #' cars.df3 = compute(cars.df2)
|
|
16 | -19x | +
- id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)+ #'
|
|
17 | -19x | +
- id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)+ #' # clean up
|
|
18 | -19x | +
- id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)+ #' delete(cars.df)
|
|
19 | -19x | +
- id4 = sample(K, N, TRUE), # large groups (int)+ #' delete(cars.df3)
|
|
20 | -19x | +
- id5 = sample(K, N, TRUE), # large groups (int)+ compute.disk.frame <- function(x, name, outdir = tempfile("tmp_df_", fileext=".df"), overwrite = TRUE, ...) { |
|
21 | -19x | +3x |
- id6 = sample(N/K, N, TRUE), # small groups (int)+ overwrite_check(outdir, overwrite) |
22 | -19x | -
- v1 = sample(5, N, TRUE), # int in range [1,5]- |
- |
23 | -19x | -
- v2 = sample(5, N, TRUE), # int in range [1,5]- |
- |
24 | -19x | -
- v3 = sample(round(runif(100,max=100),4), N, TRUE), # numeric e.g. 23.5749- |
- |
25 | -19x | -
- date1 = sample(seq(as.Date('1970-01-01'), as.Date('2019-01-01'), by = "day"), N, TRUE) # date- |
- |
26 | -+ | 3x |
- )
+ write_disk.frame(x, outdir = outdir, overwrite = TRUE) |
27 | +23 |
}
@@ -30993,14 +34728,14 @@ disk.frame coverage - 51.96% |
1 |
- #' Show the code to setup disk.frame
+ #' @rdname cmap2
|
||
3 |
- show_ceremony <- function() {+ map_by_chunk_id <- function(.x, .y, .f, ..., outdir) { |
||
4 | ! |
- glue::glue(crayon::green(ceremony_text()))+ warning("map_by_chunk_id is deprecated. Use map2 instead") |
|
5 | -+ | ! |
- }
+ map2.disk.frame(.x, .y, .f, ..., outdir = outdir) |
6 |
-
+ }
|
7 | +1 |
-
+ #' Delete a disk.frame
|
|
8 | +2 |
- #' @rdname show_ceremony
+ #' @param df a disk.frame
|
|
9 | +3 |
- #' @export
+ #' @importFrom fs dir_delete
|
|
10 | +4 |
- ceremony_text <- function() {+ #' @export
|
|
11 | +5 |
- "+ #' @examples
|
|
12 | +6 |
- # this willl set disk.frame with multiple workers
+ #' cars.df = as.disk.frame(cars)
|
|
13 | -! | +||
7 | +
- setup_disk.frame()+ #' delete(cars.df)
|
||
14 | +8 |
- # this will allow unlimited amount of data to be passed from worker to worker
+ delete <- function(df) { |
|
15 | -! | +||
9 | +8x |
- options(future.globals.maxSize = Inf)+ stopifnot("disk.frame" %in% class(df)) |
|
16 | +10 |
- "+ |
|
17 | -+ | ||
11 | +8x |
- }
+ fs::dir_delete(attr(df, "path")) |
|
18 | +12 |
-
+ }
|
19 | +1 |
- #' @rdname show_ceremony
+ #' Column names for RStudio auto-complete
|
|
20 | +2 |
- #' @export
- |
- |
21 | -! | -
- show_boilerplate <- function() show_ceremony()+ #' @description
|
|
22 | +3 |
-
+ #' Returns the names of the columns. Needed for RStudio to complete variable
|
|
23 | +4 |
- #' @rdname show_ceremony
+ #' names
|
|
24 | +5 |
- #' @export
+ #' @param x a disk.frame
|
|
25 | +6 |
- insert_ceremony <- function() {- |
- |
26 | -! | -
- if(requireNamespace("rstudioapi")) {- |
- |
27 | -! | -
- rstudioapi::insertText(ceremony_text())+ #' @importFrom dplyr tbl_vars
|
|
28 | +7 |
- } else {+ #' @export
|
|
29 | -! | +||
8 | +
- stop("insert ceremony can only be used inside RStudio")+ tbl_vars.disk.frame <- function(x) { |
||
30 | -+ | ||
9 | +1x |
- }
+ names.disk.frame(x) |
|
31 | +10 |
}
@@ -31227,4 +34953,4 @@ disk.frame coverage - 51.96%- \ No newline at end of file + \ No newline at end of file diff --git a/misc/disk.frame-report_files/datatables-crosstalk.css b/misc/disk.frame-report_files/datatables-crosstalk.css index 43422d25..fb5bae84 100644 --- a/misc/disk.frame-report_files/datatables-crosstalk.css +++ b/misc/disk.frame-report_files/datatables-crosstalk.css @@ -5,3 +5,19 @@ html body div.DTS div.dataTables_scrollBody { background: none; } + + +/* +Fix https://github.com/rstudio/DT/issues/563 +If the `table.display` is set to "block" (e.g., pkgdown), the browser will display +datatable objects strangely. The search panel and the page buttons will still be +in full-width but the table body will be "compact" and shorter. +In therory, having this attributes will affect `dom="t"` +with `display: block` users. But in reality, there should be no one. +We may remove the below lines in the future if the upstream agree to have this there. +See https://github.com/DataTables/DataTablesSrc/issues/160 +*/ + +table.dataTable { + display: table; +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 613bb16b..49f19db7 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream |