From ad2d8a3cf393d68209328a53c0837266b80a2628 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 8 Aug 2020 13:01:21 +1000 Subject: [PATCH 1/8] minor --- DESCRIPTION | 2 +- R/dplyr_verbs.r | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 421d1903..d235b10b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -52,7 +52,7 @@ Suggests: covr LinkingTo: Rcpp -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Encoding: UTF-8 URL: https://diskframe.com BugReports: https://github.com/xiaodaigh/disk.frame/issues diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 9daca00e..8ea3c4ea 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -83,20 +83,16 @@ tally.disk.frame <- create_chunk_mapper(dplyr::tally) #' @rdname dplyr_verbs count.disk.frame <- create_chunk_mapper(dplyr::count) -# TODO family is not required is group-by -# TODO alot of these .disk.frame functions are not generic - - -#' @export -#' @importFrom dplyr add_count -#' @rdname dplyr_verbs -add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) - - -#' @export -#' @importFrom dplyr add_tally -#' @rdname dplyr_verbs -add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) +#' #' @export +#' #' @importFrom dplyr add_count +#' #' @rdname dplyr_verbs +#' add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' +#' +#' #' @export +#' #' @importFrom dplyr add_tally +#' #' @rdname dplyr_verbs +#' add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) #' @export From 461f88608bd3ca85f5255e42c99aab656b5fee03 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 8 Aug 2020 13:04:26 +1000 Subject: [PATCH 2/8] minor cleanup --- CRAN-RELEASE | 4 +- book/10-group-by.Rmd | 33 +- .../rstudio conf 2021/1min video script | 11 + .../rstudio conf 2021/Abstract proposal.md | 5 + presentation/twin cities/.gitignore | 1 + tests/testthat/test-add-chunk.r | 24 ++ tests/testthat/test-names.r | 7 + vignettes/concepts.Rmd | 69 ---- vignettes/convenience-features.Rmd | 54 --- vignettes/data-table-syntax.Rmd | 78 ---- vignettes/glm.Rmd | 111 ----- vignettes/ingesting-data.Rmd | 174 -------- vignettes/intro-disk-frame.Rmd | 378 ------------------ 13 files changed, 67 insertions(+), 882 deletions(-) create mode 100644 presentation/rstudio conf 2021/1min video script create mode 100644 presentation/rstudio conf 2021/Abstract proposal.md create mode 100644 presentation/twin cities/.gitignore delete mode 100644 vignettes/concepts.Rmd delete mode 100644 vignettes/convenience-features.Rmd delete mode 100644 vignettes/data-table-syntax.Rmd delete mode 100644 vignettes/glm.Rmd delete mode 100644 vignettes/ingesting-data.Rmd delete mode 100644 vignettes/intro-disk-frame.Rmd diff --git a/CRAN-RELEASE b/CRAN-RELEASE index 116e1f09..598e45ee 100644 --- a/CRAN-RELEASE +++ b/CRAN-RELEASE @@ -1,2 +1,2 @@ -This package was submitted to CRAN on 2020-02-24. -Once it is accepted, delete this file and tag the release (commit eea20eac59). +This package was submitted to CRAN on 2020-07-07. +Once it is accepted, delete this file and tag the release (commit eb830e2ac0). diff --git a/book/10-group-by.Rmd b/book/10-group-by.Rmd index f0f6ec34..1f47eb55 100644 --- a/book/10-group-by.Rmd +++ b/book/10-group-by.Rmd @@ -58,22 +58,23 @@ It is important to note that not all functions that can run in `dplyr::summarize If a function you need/like is missing, please make a feature request [here](https://github.com/xiaodaigh/disk.frame/issues). It is a limitation that function that depend on the order a column can only obtained using estimated methods. -| Function | Exact/Estimate | Notes | -| -- | -- | -- | -| `min` | Exact | | -| `max` | Exact | | -| `mean` | Exact | | -| `sum` | Exact | | -| `length` | Exact | | -| `n` | Exact | | -| `n_distinct` | Exact | | -| `sd` | Exact | | -| `var` | Exact | `var(x)` only `cor, cov` support *planned* | -| `any` | Exact | | -| `all` | Exact | | -| `median` | Estimate | | -| `quantile` | Estimate | One quantile only | -| `IQR` | Estimate | | +| Function | Exact/Estimate | Notes | +|--------------|----------------|--------------------------------------------| +| `min` | Exact | | +| `max` | Exact | | +| `mean` | Exact | | +| `sum` | Exact | | +| `length` | Exact | | +| `n` | Exact | | +| `n_distinct` | Exact | | +| `sd` | Exact | | +| `var` | Exact | `var(x)` only `cor, cov` support *planned* | +| `any` | Exact | | +| `all` | Exact | | +| `median` | Estimate | | +| `quantile` | Estimate | One quantile only | +| `IQR` | Estimate | | + ### Notes on One-Stage group-by diff --git a/presentation/rstudio conf 2021/1min video script b/presentation/rstudio conf 2021/1min video script new file mode 100644 index 00000000..53337167 --- /dev/null +++ b/presentation/rstudio conf 2021/1min video script @@ -0,0 +1,11 @@ +Hi my name is ZedJ and I am a Data Scientist local to Melbourne. I am a keen contributor to open source data science projects, one of which I want to talk about at rstudio:conf 2021. That project is {disk.frame} - a larger-than-RAM data manipulation package. + +R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. + +{disk.frame} solves this issue by providing a 100%-R framework to manipulate data on disk. A modern laptop with {disk.frame} can comfortably handle 100GB's of data. + +Also, {disk.frame} uses {dplyr} verbs to manipulate data so useRs will find it very easy to pick up. + +Finally, because {disk.frame} is 100%-R, you can use any R package with it at no extra cost unlike Spark. + +The talk I propose will introduce {disk.frame} to users with the needs to manipulate large amounts of data with minimal setup. They will find {disk.frame} very familiar, as {disk.frame} uses {dplyr} verbs directly;. Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large datasets. They will find lots of benefits in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. Because {disk.frame} can run R functions natively, they will find that {disk.frame} allows them to many R packages directly with {disk.frame}. diff --git a/presentation/rstudio conf 2021/Abstract proposal.md b/presentation/rstudio conf 2021/Abstract proposal.md new file mode 100644 index 00000000..e7406d3f --- /dev/null +++ b/presentation/rstudio conf 2021/Abstract proposal.md @@ -0,0 +1,5 @@ +Learn how to handle 100GBs of data with ease using {disk.frame} - the larger-than-RAM-data manipulation package. + +R loads data in its entirety into RAM. However, RAM is a precious resource and often do run out. That's why most R user would have run into the "cannot allocate vector of size xxB." error at some point. + +However, the need to handle larger-than-RAM data doesn't go away just because RAM isn't large enough. So many useRs turn to big data tools like Spark for the task. In this talk, I will make the case that {disk.frame} is sufficient and often preferable for manipulating larger-than-RAM data that fit on disk. I will show how you can apply familiar {dplyr}-verbs to manipulate larger-than-RAM data with {disk.frame}. \ No newline at end of file diff --git a/presentation/twin cities/.gitignore b/presentation/twin cities/.gitignore new file mode 100644 index 00000000..26416673 --- /dev/null +++ b/presentation/twin cities/.gitignore @@ -0,0 +1 @@ +*.mp4 diff --git a/tests/testthat/test-add-chunk.r b/tests/testthat/test-add-chunk.r index 07855ac3..c6b38a73 100644 --- a/tests/testthat/test-add-chunk.r +++ b/tests/testthat/test-add-chunk.r @@ -4,6 +4,17 @@ setup({ setup_disk.frame(workers = 2) }) +test_that("guard against github 292", { + a = data.frame(a = as.Date("2020-07-01"), b = runif(1e6)) + + a.df = as.disk.frame(a) + + head(a.df) + + expect_s3_class(add_chunk(a.df, a), "disk.frame") + delete(a.df) +}) + test_that("testing add chunk without naming chunk_id", { a = data.frame(a = 1:100, b = 1:100) @@ -37,5 +48,18 @@ test_that("testing add chunk by naming chunk_id", { delete(a1) }) +test_that("testing add chunk by using compression", { + a = data.frame(a = 1:100, b = 1:100) + + a1 = as.disk.frame(a, overwrite = TRUE) + b = data.frame(a = 51:150, b = 1:100) + d = data.frame(a = 1:50, b = 1:50) + + add_chunk(a1, b, compress=50) + expect_equal(nrow(a1), 200) + + delete(a1) +}) + teardown({ }) \ No newline at end of file diff --git a/tests/testthat/test-names.r b/tests/testthat/test-names.r index 0441136b..2d114aeb 100644 --- a/tests/testthat/test-names.r +++ b/tests/testthat/test-names.r @@ -12,6 +12,13 @@ test_that("testing names", { expect_setequal(names(b), c("a","b")) }) +test_that("testing names with lazyfn", { + b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% + mutate(d = a + b) + + expect_setequal(colnames(b), c("a","b", "d")) + expect_setequal(names(b), c("a","b", "d")) +}) teardown({ fs::dir_delete(file.path(tempdir(), "tmp_names.df")) diff --git a/vignettes/concepts.Rmd b/vignettes/concepts.Rmd deleted file mode 100644 index d320fd90..00000000 --- a/vignettes/concepts.Rmd +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: "Key `{disk.frame}` concepts" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Key disk.frame concepts} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Key `{disk.frame}` concepts -There are a number of concepts and terminologies that are useful to understand in order to use `disk.frame` effectively. - -## What is a `disk.frame` and what are chunks? - -A `disk.frame` is a folder containing [`fst`](https://www.fstpackage.org/) files named "1.fst", "2.fst", "3.fst" etc. Each of the ".fst" file is called a _chunk_. - -## Workers and parallelism - -Parallelism in `disk.frame` is achieved using the [`future` package](https://cran.r-project.org/package=future). When performing many tasks, `disk.frame` uses multiple workers, where each _worker_ is an R session, to perform the tasks in parallel. - -It is recommended that you should run the following immediately after `library(disk.frame)` to set-up multiple workers. For example: - -```r -library(disk.frame) -setup_disk.frame() - -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -For example, suppose we wish to compute the number of rows for each chunk, we can clearly perform this simultaneously in parallel. The code to do that is - -```r -# use only one column is fastest -df[,.N, keep = "first_col"] -``` - -or equivalent using the `srckeep` function - -```r -# use only one column is fastest -srckeep(df, "first_col")[,.N, keep = "first_col"] -``` - -Say there are `n` chunks in `df`, and there are `m` workers. Then the first `m` chunks will run `chunk[,.N]` simultaneously. - -To see how many workers are at work, use -```r -# see how many workers are available for work -future::nbrOfWorkers() -``` - -## How `{disk.frame}` works - -When `df %>% some_fn %>% collect` is called. The `some_fn` is applied to each chunk of `df`. The collect will row-bind the results from `some_fn(chunk)`together if the returned value of `some_fn` is a data.frame, or it will return a `list` containing the results of `some_fn`. - -The session that receives these results is called the **main session**. In general, we should try to minimize the amount of data passed from the worker sessions back to the main session, because passing data around can be slow. - -Also, please note that there is no communication between the workers, except for workers passing data back to the main session. - - diff --git a/vignettes/convenience-features.Rmd b/vignettes/convenience-features.Rmd deleted file mode 100644 index 6269a97b..00000000 --- a/vignettes/convenience-features.Rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Convenience features" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Convenience features} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## Convenience Features - -### GUI for setting options - -I wanted to make `{disk.frame}` as easy to use as possible. I often forget what options are available to me. So I've made a GUI - -```r -setup_disk.frame(gui = TRUE) -``` -which opens up a Shiny app where the user can choose the options. - -### RStudio column name completion - -```r -library(disk.frame) -mtcars.df = as.disk.frame(mtcars) - -mtcars.df %>% - filter() -``` - -you can press tab in RStudio and it will show all column available - -### Insert ceremony/boilerplate into code in RStudio - -The below will insert the recommended ceremony code into your editor -```r -disk.frame::insert_ceremony() -``` -should insert - -```r -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` diff --git a/vignettes/data-table-syntax.Rmd b/vignettes/data-table-syntax.Rmd deleted file mode 100644 index e87ca88b..00000000 --- a/vignettes/data-table-syntax.Rmd +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: "Using data.table syntax with disk.frame" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Using data.table syntax} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## `disk.frame` supports `data.table` syntax - - -```{r setup_data_table, cache=TRUE} -library(disk.frame) - -# set-up disk.frame to use multiple workers -if(interactive()) { - setup_disk.frame() - # highly recommended, however it is pun into interactive() for CRAN because - # change user options are not allowed on CRAN - options(future.globals.maxSize = Inf) -} else { - setup_disk.frame(2) -} - - -library(nycflights13) - -# create a disk.frame -flights.df = as.disk.frame(nycflights13::flights, outdir = file.path(tempdir(),"flights13"), overwrite = TRUE) -``` - -In the following example, I will use the `.N` from the `data.table` package to count the unique combinations `year` and `month` within each chunk. - -```{r ok, dependson='setup_data_table'} -library(data.table) -library(disk.frame) - -flights.df = disk.frame(file.path(tempdir(),"flights13")) - -names(flights.df) - -flights.df[,.N, .(year, month), keep = c("year", "month")] -``` - -All `data.table` syntax are supported. However, `disk.frame` adds the ability to load only those columns required for the analysis using the `keep =` option. In the above analysis, only the `year` and `month` variables are required and hence `keep = c("year", "month")` was used. - -Alternatively, we can use the `srckeep` function to achieve the same, e.g. - -```r -srckeep(flights.df, c("year", "month"))[,.N, .(year, month)] -``` - -### External variables are captured - -`disk.frame` sends the computation to background workers which are essentially distinct and separate R sessions. Typically, the variables that you have available in your current R session aren't visible in the other R sessions, but `disk.frame` uses the `future` package's variable detection abilities to figure out which variables are in use and then send them to the background workers so they have access to the variables as well. E.g. - -```{r var_detect, dependson='setup_data_table'} -y = 42 -some_fn <- function(x) x - - -flights.df[,some_fn(y)] -``` - -In the above example, neither `some_fn` nor `y` are defined in the background workers' environments, but `disk.frame` still manages to evaluate this code `flights.df[,some_fn(y)]`. - -```{r clean_up, include=FALSE} -fs::dir_delete(file.path(tempdir(),"flights13")) -``` \ No newline at end of file diff --git a/vignettes/glm.Rmd b/vignettes/glm.Rmd deleted file mode 100644 index 1d5fb418..00000000 --- a/vignettes/glm.Rmd +++ /dev/null @@ -1,111 +0,0 @@ ---- -title: "Generalized Linear Models (GLM) including logistic regression with disk.frame" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Generalized Linear Models (logistic regression etc) with disk.frame} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup, cache=TRUE} -suppressPackageStartupMessages(library(disk.frame)) - -if(interactive()) { - setup_disk.frame() -} else { - # only use 1 work to pass CRAN check - setup_disk.frame(1) -} - -``` - -# GLMs - -### Prerequisites -In this article, we will assume you are familiar with Generalized Linear Models (GLMs). You are also expected to have basic working knowledge of {`disk.frame`}, see this [{`disk.frame`} Quick Start](http://diskframe.com/articles/intro-disk-frame.html). - -## Introduction -One can fit a GLM using the `glm` function. For example, - -```{r glm, cache=TRUE} -m = glm(dist ~ speed, data = cars) -``` - -would fit a linear model on the data `cars` with `dist` as the target and `speed` as the explanatory variable. You can inspect the results of the model fit using - -```{r, depeondson='glm'} -summary(m) -``` - -or if you have `{broom}` installed - -```{r, depeondson='glm'} -broom::tidy(m) -``` - -With {`disk.frame`}, you can run GLM `dfglm` function, where the `df` stands for `disk.frame` of course! -```{r dependson='setup'} -cars.df = as.disk.frame(cars) - -m = dfglm(dist ~ speed, cars.df) - -summary(m) - - -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) - -if((majorv == 3) & (minorv >= 6)) { - broom::tidy(m) -} else { - # broom doesn't work in version < R3.6 because biglm does not work -} - -``` - -The syntax didn't change at all! You are able to enjoy the benefits of `disk.frame` when dealing with larger-than-RAM data. - -## Logistic regression -Logistic regression is one of the most commonly deployed machine learning (ML) models. It is often used to build binary classification models - -```{r dependson='setup'} -iris.df = as.disk.frame(iris) - -# fit a logistic regression model to predict Speciess == "setosa" using all variables -all_terms_except_species = setdiff(names(iris.df), "Species") -formula_rhs = paste0(all_terms_except_species, collapse = "+") - -formula = as.formula(paste("Species == 'versicolor' ~ ", formula_rhs)) - -iris_model = dfglm(formula , data = iris.df, family=binomial()) - -# iris_model = dfglm(Species == "setosa" ~ , data = iris.df, family=binomial()) - -summary(iris_model) - -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) - -if((majorv == 3) & (minorv >= 6)) { - broom::tidy(iris_model) -} else { - # broom doesn't work in version < R3.6 because biglm does not work -} - -``` - -The arguments to the `dfglm` function are the same as the arguments to `biglm::bigglm` which are based on the `glm` function. Please check their documentations for other argument options. - -## Notes -`{disk.frame}` uses `{biglm}` and `{speedglm}` as the backend for GLMs. Unfortunately, neither package is managed on open-source platforms, so it's more difficult to contribute to them by making bug fixes and submitting bug reports. So bugs are likely to persists. There is an active effort on `disk.frame` to look for alternatives. Example of avenues to explore include tighter integration with `{keras}`, h2o, or Julia's OnlineStats.jl for model fit purposes. - -Another package for larger-than-RAM glm fitting, `{bigFastlm}`, has been taken off CRAN, it is managed on Github. - -Currently, parallel processing of GLM fit are not possible with {`disk.frame`}. diff --git a/vignettes/ingesting-data.Rmd b/vignettes/ingesting-data.Rmd deleted file mode 100644 index 541d870a..00000000 --- a/vignettes/ingesting-data.Rmd +++ /dev/null @@ -1,174 +0,0 @@ ---- -title: "Ingesting Data" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Ingesting data including CSVs} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Ingesting Data - -One of the most important tasks to perform before using the `{disk.frame}` package is to make some `disk.frame`s! There are a few functions to help you do that. Before we do that, we set up the `{disk.frame}` as usual - -**Setting up** - -```r -library(disk.frame) - -# set-up disk.frame to use multiple workers -if(interactive()) { - setup_disk.frame() - # highly recommended, however it is pun into interactive() for CRAN because - # change user options are not allowed on CRAN - options(future.globals.maxSize = Inf) -} else { - setup_disk.frame(2) -} - -``` - -## Convert a `data.frame` to `disk.frame` -Firstly, there is `as.disk.frame()` which allows you to make a `disk.frame` from a `data.frame`, e.g. - -```r -flights.df = as.disk.frame(nycflights13::flights) -``` - -will convert the `nycflights13::flights` `data.frame` to a `disk.frame` somewhere in `tempdir()`. To find out the location of the `disk.frame` use: - -```r -attr(flights.df, "path") -``` - -You can also specify a location to output the `disk.frame` to using `outdir` - -```r -flights.df = as.disk.frame(nycflights13::flights, outdir = "some/path.df") -``` - -it is recommended that you use `.df` as the extension for a `disk.frame`, however this is not an enforced requirement. - -However, one of the reasons for `disk.frame` to exist is to handle larger-than-RAM files, hence `as.disk.frame` is not all that useful because it can only convert data that can fit into RAM. `disk.frame` comes with a couple more ways to create `disk.frame`. - -## Creating `disk.frame` from CSVs -The function `csv_to_disk.frame` can convert CSV files to `disk.frame`. The most basic usage is - -```r -some.df = csv_to_disk.frame("some/path.csv", outdir = "some.df") -``` - -this will convert the CSV file `"some/path.csv"` to a `disk.frame`. - -## Multiple CSV files - -However, sometimes we have multiple CSV files that you want to read in and row-bind into one large `disk.frame`. You can do so by supplying a vector of file paths e.g. from the result of `list.files` - -```r -some.df = csv_to_disk.frame(c("some/path/file1.csv", "some/path/file2.csv")) - -# or -some.df = csv_to_disk.frame(list.files("some/path")) -``` - -## Ingesting CSV files chunk-wise -The `csv_to_disk.frame(path, ...)` function reads the file located at `path` in full into RAM but sometimes the CSV file may be too large to read in one go, as that would require loading the whole file into RAM. In that case, you can read the files chunk-by-chunk by using the `in_chunk_size` argument which controls how many rows you read in per chunk - -```r -# to read in 1 million (=1e6) rows per chunk -csv_to_disk.frame(path, in_chunk_size = 1e6) -``` - -When `in_chunk_size` is specified, the input file is split into many smaller files using `bigreadr`'s split file functions. This is generally the fastest way to ingest large CSVs, as the split files can be processed in parallel using all CPU cores. But the disk space requirement is doubled because the split files are as large as the original file. If you run out of disk space, then you must clean R's temporary folder at `tempdir()` and choose another `chunk_reader` e.g. `csv_to_disk.frame(..., chunk_reader = "LaF")`. - -## Sharding -One of the most important aspects of `disk.frame` is sharding. One can shard a `disk.frame` at read time by using the `shardby` - -```r -csv_to_disk.frame(path, shardby = "id") -``` - -In the above case, all rows with the same `id` values will end up in the same chunk. - - -## Just-in-time transformation -Sometimes, one may wish to perform some transformation on the CSV before writing out to disk. One can use the `inmapfn` argument to do that. The `inmapfn` name comes from INput MAPping FuNction. The general usage pattern is as follows: - -```r -csv_to_disk.frame(file.path(tempdir(), "df.csv"), inmapfn = function(chunk) { - some_transformation(chunk) -}) -``` - -As a contrived example, suppose you wish to convert a string into date at read time: - -```r -df = data.frame(date_str = c("2019-01-02", "2019-01-02")) - -# write the data.frame -write.csv(df, file.path(tempdir(), "df.csv")) - - -# this would show that date_str is a string -str(collect(csv_to_disk.frame(file.path(tempdir(), "df.csv")))$date_str) -## chr [1:2] "2019-01-02" "2019-01-02" - -# this would show that date_str is a string -df = csv_to_disk.frame(file.path(tempdir(), "df.csv"), inmapfn = function(chunk) { - # convert to date_str to date format and store as "date" - chunk[, date := as.Date(date_str, "%Y-%m-%d")] - chunk[, date_str:=NULL] -}) - -str(collect(df)$date) -## Date[1:2], format: "2019-01-02" "2019-01-02" -``` - -## Reading CSVs from zip files -Often, CSV comes zipped in a zip files. You can use the `zip_to_disk.frame` to convert all CSVs within a zip file - -```r -zip_to_disk.frame(path_to_zip_file) -``` - -The arguments for `zip_to_disk.frame` are the same as `csv_to_disk.frame`'s. - - -## Using `add_chunk` - -What if the method of converting to a `disk.frame` isn't implemented in `disk.frame` yet? One can use some lower level constructs provided by `disk.frame` to create `disk.frame`s. For example, the `add_chunk` function can be used to add more chunks to a `disk.frame`, e.g. - -```r -a.df = disk.frame() # create an empty disk.frame -add_chunk(a.df, cars) # adds cars as chunk 1 -add_chunk(a.df, cars) # adds cars as chunk 2 -``` - -Another example of using `add_chunk` is via `readr`'s chunked read functions to create a delimited file reader - -```r -delimited_to_disk.frame <- function(file, outdir, ...) { - res.df = disk.frame(outdir, ...) - readr::read_delim_chunked(file, callback = function(chunk) { - add_chunk(res.df, chunk) - }, ...) - - res.df -} - -delimited_to_disk.frame(path, outdir = "some.df") -``` - -The above code uses `readr`'s `read_delim_chunked` function to read `file` and call `add_chunk`. The problem with this approach is that is it sequential in nature and hence is not able to take advantage of parallelism. - -## Exploiting the structure of a disk.frame - -Of course, a `disk.frame` is just a folder with many `fst` files named as `1.fst`, `2.fst` etc. So one can simply create these `fst` files and ensure they have the same variable names and put them in a folder. \ No newline at end of file diff --git a/vignettes/intro-disk-frame.Rmd b/vignettes/intro-disk-frame.Rmd deleted file mode 100644 index c3d364fb..00000000 --- a/vignettes/intro-disk-frame.Rmd +++ /dev/null @@ -1,378 +0,0 @@ ---- -title: "Quick Start: Basic Operations with nycflights13" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Quick Start} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -suppressPackageStartupMessages(library(disk.frame)) -library(fst) -library(magrittr) -library(nycflights13) -library(dplyr) -library(data.table) - -# you need to run this for multi-worker support -# limit to 2 cores if not running interactively; most likely on CRAN -# set-up disk.frame to use multiple workers -if(interactive()) { - setup_disk.frame() - # highly recommended, however it is pun into interactive() for CRAN because - # change user options are not allowed on CRAN - options(future.globals.maxSize = Inf) -} else { - setup_disk.frame(2) -} - - -knitr::opts_chunk$set( - eval = FALSE, - collapse = TRUE, - comment = "#>", - include = TRUE -) -``` - -# Quick Start - replicating dplyr's tutorial on nycflight13 - -The [`disk.frame` package](https://github.com/xiaodaigh/disk.frame) aims to be the answer to the question: how do I manipulate structured tabular data that doesn't fit into Random Access Memory (RAM)? - -In a nutshell, `disk.frame` makes use of two simple ideas: - -1) split up a larger-than-RAM dataset into chunks and store each chunk in a separate file inside a folder and -2) provide a convenient API to manipulate these chunks - -`disk.frame` performs a similar role to distributed systems such as Apache Spark, Python's Dask, and Julia's JuliaDB.jl for *medium data* which are datasets that are too large for RAM but not quite large enough to qualify as *big data*. - -In this tutorial, we introduce `disk.frame`, address some common questions, and replicate the [sparklyr data manipulation tutorial](https://spark.rstudio.com/dplyr/) using `disk.frame` constructs. - -## Installation -Simply run - -```r -install.packages("disk.frame") # when CRAN ready -``` -or - -```r -devtools::install_github("xiaodaigh/disk.frame") -``` - -## Set-up `disk.frame` -`disk.frame` works best if it can process multiple data chunks in parallel. The best way to set-up `disk.frame` so that each CPU core runs a background worker is by using - -```r -setup_disk.frame() - -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -The `setup_disk.frame()` function sets up background workers equal to the number of CPU cores available on your machine; please note that, by default, hyper-threaded cores are counted as one not two. - -Alternatively, one may specify the number of workers using `setup_disk.frame(workers = n)`. - -## Basic Data Operations with `disk.frame` - -The `disk.frame` package provides convenient functions to convert `data.frame`s and CSVs to `disk.frame`s. - -### Creating a `disk.frame` from `data.frame` -We convert a `data.frame` to `disk.frame` using the `as.data.frame` function. - -```{r asdiskframe, cache=TRUE} -library(nycflights13) -library(dplyr) -library(disk.frame) -library(data.table) - -# convert the flights data to a disk.frame and store the disk.frame in the folder -# "tmp_flights" and overwrite any content if needed -flights.df <- as.disk.frame( - flights, - outdir = file.path(tempdir(), "tmp_flights.df"), - overwrite = TRUE) -flights.df -``` -You should now see a folder called `tmp_flights` with some files in it, namely `1.fst`, `2.fst`... where each `fst` files is one chunk of the `disk.frame`. - - -### Creating a `disk.frame` from CSV -```{r} -library(nycflights13) -# write a csv -csv_path = file.path(tempdir(), "tmp_flights.csv") -data.table::fwrite(flights, csv_path) - -# load the csv into a disk.frame -df_path = file.path(tempdir(), "tmp_flights.df") -flights.df <- csv_to_disk.frame( - csv_path, - outdir = df_path, - overwrite = T) - -flights.df -``` - -If the CSV is too large to read in, then we can also use the `in_chunk_size` option to control how many rows to read in at once. For example, to read in the data 100,000 rows at a time: - -```{r} -library(nycflights13) -library(disk.frame) - -# write a csv -csv_path = file.path(tempdir(), "tmp_flights.csv") - -data.table::fwrite(flights, csv_path) - -df_path = file.path(tempdir(), "tmp_flights.df") - -flights.df <- csv_to_disk.frame( - csv_path, - outdir = df_path, - in_chunk_size = 100000) - -flights.df -``` - -`disk.frame` also has a function `zip_to_disk.frame` that can convert every CSV in a zip file to a `disk.frame`. - -### Simple `dplyr` verbs and lazy evaluation -```{r dfselect, dependson='asdiskframe', cache=TRUE} -flights.df1 <- select(flights.df, year:day, arr_delay, dep_delay) -flights.df1 -``` - -```{r dependson='dfselect'} -class(flights.df1) -``` - -The class of `flights.df1` is also a `disk.frame` after the `dplyr::select` transformation. Also, `disk.frame` operations are by default (and where possible) **lazy**, meaning they don't perform the operations right away. Instead, these functions wait until you call `collect`. Exceptions to this rule are the `*_join` operations, which evaluate *eagerly* under certain conditions--see **Joins for disk.frame in-depth** for details. - -For lazily constructed `disk.frame`s (e.g. `flights.df1`), the function `collect` can be used to bring the results from disk into R, e.g. -```{r, dependson='dfselect'} -collect(flights.df1) %>% head(2) -``` - -Of course, for larger-than-RAM datasets, one wouldn't call `collect` on the whole `disk.frame` (because why would you need `disk.frame` otherwise). More likely, one would call `collect` on a `filter`ed dataset or one summarized with `group_by`. - -Some examples of other dplyr verbs applied: - -```{r, dependson='asdiskframe'} -filter(flights.df, dep_delay > 1000) %>% collect %>% head(2) -``` - -```{r, dependson='asdiskframe'} -mutate(flights.df, speed = distance / air_time * 60) %>% collect %>% head(2) -``` - -### Examples of NOT fully supported `dplyr` verbs - -The `chunk_arrange` function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly, `chunk_summarise` creates summary variables within each chunk and hence also needs to be used with caution. In the **Group-by** section, we demonstrate how to use `summarise` in the `disk.frame` context correctly with `hard_group_by`s. - -```{r, dependson='asdiskframe'} -# this only sorts within each chunk -chunk_arrange(flights.df, dplyr::desc(dep_delay)) %>% collect %>% head(2) -``` - - -```{r, dependson='asdiskframe'} -chunk_summarize(flights.df, mean_dep_delay = mean(dep_delay, na.rm =T)) %>% collect -``` - -### Piping - -One can chain `dplyr` verbs together like with a `data.frame` - -```{r, dependson='asdiskframe'} -c4 <- flights %>% - filter(month == 5, day == 17, carrier %in% c('UA', 'WN', 'AA', 'DL')) %>% - select(carrier, dep_delay, air_time, distance) %>% - mutate(air_time_hours = air_time / 60) %>% - collect %>% - arrange(carrier)# arrange should occur after `collect` - -c4 %>% head -``` - -### List of supported `dplyr` verbs - -```r -select -rename -filter -chunk_arrange # within each chunk -chunk_group_by # within each chunk -chunk_summarize # within each chunk -group_by # limited functions -summarize # limited functions -mutate -transmute -left_join -inner_join -full_join # careful. Performance! -semi_join -anit_join -``` - -## Sharding and distribution of chunks - -Like other distributed data manipulation frameworks, `disk.frame` utilizes the *sharding* concept to distribute the data into chunks. For example, "to shard by `cust_id`" means that all rows with the same `cust_id` will be stored in the same chunk. This enables `chunk_group_by` by `cust_id` to produce the same results as non-chunked data. - -The `by` variables that were used to shard the dataset are called the `shardkey`s. The *sharding* is performed by computing a deterministic hash on the shard keys (the `by` variables) for each row. The hash function produces an integer between `1` and `n`, where `n` is the number of chunks. - -## Group-by - -`{disk.frame}` implements the `group_by` operation with some caveats. In the `{disk.frame}` framework, only a subset of functions are supported in `summarize`. However, the user can create more custom `group-by` functions on the fly. - -```{r, dependson='asdiskframe'} -flights.df %>% - group_by(carrier) %>% # notice that hard_group_by needs to be set - summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules - collect %>% - arrange(carrier) -``` - -## Restrict input columns for faster processing - -One can restrict which input columns to load into memory for each chunk; this can significantly increase the speed of data processing. To restrict the input columns, use the `srckeep` function which only accepts column names as a string vector. - -```{r, dependson='asdiskframe'} -flights.df %>% - srckeep(c("carrier","dep_delay")) %>% - group_by(carrier) %>% - summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules - collect -``` - -Input column restriction is one of the most critical efficiencies provided by `disk.frame`. Because the underlying format allows random access to columns (i.e. retrieve only the columns used for processing), hence one can drastically reduce the amount of data loaded into RAM for processing by keeping only those columns that are directly used to produce the results. - -## Joins - -`disk.frame` supports many dplyr joins including: - -```r -left_join -inner_join -semi_join -inner_join -full_join # requires hard_group_by on both left and right -``` -In all cases, the left dataset (`x`) must be a `disk.frame`, and the right dataset (`y`) can be either a `disk.frame` or a `data.frame`. If the right dataset is a `disk.frame` and the `shardkey`s are different between the two `disk.frame`s then two expensive `hard` `group_by` operations are performed *eagerly*, one on the left `disk.frame` and one on the right `disk.frame` to perform the joins correctly. - -However, if the right dataset is a `data.frame` then `hard_group_by`s are only performed in the case of `full_join`. - -Note `disk.frame` does not support `right_join`. The user should use `left_join` instead. - -The below joins are performed *lazily* because `airlines.dt` is a `data.table` not a `disk.frame`: - -```{r airlines_dt, dependson='asdiskframe', cache=TRUE} -# make airlines a data.table -airlines.dt <- data.table(airlines) -# flights %>% left_join(airlines, by = "carrier") # -flights.df %>% - left_join(airlines.dt, by ="carrier") %>% - collect %>% - head -``` - -```{r, dependson='airlines_dt'} -flights.df %>% - left_join(airlines.dt, by = c("carrier", "carrier")) %>% - collect %>% - tail -``` - -## Window functions and arbitrary functions - -`{disk.frame}` supports all `data.frame` operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like `min_rank` and `rank` are supported out of the box. - -For the following example, we will use the `hard_group_by` which performs a group-by and also reorganises the chunks so that all records with the same `year`, `month`, and `day` end up in the same chunk. This is typically not advised, as `hard_group_by` can be slow for large datasets. - -```{r, dependson='asdiskframe'} -# Find the most and least delayed flight each day -bestworst <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>% - collect - -bestworst %>% head -``` - -Another example: - -```{r, dependson='asdiskframe'} -ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>% - collect - -ranked %>% head -``` - -One more example: - -```{r, dependson='asdiskframe'} -# Rank each flight within a daily window -ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - chunk_group_by(year, month, day) %>% - select(dep_delay) %>% - mutate(rank = rank(desc(dep_delay))) %>% - collect - -ranked %>% head -``` - - -## Arbitrary by-chunk processing - -One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `map.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk: - -```{r, dependson='asdiskframe'} -flights.df1 <- delayed(flights.df, ~nrow(.x)) -collect_list(flights.df1) %>% head # returns number of rows for each data.frame in a list -``` -and to do the same with `map.disk.frame`: - -```{r, dependson='asdiskframe'} -map(flights.df, ~nrow(.x), lazy = F) %>% head -``` -The `map` function can also output the results to another disk.frame folder, e.g. - -```{r, dependson='asdiskframe'} -# return the first 10 rows of each chunk -flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) - -flights.df2 %>% head -``` - -Notice `{disk.frame}` supports the `purrr` syntax for defining a function using `~`. - -## Sampling - -In the `disk.frame` framework, sampling a proportion of rows within each chunk can be performed using `sample_frac`. - -```{r, dependson='asdiskframe'} -flights.df %>% sample_frac(0.01) %>% collect %>% head -``` - -## Writing Data - -One can output a `disk.frame` by using the `write_disk.frame` function. E.g. - -```r -write_disk.frame(flights.df, outdir="out") -``` -this will output a disk.frame to the folder "out" - -```{r cleanup} -fs::dir_delete(file.path(tempdir(), "tmp_flights.df")) -fs::dir_delete(file.path(tempdir(), "tmp2")) -fs::file_delete(file.path(tempdir(), "tmp_flights.csv")) -``` From 511f76b2695decb3b902752f393da197474f116f Mon Sep 17 00:00:00 2001 From: evalparse Date: Thu, 13 Aug 2020 14:13:12 +1000 Subject: [PATCH 3/8] Update README.Rmd --- README.Rmd | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/README.Rmd b/README.Rmd index af17f732..480291fb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -270,17 +270,19 @@ The work priorities at this stage are ## Blogs and other resources -| Title | Language | Author | Date | Description | -| ------------------------------------------------------------ | -------- | --------------- | ---------- | ------------------------------------------------------------ | -| [25 days of disk.frame](https://twitter.com/evalparse/status/1200963268270886912) | English | ZJ | 2019-12-01 | 25 tweets about `{disk.frame}` | -| https://www.researchgate.net/post/What_is_the_Maximum_size_of_data_that_is_supported_by_R-datamining | English | Knut Jägersberg | 2019-11-11 | Great answer on using disk.frame | -| [`{disk.frame}` is epic](https://www.brodrigues.co/blog/2019-09-03-disk_frame/) | English | Bruno Rodriguez | 2019-09-03 | It's about loading a 30G file into `{disk.frame}` | -| [My top 10 R packages for data analytics](https://www.actuaries.digital/2019/09/26/my-top-10-r-packages-for-data-analytics/) | English | Jacky Poon | 2019-09-03 | `{disk.frame}` was number 3 | -| [useR! 2019 presentation video](https://www.youtube.com/watch?v=3XMTyi_H4q4) | English | Dai ZJ | 2019-08-03 | | -| [useR! 2019 presentation slides](https://www.beautiful.ai/player/-LphQ0YaJwRektb8nZoY) | English | Dai ZJ | 2019-08-03 | | +| Title | Language | Author | Date | Description | +|---------------------------------------------------------------------------------------------------------------------------------------|----------|-----------------|------------|----------------------------------------------------------------------------------------------------| +| [useR! 2020 Rladies San Diego {disk.frame} tutorial](https://www.youtube.com/watch?v=kjPjXs0mkwE) | English | Dai ZJ | 2020 | A 2 hour {disk.frame} tutorial for beginners | +| [25 days of disk.frame](https://twitter.com/evalparse/status/1200963268270886912) | English | ZJ | 2019-12-01 | 25 tweets about `{disk.frame}` | +| https://www.researchgate.net/post/What_is_the_Maximum_size_of_data_that_is_supported_by_R-datamining | English | Knut Jägersberg | 2019-11-11 | Great answer on using disk.frame | +| [`{disk.frame}` is epic](https://www.brodrigues.co/blog/2019-09-03-disk_frame/) | English | Bruno Rodriguez | 2019-09-03 | It's about loading a 30G file into `{disk.frame}` | +| [My top 10 R packages for data analytics](https://www.actuaries.digital/2019/09/26/my-top-10-r-packages-for-data-analytics/) | English | Jacky Poon | 2019-09-03 | `{disk.frame}` was number 3 | +| [useR! 2019 presentation video](https://www.youtube.com/watch?v=3XMTyi_H4q4) | English | Dai ZJ | 2019-08-03 | | +| [useR! 2019 presentation slides](https://www.beautiful.ai/player/-LphQ0YaJwRektb8nZoY) | English | Dai ZJ | 2019-08-03 | | | [Split-apply-combine for Maximum Likelihood Estimation of a linear model](https://www.brodrigues.co/blog/2019-10-05-parallel_maxlik/) | English | Bruno Rodriguez | 2019-10-06 | `{disk.frame}` used in helping to create a maximum likelihood estimation program for linear models | -| [Emma goes to useR! 2019](https://emmavestesson.netlify.com/2019/07/user2019/) | English | Emma Vestesson | 2019-07-16 | The first mention of `{disk.frame}` in a blog post | -| [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | +| [Emma goes to useR! 2019](https://emmavestesson.netlify.com/2019/07/user2019/) | English | Emma Vestesson | 2019-07-16 | The first mention of `{disk.frame}` in a blog post | +| [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | + ### Interested in learning `{disk.frame}` in a structured course? From a3da2c1ffe2dc9d547cce9a71f289807de3eab14 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 7 Oct 2020 01:58:47 +1100 Subject: [PATCH 4/8] trouble shooting guide --- book/88-trouble-shooting.Rmd | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 book/88-trouble-shooting.Rmd diff --git a/book/88-trouble-shooting.Rmd b/book/88-trouble-shooting.Rmd new file mode 100644 index 00000000..f9cbca51 --- /dev/null +++ b/book/88-trouble-shooting.Rmd @@ -0,0 +1,53 @@ +--- +title: "Trouble shooting" +author: "ZJ" +output: pdf_document +--- + +```{r include=FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval=TRUE, + include=TRUE +) +``` + +### Steps to trouble shoot + +1. I suggest updating {future} and your R version if you have not already done so. + +2. Are you able to share the data? + +3. Do a good MWE +``` +library(disk.frame) +setup_disk.frame() + +df<-as.disk.frame(a) + + +df1 = mutate(df, date = as.Date(as.character(datadate), format="%Y%m%d")) + +head(df1) +``` + + +3. Check if your virus scanner is blocking interprocess communication + +4. Try to apply the function to just one chunk, perhaps there is a syntax error or column error? If one chunk works then you can rule out coding error + +``` +get_chunk(df, 1) %>% + mutate(date = as.Date(as.character(datadate), format="%Y%m%d")) +``` + +5. Set the number of workers to 1, so there is no more inter-process communication. Does it work now? If it does, then it's the inter process communication. You might need to contact your admin for help + +``` +setup_disk.frame(workers=1) +mutate(df, date = as.Date(as.character(datadate), format="%Y%m%d")) +As an MWE this works for me. + +a = data.frame(datadate = rep("20201007", 3e6)) +``` From 34bafaab2d60d3251fe43d4997fa27aecbab975b Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Thu, 18 Feb 2021 10:56:31 +1100 Subject: [PATCH 5/8] updated read me --- README.Rmd | 21 +++++++++---------- README.md | 61 +++++++++++++++++++++++++++--------------------------- 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/README.Rmd b/README.Rmd index 480291fb..b410c36e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -270,18 +270,17 @@ The work priorities at this stage are ## Blogs and other resources -| Title | Language | Author | Date | Description | -|---------------------------------------------------------------------------------------------------------------------------------------|----------|-----------------|------------|----------------------------------------------------------------------------------------------------| -| [useR! 2020 Rladies San Diego {disk.frame} tutorial](https://www.youtube.com/watch?v=kjPjXs0mkwE) | English | Dai ZJ | 2020 | A 2 hour {disk.frame} tutorial for beginners | -| [25 days of disk.frame](https://twitter.com/evalparse/status/1200963268270886912) | English | ZJ | 2019-12-01 | 25 tweets about `{disk.frame}` | -| https://www.researchgate.net/post/What_is_the_Maximum_size_of_data_that_is_supported_by_R-datamining | English | Knut Jägersberg | 2019-11-11 | Great answer on using disk.frame | -| [`{disk.frame}` is epic](https://www.brodrigues.co/blog/2019-09-03-disk_frame/) | English | Bruno Rodriguez | 2019-09-03 | It's about loading a 30G file into `{disk.frame}` | -| [My top 10 R packages for data analytics](https://www.actuaries.digital/2019/09/26/my-top-10-r-packages-for-data-analytics/) | English | Jacky Poon | 2019-09-03 | `{disk.frame}` was number 3 | -| [useR! 2019 presentation video](https://www.youtube.com/watch?v=3XMTyi_H4q4) | English | Dai ZJ | 2019-08-03 | | -| [useR! 2019 presentation slides](https://www.beautiful.ai/player/-LphQ0YaJwRektb8nZoY) | English | Dai ZJ | 2019-08-03 | | +| Title | Language | Author | Date | Description | +| ------------------------------------------------------------ | -------- | --------------- | ---------- | ------------------------------------------------------------ | +| [25 days of disk.frame](https://twitter.com/evalparse/status/1200963268270886912) | English | ZJ | 2019-12-01 | 25 tweets about `{disk.frame}` | +| https://www.researchgate.net/post/What-is-the-Maximum-size-of-data-that-is-supported-by-R-datamining | English | Knut Jägersberg | 2019-11-11 | Great answer on using disk.frame | +| [`{disk.frame}` is epic](https://www.brodrigues.co/blog/2019-09-03-disk_frame/) | English | Bruno Rodriguez | 2019-09-03 | It's about loading a 30G file into `{disk.frame}` | +| [My top 10 R packages for data analytics](https://www.actuaries.digital/2019/09/26/my-top-10-r-packages-for-data-analytics/) | English | Jacky Poon | 2019-09-03 | `{disk.frame}` was number 3 | +| [useR! 2019 presentation video](https://www.youtube.com/watch?v=3XMTyi_H4q4) | English | Dai ZJ | 2019-08-03 | | +| [useR! 2019 presentation slides](https://www.beautiful.ai/player/-LphQ0YaJwRektb8nZoY) | English | Dai ZJ | 2019-08-03 | | | [Split-apply-combine for Maximum Likelihood Estimation of a linear model](https://www.brodrigues.co/blog/2019-10-05-parallel_maxlik/) | English | Bruno Rodriguez | 2019-10-06 | `{disk.frame}` used in helping to create a maximum likelihood estimation program for linear models | -| [Emma goes to useR! 2019](https://emmavestesson.netlify.com/2019/07/user2019/) | English | Emma Vestesson | 2019-07-16 | The first mention of `{disk.frame}` in a blog post | -| [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | +| [Emma goes to useR! 2019](https://emmavestesson.netlify.app/2019/07/user2019/) | English | Emma Vestesson | 2019-07-16 | The first mention of `{disk.frame}` in a blog post | +| [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | diff --git a/README.md b/README.md index 343f76bf..d7e33912 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ How do I manipulate tabular data that doesn’t fit into Random Access Memory (RAM)? -Use `{disk.frame}`\! +Use `{disk.frame}`! In a nutshell, `{disk.frame}` makes use of two simple ideas -1) split up a larger-than-RAM dataset into chunks and store each chunk +1. split up a larger-than-RAM dataset into chunks and store each chunk in a separate file inside a folder and -2) provide a convenient API to manipulate these chunks +2. provide a convenient API to manipulate these chunks `{disk.frame}` performs a similar role to distributed systems such as Apache Spark, Python’s Dask, and Julia’s JuliaDB.jl for *medium data* @@ -48,28 +48,28 @@ install.packages("disk.frame", repo="https://cran.rstudio.com") Please see these vignettes and articles about `{disk.frame}` - - [Quick start: +- [Quick start: `{disk.frame}`](https://diskframe.com/articles/intro-disk-frame.html) which replicates the `sparklyr` vignette for manipulating the `nycflights13` flights data. - - [Ingesting data into +- [Ingesting data into `{disk.frame}`](https://diskframe.com/articles/ingesting-data.html) which lists some commons way of creating disk.frames - - [`{disk.frame}` can be more - epic\!](https://diskframe.com/articles/more-epic.html) shows some +- [`{disk.frame}` can be more + epic!](https://diskframe.com/articles/more-epic.html) shows some ways of loading large CSVs and the importance of `srckeep` - - [Group-by](https://diskframe.com/articles/group-by.html) the various +- [Group-by](https://diskframe.com/articles/group-by.html) the various types of group-bys - - [Custom one-stage group-by +- [Custom one-stage group-by functions](https://diskframe.com/articles/custom-group-by.html) how to define custom one-stage group-by functions - - [Fitting GLMs (including logistic +- [Fitting GLMs (including logistic regression)](https://diskframe.com/articles/glm.html) introduces the `dfglm` function for fitting generalized linear models - - [Using data.table syntax with +- [Using data.table syntax with disk.frame](https://diskframe.com/articles/data-table-syntax.html) - - [disk.frame concepts](https://diskframe.com/articles/concepts.html) - - [Benchmark 1: disk.frame vs Dask vs +- [disk.frame concepts](https://diskframe.com/articles/concepts.html) +- [Benchmark 1: disk.frame vs Dask vs JuliaDB](https://diskframe.com/articles/vs-dask-juliadb.html) ## Common questions @@ -256,7 +256,7 @@ limitation that function that depend on the order a column can only be obtained using estimated methods. | Function | Exact/Estimate | Notes | -| ------------ | -------------- | ------------------------------------------ | +|--------------|----------------|--------------------------------------------| | `min` | Exact | | | `max` | Exact | | | `mean` | Exact | | @@ -324,7 +324,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpsnJlFJ\\file3d3ce978e3.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpMpwOj2\\file4bb025d75c63.df" ``` A number of data.frame functions are implemented for disk.frame @@ -387,16 +387,16 @@ The work priorities at this stage are ## Blogs and other resources | Title | Language | Author | Date | Description | -| ------------------------------------------------------------------------------------------------------------------------------------- | -------- | --------------- | ---------- | -------------------------------------------------------------------------------------------------- | +|---------------------------------------------------------------------------------------------------------------------------------------|----------|-----------------|------------|----------------------------------------------------------------------------------------------------| | [25 days of disk.frame](https://twitter.com/evalparse/status/1200963268270886912) | English | ZJ | 2019-12-01 | 25 tweets about `{disk.frame}` | -| | English | Knut Jägersberg | 2019-11-11 | Great answer on using disk.frame | +| | English | Knut Jägersberg | 2019-11-11 | Great answer on using disk.frame | | [`{disk.frame}` is epic](https://www.brodrigues.co/blog/2019-09-03-disk_frame/) | English | Bruno Rodriguez | 2019-09-03 | It’s about loading a 30G file into `{disk.frame}` | | [My top 10 R packages for data analytics](https://www.actuaries.digital/2019/09/26/my-top-10-r-packages-for-data-analytics/) | English | Jacky Poon | 2019-09-03 | `{disk.frame}` was number 3 | -| [useR\! 2019 presentation video](https://www.youtube.com/watch?v=3XMTyi_H4q4) | English | Dai ZJ | 2019-08-03 | | -| [useR\! 2019 presentation slides](https://www.beautiful.ai/player/-LphQ0YaJwRektb8nZoY) | English | Dai ZJ | 2019-08-03 | | +| [useR! 2019 presentation video](https://www.youtube.com/watch?v=3XMTyi_H4q4) | English | Dai ZJ | 2019-08-03 | | +| [useR! 2019 presentation slides](https://www.beautiful.ai/player/-LphQ0YaJwRektb8nZoY) | English | Dai ZJ | 2019-08-03 | | | [Split-apply-combine for Maximum Likelihood Estimation of a linear model](https://www.brodrigues.co/blog/2019-10-05-parallel_maxlik/) | English | Bruno Rodriguez | 2019-10-06 | `{disk.frame}` used in helping to create a maximum likelihood estimation program for linear models | -| [Emma goes to useR\! 2019](https://emmavestesson.netlify.com/2019/07/user2019/) | English | Emma Vestesson | 2019-07-16 | The first mention of `{disk.frame}` in a blog post | -| [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | +| [Emma goes to useR! 2019](https://emmavestesson.netlify.app/2019/07/user2019/) | English | Emma Vestesson | 2019-07-16 | The first mention of `{disk.frame}` in a blog post | +| [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | ### Interested in learning `{disk.frame}` in a structured course? @@ -412,7 +412,7 @@ perhaps you have a feature request? Please consider sponsoring ### Backers -Thank you to all our backers\! +Thank you to all our backers! @@ -427,7 +427,7 @@ show up here with a link to your website. **Do you need help with machine learning and data science in R, Python, or Julia?** I am available for Machine Learning/Data -Science/R/Python/Julia consulting\! [Email +Science/R/Python/Julia consulting! [Email me](mailto:dzj@analytixware.com) ## Non-financial ways to contribute @@ -435,16 +435,16 @@ me](mailto:dzj@analytixware.com) Do you wish to give back the open-source community in non-financial ways? Here are some ways you can contribute - - Write a blogpost about your `{disk.frame}`. I would love to learn - more about how `{disk.frame}` has helped you - - Tweet or post on social media (e.g LinkedIn) about `{disk.frame}` to +- Write a blogpost about your `{disk.frame}` usage or experience. I + would love to learn more about how `{disk.frame}` has helped you +- Tweet or post on social media (e.g LinkedIn) about `{disk.frame}` to help promote it - - Bring attention to typos and grammatical errors by correcting and +- Bring attention to typos and grammatical errors by correcting and making a PR. Or simply by [raising an issue here](https://github.com/xiaodaigh/disk.frame/issues) - - Star the [`{disk.frame}` Github +- Star the [`{disk.frame}` Github repo](https://github.com/xiaodaigh/disk.frame) - - Star any repo that `{disk.frame}` depends on +- Star any repo that `{disk.frame}` depends on e.g. [`{fst}`](https://github.com/fstpackage/fst) and [`{future}`](https://github.com/HenrikBengtsson/future) @@ -452,7 +452,6 @@ ways? Here are some ways you can contribute - @@ -467,4 +466,4 @@ status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame? ## Live Stream of `{disk.frame}` development - - +- From e9af01639a28e4b1a5288a0b03a54c73eb94f8c8 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sun, 9 May 2021 20:07:31 +1000 Subject: [PATCH 6/8] updated --- CRAN-RELEASE | 4 +- DESCRIPTION | 4 +- NAMESPACE | 2 - NEWS.md | 6 + R/dplyr_verbs.r | 8 +- R/recommend_nchunks.r | 16 +- README.md | 45 +- book/01-intro.Rmd | 2 +- book/06-vs-dask-juliadb.Rmd | 2 +- cran-comments.md | 12 +- docs/404.html | 11 +- docs/LICENSE-text.html | 11 +- docs/articles/01-intro.html | 13 +- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/02-intro-disk-frame.html | 502 +++++++++--------- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/03-concepts.html | 39 +- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/04-ingesting-data.html | 110 ++-- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/05-data-table-syntax.html | 57 +- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/06-vs-dask-juliadb.html | 166 +++--- .../figure-html/unnamed-chunk-2-1.png | Bin 15944 -> 28103 bytes .../figure-html/unnamed-chunk-3-1.png | Bin 16175 -> 27085 bytes .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/07-glm.html | 93 ++-- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/08-more-epic.html | 203 +++---- .../figure-html/unnamed-chunk-2-1.png | Bin 19434 -> 38731 bytes .../figure-html/unnamed-chunk-3-1.png | Bin 14286 -> 32146 bytes .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/09-convenience-features.html | 40 +- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/10-group-by.html | 149 +++--- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/11-custom-group-by.html | 155 +++--- .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/88-trouble-shooting.html | 186 +++++++ .../header-attrs-2.8/header-attrs.js | 12 + docs/articles/index.html | 13 +- docs/authors.html | 11 +- docs/index.html | 247 +++++---- docs/news/index.html | 47 +- docs/pkgdown.css | 4 +- docs/pkgdown.yml | 7 +- docs/reference/add_chunk.html | 49 +- docs/reference/as.data.frame.disk.frame.html | 21 +- docs/reference/as.data.table.disk.frame.html | 24 +- docs/reference/as.disk.frame.html | 42 +- docs/reference/chunk_group_by.html | 19 +- docs/reference/cmap.html | 162 +++--- docs/reference/cmap2.html | 29 +- docs/reference/collect.html | 39 +- docs/reference/colnames.html | 33 +- docs/reference/compute.disk.frame.html | 36 +- docs/reference/create_chunk_mapper.html | 91 ++-- docs/reference/csv_to_disk.frame.html | 54 +- docs/reference/delete.html | 18 +- docs/reference/df_ram_size.html | 16 +- docs/reference/dfglm.html | 38 +- docs/reference/disk.frame.html | 29 +- docs/reference/dplyr_verbs.html | 62 ++- docs/reference/evalparseglue.html | 13 +- docs/reference/foverlaps.disk.frame.html | 57 +- docs/reference/gen_datatable_synthetic.html | 13 +- docs/reference/get_chunk.html | 32 +- docs/reference/get_chunk_ids.html | 37 +- docs/reference/group_by.html | 21 +- docs/reference/groups.disk.frame.html | 13 +- docs/reference/hard_arrange.html | 45 +- docs/reference/hard_group_by.html | 53 +- docs/reference/head_tail.html | 26 +- docs/reference/index.html | 19 +- docs/reference/is_disk.frame.html | 22 +- docs/reference/join.html | 175 +++--- docs/reference/make_glm_streaming_fn.html | 40 +- docs/reference/merge.disk.frame.html | 42 +- docs/reference/move_to.html | 26 +- docs/reference/nchunks.html | 28 +- docs/reference/ncol_nrow.html | 28 +- docs/reference/one-stage-group-by-verbs.html | 67 +-- docs/reference/overwrite_check.html | 22 +- docs/reference/print.disk.frame.html | 13 +- docs/reference/pull.disk.frame.html | 13 +- docs/reference/rbindlist.disk.frame.html | 39 +- docs/reference/rechunk.html | 46 +- docs/reference/recommend_nchunks.html | 31 +- docs/reference/remove_chunk.html | 36 +- docs/reference/sample.html | 72 +-- docs/reference/setup_disk.frame.html | 39 +- docs/reference/shard.html | 41 +- docs/reference/shardkey.html | 13 +- docs/reference/shardkey_equal.html | 13 +- docs/reference/show_ceremony.html | 19 +- docs/reference/srckeep.html | 23 +- docs/reference/sub-.disk.frame.html | 21 +- docs/reference/tbl_vars.disk.frame.html | 15 +- docs/reference/write_disk.frame.html | 53 +- docs/reference/zip_to_disk.frame.html | 45 +- man/dplyr_verbs.Rd | 3 - tests/testthat.R | 4 - tests/testthat/test-Rcpp.R | 6 - tests/testthat/test-add-chunk.r | 65 --- tests/testthat/test-anti_join.R | 75 --- tests/testthat/test-as-data-frame.R | 14 - tests/testthat/test-as-disk-frame.R | 15 - tests/testthat/test-bloom-filter.r | 16 - tests/testthat/test-collect.R | 57 -- tests/testthat/test-compute.r | 55 -- tests/testthat/test-csv2disk.frame.r | 84 --- tests/testthat/test-data-table.r | 41 -- tests/testthat/test-delete.r | 20 - tests/testthat/test-disk-frame.r | 34 -- tests/testthat/test-dplyr-verbs.r | 175 ------ tests/testthat/test-dtplyr-support.r | 46 -- tests/testthat/test-foverlaps.r | 42 -- tests/testthat/test-full_join.R | 58 -- tests/testthat/test-get_chunk.r | 17 - tests/testthat/test-get_chunk_ids.r | 20 - tests/testthat/test-glm.r | 30 -- tests/testthat/test-group-by.R | 309 ----------- tests/testthat/test-hard-arrange.R | 111 ---- tests/testthat/test-inner_join.R | 64 --- tests/testthat/test-is-disk.frame.r | 12 - tests/testthat/test-left_join.R | 71 --- tests/testthat/test-map.r | 76 --- tests/testthat/test-map2.r | 46 -- tests/testthat/test-merge.r | 59 -- tests/testthat/test-names.r | 25 - tests/testthat/test-nchunks.r | 18 - tests/testthat/test-nrow-ncol.R | 20 - tests/testthat/test-overwrite_check.r | 21 - tests/testthat/test-pls-add.r | 44 -- tests/testthat/test-print.r | 14 - tests/testthat/test-pull.r | 50 -- tests/testthat/test-rbindlist.r | 31 -- tests/testthat/test-rechunk.r | 82 --- tests/testthat/test-recommend_nchunk.R | 16 - tests/testthat/test-remove_chunk.r | 18 - tests/testthat/test-right_join.r | 7 - tests/testthat/test-sample_frac.r | 20 - tests/testthat/test-sample_n.r | 16 - tests/testthat/test-sas_to_csv.r | 8 - tests/testthat/test-sas_to_disk.frame.r | 10 - tests/testthat/test-semi_join.R | 72 --- tests/testthat/test-setup.r | 7 - tests/testthat/test-shard.r | 28 - tests/testthat/test-shardkey.r | 17 - tests/testthat/test-srckeep.r | 17 - tests/testthat/test-tbl_vars.r | 16 - tests/testthat/test-util.r | 8 - tests/testthat/test-write_disk.frame.R | 39 -- tests/testthat/test-zip_to_disk.frame.r | 7 - utils/build_utils.R | 20 +- 155 files changed, 2591 insertions(+), 4047 deletions(-) create mode 100644 docs/articles/01-intro_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/02-intro-disk-frame_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/03-concepts_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/04-ingesting-data_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/05-data-table-syntax_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/06-vs-dask-juliadb_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/07-glm_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/08-more-epic_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/09-convenience-features_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/10-group-by_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/11-custom-group-by_files/header-attrs-2.8/header-attrs.js create mode 100644 docs/articles/88-trouble-shooting.html create mode 100644 docs/articles/88-trouble-shooting_files/header-attrs-2.8/header-attrs.js delete mode 100644 tests/testthat.R delete mode 100644 tests/testthat/test-Rcpp.R delete mode 100644 tests/testthat/test-add-chunk.r delete mode 100644 tests/testthat/test-anti_join.R delete mode 100644 tests/testthat/test-as-data-frame.R delete mode 100644 tests/testthat/test-as-disk-frame.R delete mode 100644 tests/testthat/test-bloom-filter.r delete mode 100644 tests/testthat/test-collect.R delete mode 100644 tests/testthat/test-compute.r delete mode 100644 tests/testthat/test-csv2disk.frame.r delete mode 100644 tests/testthat/test-data-table.r delete mode 100644 tests/testthat/test-delete.r delete mode 100644 tests/testthat/test-disk-frame.r delete mode 100644 tests/testthat/test-dplyr-verbs.r delete mode 100644 tests/testthat/test-dtplyr-support.r delete mode 100644 tests/testthat/test-foverlaps.r delete mode 100644 tests/testthat/test-full_join.R delete mode 100644 tests/testthat/test-get_chunk.r delete mode 100644 tests/testthat/test-get_chunk_ids.r delete mode 100644 tests/testthat/test-glm.r delete mode 100644 tests/testthat/test-group-by.R delete mode 100644 tests/testthat/test-hard-arrange.R delete mode 100644 tests/testthat/test-inner_join.R delete mode 100644 tests/testthat/test-is-disk.frame.r delete mode 100644 tests/testthat/test-left_join.R delete mode 100644 tests/testthat/test-map.r delete mode 100644 tests/testthat/test-map2.r delete mode 100644 tests/testthat/test-merge.r delete mode 100644 tests/testthat/test-names.r delete mode 100644 tests/testthat/test-nchunks.r delete mode 100644 tests/testthat/test-nrow-ncol.R delete mode 100644 tests/testthat/test-overwrite_check.r delete mode 100644 tests/testthat/test-pls-add.r delete mode 100644 tests/testthat/test-print.r delete mode 100644 tests/testthat/test-pull.r delete mode 100644 tests/testthat/test-rbindlist.r delete mode 100644 tests/testthat/test-rechunk.r delete mode 100644 tests/testthat/test-recommend_nchunk.R delete mode 100644 tests/testthat/test-remove_chunk.r delete mode 100644 tests/testthat/test-right_join.r delete mode 100644 tests/testthat/test-sample_frac.r delete mode 100644 tests/testthat/test-sample_n.r delete mode 100644 tests/testthat/test-sas_to_csv.r delete mode 100644 tests/testthat/test-sas_to_disk.frame.r delete mode 100644 tests/testthat/test-semi_join.R delete mode 100644 tests/testthat/test-setup.r delete mode 100644 tests/testthat/test-shard.r delete mode 100644 tests/testthat/test-shardkey.r delete mode 100644 tests/testthat/test-srckeep.r delete mode 100644 tests/testthat/test-tbl_vars.r delete mode 100644 tests/testthat/test-util.r delete mode 100644 tests/testthat/test-write_disk.frame.R delete mode 100644 tests/testthat/test-zip_to_disk.frame.r diff --git a/CRAN-RELEASE b/CRAN-RELEASE index f72236de..48d603de 100644 --- a/CRAN-RELEASE +++ b/CRAN-RELEASE @@ -1,2 +1,2 @@ -This package was submitted to CRAN on 2021-02-13. -Once it is accepted, delete this file and tag the release (commit f7dd3db). +This package was submitted to CRAN on 2021-03-12. +Once it is accepted, delete this file and tag the release (commit 34bafaa). diff --git a/DESCRIPTION b/DESCRIPTION index a9207fc2..0f848ee8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.4.0 -Date: 2021-02-11 +Version: 0.5.0 +Date: 2021-05-09 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) diff --git a/NAMESPACE b/NAMESPACE index 51fa7439..da78d075 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,7 +67,6 @@ S3method(transmute,disk.frame) export(IQR_df.chunk_agg.disk.frame) export(IQR_df.collected_agg.disk.frame) export(add_chunk) -export(add_count.disk.frame) export(add_tally.disk.frame) export(all_df.chunk_agg.disk.frame) export(all_df.collected_agg.disk.frame) @@ -176,7 +175,6 @@ importFrom(data.table,setDT) importFrom(data.table,setkey) importFrom(data.table,setkeyv) importFrom(data.table,timetaken) -importFrom(dplyr,add_count) importFrom(dplyr,add_tally) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) diff --git a/NEWS.md b/NEWS.md index f06cc6e6..088222ce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# disk.frame 0.5 +* removed `add_count` method + +# disk.frame 0.4.1 +* removed use of `sysctl` which was violating CRAN policy + # disk.frame 0.4.0 * Removed `count` and `tally` * Fixed package compatibility diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index e92e678d..75be5774 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -70,10 +70,10 @@ chunk_arrange <- create_chunk_mapper(dplyr::arrange) # TODO alot of these .disk.frame functions are not generic -#' @export -#' @importFrom dplyr add_count -#' @rdname dplyr_verbs -add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' #' @export +#' #' @importFrom dplyr add_count +#' #' @rdname dplyr_verbs +#' add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) #' @export diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index 8d77e1f5..e5be6ac0 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -91,16 +91,18 @@ df_ram_size <- function() { } } } else { - os = R.version$os - if (length(grep("^darwin", os))) { - a = substring(system("sysctl hw.memsize", intern = TRUE), 13) - } #else { + #os = R.version$os + #if (length(grep("^darwin", os))) { + #a = substring(system("sysctl hw.memsize", intern = TRUE), 13) + # the above is not allowed by CRAN + #} #else { # This would work but is not allowed by CRAN #a = system('grep MemTotal /proc/meminfo', intern = TRUE) #} - l = strsplit(a, " ")[[1]] - l = as.numeric(l[length(l)-1]) - ram_size = l/1024^2 + #l = strsplit(a, " ")[[1]] + #l = as.numeric(l[length(l)-1]) + #ram_size = l/1024^2 + ram_size = 16 # to be conservative } if(is.null(ram_size)) { diff --git a/README.md b/README.md index d7e33912..7f14e340 100644 --- a/README.md +++ b/README.md @@ -211,15 +211,12 @@ flights.df %>% filter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> 1 2013 1 1 517 515 2 830 819 -#> 2 2013 1 1 533 529 4 850 830 -#> arr_delay carrier flight tailnum origin dest air_time distance hour minute -#> 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15 -#> 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29 -#> time_hour origin_dest -#> 1 2013-01-01 05:00:00 EWRIAH -#> 2 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight +#> 1 2013 1 1 517 515 2 830 819 11 UA 1545 +#> 2 2013 1 1 533 529 4 850 830 20 UA 1714 +#> tailnum origin dest air_time distance hour minute time_hour origin_dest +#> 1 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH +#> 2 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH ``` ### Group-by @@ -276,14 +273,6 @@ obtained using estimated methods. ``` r library(data.table) -#> -#> Attaching package: 'data.table' -#> The following object is masked from 'package:purrr': -#> -#> transpose -#> The following objects are masked from 'package:dplyr': -#> -#> between, first, last suppressWarnings( grp_by_stage1 <- @@ -324,7 +313,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpMpwOj2\\file4bb025d75c63.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpEdhlRv\\file33ec67c31d9c.df" ``` A number of data.frame functions are implemented for disk.frame @@ -332,23 +321,19 @@ A number of data.frame functions are implemented for disk.frame ``` r # get first few rows head(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> 1: 2013 1 1 517 515 2 830 819 -#> arr_delay carrier flight tailnum origin dest air_time distance hour minute -#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 -#> time_hour -#> 1: 2013-01-01 05:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier +#> 1: 2013 1 1 517 515 2 830 819 11 UA +#> flight tailnum origin dest air_time distance hour minute time_hour +#> 1: 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 ``` ``` r # get last few rows tail(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> 1: 2013 9 30 NA 840 NA NA 1020 -#> arr_delay carrier flight tailnum origin dest air_time distance hour minute -#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40 -#> time_hour -#> 1: 2013-09-30 08:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier +#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ +#> flight tailnum origin dest air_time distance hour minute time_hour +#> 1: 3531 N839MQ LGA RDU NA 431 8 40 2013-09-30 08:00:00 ``` ``` r diff --git a/book/01-intro.Rmd b/book/01-intro.Rmd index d0df5a20..0ffc5a2e 100644 --- a/book/01-intro.Rmd +++ b/book/01-intro.Rmd @@ -3,7 +3,7 @@ title: "Preface - The birth of `disk.frame`" author: "ZJ" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{preface} + %\VignetteIndexEntry{Preface - The birth of `disk.frame`} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/book/06-vs-dask-juliadb.Rmd b/book/06-vs-dask-juliadb.Rmd index 830081f9..c018af59 100644 --- a/book/06-vs-dask-juliadb.Rmd +++ b/book/06-vs-dask-juliadb.Rmd @@ -3,7 +3,7 @@ title: "Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone el author: "ZJ" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{benchmark-1} + %\VignetteIndexEntry{Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/cran-comments.md b/cran-comments.md index 3784d366..b661d1a8 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,11 +1,11 @@ -## Submission for v0.4.0 -* Fixed recently reported warnings +## Submission for v0.5.0 +* Fixed issue in CRAN check but needed to update version to follow semver conventions ## Test environments -* local Windows 10 Pro install, R 4.0.3 -* local Windows 10 Pro install, R devel (as of 2021-02-11) -* local Linux/Ubuntu install, R 4.0.3 -* local Linux/Ubuntu install, R devel (as of 2021-02-11) +* local Windows 10 Pro install, R 4.0.5 +* local Windows 10 Pro install, R devel (as of 2021-05-09) +* local Linux/Ubuntu install, R 4.0.5 +* local Linux/Ubuntu install, R devel (as of 2021-05-09) ## R CMD check results There were no ERRORs nor WARNINGs nor NOTE when run locally. diff --git a/docs/404.html b/docs/404.html index 8c5ef8b1..ed268161 100644 --- a/docs/404.html +++ b/docs/404.html @@ -71,7 +71,7 @@ disk.frame - 0.3.6 + 0.5.0 @@ -79,7 +79,7 @@
  • @@ -135,7 +138,7 @@ - - -
    +
    @@ -167,117 +97,114 @@

    Add a chunk to the disk.frame

    largest numbered file, "n.fst".

    -
    add_chunk(df, chunk, chunk_id = NULL, full.names = FALSE, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    df

    the disk.frame to add a chunk to

    chunk

    a data.frame to be added as a chunk

    chunk_id

    a numeric number indicating the id of the chunk. If NULL it -will be set to the largest chunk_id + 1

    full.names

    whether the chunk_id name match should be to the full file -path not just the file name

    ...

    Passed in the write_fst. E.g. compress

    - -

    Value

    +
    +
    add_chunk(df, chunk, chunk_id = NULL, full.names = FALSE, ...)
    +
    +
    +

    Arguments

    +
    df
    +

    the disk.frame to add a chunk to

    +
    chunk
    +

    a data.frame to be added as a chunk

    +
    chunk_id
    +

    a numeric number indicating the id of the chunk. If NULL it +will be set to the largest chunk_id + 1

    +
    full.names
    +

    whether the chunk_id name match should be to the full file +path not just the file name

    +
    ...
    +

    Passed in the write_fst. E.g. compress

    +
    +
    +

    Value

    disk.frame

    -

    Details

    - +
    +
    +

    Details

    The function is the preferred way to add a chunk to a disk.frame. It performs checks on the types to make sure that the new chunk doesn't have different types to the disk.frame.

    +
    -

    Examples

    -
    # create a disk.frame -df_path = file.path(tempdir(), "tmp_add_chunk") -diskf = disk.frame(df_path) - -# add a chunk to diskf -add_chunk(diskf, cars) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP/tmp_add_chunk" -#> nchunks: 1 -#> nrow (at source): 50 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    add_chunk(diskf, cars) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP/tmp_add_chunk" -#> nchunks: 2 -#> nrow (at source): 100 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    -nchunks(diskf) # 2 -
    #> [1] 2
    -df2 = disk.frame(file.path(tempdir(), "tmp_add_chunk2")) - -# add chunks by specifying the chunk_id number; this is especially useful if -# you wish to add multiple chunk in parralel - -add_chunk(df2, data.frame(chunk=1), 1) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP/tmp_add_chunk2" -#> nchunks: 1 -#> nrow (at source): 1 -#> ncol (at source): 1 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    add_chunk(df2, data.frame(chunk=2), 3) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP/tmp_add_chunk2" -#> nchunks: 2 -#> nrow (at source): 2 -#> ncol (at source): 1 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    -nchunks(df2) # 2 -
    #> [1] 2
    -dir(attr(df2, "path", exact=TRUE)) -
    #> [1] "1.fst" "3.fst"
    # [1] "1.fst" "3.fst" - -# clean up -delete(diskf) -delete(df2) -
    +
    +

    Examples

    +
    # create a disk.frame
    +df_path = file.path(tempdir(), "tmp_add_chunk")
    +diskf = disk.frame(df_path)
    +
    +# add a chunk to diskf
    +add_chunk(diskf, cars)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk"
    +#> nchunks: 1
    +#> nrow (at source): 50
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +add_chunk(diskf, cars)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk"
    +#> nchunks: 2
    +#> nrow (at source): 100
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +
    +nchunks(diskf) # 2
    +#> [1] 2
    +
    +df2 = disk.frame(file.path(tempdir(), "tmp_add_chunk2"))
    +
    +# add chunks by specifying the chunk_id number; this is especially useful if
    +# you wish to add multiple chunk in parralel
    +
    +add_chunk(df2, data.frame(chunk=1), 1)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk2"
    +#> nchunks: 1
    +#> nrow (at source): 1
    +#> ncol (at source): 1
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +add_chunk(df2, data.frame(chunk=2), 3)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/tmp_add_chunk2"
    +#> nchunks: 2
    +#> nrow (at source): 2
    +#> ncol (at source): 1
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +
    +nchunks(df2) # 2
    +#> [1] 2
    +
    +dir(attr(df2, "path", exact=TRUE))
    +#> [1] "1.fst" "3.fst"
    +# [1] "1.fst" "3.fst"
    +
    +# clean up
    +delete(diskf)
    +delete(df2)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html index 275aebaf..abfa6194 100644 --- a/docs/reference/as.data.frame.disk.frame.html +++ b/docs/reference/as.data.frame.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Convert disk.frame to data.frame by collecting all chunks — as.data.frame.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Convert disk.frame to data.frame by collecting all chunks — as.data.frame.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,113 +95,104 @@

    Convert disk.frame to data.frame by collecting all chunks

    Convert disk.frame to data.frame by collecting all chunks

    -
    # S3 method for disk.frame
    -as.data.frame(x, row.names, optional, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    x

    a disk.frame

    row.names

    NULL or a character vector giving the row names for the data frame. Missing values are not allowed.

    optional

    logical. If TRUE, setting row names and converting column names (to syntactic names: see make.names) is optional. Note that all of R's base package as.data.frame() methods use optional only for column names treatment, basically with the meaning of data.frame(*, check.names = !optional). See also the make.names argument of the matrix method.

    ...

    additional arguments to be passed to or from methods.

    - +
    +
    # S3 method for disk.frame
    +as.data.frame(x, row.names, optional, ...)
    +
    -

    Examples

    -
    cars.df = as.disk.frame(cars) -as.data.frame(cars.df) -
    #> speed dist -#> 1 4 2 -#> 2 4 10 -#> 3 7 4 -#> 4 7 22 -#> 5 8 16 -#> 6 9 10 -#> 7 10 18 -#> 8 10 26 -#> 9 10 34 -#> 10 11 17 -#> 11 11 28 -#> 12 12 14 -#> 13 12 20 -#> 14 12 24 -#> 15 12 28 -#> 16 13 26 -#> 17 13 34 -#> 18 13 34 -#> 19 13 46 -#> 20 14 26 -#> 21 14 36 -#> 22 14 60 -#> 23 14 80 -#> 24 15 20 -#> 25 15 26 -#> 26 15 54 -#> 27 16 32 -#> 28 16 40 -#> 29 17 32 -#> 30 17 40 -#> 31 17 50 -#> 32 18 42 -#> 33 18 56 -#> 34 18 76 -#> 35 18 84 -#> 36 19 36 -#> 37 19 46 -#> 38 19 68 -#> 39 20 32 -#> 40 20 48 -#> 41 20 52 -#> 42 20 56 -#> 43 20 64 -#> 44 22 66 -#> 45 23 54 -#> 46 24 70 -#> 47 24 92 -#> 48 24 93 -#> 49 24 120 -#> 50 25 85
    -# clean up -delete(cars.df) -
    +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    row.names
    +

    NULL or a character vector giving the row names for the data frame. Missing values are not allowed.

    +
    optional
    +

    logical. If TRUE, setting row names and converting column names (to syntactic names: see make.names) is optional. Note that all of R's base package as.data.frame() methods use optional only for column names treatment, basically with the meaning of data.frame(*, check.names = !optional). See also the make.names argument of the matrix method.

    +
    ...
    +

    additional arguments to be passed to or from methods.

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +as.data.frame(cars.df)
    +#>    speed dist
    +#> 1      4    2
    +#> 2      4   10
    +#> 3      7    4
    +#> 4      7   22
    +#> 5      8   16
    +#> 6      9   10
    +#> 7     10   18
    +#> 8     10   26
    +#> 9     10   34
    +#> 10    11   17
    +#> 11    11   28
    +#> 12    12   14
    +#> 13    12   20
    +#> 14    12   24
    +#> 15    12   28
    +#> 16    13   26
    +#> 17    13   34
    +#> 18    13   34
    +#> 19    13   46
    +#> 20    14   26
    +#> 21    14   36
    +#> 22    14   60
    +#> 23    14   80
    +#> 24    15   20
    +#> 25    15   26
    +#> 26    15   54
    +#> 27    16   32
    +#> 28    16   40
    +#> 29    17   32
    +#> 30    17   40
    +#> 31    17   50
    +#> 32    18   42
    +#> 33    18   56
    +#> 34    18   76
    +#> 35    18   84
    +#> 36    19   36
    +#> 37    19   46
    +#> 38    19   68
    +#> 39    20   32
    +#> 40    20   48
    +#> 41    20   52
    +#> 42    20   56
    +#> 43    20   64
    +#> 44    22   66
    +#> 45    23   54
    +#> 46    24   70
    +#> 47    24   92
    +#> 48    24   93
    +#> 49    24  120
    +#> 50    25   85
    +
    +# clean up
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html index 16530f9e..7e9ac1e2 100644 --- a/docs/reference/as.data.table.disk.frame.html +++ b/docs/reference/as.data.table.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Convert disk.frame to data.table by collecting all chunks — as.data.table.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Convert disk.frame to data.table by collecting all chunks — as.data.table.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,116 +95,112 @@

    Convert disk.frame to data.table by collecting all chunks

    Convert disk.frame to data.table by collecting all chunks

    -
    # S3 method for disk.frame
    -as.data.table(x, keep.rownames = FALSE, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    x

    a disk.frame

    keep.rownames

    passed to as.data.table

    ...

    passed to as.data.table

    - +
    +
    # S3 method for disk.frame
    +as.data.table(x, keep.rownames = FALSE, ...)
    +
    -

    Examples

    -
    #> -#> Attaching package: 'data.table'
    #> The following object is masked from 'package:purrr': -#> -#> transpose
    #> The following objects are masked from 'package:dplyr': -#> -#> between, first, last
    cars.df = as.disk.frame(cars) -as.data.table(cars.df) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> 26: 15 54 -#> 27: 16 32 -#> 28: 16 40 -#> 29: 17 32 -#> 30: 17 40 -#> 31: 17 50 -#> 32: 18 42 -#> 33: 18 56 -#> 34: 18 76 -#> 35: 18 84 -#> 36: 19 36 -#> 37: 19 46 -#> 38: 19 68 -#> 39: 20 32 -#> 40: 20 48 -#> 41: 20 52 -#> 42: 20 56 -#> 43: 20 64 -#> 44: 22 66 -#> 45: 23 54 -#> 46: 24 70 -#> 47: 24 92 -#> 48: 24 93 -#> 49: 24 120 -#> 50: 25 85 -#> speed dist
    -# clean up -delete(cars.df) -
    +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    keep.rownames
    +

    passed to as.data.table

    +
    ...
    +

    passed to as.data.table

    +
    + +
    +

    Examples

    +
    library(data.table)
    +#> 
    +#> Attaching package: 'data.table'
    +#> The following object is masked from 'package:purrr':
    +#> 
    +#>     transpose
    +#> The following objects are masked from 'package:dplyr':
    +#> 
    +#>     between, first, last
    +cars.df = as.disk.frame(cars)
    +as.data.table(cars.df)
    +#>     speed dist
    +#>  1:     4    2
    +#>  2:     4   10
    +#>  3:     7    4
    +#>  4:     7   22
    +#>  5:     8   16
    +#>  6:     9   10
    +#>  7:    10   18
    +#>  8:    10   26
    +#>  9:    10   34
    +#> 10:    11   17
    +#> 11:    11   28
    +#> 12:    12   14
    +#> 13:    12   20
    +#> 14:    12   24
    +#> 15:    12   28
    +#> 16:    13   26
    +#> 17:    13   34
    +#> 18:    13   34
    +#> 19:    13   46
    +#> 20:    14   26
    +#> 21:    14   36
    +#> 22:    14   60
    +#> 23:    14   80
    +#> 24:    15   20
    +#> 25:    15   26
    +#> 26:    15   54
    +#> 27:    16   32
    +#> 28:    16   40
    +#> 29:    17   32
    +#> 30:    17   40
    +#> 31:    17   50
    +#> 32:    18   42
    +#> 33:    18   56
    +#> 34:    18   76
    +#> 35:    18   84
    +#> 36:    19   36
    +#> 37:    19   46
    +#> 38:    19   68
    +#> 39:    20   32
    +#> 40:    20   48
    +#> 41:    20   52
    +#> 42:    20   56
    +#> 43:    20   64
    +#> 44:    22   66
    +#> 45:    23   54
    +#> 46:    24   70
    +#> 47:    24   92
    +#> 48:    24   93
    +#> 49:    24  120
    +#> 50:    25   85
    +#>     speed dist
    +
    +# clean up
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html index b6f779b8..10189939 100644 --- a/docs/reference/as.disk.frame.html +++ b/docs/reference/as.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Make a data.frame into a disk.frame — as.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Make a data.frame into a disk.frame — as.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,91 +95,75 @@

    Make a data.frame into a disk.frame

    Make a data.frame into a disk.frame

    -
    as.disk.frame(
    -  df,
    -  outdir = tempfile(fileext = ".df"),
    -  nchunks = recommend_nchunks(df),
    -  overwrite = FALSE,
    -  shardby = NULL,
    -  compress = 50,
    -  ...
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame

    outdir

    the output directory

    nchunks

    number of chunks

    overwrite

    if TRUE the outdir will be overwritten, if FALSE it will throw an error if the directory is not empty

    shardby

    The shardkey

    compress

    the compression level 0-100; 100 is highest

    ...

    passed to output_disk.frame

    - - -

    Examples

    -
    # write to temporary location -cars.df = as.disk.frame(cars) - -# specify a different path in the temporary folder, you are free to choose a different folder -cars_new_location.df = as.disk.frame(cars, outdir = file.path(tempdir(), "some_path.df")) - -# specify a different number of chunks -# this writes to tempdir() by default -cars_chunks.df = as.disk.frame(cars, nchunks = 4, overwrite = TRUE) - -# clean up -delete(cars.df) -delete(cars_new_location.df) -delete(cars_chunks.df) -
    +
    +
    as.disk.frame(
    +  df,
    +  outdir = tempfile(fileext = ".df"),
    +  nchunks = recommend_nchunks(df),
    +  overwrite = FALSE,
    +  shardby = NULL,
    +  compress = 50,
    +  ...
    +)
    +
    + +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    outdir
    +

    the output directory

    +
    nchunks
    +

    number of chunks

    +
    overwrite
    +

    if TRUE the outdir will be overwritten, if FALSE it will throw an error if the directory is not empty

    +
    shardby
    +

    The shardkey

    +
    compress
    +

    the compression level 0-100; 100 is highest

    +
    ...
    +

    passed to output_disk.frame

    +
    + +
    +

    Examples

    +
    # write to temporary location
    +cars.df = as.disk.frame(cars) 
    +
    +# specify a different path in the temporary folder, you are free to choose a different folder
    +cars_new_location.df = as.disk.frame(cars, outdir = file.path(tempdir(), "some_path.df"))
    +
    +# specify a different number of chunks
    +# this writes to tempdir() by default
    +cars_chunks.df = as.disk.frame(cars, nchunks = 4, overwrite = TRUE) 
    +
    +# clean up
    +delete(cars.df)
    +delete(cars_new_location.df)
    +delete(cars_chunks.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html index 731d94e4..3b4f8168 100644 --- a/docs/reference/chunk_group_by.html +++ b/docs/reference/chunk_group_by.html @@ -1,70 +1,15 @@ - - - - - - - -Group by within each disk.frame — chunk_summarize • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Group by within each disk.frame — chunk_summarize • disk.frame - - - - - - - - - - - + + - - -
    -
    - -
    - -
    +
    @@ -171,56 +101,49 @@

    Group by within each disk.frame

    reorganizes the chunks by the shard key.

    -
    chunk_summarize(.data, ...)
    -
    -chunk_summarise(.data, ...)
    -
    -chunk_group_by(.data, ...)
    +    
    +
    chunk_summarize(.data, ...)
     
    -chunk_ungroup(.data, ...)
    +chunk_summarise(.data, ...) -

    Arguments

    - - - - - - - - - - -
    .data

    a disk.frame

    ...

    passed to dplyr::group_by

    +chunk_group_by(.data, ...) -

    See also

    +chunk_ungroup(.data, ...)
    +
    -

    hard_group_by group_by

    +
    +

    Arguments

    +
    .data
    +

    a disk.frame

    +
    ...
    +

    passed to dplyr::group_by

    +
    +
    +

    See also

    +

    hard_group_by group_by

    +
    + -
    - +
    - - + + diff --git a/docs/reference/cmap.html b/docs/reference/cmap.html index c2cd39fa..8d46d751 100644 --- a/docs/reference/cmap.html +++ b/docs/reference/cmap.html @@ -1,71 +1,16 @@ - - - - - - - -Apply the same function to all chunks — cmap • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Apply the same function to all chunks — cmap • disk.frame - - - - - - - - - - - + + - - -
    -
    - -
    - -
    +
    @@ -173,439 +103,415 @@

    Apply the same function to all chunks

    `delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl

    -
    cmap(.x, .f, ...)
    -
    -# S3 method for disk.frame
    -cmap(
    -  .x,
    -  .f,
    -  ...,
    -  outdir = NULL,
    -  keep = NULL,
    -  chunks = nchunks(.x),
    -  compress = 50,
    -  lazy = TRUE,
    -  overwrite = FALSE,
    -  vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()),
    -  .progress = TRUE
    -)
    -
    -cmap_dfr(.x, .f, ..., .id = NULL)
    -
    -# S3 method for disk.frame
    -cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)
    -
    -cimap(.x, .f, ...)
    -
    -# S3 method for disk.frame
    -cimap(
    -  .x,
    -  .f,
    -  outdir = NULL,
    -  keep = NULL,
    -  chunks = nchunks(.x),
    -  compress = 50,
    -  lazy = TRUE,
    -  overwrite = FALSE,
    -  ...
    -)
    -
    -cimap_dfr(.x, .f, ..., .id = NULL)
    -
    -# S3 method for disk.frame
    -cimap_dfr(
    -  .x,
    -  .f,
    -  ...,
    -  .id = NULL,
    -  use.names = fill,
    -  fill = FALSE,
    -  idcol = NULL
    -)
    -
    -lazy(.x, .f, ...)
    -
    -# S3 method for disk.frame
    -lazy(.x, .f, ...)
    -
    -delayed(.x, .f, ...)
    -
    -chunk_lapply(...)
    -
    -map(.x, .f, ...)
    -
    -# S3 method for disk.frame
    -map(...)
    -
    -# S3 method for default
    -map(.x, .f, ...)
    -
    -imap_dfr(.x, .f, ..., .id = NULL)
    -
    -# S3 method for disk.frame
    -imap_dfr(...)
    -
    -# S3 method for default
    -imap_dfr(.x, .f, ..., .id = NULL)
    -
    -imap(.x, .f, ...)
    -
    -# S3 method for default
    -imap(.x, .f, ...)
    -
    -# S3 method for disk.frame
    -map_dfr(...)
    -
    -# S3 method for default
    -map_dfr(.x, .f, ..., .id = NULL)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    .x

    a disk.frame

    .f

    a function to apply to each of the chunks

    ...

    for compatibility with `purrr::map`

    outdir

    the output directory

    keep

    the columns to keep from the input

    chunks

    The number of chunks to output

    compress

    0-100 fst compression ratio

    lazy

    if TRUE then do this lazily

    overwrite

    if TRUE removes any existing chunks in the data

    vars_and_pkgs

    variables and packages to send to a background session. This is typically automatically detected

    .progress

    A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

    .id

    not used

    use.names

    for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

    fill

    for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

    idcol

    for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) - -# return the first row of each chunk lazily -# -cars2 = cmap(cars.df, function(chunk) { - chunk[,1] -}) - -collect(cars2) -
    #> speed -#> 1: 4 -#> 2: 4 -#> 3: 7 -#> 4: 7 -#> 5: 8 -#> 6: 9 -#> 7: 10 -#> 8: 10 -#> 9: 10 -#> 10: 11 -#> 11: 11 -#> 12: 12 -#> 13: 12 -#> 14: 12 -#> 15: 12 -#> 16: 13 -#> 17: 13 -#> 18: 13 -#> 19: 13 -#> 20: 14 -#> 21: 14 -#> 22: 14 -#> 23: 14 -#> 24: 15 -#> 25: 15 -#> 26: 15 -#> 27: 16 -#> 28: 16 -#> 29: 17 -#> 30: 17 -#> 31: 17 -#> 32: 18 -#> 33: 18 -#> 34: 18 -#> 35: 18 -#> 36: 19 -#> 37: 19 -#> 38: 19 -#> 39: 20 -#> 40: 20 -#> 41: 20 -#> 42: 20 -#> 43: 20 -#> 44: 22 -#> 45: 23 -#> 46: 24 -#> 47: 24 -#> 48: 24 -#> 49: 24 -#> 50: 25 -#> speed
    -# same as above but using purrr -cars2 = cmap(cars.df, ~.x[1,]) - -collect(cars2) -
    #> speed dist -#> 1: 4 2 -#> 2: 11 17 -#> 3: 13 46 -#> 4: 16 40 -#> 5: 19 46 -#> 6: 24 70
    -# return the first row of each chunk eagerly as list -cmap(cars.df, ~.x[1,], lazy = FALSE) -
    #> [[1]] -#> speed dist -#> 1: 4 2 -#> -#> [[2]] -#> speed dist -#> 1: 11 17 -#> -#> [[3]] -#> speed dist -#> 1: 13 46 -#> -#> [[4]] -#> speed dist -#> 1: 16 40 -#> -#> [[5]] -#> speed dist -#> 1: 19 46 -#> -#> [[6]] -#> speed dist -#> 1: 24 70 -#>
    -# return the first row of each chunk eagerly as data.table/data.frame by row-binding -cmap_dfr(cars.df, ~.x[1,]) -
    #> speed dist -#> 1: 4 2 -#> 2: 11 17 -#> 3: 13 46 -#> 4: 16 40 -#> 5: 19 46 -#> 6: 24 70
    -# lazy and delayed are just an aliases for cmap(..., lazy = TRUE) -collect(lazy(cars.df, ~.x[1,])) -
    #> speed dist -#> 1: 4 2 -#> 2: 11 17 -#> 3: 13 46 -#> 4: 16 40 -#> 5: 19 46 -#> 6: 24 70
    collect(delayed(cars.df, ~.x[1,])) -
    #> speed dist -#> 1: 4 2 -#> 2: 11 17 -#> 3: 13 46 -#> 4: 16 40 -#> 5: 19 46 -#> 6: 24 70
    -# clean up cars.df -delete(cars.df) -cars.df = as.disk.frame(cars) - -# .x is the chunk and .y is the ID as an integer - -# lazy = TRUE support is not available at the moment -cimap(cars.df, ~.x[, id := .y], lazy = FALSE) -
    #> [[1]] -#> speed dist id -#> 1: 4 2 1 -#> 2: 4 10 1 -#> 3: 7 4 1 -#> 4: 7 22 1 -#> 5: 8 16 1 -#> 6: 9 10 1 -#> 7: 10 18 1 -#> 8: 10 26 1 -#> 9: 10 34 1 -#> -#> [[2]] -#> speed dist id -#> 1: 11 17 2 -#> 2: 11 28 2 -#> 3: 12 14 2 -#> 4: 12 20 2 -#> 5: 12 24 2 -#> 6: 12 28 2 -#> 7: 13 26 2 -#> 8: 13 34 2 -#> 9: 13 34 2 -#> -#> [[3]] -#> speed dist id -#> 1: 13 46 3 -#> 2: 14 26 3 -#> 3: 14 36 3 -#> 4: 14 60 3 -#> 5: 14 80 3 -#> 6: 15 20 3 -#> 7: 15 26 3 -#> 8: 15 54 3 -#> 9: 16 32 3 -#> -#> [[4]] -#> speed dist id -#> 1: 16 40 4 -#> 2: 17 32 4 -#> 3: 17 40 4 -#> 4: 17 50 4 -#> 5: 18 42 4 -#> 6: 18 56 4 -#> 7: 18 76 4 -#> 8: 18 84 4 -#> 9: 19 36 4 -#> -#> [[5]] -#> speed dist id -#> 1: 19 46 5 -#> 2: 19 68 5 -#> 3: 20 32 5 -#> 4: 20 48 5 -#> 5: 20 52 5 -#> 6: 20 56 5 -#> 7: 20 64 5 -#> 8: 22 66 5 -#> 9: 23 54 5 -#> -#> [[6]] -#> speed dist id -#> 1: 24 70 6 -#> 2: 24 92 6 -#> 3: 24 93 6 -#> 4: 24 120 6 -#> 5: 25 85 6 -#>
    -cimap_dfr(cars.df, ~.x[, id := .y]) -
    #> speed dist id -#> 1: 4 2 1 -#> 2: 4 10 1 -#> 3: 7 4 1 -#> 4: 7 22 1 -#> 5: 8 16 1 -#> 6: 9 10 1 -#> 7: 10 18 1 -#> 8: 10 26 1 -#> 9: 10 34 1 -#> 10: 11 17 2 -#> 11: 11 28 2 -#> 12: 12 14 2 -#> 13: 12 20 2 -#> 14: 12 24 2 -#> 15: 12 28 2 -#> 16: 13 26 2 -#> 17: 13 34 2 -#> 18: 13 34 2 -#> 19: 13 46 3 -#> 20: 14 26 3 -#> 21: 14 36 3 -#> 22: 14 60 3 -#> 23: 14 80 3 -#> 24: 15 20 3 -#> 25: 15 26 3 -#> 26: 15 54 3 -#> 27: 16 32 3 -#> 28: 16 40 4 -#> 29: 17 32 4 -#> 30: 17 40 4 -#> 31: 17 50 4 -#> 32: 18 42 4 -#> 33: 18 56 4 -#> 34: 18 76 4 -#> 35: 18 84 4 -#> 36: 19 36 4 -#> 37: 19 46 5 -#> 38: 19 68 5 -#> 39: 20 32 5 -#> 40: 20 48 5 -#> 41: 20 52 5 -#> 42: 20 56 5 -#> 43: 20 64 5 -#> 44: 22 66 5 -#> 45: 23 54 5 -#> 46: 24 70 6 -#> 47: 24 92 6 -#> 48: 24 93 6 -#> 49: 24 120 6 -#> 50: 25 85 6 -#> speed dist id
    -# clean up cars.df -delete(cars.df) -
    +
    +
    cmap(.x, .f, ...)
    +
    +# S3 method for disk.frame
    +cmap(
    +  .x,
    +  .f,
    +  ...,
    +  outdir = NULL,
    +  keep = NULL,
    +  chunks = nchunks(.x),
    +  compress = 50,
    +  lazy = TRUE,
    +  overwrite = FALSE,
    +  vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()),
    +  .progress = TRUE
    +)
    +
    +cmap_dfr(.x, .f, ..., .id = NULL)
    +
    +# S3 method for disk.frame
    +cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)
    +
    +cimap(.x, .f, ...)
    +
    +# S3 method for disk.frame
    +cimap(
    +  .x,
    +  .f,
    +  outdir = NULL,
    +  keep = NULL,
    +  chunks = nchunks(.x),
    +  compress = 50,
    +  lazy = TRUE,
    +  overwrite = FALSE,
    +  ...
    +)
    +
    +cimap_dfr(.x, .f, ..., .id = NULL)
    +
    +# S3 method for disk.frame
    +cimap_dfr(
    +  .x,
    +  .f,
    +  ...,
    +  .id = NULL,
    +  use.names = fill,
    +  fill = FALSE,
    +  idcol = NULL
    +)
    +
    +lazy(.x, .f, ...)
    +
    +# S3 method for disk.frame
    +lazy(.x, .f, ...)
    +
    +delayed(.x, .f, ...)
    +
    +chunk_lapply(...)
    +
    +map(.x, .f, ...)
    +
    +# S3 method for disk.frame
    +map(...)
    +
    +# S3 method for default
    +map(.x, .f, ...)
    +
    +imap_dfr(.x, .f, ..., .id = NULL)
    +
    +# S3 method for disk.frame
    +imap_dfr(...)
    +
    +# S3 method for default
    +imap_dfr(.x, .f, ..., .id = NULL)
    +
    +imap(.x, .f, ...)
    +
    +# S3 method for default
    +imap(.x, .f, ...)
    +
    +# S3 method for disk.frame
    +map_dfr(...)
    +
    +# S3 method for default
    +map_dfr(.x, .f, ..., .id = NULL)
    +
    + +
    +

    Arguments

    +
    .x
    +

    a disk.frame

    +
    .f
    +

    a function to apply to each of the chunks

    +
    ...
    +

    for compatibility with `purrr::map`

    +
    outdir
    +

    the output directory

    +
    keep
    +

    the columns to keep from the input

    +
    chunks
    +

    The number of chunks to output

    +
    compress
    +

    0-100 fst compression ratio

    +
    lazy
    +

    if TRUE then do this lazily

    +
    overwrite
    +

    if TRUE removes any existing chunks in the data

    +
    vars_and_pkgs
    +

    variables and packages to send to a background session. This is typically automatically detected

    +
    .progress
    +

    A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

    +
    .id
    +

    not used

    +
    use.names
    +

    for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

    +
    fill
    +

    for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

    +
    idcol
    +

    for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# return the first row of each chunk lazily
    +# 
    +cars2 = cmap(cars.df, function(chunk) {
    + chunk[,1]
    +})
    +
    +collect(cars2)
    +#>     speed
    +#>  1:     4
    +#>  2:     4
    +#>  3:     7
    +#>  4:     7
    +#>  5:     8
    +#>  6:     9
    +#>  7:    10
    +#>  8:    10
    +#>  9:    10
    +#> 10:    11
    +#> 11:    11
    +#> 12:    12
    +#> 13:    12
    +#> 14:    12
    +#> 15:    12
    +#> 16:    13
    +#> 17:    13
    +#> 18:    13
    +#> 19:    13
    +#> 20:    14
    +#> 21:    14
    +#> 22:    14
    +#> 23:    14
    +#> 24:    15
    +#> 25:    15
    +#> 26:    15
    +#> 27:    16
    +#> 28:    16
    +#> 29:    17
    +#> 30:    17
    +#> 31:    17
    +#> 32:    18
    +#> 33:    18
    +#> 34:    18
    +#> 35:    18
    +#> 36:    19
    +#> 37:    19
    +#> 38:    19
    +#> 39:    20
    +#> 40:    20
    +#> 41:    20
    +#> 42:    20
    +#> 43:    20
    +#> 44:    22
    +#> 45:    23
    +#> 46:    24
    +#> 47:    24
    +#> 48:    24
    +#> 49:    24
    +#> 50:    25
    +#>     speed
    +
    +# same as above but using purrr 
    +cars2 = cmap(cars.df, ~.x[1,])
    +
    +collect(cars2)
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:    11   17
    +#> 3:    13   46
    +#> 4:    16   40
    +#> 5:    19   46
    +#> 6:    24   70
    +
    +# return the first row of each chunk eagerly as list
    +cmap(cars.df, ~.x[1,], lazy = FALSE)
    +#> [[1]]
    +#>    speed dist
    +#> 1:     4    2
    +#> 
    +#> [[2]]
    +#>    speed dist
    +#> 1:    11   17
    +#> 
    +#> [[3]]
    +#>    speed dist
    +#> 1:    13   46
    +#> 
    +#> [[4]]
    +#>    speed dist
    +#> 1:    16   40
    +#> 
    +#> [[5]]
    +#>    speed dist
    +#> 1:    19   46
    +#> 
    +#> [[6]]
    +#>    speed dist
    +#> 1:    24   70
    +#> 
    +
    +# return the first row of each chunk eagerly as data.table/data.frame by row-binding
    +cmap_dfr(cars.df, ~.x[1,])
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:    11   17
    +#> 3:    13   46
    +#> 4:    16   40
    +#> 5:    19   46
    +#> 6:    24   70
    +
    +# lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
    +collect(lazy(cars.df, ~.x[1,]))
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:    11   17
    +#> 3:    13   46
    +#> 4:    16   40
    +#> 5:    19   46
    +#> 6:    24   70
    +collect(delayed(cars.df, ~.x[1,]))
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:    11   17
    +#> 3:    13   46
    +#> 4:    16   40
    +#> 5:    19   46
    +#> 6:    24   70
    +
    +# clean up cars.df
    +delete(cars.df)
    +cars.df = as.disk.frame(cars)
    +
    +# .x is the chunk and .y is the ID as an integer
    +
    +# lazy = TRUE support is not available at the moment
    +cimap(cars.df, ~.x[, id := .y], lazy = FALSE)
    +#> [[1]]
    +#>    speed dist id
    +#> 1:     4    2  1
    +#> 2:     4   10  1
    +#> 3:     7    4  1
    +#> 4:     7   22  1
    +#> 5:     8   16  1
    +#> 6:     9   10  1
    +#> 7:    10   18  1
    +#> 8:    10   26  1
    +#> 9:    10   34  1
    +#> 
    +#> [[2]]
    +#>    speed dist id
    +#> 1:    11   17  2
    +#> 2:    11   28  2
    +#> 3:    12   14  2
    +#> 4:    12   20  2
    +#> 5:    12   24  2
    +#> 6:    12   28  2
    +#> 7:    13   26  2
    +#> 8:    13   34  2
    +#> 9:    13   34  2
    +#> 
    +#> [[3]]
    +#>    speed dist id
    +#> 1:    13   46  3
    +#> 2:    14   26  3
    +#> 3:    14   36  3
    +#> 4:    14   60  3
    +#> 5:    14   80  3
    +#> 6:    15   20  3
    +#> 7:    15   26  3
    +#> 8:    15   54  3
    +#> 9:    16   32  3
    +#> 
    +#> [[4]]
    +#>    speed dist id
    +#> 1:    16   40  4
    +#> 2:    17   32  4
    +#> 3:    17   40  4
    +#> 4:    17   50  4
    +#> 5:    18   42  4
    +#> 6:    18   56  4
    +#> 7:    18   76  4
    +#> 8:    18   84  4
    +#> 9:    19   36  4
    +#> 
    +#> [[5]]
    +#>    speed dist id
    +#> 1:    19   46  5
    +#> 2:    19   68  5
    +#> 3:    20   32  5
    +#> 4:    20   48  5
    +#> 5:    20   52  5
    +#> 6:    20   56  5
    +#> 7:    20   64  5
    +#> 8:    22   66  5
    +#> 9:    23   54  5
    +#> 
    +#> [[6]]
    +#>    speed dist id
    +#> 1:    24   70  6
    +#> 2:    24   92  6
    +#> 3:    24   93  6
    +#> 4:    24  120  6
    +#> 5:    25   85  6
    +#> 
    +
    +cimap_dfr(cars.df, ~.x[, id := .y])
    +#>     speed dist id
    +#>  1:     4    2  1
    +#>  2:     4   10  1
    +#>  3:     7    4  1
    +#>  4:     7   22  1
    +#>  5:     8   16  1
    +#>  6:     9   10  1
    +#>  7:    10   18  1
    +#>  8:    10   26  1
    +#>  9:    10   34  1
    +#> 10:    11   17  2
    +#> 11:    11   28  2
    +#> 12:    12   14  2
    +#> 13:    12   20  2
    +#> 14:    12   24  2
    +#> 15:    12   28  2
    +#> 16:    13   26  2
    +#> 17:    13   34  2
    +#> 18:    13   34  2
    +#> 19:    13   46  3
    +#> 20:    14   26  3
    +#> 21:    14   36  3
    +#> 22:    14   60  3
    +#> 23:    14   80  3
    +#> 24:    15   20  3
    +#> 25:    15   26  3
    +#> 26:    15   54  3
    +#> 27:    16   32  3
    +#> 28:    16   40  4
    +#> 29:    17   32  4
    +#> 30:    17   40  4
    +#> 31:    17   50  4
    +#> 32:    18   42  4
    +#> 33:    18   56  4
    +#> 34:    18   76  4
    +#> 35:    18   84  4
    +#> 36:    19   36  4
    +#> 37:    19   46  5
    +#> 38:    19   68  5
    +#> 39:    20   32  5
    +#> 40:    20   48  5
    +#> 41:    20   52  5
    +#> 42:    20   56  5
    +#> 43:    20   64  5
    +#> 44:    22   66  5
    +#> 45:    23   54  5
    +#> 46:    24   70  6
    +#> 47:    24   92  6
    +#> 48:    24   93  6
    +#> 49:    24  120  6
    +#> 50:    25   85  6
    +#>     speed dist id
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/cmap2.html b/docs/reference/cmap2.html index 35f65ab5..0910f609 100644 --- a/docs/reference/cmap2.html +++ b/docs/reference/cmap2.html @@ -1,68 +1,13 @@ - - - - - - - -`cmap2` a function to two disk.frames — cmap2 • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`cmap2` a function to two disk.frames — cmap2 • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -167,175 +97,164 @@

    `cmap2` a function to two disk.frames

    gets run by .f(x.chunk, y.chunk)

    -
    cmap2(.x, .y, .f, ...)
    -
    -map2(.x, .y, .f, ...)
    -
    -map_by_chunk_id(.x, .y, .f, ..., outdir)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    .x

    a disk.frame

    .y

    a disk.frame

    .f

    a function to be called on each chunk of x and y matched by -chunk_id

    ...

    not used

    outdir

    output directory

    +
    +
    cmap2(.x, .y, .f, ...)
     
    +map2(.x, .y, .f, ...)
     
    -    

    Examples

    -
    cars.df = as.disk.frame(cars) +map_by_chunk_id(.x, .y, .f, ..., outdir)
    +
    -cars2.df = cmap2(cars.df, cars.df, ~data.table::rbindlist(list(.x, .y))) -collect(cars2.df) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 4 2 -#> 11: 4 10 -#> 12: 7 4 -#> 13: 7 22 -#> 14: 8 16 -#> 15: 9 10 -#> 16: 10 18 -#> 17: 10 26 -#> 18: 10 34 -#> 19: 11 17 -#> 20: 11 28 -#> 21: 12 14 -#> 22: 12 20 -#> 23: 12 24 -#> 24: 12 28 -#> 25: 13 26 -#> 26: 13 34 -#> 27: 13 34 -#> 28: 11 17 -#> 29: 11 28 -#> 30: 12 14 -#> 31: 12 20 -#> 32: 12 24 -#> 33: 12 28 -#> 34: 13 26 -#> 35: 13 34 -#> 36: 13 34 -#> 37: 13 46 -#> 38: 14 26 -#> 39: 14 36 -#> 40: 14 60 -#> 41: 14 80 -#> 42: 15 20 -#> 43: 15 26 -#> 44: 15 54 -#> 45: 16 32 -#> 46: 13 46 -#> 47: 14 26 -#> 48: 14 36 -#> 49: 14 60 -#> 50: 14 80 -#> 51: 15 20 -#> 52: 15 26 -#> 53: 15 54 -#> 54: 16 32 -#> 55: 16 40 -#> 56: 17 32 -#> 57: 17 40 -#> 58: 17 50 -#> 59: 18 42 -#> 60: 18 56 -#> 61: 18 76 -#> 62: 18 84 -#> 63: 19 36 -#> 64: 16 40 -#> 65: 17 32 -#> 66: 17 40 -#> 67: 17 50 -#> 68: 18 42 -#> 69: 18 56 -#> 70: 18 76 -#> 71: 18 84 -#> 72: 19 36 -#> 73: 19 46 -#> 74: 19 68 -#> 75: 20 32 -#> 76: 20 48 -#> 77: 20 52 -#> 78: 20 56 -#> 79: 20 64 -#> 80: 22 66 -#> 81: 23 54 -#> 82: 19 46 -#> 83: 19 68 -#> 84: 20 32 -#> 85: 20 48 -#> 86: 20 52 -#> 87: 20 56 -#> 88: 20 64 -#> 89: 22 66 -#> 90: 23 54 -#> 91: 24 70 -#> 92: 24 92 -#> 93: 24 93 -#> 94: 24 120 -#> 95: 25 85 -#> 96: 24 70 -#> 97: 24 92 -#> 98: 24 93 -#> 99: 24 120 -#> 100: 25 85 -#> speed dist
    -# clean up cars.df -delete(cars.df) -delete(cars2.df) -
    +
    +

    Arguments

    +
    .x
    +

    a disk.frame

    +
    .y
    +

    a disk.frame

    +
    .f
    +

    a function to be called on each chunk of x and y matched by +chunk_id

    +
    ...
    +

    not used

    +
    outdir
    +

    output directory

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +cars2.df = cmap2(cars.df, cars.df, ~data.table::rbindlist(list(.x, .y)))
    +collect(cars2.df)
    +#>      speed dist
    +#>   1:     4    2
    +#>   2:     4   10
    +#>   3:     7    4
    +#>   4:     7   22
    +#>   5:     8   16
    +#>   6:     9   10
    +#>   7:    10   18
    +#>   8:    10   26
    +#>   9:    10   34
    +#>  10:     4    2
    +#>  11:     4   10
    +#>  12:     7    4
    +#>  13:     7   22
    +#>  14:     8   16
    +#>  15:     9   10
    +#>  16:    10   18
    +#>  17:    10   26
    +#>  18:    10   34
    +#>  19:    11   17
    +#>  20:    11   28
    +#>  21:    12   14
    +#>  22:    12   20
    +#>  23:    12   24
    +#>  24:    12   28
    +#>  25:    13   26
    +#>  26:    13   34
    +#>  27:    13   34
    +#>  28:    11   17
    +#>  29:    11   28
    +#>  30:    12   14
    +#>  31:    12   20
    +#>  32:    12   24
    +#>  33:    12   28
    +#>  34:    13   26
    +#>  35:    13   34
    +#>  36:    13   34
    +#>  37:    13   46
    +#>  38:    14   26
    +#>  39:    14   36
    +#>  40:    14   60
    +#>  41:    14   80
    +#>  42:    15   20
    +#>  43:    15   26
    +#>  44:    15   54
    +#>  45:    16   32
    +#>  46:    13   46
    +#>  47:    14   26
    +#>  48:    14   36
    +#>  49:    14   60
    +#>  50:    14   80
    +#>  51:    15   20
    +#>  52:    15   26
    +#>  53:    15   54
    +#>  54:    16   32
    +#>  55:    16   40
    +#>  56:    17   32
    +#>  57:    17   40
    +#>  58:    17   50
    +#>  59:    18   42
    +#>  60:    18   56
    +#>  61:    18   76
    +#>  62:    18   84
    +#>  63:    19   36
    +#>  64:    16   40
    +#>  65:    17   32
    +#>  66:    17   40
    +#>  67:    17   50
    +#>  68:    18   42
    +#>  69:    18   56
    +#>  70:    18   76
    +#>  71:    18   84
    +#>  72:    19   36
    +#>  73:    19   46
    +#>  74:    19   68
    +#>  75:    20   32
    +#>  76:    20   48
    +#>  77:    20   52
    +#>  78:    20   56
    +#>  79:    20   64
    +#>  80:    22   66
    +#>  81:    23   54
    +#>  82:    19   46
    +#>  83:    19   68
    +#>  84:    20   32
    +#>  85:    20   48
    +#>  86:    20   52
    +#>  87:    20   56
    +#>  88:    20   64
    +#>  89:    22   66
    +#>  90:    23   54
    +#>  91:    24   70
    +#>  92:    24   92
    +#>  93:    24   93
    +#>  94:    24  120
    +#>  95:    25   85
    +#>  96:    24   70
    +#>  97:    24   92
    +#>  98:    24   93
    +#>  99:    24  120
    +#> 100:    25   85
    +#>      speed dist
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(cars2.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/collect.html b/docs/reference/collect.html index 01995d45..5b195078 100644 --- a/docs/reference/collect.html +++ b/docs/reference/collect.html @@ -1,70 +1,15 @@ - - - - - - - -Bring the disk.frame into R — collect.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Bring the disk.frame into R — collect.disk.frame • disk.frame - - - - - - - - - + + - - - - -
    -
    - -
    - -
    +
    @@ -171,211 +101,205 @@

    Bring the disk.frame into R

    operations as data.table/data.frame or as a list

    -
    # S3 method for disk.frame
    -collect(x, ..., parallel = !is.null(attr(x, "lazyfn")))
    +    
    +
    # S3 method for disk.frame
    +collect(x, ..., parallel = !is.null(attr(x, "lazyfn")))
     
    -collect_list(x, simplify = FALSE, parallel = !is.null(attr(x, "lazyfn")))
    +collect_list(x, simplify = FALSE, parallel = !is.null(attr(x, "lazyfn")))
     
    -# S3 method for summarized_disk.frame
    -collect(x, ..., parallel = !is.null(attr(x, "lazyfn")))
    +# S3 method for summarized_disk.frame +collect(x, ..., parallel = !is.null(attr(x, "lazyfn")))
    +
    -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    x

    a disk.frame

    ...

    not used

    parallel

    if TRUE the collection is performed in parallel. By default +

    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    ...
    +

    not used

    +
    parallel
    +

    if TRUE the collection is performed in parallel. By default if there are delayed/lazy steps then it will be parallel, otherwise it will not be in parallel. This is because parallel requires transferring data from background R session to the current R session and if there is no computation then it's better to avoid transferring data between session, -hence parallel = FALSE is a better choice

    simplify

    Should the result be simplified to array

    - -

    Value

    - -

    collect return a data.frame/data.table

    -

    collect_list returns a list

    -

    collect return a data.frame/data.table

    - -

    Examples

    -
    cars.df = as.disk.frame(cars) -# use collect to bring the data into RAM as a data.table/data.frame -collect(cars.df) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> 26: 15 54 -#> 27: 16 32 -#> 28: 16 40 -#> 29: 17 32 -#> 30: 17 40 -#> 31: 17 50 -#> 32: 18 42 -#> 33: 18 56 -#> 34: 18 76 -#> 35: 18 84 -#> 36: 19 36 -#> 37: 19 46 -#> 38: 19 68 -#> 39: 20 32 -#> 40: 20 48 -#> 41: 20 52 -#> 42: 20 56 -#> 43: 20 64 -#> 44: 22 66 -#> 45: 23 54 -#> 46: 24 70 -#> 47: 24 92 -#> 48: 24 93 -#> 49: 24 120 -#> 50: 25 85 -#> speed dist
    -# clean up -delete(cars.df) -cars.df = as.disk.frame(cars) +hence parallel = FALSE is a better choice

    +
    simplify
    +

    Should the result be simplified to array

    +
    +
    +

    Value

    +

    collect return a data.frame/data.table +collect_list returns a list +collect return a data.frame/data.table

    +
    -# returns the result as a list -collect_list(cmap(cars.df, ~1)) -
    #> [[1]] -#> [1] 1 -#> -#> [[2]] -#> [1] 1 -#> -#> [[3]] -#> [1] 1 -#> -#> [[4]] -#> [1] 1 -#> -#> [[5]] -#> [1] 1 -#> -#> [[6]] -#> [1] 1 -#>
    -# clean up -delete(cars.df) -cars.df = as.disk.frame(cars) -# use collect to bring the data into RAM as a data.table/data.frame -collect(cars.df) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> 26: 15 54 -#> 27: 16 32 -#> 28: 16 40 -#> 29: 17 32 -#> 30: 17 40 -#> 31: 17 50 -#> 32: 18 42 -#> 33: 18 56 -#> 34: 18 76 -#> 35: 18 84 -#> 36: 19 36 -#> 37: 19 46 -#> 38: 19 68 -#> 39: 20 32 -#> 40: 20 48 -#> 41: 20 52 -#> 42: 20 56 -#> 43: 20 64 -#> 44: 22 66 -#> 45: 23 54 -#> 46: 24 70 -#> 47: 24 92 -#> 48: 24 93 -#> 49: 24 120 -#> 50: 25 85 -#> speed dist
    -# clean up -delete(cars.df) -
    +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +# use collect to bring the data into RAM as a data.table/data.frame
    +collect(cars.df)
    +#>     speed dist
    +#>  1:     4    2
    +#>  2:     4   10
    +#>  3:     7    4
    +#>  4:     7   22
    +#>  5:     8   16
    +#>  6:     9   10
    +#>  7:    10   18
    +#>  8:    10   26
    +#>  9:    10   34
    +#> 10:    11   17
    +#> 11:    11   28
    +#> 12:    12   14
    +#> 13:    12   20
    +#> 14:    12   24
    +#> 15:    12   28
    +#> 16:    13   26
    +#> 17:    13   34
    +#> 18:    13   34
    +#> 19:    13   46
    +#> 20:    14   26
    +#> 21:    14   36
    +#> 22:    14   60
    +#> 23:    14   80
    +#> 24:    15   20
    +#> 25:    15   26
    +#> 26:    15   54
    +#> 27:    16   32
    +#> 28:    16   40
    +#> 29:    17   32
    +#> 30:    17   40
    +#> 31:    17   50
    +#> 32:    18   42
    +#> 33:    18   56
    +#> 34:    18   76
    +#> 35:    18   84
    +#> 36:    19   36
    +#> 37:    19   46
    +#> 38:    19   68
    +#> 39:    20   32
    +#> 40:    20   48
    +#> 41:    20   52
    +#> 42:    20   56
    +#> 43:    20   64
    +#> 44:    22   66
    +#> 45:    23   54
    +#> 46:    24   70
    +#> 47:    24   92
    +#> 48:    24   93
    +#> 49:    24  120
    +#> 50:    25   85
    +#>     speed dist
    +
    +# clean up
    +delete(cars.df)
    +cars.df = as.disk.frame(cars)
    +
    +# returns the result as a list
    +collect_list(cmap(cars.df, ~1))
    +#> [[1]]
    +#> [1] 1
    +#> 
    +#> [[2]]
    +#> [1] 1
    +#> 
    +#> [[3]]
    +#> [1] 1
    +#> 
    +#> [[4]]
    +#> [1] 1
    +#> 
    +#> [[5]]
    +#> [1] 1
    +#> 
    +#> [[6]]
    +#> [1] 1
    +#> 
    +
    +# clean up
    +delete(cars.df)
    +cars.df = as.disk.frame(cars)
    +# use collect to bring the data into RAM as a data.table/data.frame
    +collect(cars.df)
    +#>     speed dist
    +#>  1:     4    2
    +#>  2:     4   10
    +#>  3:     7    4
    +#>  4:     7   22
    +#>  5:     8   16
    +#>  6:     9   10
    +#>  7:    10   18
    +#>  8:    10   26
    +#>  9:    10   34
    +#> 10:    11   17
    +#> 11:    11   28
    +#> 12:    12   14
    +#> 13:    12   20
    +#> 14:    12   24
    +#> 15:    12   28
    +#> 16:    13   26
    +#> 17:    13   34
    +#> 18:    13   34
    +#> 19:    13   46
    +#> 20:    14   26
    +#> 21:    14   36
    +#> 22:    14   60
    +#> 23:    14   80
    +#> 24:    15   20
    +#> 25:    15   26
    +#> 26:    15   54
    +#> 27:    16   32
    +#> 28:    16   40
    +#> 29:    17   32
    +#> 30:    17   40
    +#> 31:    17   50
    +#> 32:    18   42
    +#> 33:    18   56
    +#> 34:    18   76
    +#> 35:    18   84
    +#> 36:    19   36
    +#> 37:    19   46
    +#> 38:    19   68
    +#> 39:    20   32
    +#> 40:    20   48
    +#> 41:    20   52
    +#> 42:    20   56
    +#> 43:    20   64
    +#> 44:    22   66
    +#> 45:    23   54
    +#> 46:    24   70
    +#> 47:    24   92
    +#> 48:    24   93
    +#> 49:    24  120
    +#> 50:    25   85
    +#>     speed dist
    +
    +# clean up
    +delete(cars.df)
    +
    +
    + -
    - +
    - - + + diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html index 432d0801..3743a499 100644 --- a/docs/reference/colnames.html +++ b/docs/reference/colnames.html @@ -1,70 +1,15 @@ - - - - - - - -Return the column names of the disk.frame — colnames • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Return the column names of the disk.frame — colnames • disk.frame - - - - - - - - - - - - - + + -
    -
    - -
    - -
    +
    @@ -171,56 +101,48 @@

    Return the column names of the disk.frame

    might take some time.

    -
    colnames(x, ...)
    +    
    +
    colnames(x, ...)
     
    -# S3 method for disk.frame
    -names(x, ...)
    +# S3 method for disk.frame
    +names(x, ...)
     
    -# S3 method for disk.frame
    -colnames(x, ...)
    +# S3 method for disk.frame
    +colnames(x, ...)
     
    -# S3 method for default
    -colnames(x, ...)
    - -

    Arguments

    - - - - - - - - - - -
    x

    a disk.frame

    ...

    not used

    +# S3 method for default +colnames(x, ...)
    +
    +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    ...
    +

    not used

    +
    + -
    - +
    - - + + diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html index 9c62247e..8feaf6eb 100644 --- a/docs/reference/compute.disk.frame.html +++ b/docs/reference/compute.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Compute without writing — compute.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Compute without writing — compute.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,76 +95,64 @@

    Compute without writing

    Perform the computation; same as calling cmap without .f and lazy = FALSE

    -
    # S3 method for disk.frame
    -compute(
    -  x,
    -  name,
    -  outdir = tempfile("tmp_df_", fileext = ".df"),
    -  overwrite = TRUE,
    -  ...
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    x

    a disk.frame

    name

    Not used. Kept for compatibility with dplyr

    outdir

    the output directory

    overwrite

    whether to overwrite or not

    ...

    Not used. Kept for dplyr compatibility

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) -cars.df2 = cars.df %>% cmap(~.x) -# the computation is performed and the data is now stored elsewhere -cars.df3 = compute(cars.df2) - -# clean up -delete(cars.df) -delete(cars.df3) -
    +
    +
    # S3 method for disk.frame
    +compute(
    +  x,
    +  name,
    +  outdir = tempfile("tmp_df_", fileext = ".df"),
    +  overwrite = TRUE,
    +  ...
    +)
    +
    + +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    name
    +

    Not used. Kept for compatibility with dplyr

    +
    outdir
    +

    the output directory

    +
    overwrite
    +

    whether to overwrite or not

    +
    ...
    +

    Not used. Kept for dplyr compatibility

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +cars.df2 = cars.df %>% cmap(~.x)
    +# the computation is performed and the data is now stored elsewhere
    +cars.df3 = compute(cars.df2)
    +
    +# clean up
    +delete(cars.df)
    +delete(cars.df3)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index 041c6bb6..7c28d311 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -1,67 +1,12 @@ - - - - - - - -Create function that applies to each chunk if disk.frame — create_chunk_mapper • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create function that applies to each chunk if disk.frame — create_chunk_mapper • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,108 +95,102 @@

    Create function that applies to each chunk if disk.frame

    A function to make it easier to create functions like filter

    -
    create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    chunk_fn

    The dplyr function to create a mapper for

    warning_msg

    The warning message to display when invoking the mapper

    as.data.frame

    force the input chunk of a data.frame; needed for dtplyr

    - - -

    Examples

    -
    -filter = create_chunk_mapper(dplyr::filter) - -#' example: creating a function that keeps only the first and last n row -first_and_last <- function(chunk, n, ...) { - nr = nrow(chunk) - print(nr-n+1:nr) - chunk[c(1:n, (nr-n+1):nr), ] -} - -#' create the function for use with disk.frame -first_and_last_df = create_chunk_mapper(first_and_last) - -mtcars.df = as.disk.frame(mtcars) - -#' the operation is lazy -lazy_mtcars.df = mtcars.df %>% - first_and_last_df(2) - -#' bring into R -collect(lazy_mtcars.df) -
    #> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 1 2
    #> mpg cyl disp hp drat wt qsec vs am gear carb -#> 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 -#> 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 -#> 3: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 -#> 4: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 -#> 5: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 -#> 6: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 -#> 7: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 -#> 8: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 -#> 9: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 -#> 10: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 -#> 11: 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 -#> 12: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 -#> 13: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 -#> 14: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 -#> 15: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 -#> 16: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 -#> 17: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 -#> 18: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 -#> 19: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 -#> 20: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 -#> 21: 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 -#> 22: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 -#> 23: 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 -#> 24: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 -#> mpg cyl disp hp drat wt qsec vs am gear carb
    -#' clean up -delete(mtcars.df) +
    +
    create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
    +
    -
    +
    +

    Arguments

    +
    chunk_fn
    +

    The dplyr function to create a mapper for

    +
    warning_msg
    +

    The warning message to display when invoking the mapper

    +
    as.data.frame
    +

    force the input chunk of a data.frame; needed for dtplyr

    +
    + +
    +

    Examples

    +
    
    +filter = create_chunk_mapper(dplyr::filter)
    +
    +#' example: creating a function that keeps only the first and last n row
    +first_and_last <- function(chunk, n, ...) {
    +  nr = nrow(chunk)
    +  print(nr-n+1:nr)
    +  chunk[c(1:n, (nr-n+1):nr), ]
    +}
    +
    +#' create the function for use with disk.frame
    +first_and_last_df = create_chunk_mapper(first_and_last)
    +
    +mtcars.df = as.disk.frame(mtcars)
    +
    +#' the operation is lazy
    +lazy_mtcars.df = mtcars.df %>%
    +  first_and_last_df(2)
    +
    +#' bring into R
    +collect(lazy_mtcars.df)
    +#> [1]  5  6  7  8  9 10
    +#> [1]  5  6  7  8  9 10
    +#> [1]  5  6  7  8  9 10
    +#> [1]  5  6  7  8  9 10
    +#> [1]  5  6  7  8  9 10
    +#> [1] 1 2
    +#>      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
    +#>  1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
    +#>  2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
    +#>  3: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
    +#>  4: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
    +#>  5: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
    +#>  6: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
    +#>  7: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
    +#>  8: 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
    +#>  9: 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
    +#> 10: 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
    +#> 11: 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
    +#> 12: 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
    +#> 13: 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
    +#> 14: 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
    +#> 15: 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
    +#> 16: 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
    +#> 17: 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
    +#> 18: 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
    +#> 19: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
    +#> 20: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
    +#> 21: 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
    +#> 22: 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
    +#> 23: 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
    +#> 24: 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
    +#>      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
    +
    +#' clean up
    +delete(mtcars.df)
    +
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index 687aa9d8..17b0d387 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Convert CSV file(s) to disk.frame format — csv_to_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Convert CSV file(s) to disk.frame format — csv_to_disk.frame • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,142 +95,115 @@

    Convert CSV file(s) to disk.frame format

    Convert CSV file(s) to disk.frame format

    -
    csv_to_disk.frame(
    -  infile,
    -  outdir = tempfile(fileext = ".df"),
    -  inmapfn = base::I,
    -  nchunks = recommend_nchunks(sum(file.size(infile))),
    -  in_chunk_size = NULL,
    -  shardby = NULL,
    -  compress = 50,
    -  overwrite = TRUE,
    -  header = TRUE,
    -  .progress = TRUE,
    -  backend = c("data.table", "readr", "LaF"),
    -  chunk_reader = c("bigreadr", "data.table", "readr", "readLines"),
    -  ...
    -)
    +
    +
    csv_to_disk.frame(
    +  infile,
    +  outdir = tempfile(fileext = ".df"),
    +  inmapfn = base::I,
    +  nchunks = recommend_nchunks(sum(file.size(infile))),
    +  in_chunk_size = NULL,
    +  shardby = NULL,
    +  compress = 50,
    +  overwrite = TRUE,
    +  header = TRUE,
    +  .progress = TRUE,
    +  backend = c("data.table", "readr", "LaF"),
    +  chunk_reader = c("bigreadr", "data.table", "readr", "readLines"),
    +  ...
    +)
    +
    -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    infile

    The input CSV file or files

    outdir

    The directory to output the disk.frame to

    inmapfn

    A function to be applied to the chunk read in from CSV before +

    +

    Arguments

    +
    infile
    +

    The input CSV file or files

    +
    outdir
    +

    The directory to output the disk.frame to

    +
    inmapfn
    +

    A function to be applied to the chunk read in from CSV before the chunk is being written out. Commonly used to perform simple -transformations. Defaults to the identity function (ie. no transformation)

    nchunks

    Number of chunks to output

    in_chunk_size

    When reading in the file, how many lines to read in at +transformations. Defaults to the identity function (ie. no transformation)

    +
    nchunks
    +

    Number of chunks to output

    +
    in_chunk_size
    +

    When reading in the file, how many lines to read in at once. This is different to nchunks which controls how many chunks are -output

    shardby

    The column(s) to shard the data by. For example suppose +output

    +
    shardby
    +

    The column(s) to shard the data by. For example suppose `shardby = c("col1","col2")` then every row where the values `col1` and `col2` are the same will end up in the same chunk; this will allow merging -by `col1` and `col2` to be more efficient

    compress

    For fst backends it's a number between 0 and 100 where 100 is -the highest compression ratio.

    overwrite

    Whether to overwrite the existing directory

    header

    Whether the files have header. Defaults to TRUE

    .progress

    A logical, for whether or not to print a progress bar for -multiprocess, multisession, and multicore plans. From furrr

    backend

    The CSV reader backend to choose: "data.table" or "readr". +by `col1` and `col2` to be more efficient

    +
    compress
    +

    For fst backends it's a number between 0 and 100 where 100 is +the highest compression ratio.

    +
    overwrite
    +

    Whether to overwrite the existing directory

    +
    header
    +

    Whether the files have header. Defaults to TRUE

    +
    .progress
    +

    A logical, for whether or not to print a progress bar for +multiprocess, multisession, and multicore plans. From furrr

    +
    backend
    +

    The CSV reader backend to choose: "data.table" or "readr". disk.frame does not have its own CSV reader. It uses either data.table::fread or readr::read_delimited. It is worth noting that data.table::fread does not detect dates and all dates are imported as strings, and you are encouraged to use fasttime to convert the strings to date. You can use the `inmapfn` to do that. However, if you want automatic date detection, then backend="readr" may suit your needs. However, readr -is often slower than data.table, hence data.table is chosen as the default.

    chunk_reader

    Even if you choose a backend there can still be multiple +is often slower than data.table, hence data.table is chosen as the default.

    +
    chunk_reader
    +

    Even if you choose a backend there can still be multiple strategies on how to approach the CSV reads. For example, data.table::fread tries to mmap the whole file which can cause the whole read process to fail. In that case we can change the chunk_reader to "readLines" which uses the readLines function to read chunk by chunk and still use data.table::fread to process the chunks. There are currently no strategies for readr backend, -except the default one.

    ...

    passed to data.table::fread, disk.frame::as.disk.frame, -disk.frame::shard

    - -

    See also

    - -

    Other ingesting data: -zip_to_disk.frame()

    - -

    Examples

    -
    tmpfile = tempfile() -write.csv(cars, tmpfile) -tmpdf = tempfile(fileext = ".df") -df = csv_to_disk.frame(tmpfile, outdir = tmpdf, overwrite = TRUE) +except the default one.

    +
    ...
    +

    passed to data.table::fread, disk.frame::as.disk.frame, +disk.frame::shard

    +
    +
    +

    See also

    +

    Other ingesting data: +zip_to_disk.frame()

    +
    -# clean up -fs::file_delete(tmpfile) -delete(df) -
    +
    +

    Examples

    +
    tmpfile = tempfile()
    +write.csv(cars, tmpfile)
    +tmpdf = tempfile(fileext = ".df")
    +df = csv_to_disk.frame(tmpfile, outdir = tmpdf, overwrite = TRUE)
    +
    +# clean up
    +fs::file_delete(tmpfile)
    +delete(df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/delete.html b/docs/reference/delete.html index d890c7ba..2a238505 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -1,67 +1,12 @@ - - - - - - - -Delete a disk.frame — delete • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Delete a disk.frame — delete • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,47 +95,43 @@

    Delete a disk.frame

    Delete a disk.frame

    -
    delete(df)
    - -

    Arguments

    - - - - - - -
    df

    a disk.frame

    - +
    +
    delete(df)
    +
    -

    Examples

    -
    cars.df = as.disk.frame(cars) -delete(cars.df) -
    +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index c3ac569a..57bfaa8d 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -1,67 +1,12 @@ - - - - - - - -Get the size of RAM in gigabytes — df_ram_size • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get the size of RAM in gigabytes — df_ram_size • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,42 +95,43 @@

    Get the size of RAM in gigabytes

    Get the size of RAM in gigabytes

    -
    df_ram_size()
    - - -

    Value

    +
    +
    df_ram_size()
    +
    +
    +

    Value

    integer of RAM in gigabyte (GB)

    +
    -

    Examples

    -
    # returns the RAM size in gigabyte (GB) -df_ram_size() -
    #> [1] 64
    +
    +

    Examples

    +
    # returns the RAM size in gigabyte (GB)
    +df_ram_size() 
    +#> [1] 64
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index ee6fc36b..1197ac98 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -1,68 +1,13 @@ - - - - - - - -Fit generalized linear models (glm) with disk.frame — dfglm • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Fit generalized linear models (glm) with disk.frame — dfglm • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -167,37 +97,30 @@

    Fit generalized linear models (glm) with disk.frame

    those return by those functions. This is a convenience wrapper

    -
    dfglm(formula, data, ..., glm_backend = c("biglm", "speedglm", "biglmm"))
    +
    +
    dfglm(formula, data, ..., glm_backend = c("biglm", "speedglm", "biglmm"))
    +
    -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    formula

    A model formula

    data

    See Details below. Method dispatch is on this argument

    ...

    Additional arguments

    glm_backend

    Which package to use for fitting GLMs. The default is +

    +

    Arguments

    +
    formula
    +

    A model formula

    +
    data
    +

    See Details below. Method dispatch is on this argument

    +
    ...
    +

    Additional arguments

    +
    glm_backend
    +

    Which package to use for fitting GLMs. The default is "biglm", which has known issues with factor level if different levels are present in different chunks. The "speedglm" option is more robust, but does not -implement `predict` which makes prediction and implementation impossible.

    - -

    Value

    - +implement `predict` which makes prediction and implementation impossible.

    +
    +
    +

    Value

    An object of class bigglm

    -

    Details

    - +
    +
    +

    Details

    The data argument may be a function, a data frame, or a SQLiteConnection or RODBC connection object.

    When it is a function the function must take a single argument @@ -217,63 +140,66 @@

    Details variables needed for the model, not the whole table. The code in the SQLiteConnection method should work for other DBI connections, but I do not have any of these to check it with.

    -

    References

    - +
    +
    +

    References

    Algorithm AS274 Applied Statistics (1992) Vol.41, No. 2

    -

    See also

    - -

    Other Machine Learning (ML): -make_glm_streaming_fn()

    +
    +
    +

    See also

    +

    Other Machine Learning (ML): +make_glm_streaming_fn()

    +
    -

    Examples

    -
    cars.df = as.disk.frame(cars) -m = dfglm(dist ~ speed, data = cars.df) -
    #> Loading required namespace: biglm
    -# can use normal R functions -# Only works in version > R 3.6 -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) -if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) { - summary(m) - predict(m, get_chunk(cars.df, 1)) - predict(m, collect(cars.df)) - # can use broom to tidy up the returned info - broom::tidy(m) -} -
    #> # A tibble: 2 x 4 -#> term estimate std.error p.value -#> <chr> <dbl> <dbl> <dbl> -#> 1 (Intercept) -17.6 6.76 9.29e- 3 -#> 2 speed 3.93 0.416 2.96e-21
    -# clean up -delete(cars.df) -
    +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +m = dfglm(dist ~ speed, data = cars.df)
    +#> Loading required namespace: biglm
    +
    +# can use normal R functions
    +# Only works in version > R 3.6
    +majorv = as.integer(version$major)
    +minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])
    +if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) {
    +  summary(m)
    +  predict(m, get_chunk(cars.df, 1))
    +  predict(m, collect(cars.df))
    +  # can use broom to tidy up the returned info
    +  broom::tidy(m)
    +}
    +#> # A tibble: 2 x 4
    +#>   term        estimate std.error  p.value
    +#>   <chr>          <dbl>     <dbl>    <dbl>
    +#> 1 (Intercept)   -17.6      6.76  9.29e- 3
    +#> 2 speed           3.93     0.416 2.96e-21
    +
    +# clean up
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index 574c9440..0d2320c2 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Create a disk.frame from a folder — disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create a disk.frame from a folder — disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,67 +95,64 @@

    Create a disk.frame from a folder

    Create a disk.frame from a folder

    -
    disk.frame(path, backend = "fst")
    - -

    Arguments

    - - - - - - - - - - -
    path

    The path to store the output file or to a directory

    backend

    The only available backend is fst at the moment

    - - -

    Examples

    -
    path = file.path(tempdir(),"cars") -as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP/cars" -#> nchunks: 2 -#> nrow (at source): 50 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    df = disk.frame(path) -head(df) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10
    nchunks(df) -
    #> [1] 2
    # clean up -delete(df) -
    +
    +
    disk.frame(path, backend = "fst")
    +
    + +
    +

    Arguments

    +
    path
    +

    The path to store the output file or to a directory

    +
    backend
    +

    The only available backend is fst at the moment

    +
    + +
    +

    Examples

    +
    path = file.path(tempdir(),"cars")
    +as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/cars"
    +#> nchunks: 2
    +#> nrow (at source): 50
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +df = disk.frame(path)
    +head(df)
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:     4   10
    +#> 3:     7    4
    +#> 4:     7   22
    +#> 5:     8   16
    +#> 6:     9   10
    +nchunks(df)
    +#> [1] 2
    +# clean up
    +delete(df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html index 9f86f10e..4cbb0612 100644 --- a/docs/reference/dplyr_verbs.html +++ b/docs/reference/dplyr_verbs.html @@ -1,68 +1,13 @@ - - - - - - - -The dplyr verbs implemented for disk.frame — select.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -The dplyr verbs implemented for disk.frame — select.disk.frame • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -167,94 +97,88 @@

    The dplyr verbs implemented for disk.frame

    `chunk_arrange` performs the actions within each chunk

    -
    # S3 method for disk.frame
    -select(.data, ...)
    -
    -# S3 method for disk.frame
    -rename(.data, ...)
    -
    -# S3 method for disk.frame
    -filter(.data, ...)
    -
    -# S3 method for disk.frame
    -mutate(.data, ...)
    -
    -# S3 method for disk.frame
    -transmute(.data, ...)
    +    
    +
    # S3 method for disk.frame
    +select(.data, ...)
     
    -# S3 method for disk.frame
    -arrange(.data, ...)
    +# S3 method for disk.frame
    +rename(.data, ...)
     
    -chunk_arrange(.data, ...)
    +# S3 method for disk.frame
    +filter(.data, ...)
     
    -add_tally.disk.frame(.data, ...)
    +# S3 method for disk.frame
    +mutate(.data, ...)
     
    -# S3 method for disk.frame
    -do(.data, ...)
    +# S3 method for disk.frame
    +transmute(.data, ...)
     
    -# S3 method for disk.frame
    -distinct(...)
    +# S3 method for disk.frame
    +arrange(.data, ...)
     
    -chunk_distinct(.data, ...)
    +chunk_arrange(.data, ...)
     
    -# S3 method for disk.frame
    -glimpse(.data, ...)
    +add_tally.disk.frame(.data, ...) -

    Arguments

    - - - - - - - - - - -
    .data

    a disk.frame

    ...

    Same as the dplyr functions

    +# S3 method for disk.frame +do(.data, ...) +# S3 method for disk.frame +distinct(...) -

    Examples

    -
    library(dplyr) -cars.df = as.disk.frame(cars) -mult = 2 +chunk_distinct(.data, ...) -# use all any of the supported dplyr -cars2 = cars.df %>% - select(speed) %>% - mutate(speed2 = speed * mult) %>% - filter(speed < 50) %>% - rename(speed1 = speed) %>% - collect +# S3 method for disk.frame +glimpse(.data, ...)
    +
    -# clean up cars.df -delete(cars.df) -
    +
    +

    Arguments

    +
    .data
    +

    a disk.frame

    +
    ...
    +

    Same as the dplyr functions

    +
    + +
    +

    Examples

    +
    library(dplyr)
    +cars.df = as.disk.frame(cars)
    +mult = 2
    +
    +# use all any of the supported dplyr
    +cars2 = cars.df %>% 
    +  select(speed) %>% 
    +  mutate(speed2 = speed * mult) %>% 
    +  filter(speed < 50) %>% 
    +  rename(speed1 = speed) %>% 
    +  collect
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index ef02aa33..4172714e 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -1,67 +1,12 @@ - - - - - - - -Helper function to evalparse some `glue::glue` string — evalparseglue • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Helper function to evalparse some `glue::glue` string — evalparseglue • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -165,47 +95,39 @@

    Helper function to evalparse some `glue::glue` string

    Helper function to evalparse some `glue::glue` string

    -
    evalparseglue(code, env = parent.frame())
    - -

    Arguments

    - - - - - - - - - - -
    code

    the code in character(string) format to evaluate

    env

    the environment in which to evaluate the code

    +
    +
    evalparseglue(code, env = parent.frame())
    +
    +
    +

    Arguments

    +
    code
    +

    the code in character(string) format to evaluate

    +
    env
    +

    the environment in which to evaluate the code

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index df3534db..19607792 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Apply data.table's foverlaps to the disk.frame — foverlaps.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Apply data.table's foverlaps to the disk.frame — foverlaps.disk.frame • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,101 +95,81 @@

    Apply data.table's foverlaps to the disk.frame

    EXPERIMENTAL

    -
    foverlaps.disk.frame(
    -  df1,
    -  df2,
    -  by.x = if (identical(shardkey(df1)$shardkey, "")) shardkey(df1)$shardkey else
    -    shardkey(df2)$shardkey,
    -  by.y = shardkey(df2)$shardkey,
    -  ...,
    -  outdir = tempfile("df_foverlaps_tmp", fileext = ".df"),
    -  merge_by_chunk_id = FALSE,
    -  compress = 50,
    -  overwrite = TRUE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df1

    A disk.frame

    df2

    A disk.frame or a data.frame

    by.x

    character/string vector. by.x used in foverlaps

    by.y

    character/string vector. by.x used in foverlaps

    ...

    passed to data.table::foverlaps and disk.frame::cmap.disk.frame

    outdir

    The output directory of the disk.frame

    merge_by_chunk_id

    If TRUE then the merges will happen for chunks in df1 and df2 with the same chunk id which speed up processing. Otherwise every chunk of df1 is merged with every chunk of df2. Ignored with df2 is not a disk.frame

    compress

    The compression ratio for fst

    overwrite

    overwrite existing directory

    - - -

    Examples

    -
    library(data.table) +
    +
    foverlaps.disk.frame(
    +  df1,
    +  df2,
    +  by.x = if (identical(shardkey(df1)$shardkey, "")) shardkey(df1)$shardkey else
    +    shardkey(df2)$shardkey,
    +  by.y = shardkey(df2)$shardkey,
    +  ...,
    +  outdir = tempfile("df_foverlaps_tmp", fileext = ".df"),
    +  merge_by_chunk_id = FALSE,
    +  compress = 50,
    +  overwrite = TRUE
    +)
    +
    -## simple example: -x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)) -y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)) -byxy = c("start", "end") -xy.df = foverlaps.disk.frame( - x, y, by.x = byxy, by.y = byxy, - merge_by_chunk_id = TRUE, overwrite = TRUE) -# clean up -delete(x) -delete(y) -delete(xy.df) -
    +
    +

    Arguments

    +
    df1
    +

    A disk.frame

    +
    df2
    +

    A disk.frame or a data.frame

    +
    by.x
    +

    character/string vector. by.x used in foverlaps

    +
    by.y
    +

    character/string vector. by.x used in foverlaps

    +
    ...
    +

    passed to data.table::foverlaps and disk.frame::cmap.disk.frame

    +
    outdir
    +

    The output directory of the disk.frame

    +
    merge_by_chunk_id
    +

    If TRUE then the merges will happen for chunks in df1 and df2 with the same chunk id which speed up processing. Otherwise every chunk of df1 is merged with every chunk of df2. Ignored with df2 is not a disk.frame

    +
    compress
    +

    The compression ratio for fst

    +
    overwrite
    +

    overwrite existing directory

    +
    + +
    +

    Examples

    +
    library(data.table)
    +
    +## simple example:
    +x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10))
    +y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3))
    +byxy = c("start", "end")
    +xy.df = foverlaps.disk.frame(
    +   x, y, by.x = byxy, by.y = byxy,
    +  merge_by_chunk_id = TRUE, overwrite = TRUE)
    +# clean up
    +delete(x)
    +delete(y)
    +delete(xy.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index 9a068fca..608041f6 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -1,67 +1,12 @@ - - - - - - - -Generate synthetic dataset for testing — gen_datatable_synthetic • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Generate synthetic dataset for testing — gen_datatable_synthetic • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -165,47 +95,39 @@

    Generate synthetic dataset for testing

    Generate synthetic dataset for testing

    -
    gen_datatable_synthetic(N = 2e+08, K = 100)
    - -

    Arguments

    - - - - - - - - - - -
    N

    number of rows. Defaults to 200 million

    K

    controls the number of unique values for id. Some ids will have K distinct values while others have N/K distinct values

    +
    +
    gen_datatable_synthetic(N = 2e+08, K = 100)
    +
    +
    +

    Arguments

    +
    N
    +

    number of rows. Defaults to 200 million

    +
    K
    +

    controls the number of unique values for id. Some ids will have K distinct values while others have N/K distinct values

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index 517e982e..66df5c10 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -1,67 +1,12 @@ - - - - - - - -Obtain one chunk by chunk id — get_chunk • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Obtain one chunk by chunk id — get_chunk • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,178 +95,170 @@

    Obtain one chunk by chunk id

    Obtain one chunk by chunk id

    -
    get_chunk(...)
    -
    -# S3 method for disk.frame
    -get_chunk(df, n, keep = NULL, full.names = FALSE, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    ...

    passed to fst::read_fst or whichever read function is used in the backend

    df

    a disk.frame

    n

    the chunk id. If numeric then matches by number, if character then returns the chunk with the same name as n

    keep

    the columns to keep

    full.names

    whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric

    +
    +
    get_chunk(...)
     
    +# S3 method for disk.frame
    +get_chunk(df, n, keep = NULL, full.names = FALSE, ...)
    +
    -

    Examples

    -
    cars.df = as.disk.frame(cars, nchunks = 2) -get_chunk(cars.df, 1) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> speed dist
    get_chunk(cars.df, 2) -
    #> speed dist -#> 1: 15 54 -#> 2: 16 32 -#> 3: 16 40 -#> 4: 17 32 -#> 5: 17 40 -#> 6: 17 50 -#> 7: 18 42 -#> 8: 18 56 -#> 9: 18 76 -#> 10: 18 84 -#> 11: 19 36 -#> 12: 19 46 -#> 13: 19 68 -#> 14: 20 32 -#> 15: 20 48 -#> 16: 20 52 -#> 17: 20 56 -#> 18: 20 64 -#> 19: 22 66 -#> 20: 23 54 -#> 21: 24 70 -#> 22: 24 92 -#> 23: 24 93 -#> 24: 24 120 -#> 25: 25 85 -#> speed dist
    get_chunk(cars.df, 1, keep = "speed") -
    #> speed -#> 1: 4 -#> 2: 4 -#> 3: 7 -#> 4: 7 -#> 5: 8 -#> 6: 9 -#> 7: 10 -#> 8: 10 -#> 9: 10 -#> 10: 11 -#> 11: 11 -#> 12: 12 -#> 13: 12 -#> 14: 12 -#> 15: 12 -#> 16: 13 -#> 17: 13 -#> 18: 13 -#> 19: 13 -#> 20: 14 -#> 21: 14 -#> 22: 14 -#> 23: 14 -#> 24: 15 -#> 25: 15 -#> speed
    -# if full.names = TRUE then the full path to the chunk need to be provided -get_chunk(cars.df, file.path(attr(cars.df, "path"), "1.fst"), full.names = TRUE) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> speed dist
    -# clean up cars.df -delete(cars.df) -
    +
    +

    Arguments

    +
    ...
    +

    passed to fst::read_fst or whichever read function is used in the backend

    +
    df
    +

    a disk.frame

    +
    n
    +

    the chunk id. If numeric then matches by number, if character then returns the chunk with the same name as n

    +
    keep
    +

    the columns to keep

    +
    full.names
    +

    whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars, nchunks = 2)
    +get_chunk(cars.df, 1)
    +#>     speed dist
    +#>  1:     4    2
    +#>  2:     4   10
    +#>  3:     7    4
    +#>  4:     7   22
    +#>  5:     8   16
    +#>  6:     9   10
    +#>  7:    10   18
    +#>  8:    10   26
    +#>  9:    10   34
    +#> 10:    11   17
    +#> 11:    11   28
    +#> 12:    12   14
    +#> 13:    12   20
    +#> 14:    12   24
    +#> 15:    12   28
    +#> 16:    13   26
    +#> 17:    13   34
    +#> 18:    13   34
    +#> 19:    13   46
    +#> 20:    14   26
    +#> 21:    14   36
    +#> 22:    14   60
    +#> 23:    14   80
    +#> 24:    15   20
    +#> 25:    15   26
    +#>     speed dist
    +get_chunk(cars.df, 2)
    +#>     speed dist
    +#>  1:    15   54
    +#>  2:    16   32
    +#>  3:    16   40
    +#>  4:    17   32
    +#>  5:    17   40
    +#>  6:    17   50
    +#>  7:    18   42
    +#>  8:    18   56
    +#>  9:    18   76
    +#> 10:    18   84
    +#> 11:    19   36
    +#> 12:    19   46
    +#> 13:    19   68
    +#> 14:    20   32
    +#> 15:    20   48
    +#> 16:    20   52
    +#> 17:    20   56
    +#> 18:    20   64
    +#> 19:    22   66
    +#> 20:    23   54
    +#> 21:    24   70
    +#> 22:    24   92
    +#> 23:    24   93
    +#> 24:    24  120
    +#> 25:    25   85
    +#>     speed dist
    +get_chunk(cars.df, 1, keep = "speed")
    +#>     speed
    +#>  1:     4
    +#>  2:     4
    +#>  3:     7
    +#>  4:     7
    +#>  5:     8
    +#>  6:     9
    +#>  7:    10
    +#>  8:    10
    +#>  9:    10
    +#> 10:    11
    +#> 11:    11
    +#> 12:    12
    +#> 13:    12
    +#> 14:    12
    +#> 15:    12
    +#> 16:    13
    +#> 17:    13
    +#> 18:    13
    +#> 19:    13
    +#> 20:    14
    +#> 21:    14
    +#> 22:    14
    +#> 23:    14
    +#> 24:    15
    +#> 25:    15
    +#>     speed
    +
    +# if full.names = TRUE then the full path to the chunk need to be provided
    +get_chunk(cars.df, file.path(attr(cars.df, "path"), "1.fst"), full.names = TRUE)
    +#>     speed dist
    +#>  1:     4    2
    +#>  2:     4   10
    +#>  3:     7    4
    +#>  4:     7   22
    +#>  5:     8   16
    +#>  6:     9   10
    +#>  7:    10   18
    +#>  8:    10   26
    +#>  9:    10   34
    +#> 10:    11   17
    +#> 11:    11   28
    +#> 12:    12   14
    +#> 13:    12   20
    +#> 14:    12   24
    +#> 15:    12   28
    +#> 16:    13   26
    +#> 17:    13   34
    +#> 18:    13   34
    +#> 19:    13   46
    +#> 20:    14   26
    +#> 21:    14   36
    +#> 22:    14   60
    +#> 23:    14   80
    +#> 24:    15   20
    +#> 25:    15   26
    +#>     speed dist
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index b6b80fac..d54c8f15 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -1,67 +1,12 @@ - - - - - - - -Get the chunk IDs and files names — get_chunk_ids • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get the chunk IDs and files names — get_chunk_ids • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,75 +95,68 @@

    Get the chunk IDs and files names

    Get the chunk IDs and files names

    -
    get_chunk_ids(df, ..., full.names = FALSE, strip_extension = TRUE)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame

    ...

    passed to list.files

    full.names

    If TRUE returns the full path to the file, Defaults to FALSE

    strip_extension

    If TRUE then the file extension in the chunk_id is removed. Defaults to TRUE

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) - -# return the integer-string chunk IDs -get_chunk_ids(cars.df) -
    #> [1] "1" "2" "3" "4" "5" "6"
    -# return the file name chunk IDs -get_chunk_ids(cars.df, full.names = TRUE) -
    #> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2PGAkP\\file4b14668a2842.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2PGAkP\\file4b14668a2842.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2PGAkP\\file4b14668a2842.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2PGAkP\\file4b14668a2842.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2PGAkP\\file4b14668a2842.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2PGAkP\\file4b14668a2842.df/6.fst"
    -# return the file name chunk IDs with file extension -get_chunk_ids(cars.df, strip_extension = FALSE) -
    #> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"
    -# clean up cars.df -delete(cars.df) -
    +
    +
    get_chunk_ids(df, ..., full.names = FALSE, strip_extension = TRUE)
    +
    + +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    ...
    +

    passed to list.files

    +
    full.names
    +

    If TRUE returns the full path to the file, Defaults to FALSE

    +
    strip_extension
    +

    If TRUE then the file extension in the chunk_id is removed. Defaults to TRUE

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# return the integer-string chunk IDs
    +get_chunk_ids(cars.df)
    +#> [1] "1" "2" "3" "4" "5" "6"
    +
    +# return the file name chunk IDs
    +get_chunk_ids(cars.df, full.names = TRUE)
    +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/1.fst"
    +#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/2.fst"
    +#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/3.fst"
    +#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/4.fst"
    +#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/5.fst"
    +#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/6.fst"
    +
    +# return the file name chunk IDs with file extension
    +get_chunk_ids(cars.df, strip_extension = FALSE)
    +#> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index f4e80c6e..a7d39c8e 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -1,70 +1,15 @@ - - - - - - - -A function to parse the summarize function — summarise.grouped_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -A function to parse the summarize function — summarise.grouped_disk.frame • disk.frame - - - - - - - - - - + + - - - -
    -
    - -
    - -
    +
    @@ -171,71 +101,60 @@

    A function to parse the summarize function

    reorganizes the chunks by the shard key.

    -
    # S3 method for grouped_disk.frame
    -summarise(.data, ...)
    -
    -# S3 method for grouped_disk.frame
    -summarize(.data, ...)
    -
    -# S3 method for disk.frame
    -group_by(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data))
    -
    -# S3 method for disk.frame
    -summarize(.data, ...)
    -
    -# S3 method for disk.frame
    -summarise(.data, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    .data

    a disk.frame

    ...

    same as the dplyr::group_by

    add

    from dplyr

    .drop

    from dplyr

    - -

    See also

    - -

    hard_group_by

    +
    +
    # S3 method for grouped_disk.frame
    +summarise(.data, ...)
    +
    +# S3 method for grouped_disk.frame
    +summarize(.data, ...)
    +
    +# S3 method for disk.frame
    +group_by(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data))
    +
    +# S3 method for disk.frame
    +summarize(.data, ...)
    +
    +# S3 method for disk.frame
    +summarise(.data, ...)
    +
    + +
    +

    Arguments

    +
    .data
    +

    a disk.frame

    +
    ...
    +

    same as the dplyr::group_by

    +
    add
    +

    from dplyr

    +
    .drop
    +

    from dplyr

    +
    +
    +

    See also

    +

    hard_group_by

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index 9c8d9c05..02be8acf 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -The shard keys of the disk.frame — groups.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -The shard keys of the disk.frame — groups.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,47 +95,42 @@

    The shard keys of the disk.frame

    The shard keys of the disk.frame

    -
    # S3 method for disk.frame
    -groups(x)
    - -

    Arguments

    - - - - - - -
    x

    a disk.frame

    - -

    Value

    +
    +
    # S3 method for disk.frame
    +groups(x)
    +
    +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    +
    +

    Value

    character

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/hard_arrange.html b/docs/reference/hard_arrange.html index d31b48ee..c880a71b 100644 --- a/docs/reference/hard_arrange.html +++ b/docs/reference/hard_arrange.html @@ -1,69 +1,14 @@ - - - - - - - -Perform a hard arrange — hard_arrange • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Perform a hard arrange — hard_arrange • disk.frame - - - - - - - - - + + - - - - -
    -
    - -
    - -
    +
    @@ -169,120 +99,107 @@

    Perform a hard arrange

    row that share the same `by` value will end up in the same chunk.

    -
    hard_arrange(df, ..., add = FALSE, .drop = FALSE)
    -
    -# S3 method for data.frame
    -hard_arrange(df, ...)
    -
    -# S3 method for disk.frame
    -hard_arrange(
    -  df,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_hard_arrange"),
    -  nchunks = disk.frame::nchunks(df),
    -  overwrite = TRUE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame

    ...

    grouping variables

    add

    same as dplyr::arrange

    .drop

    same as dplyr::arrange

    outdir

    the output directory

    nchunks

    The number of chunks in the output. Defaults = nchunks.disk.frame(df)

    overwrite

    overwrite the out put directory

    +
    +
    hard_arrange(df, ..., add = FALSE, .drop = FALSE)
     
    +# S3 method for data.frame
    +hard_arrange(df, ...)
     
    -    

    Examples

    -
    iris.df = as.disk.frame(iris, nchunks = 2) +# S3 method for disk.frame +hard_arrange( + df, + ..., + outdir = tempfile("tmp_disk_frame_hard_arrange"), + nchunks = disk.frame::nchunks(df), + overwrite = TRUE +)
    +
    -# arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk -iris_hard.df = hard_arrange(iris.df, Species) -
    #> Appending disk.frames:
    -get_chunk(iris_hard.df, 1) -
    #> # A tibble: 50 x 5 -#> # Groups: Species [1] -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <dbl> <dbl> <dbl> <dbl> <fct> -#> 1 6.3 3.3 6 2.5 virginica -#> 2 5.8 2.7 5.1 1.9 virginica -#> 3 7.1 3 5.9 2.1 virginica -#> 4 6.3 2.9 5.6 1.8 virginica -#> 5 6.5 3 5.8 2.2 virginica -#> 6 7.6 3 6.6 2.1 virginica -#> 7 4.9 2.5 4.5 1.7 virginica -#> 8 7.3 2.9 6.3 1.8 virginica -#> 9 6.7 2.5 5.8 1.8 virginica -#> 10 7.2 3.6 6.1 2.5 virginica -#> # ... with 40 more rows
    get_chunk(iris_hard.df, 2) -
    #> # A tibble: 50 x 5 -#> # Groups: Species [1] -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <dbl> <dbl> <dbl> <dbl> <fct> -#> 1 6.3 3.3 6 2.5 virginica -#> 2 5.8 2.7 5.1 1.9 virginica -#> 3 7.1 3 5.9 2.1 virginica -#> 4 6.3 2.9 5.6 1.8 virginica -#> 5 6.5 3 5.8 2.2 virginica -#> 6 7.6 3 6.6 2.1 virginica -#> 7 4.9 2.5 4.5 1.7 virginica -#> 8 7.3 2.9 6.3 1.8 virginica -#> 9 6.7 2.5 5.8 1.8 virginica -#> 10 7.2 3.6 6.1 2.5 virginica -#> # ... with 40 more rows
    -# clean up cars.df -delete(iris.df) -delete(iris_hard.df) -
    +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    ...
    +

    grouping variables

    +
    add
    +

    same as dplyr::arrange

    +
    .drop
    +

    same as dplyr::arrange

    +
    outdir
    +

    the output directory

    +
    nchunks
    +

    The number of chunks in the output. Defaults = nchunks.disk.frame(df)

    +
    overwrite
    +

    overwrite the out put directory

    +
    + +
    +

    Examples

    +
    iris.df = as.disk.frame(iris, nchunks = 2)
    +
    +# arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk
    +iris_hard.df = hard_arrange(iris.df, Species)
    +#> Appending disk.frames: 
    +
    +get_chunk(iris_hard.df, 1)
    +#> # A tibble: 50 x 5
    +#> # Groups:   Species [1]
    +#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
    +#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
    +#>  1          6.3         3.3          6           2.5 virginica
    +#>  2          5.8         2.7          5.1         1.9 virginica
    +#>  3          7.1         3            5.9         2.1 virginica
    +#>  4          6.3         2.9          5.6         1.8 virginica
    +#>  5          6.5         3            5.8         2.2 virginica
    +#>  6          7.6         3            6.6         2.1 virginica
    +#>  7          4.9         2.5          4.5         1.7 virginica
    +#>  8          7.3         2.9          6.3         1.8 virginica
    +#>  9          6.7         2.5          5.8         1.8 virginica
    +#> 10          7.2         3.6          6.1         2.5 virginica
    +#> # ... with 40 more rows
    +get_chunk(iris_hard.df, 2)
    +#> # A tibble: 50 x 5
    +#> # Groups:   Species [1]
    +#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
    +#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
    +#>  1          6.3         3.3          6           2.5 virginica
    +#>  2          5.8         2.7          5.1         1.9 virginica
    +#>  3          7.1         3            5.9         2.1 virginica
    +#>  4          6.3         2.9          5.6         1.8 virginica
    +#>  5          6.5         3            5.8         2.2 virginica
    +#>  6          7.6         3            6.6         2.1 virginica
    +#>  7          4.9         2.5          4.5         1.7 virginica
    +#>  8          7.3         2.9          6.3         1.8 virginica
    +#>  9          6.7         2.5          5.8         1.8 virginica
    +#> 10          7.2         3.6          6.1         2.5 virginica
    +#> # ... with 40 more rows
    +
    +# clean up cars.df
    +delete(iris.df)
    +delete(iris_hard.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/hard_group_by.html b/docs/reference/hard_group_by.html index 7df1019d..8dd5247b 100644 --- a/docs/reference/hard_group_by.html +++ b/docs/reference/hard_group_by.html @@ -1,69 +1,14 @@ - - - - - - - -Perform a hard group — hard_group_by • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Perform a hard group — hard_group_by • disk.frame - - - - - - - - - - + + - - - -
    -
    - -
    - -
    +
    @@ -169,126 +99,108 @@

    Perform a hard group

    row that share the same `by` value will end up in the same chunk.

    -
    hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
    -
    -# S3 method for data.frame
    -hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
    -
    -# S3 method for disk.frame
    -hard_group_by(
    -  df,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_hard_group_by"),
    -  nchunks = disk.frame::nchunks(df),
    -  overwrite = TRUE,
    -  shardby_function = "hash",
    -  sort_splits = NULL,
    -  desc_vars = NULL,
    -  sort_split_sample_size = 100
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame

    ...

    grouping variables

    .add

    same as dplyr::group_by

    .drop

    same as dplyr::group_by

    outdir

    the output directory

    nchunks

    The number of chunks in the output. Defaults = nchunks.disk.frame(df)

    overwrite

    overwrite the out put directory

    shardby_function

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    sort_splits

    for the "sort" shardby function, a dataframe with the split values.

    desc_vars

    for the "sort" shardby function, the variables to sort descending.

    sort_split_sample_size

    for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.

    - - -

    Examples

    -
    iris.df = as.disk.frame(iris, nchunks = 2) +
    +
    hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
    +
    +# S3 method for data.frame
    +hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
    +
    +# S3 method for disk.frame
    +hard_group_by(
    +  df,
    +  ...,
    +  outdir = tempfile("tmp_disk_frame_hard_group_by"),
    +  nchunks = disk.frame::nchunks(df),
    +  overwrite = TRUE,
    +  shardby_function = "hash",
    +  sort_splits = NULL,
    +  desc_vars = NULL,
    +  sort_split_sample_size = 100
    +)
    +
    -# group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk -iris_hard.df = hard_group_by(iris.df, Species) -
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    -get_chunk(iris_hard.df, 1) -
    #> # A tibble: 150 x 5 -#> # Groups: Species [3] -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <dbl> <dbl> <dbl> <dbl> <fct> -#> 1 5.1 3.5 1.4 0.2 setosa -#> 2 4.9 3 1.4 0.2 setosa -#> 3 4.7 3.2 1.3 0.2 setosa -#> 4 4.6 3.1 1.5 0.2 setosa -#> 5 5 3.6 1.4 0.2 setosa -#> 6 5.4 3.9 1.7 0.4 setosa -#> 7 4.6 3.4 1.4 0.3 setosa -#> 8 5 3.4 1.5 0.2 setosa -#> 9 4.4 2.9 1.4 0.2 setosa -#> 10 4.9 3.1 1.5 0.1 setosa -#> # ... with 140 more rows
    get_chunk(iris_hard.df, 2) -
    #> Warning: The chunk NA does not exist; returning an empty data.table
    #> Null data.table (0 rows and 0 cols)
    -# clean up cars.df -delete(iris.df) -delete(iris_hard.df) -
    +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    ...
    +

    grouping variables

    +
    .add
    +

    same as dplyr::group_by

    +
    .drop
    +

    same as dplyr::group_by

    +
    outdir
    +

    the output directory

    +
    nchunks
    +

    The number of chunks in the output. Defaults = nchunks.disk.frame(df)

    +
    overwrite
    +

    overwrite the out put directory

    +
    shardby_function
    +

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    +
    sort_splits
    +

    for the "sort" shardby function, a dataframe with the split values.

    +
    desc_vars
    +

    for the "sort" shardby function, the variables to sort descending.

    +
    sort_split_sample_size
    +

    for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.

    +
    + +
    +

    Examples

    +
    iris.df = as.disk.frame(iris, nchunks = 2)
    +
    +# group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk
    +iris_hard.df = hard_group_by(iris.df, Species)
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +
    +get_chunk(iris_hard.df, 1)
    +#> # A tibble: 150 x 5
    +#> # Groups:   Species [3]
    +#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    +#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
    +#>  1          5.1         3.5          1.4         0.2 setosa 
    +#>  2          4.9         3            1.4         0.2 setosa 
    +#>  3          4.7         3.2          1.3         0.2 setosa 
    +#>  4          4.6         3.1          1.5         0.2 setosa 
    +#>  5          5           3.6          1.4         0.2 setosa 
    +#>  6          5.4         3.9          1.7         0.4 setosa 
    +#>  7          4.6         3.4          1.4         0.3 setosa 
    +#>  8          5           3.4          1.5         0.2 setosa 
    +#>  9          4.4         2.9          1.4         0.2 setosa 
    +#> 10          4.9         3.1          1.5         0.1 setosa 
    +#> # ... with 140 more rows
    +get_chunk(iris_hard.df, 2)
    +#> Warning: The chunk NA does not exist; returning an empty data.table
    +#> Null data.table (0 rows and 0 cols)
    +
    +# clean up cars.df
    +delete(iris.df)
    +delete(iris_hard.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 74fe55c0..6312a2b2 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -1,67 +1,12 @@ - - - - - - - -Head and tail of the disk.frame — head.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Head and tail of the disk.frame — head.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,74 +95,68 @@

    Head and tail of the disk.frame

    Head and tail of the disk.frame

    -
    # S3 method for disk.frame
    -head(x, n = 6L, ...)
    -
    -# S3 method for disk.frame
    -tail(x, n = 6L, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    x

    a disk.frame

    n

    number of rows to include

    ...

    passed to base::head or base::tail

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) -head(cars.df) -
    #> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10
    tail(cars.df) -
    #> speed dist -#> 1: 24 70 -#> 2: 24 92 -#> 3: 24 93 -#> 4: 24 120 -#> 5: 25 85
    -# clean up -delete(cars.df) -
    +
    +
    # S3 method for disk.frame
    +head(x, n = 6L, ...)
    +
    +# S3 method for disk.frame
    +tail(x, n = 6L, ...)
    +
    + +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    n
    +

    number of rows to include

    +
    ...
    +

    passed to base::head or base::tail

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +head(cars.df)
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:     4   10
    +#> 3:     7    4
    +#> 4:     7   22
    +#> 5:     8   16
    +#> 6:     9   10
    +tail(cars.df)
    +#>    speed dist
    +#> 1:    24   70
    +#> 2:    24   92
    +#> 3:    24   93
    +#> 4:    24  120
    +#> 5:    25   85
    +
    +# clean up 
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/index.html b/docs/reference/index.html index 4f823595..482f042c 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -1,66 +1,12 @@ - - - - - - - -Function reference • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Function reference • disk.frame + + - - - - -
    -
    - -
    - -
    +
    - - - - - - - - - - -
    -

    All functions

    + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    +

    All functions

    +

    add_chunk()

    Add a chunk to the disk.frame

    +

    as.data.frame(<disk.frame>)

    Convert disk.frame to data.frame by collecting all chunks

    +

    as.data.table(<disk.frame>)

    Convert disk.frame to data.table by collecting all chunks

    +

    as.disk.frame()

    Make a data.frame into a disk.frame

    +

    chunk_summarize() chunk_summarise() chunk_group_by() chunk_ungroup()

    Group by within each disk.frame

    +

    cmap() cmap_dfr() cimap() cimap_dfr() lazy() delayed() chunk_lapply() map() imap_dfr() imap() map_dfr(<disk.frame>) map_dfr(<default>)

    Apply the same function to all chunks

    +

    cmap2() map2() map_by_chunk_id()

    `cmap2` a function to two disk.frames

    +

    collect(<disk.frame>) collect_list() collect(<summarized_disk.frame>)

    Bring the disk.frame into R

    +

    colnames() names(<disk.frame>)

    Return the column names of the disk.frame

    +

    compute(<disk.frame>)

    Compute without writing

    +

    create_chunk_mapper()

    Create function that applies to each chunk if disk.frame

    +

    csv_to_disk.frame()

    Convert CSV file(s) to disk.frame format

    +

    delete()

    Delete a disk.frame

    +

    dfglm()

    Fit generalized linear models (glm) with disk.frame

    +

    df_ram_size()

    Get the size of RAM in gigabytes

    +

    disk.frame()

    Create a disk.frame from a folder

    +

    select(<disk.frame>) rename(<disk.frame>) filter(<disk.frame>) mutate(<disk.frame>) transmute(<disk.frame>) arrange(<disk.frame>) chunk_arrange() add_tally.disk.frame() do(<disk.frame>) distinct(<disk.frame>) chunk_distinct() glimpse(<disk.frame>)

    The dplyr verbs implemented for disk.frame

    +

    evalparseglue()

    Helper function to evalparse some `glue::glue` string

    +

    foverlaps.disk.frame()

    Apply data.table's foverlaps to the disk.frame

    +

    gen_datatable_synthetic()

    Generate synthetic dataset for testing

    +

    get_chunk()

    Obtain one chunk by chunk id

    +

    get_chunk_ids()

    Get the chunk IDs and files names

    +

    groups(<disk.frame>)

    The shard keys of the disk.frame

    +

    summarise(<grouped_disk.frame>) summarize(<grouped_disk.frame>) group_by(<disk.frame>) summarize(<disk.frame>) summarise(<disk.frame>)

    A function to parse the summarize function

    +

    hard_arrange()

    Perform a hard arrange

    +

    hard_group_by()

    Perform a hard group

    +

    head(<disk.frame>) tail(<disk.frame>)

    Head and tail of the disk.frame

    +

    is_disk.frame()

    Checks if a folder is a disk.frame

    +

    anti_join(<disk.frame>) full_join(<disk.frame>) inner_join(<disk.frame>) left_join(<disk.frame>) semi_join(<disk.frame>)

    Performs join/merge for disk.frames

    +

    make_glm_streaming_fn()

    A streaming function for speedglm

    +

    merge(<disk.frame>)

    Merge function for disk.frames

    +

    move_to() copy_df_to()

    Move or copy a disk.frame to another location

    +

    nchunks() nchunk()

    Returns the number of chunks in a disk.frame

    +

    nrow() ncol()

    Number of rows or columns

    +

    var_df.chunk_agg.disk.frame() var_df.collected_agg.disk.frame() sd_df.chunk_agg.disk.frame() sd_df.collected_agg.disk.frame() mean_df.chunk_agg.disk.frame() mean_df.collected_agg.disk.frame() sum_df.chunk_agg.disk.frame() sum_df.collected_agg.disk.frame() min_df.chunk_agg.disk.frame() min_df.collected_agg.disk.frame() max_df.chunk_agg.disk.frame() max_df.collected_agg.disk.frame() median_df.chunk_agg.disk.frame() median_df.collected_agg.disk.frame() n_df.chunk_agg.disk.frame() n_df.collected_agg.disk.frame() length_df.chunk_agg.disk.frame() length_df.collected_agg.disk.frame() any_df.chunk_agg.disk.frame() any_df.collected_agg.disk.frame() all_df.chunk_agg.disk.frame() all_df.collected_agg.disk.frame() n_distinct_df.chunk_agg.disk.frame() n_distinct_df.collected_agg.disk.frame() quantile_df.chunk_agg.disk.frame() quantile_df.collected_agg.disk.frame() IQR_df.chunk_agg.disk.frame() IQR_df.collected_agg.disk.frame()

    One Stage function

    +

    overwrite_check()

    Check if the outdir exists or not

    +

    print(<disk.frame>)

    Print disk.frame

    +

    pull(<disk.frame>)

    Pull a column from table similar to `dplyr::pull`.

    +

    rbindlist.disk.frame()

    rbindlist disk.frames together

    +

    rechunk()

    Increase or decrease the number of chunks in the disk.frame

    +

    recommend_nchunks()

    Recommend number of chunks based on input size

    +

    remove_chunk()

    Removes a chunk from the disk.frame

    +

    sample_frac(<disk.frame>)

    Sample n rows from a disk.frame

    +

    setup_disk.frame()

    Set up disk.frame environment

    +

    shard() distribute()

    Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame

    +

    shardkey()

    Returns the shardkey (not implemented yet)

    +

    shardkey_equal()

    Compare two disk.frame shardkeys

    +

    show_ceremony() ceremony_text() show_boilerplate() insert_ceremony()

    Show the code to setup disk.frame

    +

    srckeep() srckeepchunks()

    Keep only the variables from the input listed in selections

    +

    `[`(<disk.frame>)

    [ interface for disk.frame using fst backend

    +

    tbl_vars(<disk.frame>) group_vars(<disk.frame>)

    Column names for RStudio auto-complete

    +

    write_disk.frame() output_disk.frame()

    Write disk.frame to disk

    +

    zip_to_disk.frame()

    `zip_to_disk.frame` is used to read and convert every CSV file within the zip file to disk.frame format

    - +
    +
    -
    - +
    - - + + diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index b64b33ea..c56ab38f 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Checks if a folder is a disk.frame — is_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Checks if a folder is a disk.frame — is_disk.frame • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,52 +95,50 @@

    Checks if a folder is a disk.frame

    Checks if a folder is a disk.frame

    -
    is_disk.frame(df)
    - -

    Arguments

    - - - - - - -
    df

    a disk.frame or directory to check

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) +
    +
    is_disk.frame(df)
    +
    -is_disk.frame(cars) # FALSE -
    #> [1] FALSE
    is_disk.frame(cars.df) # TRUE -
    #> [1] TRUE
    -# clean up cars.df -delete(cars.df) -
    +
    +

    Arguments

    +
    df
    +

    a disk.frame or directory to check

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +is_disk.frame(cars) # FALSE
    +#> [1] FALSE
    +is_disk.frame(cars.df) # TRUE
    +#> [1] TRUE
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/join.html b/docs/reference/join.html index 3abfbc81..6bdc7ff7 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -1,67 +1,12 @@ - - - - - - - -Performs join/merge for disk.frames — anti_join.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Performs join/merge for disk.frames — anti_join.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,185 +95,208 @@

    Performs join/merge for disk.frames

    Performs join/merge for disk.frames

    -
    # S3 method for disk.frame
    -anti_join(
    -  x,
    -  y,
    -  by = NULL,
    -  copy = FALSE,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_anti_join"),
    -  merge_by_chunk_id = FALSE,
    -  overwrite = TRUE,
    -  .progress = FALSE
    -)
    -
    -# S3 method for disk.frame
    -full_join(
    -  x,
    -  y,
    -  by = NULL,
    -  copy = FALSE,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_full_join"),
    -  overwrite = TRUE,
    -  merge_by_chunk_id,
    -  .progress = FALSE
    -)
    -
    -# S3 method for disk.frame
    -inner_join(
    -  x,
    -  y,
    -  by = NULL,
    -  copy = FALSE,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_inner_join"),
    -  merge_by_chunk_id = NULL,
    -  overwrite = TRUE,
    -  .progress = FALSE
    -)
    -
    -# S3 method for disk.frame
    -left_join(
    -  x,
    -  y,
    -  by = NULL,
    -  copy = FALSE,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_left_join"),
    -  merge_by_chunk_id = FALSE,
    -  overwrite = TRUE,
    -  .progress = FALSE
    -)
    -
    -# S3 method for disk.frame
    -semi_join(
    -  x,
    -  y,
    -  by = NULL,
    -  copy = FALSE,
    -  ...,
    -  outdir = tempfile("tmp_disk_frame_semi_join"),
    -  merge_by_chunk_id = FALSE,
    -  overwrite = TRUE,
    -  .progress = FALSE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    x

    a disk.frame

    y

    a data.frame or disk.frame. If data.frame then returns lazily; if disk.frame it performs the join eagerly and return a disk.frame

    by

    join by

    copy

    same as dplyr::anti_join

    ...

    same as dplyr's joins

    outdir

    output directory for disk.frame

    merge_by_chunk_id

    the merge is performed by chunk id

    overwrite

    overwrite output directory

    .progress

    Show progress or not. Defaults to FALSE

    - -

    Value

    +
    +
    # S3 method for disk.frame
    +anti_join(
    +  x,
    +  y,
    +  by = NULL,
    +  copy = FALSE,
    +  ...,
    +  outdir = tempfile("tmp_disk_frame_anti_join"),
    +  merge_by_chunk_id = FALSE,
    +  overwrite = TRUE,
    +  .progress = FALSE
    +)
    +
    +# S3 method for disk.frame
    +full_join(
    +  x,
    +  y,
    +  by = NULL,
    +  copy = FALSE,
    +  ...,
    +  outdir = tempfile("tmp_disk_frame_full_join"),
    +  overwrite = TRUE,
    +  merge_by_chunk_id,
    +  .progress = FALSE
    +)
    +
    +# S3 method for disk.frame
    +inner_join(
    +  x,
    +  y,
    +  by = NULL,
    +  copy = FALSE,
    +  ...,
    +  outdir = tempfile("tmp_disk_frame_inner_join"),
    +  merge_by_chunk_id = NULL,
    +  overwrite = TRUE,
    +  .progress = FALSE
    +)
    +
    +# S3 method for disk.frame
    +left_join(
    +  x,
    +  y,
    +  by = NULL,
    +  copy = FALSE,
    +  ...,
    +  outdir = tempfile("tmp_disk_frame_left_join"),
    +  merge_by_chunk_id = FALSE,
    +  overwrite = TRUE,
    +  .progress = FALSE
    +)
    +
    +# S3 method for disk.frame
    +semi_join(
    +  x,
    +  y,
    +  by = NULL,
    +  copy = FALSE,
    +  ...,
    +  outdir = tempfile("tmp_disk_frame_semi_join"),
    +  merge_by_chunk_id = FALSE,
    +  overwrite = TRUE,
    +  .progress = FALSE
    +)
    +
    +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    y
    +

    a data.frame or disk.frame. If data.frame then returns lazily; if disk.frame it performs the join eagerly and return a disk.frame

    +
    by
    +

    join by

    +
    copy
    +

    same as dplyr::anti_join

    +
    ...
    +

    same as dplyr's joins

    +
    outdir
    +

    output directory for disk.frame

    +
    merge_by_chunk_id
    +

    the merge is performed by chunk id

    +
    overwrite
    +

    overwrite output directory

    +
    .progress
    +

    Show progress or not. Defaults to FALSE

    +
    +
    +

    Value

    disk.frame or data.frame/data.table

    +
    -

    Examples

    -
    df.df = as.disk.frame(data.frame(x = 1:3, y = 4:6), overwrite = TRUE) -df2.df = as.disk.frame(data.frame(x = 1:2, z = 10:11), overwrite = TRUE) - -anti_joined.df = anti_join(df.df, df2.df) -
    #> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    -anti_joined.df %>% collect -
    #> x y -#> 1: 3 6
    -anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11)) -
    #> Joining, by = "x"
    #> Joining, by = "x"
    #> Joining, by = "x"
    -# clean up -delete(df.df) -delete(df2.df) -delete(anti_joined.df) -cars.df = as.disk.frame(cars) - -join.df = full_join(cars.df, cars.df, merge_by_chunk_id = TRUE) - -# clean up cars.df -delete(cars.df) -delete(join.df) -cars.df = as.disk.frame(cars) - -join.df = inner_join(cars.df, cars.df, merge_by_chunk_id = TRUE) - -# clean up cars.df -delete(cars.df) -delete(join.df) -cars.df = as.disk.frame(cars) - -join.df = left_join(cars.df, cars.df) -
    #> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    -# clean up cars.df -delete(cars.df) -delete(join.df) -cars.df = as.disk.frame(cars) - -join.df = semi_join(cars.df, cars.df) -
    #> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    -# clean up cars.df -delete(cars.df) -delete(join.df) -
    +
    +

    Examples

    +
    df.df = as.disk.frame(data.frame(x = 1:3, y = 4:6), overwrite = TRUE)
    +df2.df = as.disk.frame(data.frame(x = 1:2, z = 10:11), overwrite = TRUE)
    +
    +anti_joined.df = anti_join(df.df, df2.df) 
    +#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +
    +anti_joined.df %>% collect
    +#>    x y
    +#> 1: 3 6
    +
    +anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11))
    +#> Joining, by = "x"
    +#> Joining, by = "x"
    +#> Joining, by = "x"
    +
    +# clean up
    +delete(df.df)
    +delete(df2.df)
    +delete(anti_joined.df)
    +cars.df = as.disk.frame(cars)
    +
    +join.df = full_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(join.df)
    +cars.df = as.disk.frame(cars)
    +
    +join.df = inner_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(join.df)
    +cars.df = as.disk.frame(cars)
    +
    +join.df = left_join(cars.df, cars.df)
    +#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(join.df)
    +cars.df = as.disk.frame(cars)
    +
    +join.df = semi_join(cars.df, cars.df)
    +#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(join.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index 1f76192a..91b68bc6 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -1,67 +1,12 @@ - - - - - - - -A streaming function for speedglm — make_glm_streaming_fn • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -A streaming function for speedglm — make_glm_streaming_fn • disk.frame - + + - - - -
    -
    - -
    - -
    +
    @@ -165,119 +95,116 @@

    A streaming function for speedglm

    Define a function that can be used to feed data into speedglm and biglm

    -
    make_glm_streaming_fn(data, verbose = FALSE)
    - -

    Arguments

    - - - - - - - - - - -
    data

    a disk.frame

    verbose

    Whether to print the status of data loading. Default to FALSE

    - -

    Value

    +
    +
    make_glm_streaming_fn(data, verbose = FALSE)
    +
    +
    +

    Arguments

    +
    data
    +

    a disk.frame

    +
    verbose
    +

    Whether to print the status of data loading. Default to FALSE

    +
    +
    +

    Value

    return a function, fn, that can be used as the data argument in biglm::bigglm or speedglm::shglm

    -

    See also

    - -

    Other Machine Learning (ML): -dfglm()

    - -

    Examples

    -
    cars.df = as.disk.frame(cars) -streamacq = make_glm_streaming_fn(cars.df, verbose = FALSE) +
    +
    +

    See also

    +

    Other Machine Learning (ML): +dfglm()

    +
    -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) -if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) { - m = biglm::bigglm(dist ~ speed, data = streamacq) - summary(m) - predict(m, get_chunk(cars.df, 1)) - predict(m, collect(cars.df, 1)) -} else { - m = speedglm::shglm(dist ~ speed, data = streamacq) -} -
    #> [,1] -#> 1 -1.849460 -#> 2 -1.849460 -#> 3 9.947766 -#> 4 9.947766 -#> 5 13.880175 -#> 6 17.812584 -#> 7 21.744993 -#> 8 21.744993 -#> 9 21.744993 -#> 10 25.677401 -#> 11 25.677401 -#> 12 29.609810 -#> 13 29.609810 -#> 14 29.609810 -#> 15 29.609810 -#> 16 33.542219 -#> 17 33.542219 -#> 18 33.542219 -#> 19 33.542219 -#> 20 37.474628 -#> 21 37.474628 -#> 22 37.474628 -#> 23 37.474628 -#> 24 41.407036 -#> 25 41.407036 -#> 26 41.407036 -#> 27 45.339445 -#> 28 45.339445 -#> 29 49.271854 -#> 30 49.271854 -#> 31 49.271854 -#> 32 53.204263 -#> 33 53.204263 -#> 34 53.204263 -#> 35 53.204263 -#> 36 57.136672 -#> 37 57.136672 -#> 38 57.136672 -#> 39 61.069080 -#> 40 61.069080 -#> 41 61.069080 -#> 42 61.069080 -#> 43 61.069080 -#> 44 68.933898 -#> 45 72.866307 -#> 46 76.798715 -#> 47 76.798715 -#> 48 76.798715 -#> 49 76.798715 -#> 50 80.731124
    +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +streamacq = make_glm_streaming_fn(cars.df, verbose = FALSE)
    +
    +majorv = as.integer(version$major)
    +minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])
    +if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) {
    +  m = biglm::bigglm(dist ~ speed, data = streamacq)
    +  summary(m)
    +  predict(m, get_chunk(cars.df, 1))
    +  predict(m, collect(cars.df, 1))
    +} else {
    +  m = speedglm::shglm(dist ~ speed, data = streamacq)
    +}
    +#>         [,1]
    +#> 1  -1.849460
    +#> 2  -1.849460
    +#> 3   9.947766
    +#> 4   9.947766
    +#> 5  13.880175
    +#> 6  17.812584
    +#> 7  21.744993
    +#> 8  21.744993
    +#> 9  21.744993
    +#> 10 25.677401
    +#> 11 25.677401
    +#> 12 29.609810
    +#> 13 29.609810
    +#> 14 29.609810
    +#> 15 29.609810
    +#> 16 33.542219
    +#> 17 33.542219
    +#> 18 33.542219
    +#> 19 33.542219
    +#> 20 37.474628
    +#> 21 37.474628
    +#> 22 37.474628
    +#> 23 37.474628
    +#> 24 41.407036
    +#> 25 41.407036
    +#> 26 41.407036
    +#> 27 45.339445
    +#> 28 45.339445
    +#> 29 49.271854
    +#> 30 49.271854
    +#> 31 49.271854
    +#> 32 53.204263
    +#> 33 53.204263
    +#> 34 53.204263
    +#> 35 53.204263
    +#> 36 57.136672
    +#> 37 57.136672
    +#> 38 57.136672
    +#> 39 61.069080
    +#> 40 61.069080
    +#> 41 61.069080
    +#> 42 61.069080
    +#> 43 61.069080
    +#> 44 68.933898
    +#> 45 72.866307
    +#> 46 76.798715
    +#> 47 76.798715
    +#> 48 76.798715
    +#> 49 76.798715
    +#> 50 80.731124
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index b2d35726..26cafba5 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Merge function for disk.frames — merge.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Merge function for disk.frames — merge.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,86 +95,70 @@

    Merge function for disk.frames

    Merge function for disk.frames

    -
    # S3 method for disk.frame
    -merge(
    -  x,
    -  y,
    -  by,
    -  outdir = tempfile(fileext = ".df"),
    -  ...,
    -  merge_by_chunk_id = FALSE,
    -  overwrite = FALSE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    x

    a disk.frame

    y

    a disk.frame or data.frame

    by

    the merge by keys

    outdir

    The output directory for the disk.frame

    ...

    passed to merge and cmap.disk.frame

    merge_by_chunk_id

    if TRUE then only chunks in df1 and df2 with the same chunk id will get merged

    overwrite

    overwrite the outdir or not

    - - -

    Examples

    -
    b = as.disk.frame(data.frame(a = 51:150, b = 1:100)) -d = as.disk.frame(data.frame(a = 151:250, b = 1:100)) -bd.df = merge(b, d, by = "b", merge_by_chunk_id = TRUE) - -# clean up cars.df -delete(b) -delete(d) -delete(bd.df) -
    +
    +
    # S3 method for disk.frame
    +merge(
    +  x,
    +  y,
    +  by,
    +  outdir = tempfile(fileext = ".df"),
    +  ...,
    +  merge_by_chunk_id = FALSE,
    +  overwrite = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    y
    +

    a disk.frame or data.frame

    +
    by
    +

    the merge by keys

    +
    outdir
    +

    The output directory for the disk.frame

    +
    ...
    +

    passed to merge and cmap.disk.frame

    +
    merge_by_chunk_id
    +

    if TRUE then only chunks in df1 and df2 with the same chunk id will get merged

    +
    overwrite
    +

    overwrite the outdir or not

    +
    + +
    +

    Examples

    +
    b = as.disk.frame(data.frame(a = 51:150, b = 1:100))
    +d = as.disk.frame(data.frame(a = 151:250, b = 1:100))
    +bd.df = merge(b, d, by = "b", merge_by_chunk_id = TRUE)
    +
    +# clean up cars.df
    +delete(b)
    +delete(d)
    +delete(bd.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index 50090a6c..9037467e 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -1,67 +1,12 @@ - - - - - - - -Move or copy a disk.frame to another location — move_to • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Move or copy a disk.frame to another location — move_to • disk.frame - + + - - - -
    -
    - -
    - -
    +
    @@ -165,71 +95,62 @@

    Move or copy a disk.frame to another location

    Move or copy a disk.frame to another location

    -
    move_to(df, outdir, ..., copy = FALSE)
    -
    -copy_df_to(df, outdir, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    df

    The disk.frame

    outdir

    The new location

    ...

    NOT USED

    copy

    Merely copy and not move

    - -

    Value

    +
    +
    move_to(df, outdir, ..., copy = FALSE)
     
    -    

    a disk.frame

    - -

    Examples

    -
    cars.df = as.disk.frame(cars) - -cars_copy.df = copy_df_to(cars.df, outdir = tempfile(fileext=".df")) +copy_df_to(df, outdir, ...)
    +
    -cars2.df = move_to(cars.df, outdir = tempfile(fileext=".df")) +
    +

    Arguments

    +
    df
    +

    The disk.frame

    +
    outdir
    +

    The new location

    +
    ...
    +

    NOT USED

    +
    copy
    +

    Merely copy and not move

    +
    +
    +

    Value

    +

    a disk.frame

    +
    -# clean up -delete(cars_copy.df) -delete(cars2.df) -
    +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +cars_copy.df = copy_df_to(cars.df, outdir = tempfile(fileext=".df"))
    +
    +cars2.df = move_to(cars.df, outdir = tempfile(fileext=".df"))
    +
    +# clean up
    +delete(cars_copy.df)
    +delete(cars2.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index 91ce528c..988619c1 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -1,67 +1,12 @@ - - - - - - - -Returns the number of chunks in a disk.frame — nchunks • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Returns the number of chunks in a disk.frame — nchunks • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,69 +95,63 @@

    Returns the number of chunks in a disk.frame

    Returns the number of chunks in a disk.frame

    -
    nchunks(df, ...)
    -
    -nchunk(df, ...)
    -
    -# S3 method for disk.frame
    -nchunk(df, ...)
    -
    -# S3 method for disk.frame
    -nchunks(df, skip.ready.check = FALSE, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    df

    a disk.frame

    ...

    not used

    skip.ready.check

    NOT implemented

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) - -# return the number of chunks -nchunks(cars.df) -
    #> [1] 6
    nchunk(cars.df) -
    #> [1] 6
    -# clean up cars.df -delete(cars.df) -
    +
    +
    nchunks(df, ...)
    +
    +nchunk(df, ...)
    +
    +# S3 method for disk.frame
    +nchunk(df, ...)
    +
    +# S3 method for disk.frame
    +nchunks(df, skip.ready.check = FALSE, ...)
    +
    + +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    ...
    +

    not used

    +
    skip.ready.check
    +

    NOT implemented

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# return the number of chunks
    +nchunks(cars.df)
    +#> [1] 6
    +nchunk(cars.df)
    +#> [1] 6
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index 5d39a4fa..a8a9d09a 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -1,67 +1,12 @@ - - - - - - - -Number of rows or columns — nrow • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Number of rows or columns — nrow • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,65 +95,61 @@

    Number of rows or columns

    Number of rows or columns

    -
    nrow(df, ...)
    -
    -# S3 method for disk.frame
    -nrow(df, ...)
    -
    -ncol(df)
    -
    -# S3 method for disk.frame
    -ncol(df)
    +
    +
    nrow(df, ...)
     
    -    

    Arguments

    - - - - - - - - - - -
    df

    a disk.frame

    ...

    passed to base::nrow

    +# S3 method for disk.frame +nrow(df, ...) +ncol(df) -

    Examples

    -
    cars.df = as.disk.frame(cars) +# S3 method for disk.frame +ncol(df)
    +
    -# return total number of column and rows -ncol(cars.df) -
    #> [1] 2
    nrow(cars.df) -
    #> [1] 50
    -# clean up cars.df -delete(cars.df) -
    +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    ...
    +

    passed to base::nrow

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# return total number of column and rows
    +ncol(cars.df)
    +#> [1] 2
    +nrow(cars.df)
    +#> [1] 50
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index ec4a929e..22b7a93c 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -1,69 +1,14 @@ - - - - - - - -One Stage function — var_df.chunk_agg.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -One Stage function — var_df.chunk_agg.disk.frame • disk.frame - - - - - - - - - - - - - + + -
    -
    - -
    - -
    +
    @@ -169,109 +99,97 @@

    One Stage function

    mean collected_agg

    -
    var_df.chunk_agg.disk.frame(x, na.rm = FALSE)
    +    
    +
    var_df.chunk_agg.disk.frame(x, na.rm = FALSE)
     
    -var_df.collected_agg.disk.frame(listx)
    +var_df.collected_agg.disk.frame(listx)
     
    -sd_df.chunk_agg.disk.frame(x, na.rm = FALSE)
    +sd_df.chunk_agg.disk.frame(x, na.rm = FALSE)
     
    -sd_df.collected_agg.disk.frame(listx)
    +sd_df.collected_agg.disk.frame(listx)
     
    -mean_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
    +mean_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
     
    -mean_df.collected_agg.disk.frame(listx)
    +mean_df.collected_agg.disk.frame(listx)
     
    -sum_df.chunk_agg.disk.frame(x, ...)
    +sum_df.chunk_agg.disk.frame(x, ...)
     
    -sum_df.collected_agg.disk.frame(listx, ...)
    +sum_df.collected_agg.disk.frame(listx, ...)
     
    -min_df.chunk_agg.disk.frame(x, ...)
    +min_df.chunk_agg.disk.frame(x, ...)
     
    -min_df.collected_agg.disk.frame(listx, ...)
    +min_df.collected_agg.disk.frame(listx, ...)
     
    -max_df.chunk_agg.disk.frame(x, ...)
    +max_df.chunk_agg.disk.frame(x, ...)
     
    -max_df.collected_agg.disk.frame(listx, ...)
    +max_df.collected_agg.disk.frame(listx, ...)
     
    -median_df.chunk_agg.disk.frame(x, ...)
    +median_df.chunk_agg.disk.frame(x, ...)
     
    -median_df.collected_agg.disk.frame(listx, ...)
    +median_df.collected_agg.disk.frame(listx, ...)
     
    -n_df.chunk_agg.disk.frame(...)
    +n_df.chunk_agg.disk.frame(...)
     
    -n_df.collected_agg.disk.frame(listx, ...)
    +n_df.collected_agg.disk.frame(listx, ...)
     
    -length_df.chunk_agg.disk.frame(x, ...)
    +length_df.chunk_agg.disk.frame(x, ...)
     
    -length_df.collected_agg.disk.frame(listx, ...)
    +length_df.collected_agg.disk.frame(listx, ...)
     
    -any_df.chunk_agg.disk.frame(x, ...)
    +any_df.chunk_agg.disk.frame(x, ...)
     
    -any_df.collected_agg.disk.frame(listx, ...)
    +any_df.collected_agg.disk.frame(listx, ...)
     
    -all_df.chunk_agg.disk.frame(x, ...)
    +all_df.chunk_agg.disk.frame(x, ...)
     
    -all_df.collected_agg.disk.frame(listx, ...)
    +all_df.collected_agg.disk.frame(listx, ...)
     
    -n_distinct_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
    +n_distinct_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
     
    -n_distinct_df.collected_agg.disk.frame(listx, ...)
    +n_distinct_df.collected_agg.disk.frame(listx, ...)
     
    -quantile_df.chunk_agg.disk.frame(x, ...)
    +quantile_df.chunk_agg.disk.frame(x, ...)
     
    -quantile_df.collected_agg.disk.frame(listx, ...)
    +quantile_df.collected_agg.disk.frame(listx, ...)
     
    -IQR_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
    +IQR_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
     
    -IQR_df.collected_agg.disk.frame(listx, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    x

    the input

    na.rm

    Remove NAs. TRUE of FALSE

    listx

    a list

    ...

    additional options

    +IQR_df.collected_agg.disk.frame(listx, ...)
    +
    +
    +

    Arguments

    +
    x
    +

    the input

    +
    na.rm
    +

    Remove NAs. TRUE of FALSE

    +
    listx
    +

    a list

    +
    ...
    +

    additional options

    +
    + -
    - +
    - - + + diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index 386c7661..f4d07be2 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -1,67 +1,12 @@ - - - - - - - -Check if the outdir exists or not — overwrite_check • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Check if the outdir exists or not — overwrite_check • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,55 +95,49 @@

    Check if the outdir exists or not

    If the overwrite is TRUE then the folder will be deleted, otherwise the folder will be created.

    -
    overwrite_check(outdir, overwrite)
    - -

    Arguments

    - - - - - - - - - - -
    outdir

    the output directory

    overwrite

    TRUE or FALSE if `outdir`` exists and overwrite = FALSE then throw an error

    - - -

    Examples

    -
    tf = tempfile() -overwrite_check(tf, overwrite = FALSE) -overwrite_check(tf, overwrite = TRUE) - -# clean up -fs::dir_delete(tf) -
    +
    +
    overwrite_check(outdir, overwrite)
    +
    + +
    +

    Arguments

    +
    outdir
    +

    the output directory

    +
    overwrite
    +

    TRUE or FALSE if `outdir`` exists and overwrite = FALSE then throw an error

    +
    + +
    +

    Examples

    +
    tf = tempfile()
    +overwrite_check(tf, overwrite = FALSE)
    +overwrite_check(tf, overwrite = TRUE)
    +
    +# clean up
    +fs::dir_delete(tf)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index b3752bd0..a0ef8db4 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Print disk.frame — print.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Print disk.frame — print.disk.frame • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -165,48 +95,40 @@

    Print disk.frame

    a new print method for disk.frame

    -
    # S3 method for disk.frame
    -print(x, ...)
    - -

    Arguments

    - - - - - - - - - - -
    x

    disk.frame

    ...

    not used

    +
    +
    # S3 method for disk.frame
    +print(x, ...)
    +
    +
    +

    Arguments

    +
    x
    +

    disk.frame

    +
    ...
    +

    not used

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index 4bcc7b8a..ff438239 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Pull a column from table similar to `dplyr::pull`. — pull.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Pull a column from table similar to `dplyr::pull`. — pull.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,56 +95,44 @@

    Pull a column from table similar to `dplyr::pull`.

    Pull a column from table similar to `dplyr::pull`.

    -
    # S3 method for disk.frame
    -pull(.data, var = -1, name = NULL, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    .data

    The disk.frame

    var

    can be an positive or negative integer or a character/string. See dplyr::pull documentation

    name

    See dplyr::pull documentation

    ...

    Not used, kept for compatibility with `dplyr::pull`

    +
    +
    # S3 method for disk.frame
    +pull(.data, var = -1, name = NULL, ...)
    +
    +
    +

    Arguments

    +
    .data
    +

    The disk.frame

    +
    var
    +

    can be an positive or negative integer or a character/string. See dplyr::pull documentation

    +
    name
    +

    See dplyr::pull documentation

    +
    ...
    +

    Not used, kept for compatibility with `dplyr::pull`

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index 2ea44cfb..081dadec 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -rbindlist disk.frames together — rbindlist.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -rbindlist disk.frames together — rbindlist.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,85 +95,70 @@

    rbindlist disk.frames together

    rbindlist disk.frames together

    -
    rbindlist.disk.frame(
    -  df_list,
    -  outdir = tempfile(fileext = ".df"),
    -  by_chunk_id = TRUE,
    -  parallel = TRUE,
    -  compress = 50,
    -  overwrite = TRUE,
    -  .progress = TRUE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df_list

    A list of disk.frames

    outdir

    Output directory of the row-bound disk.frames

    by_chunk_id

    If TRUE then only the chunks with the same chunk IDs will be bound

    parallel

    if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE

    compress

    0-100, 100 being the highest compression rate.

    overwrite

    overwrite the output directory

    .progress

    A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) - -# row-bind two disk.frames -cars2.df = rbindlist.disk.frame(list(cars.df, cars.df)) -
    #> Appending disk.frames:
    -# clean up cars.df -delete(cars.df) -delete(cars2.df) -
    +
    +
    rbindlist.disk.frame(
    +  df_list,
    +  outdir = tempfile(fileext = ".df"),
    +  by_chunk_id = TRUE,
    +  parallel = TRUE,
    +  compress = 50,
    +  overwrite = TRUE,
    +  .progress = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    df_list
    +

    A list of disk.frames

    +
    outdir
    +

    Output directory of the row-bound disk.frames

    +
    by_chunk_id
    +

    If TRUE then only the chunks with the same chunk IDs will be bound

    +
    parallel
    +

    if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE

    +
    compress
    +

    0-100, 100 being the highest compression rate.

    +
    overwrite
    +

    overwrite the output directory

    +
    .progress
    +

    A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# row-bind two disk.frames
    +cars2.df = rbindlist.disk.frame(list(cars.df, cars.df))
    +#> Appending disk.frames: 
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(cars2.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index fb5df9b6..1ad617e3 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -1,67 +1,12 @@ - - - - - - - -Increase or decrease the number of chunks in the disk.frame — rechunk • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Increase or decrease the number of chunks in the disk.frame — rechunk • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,100 +95,88 @@

    Increase or decrease the number of chunks in the disk.frame

    Increase or decrease the number of chunks in the disk.frame

    -
    rechunk(
    -  df,
    -  nchunks,
    -  outdir = attr(df, "path", exact = TRUE),
    -  shardby = NULL,
    -  overwrite = TRUE,
    -  shardby_function = "hash",
    -  sort_splits = NULL,
    -  desc_vars = NULL
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    the disk.frame to rechunk

    nchunks

    number of chunks

    outdir

    the output directory

    shardby

    the shardkeys

    overwrite

    overwrite the output directory

    shardby_function

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    sort_splits

    for the "sort" shardby function, a dataframe with the split values.

    desc_vars

    for the "sort" shardby function, the variables to sort descending.

    - - -

    Examples

    -
    # create a disk.frame with 2 chunks in tempdir() -cars.df = as.disk.frame(cars, nchunks = 2) +
    +
    rechunk(
    +  df,
    +  nchunks,
    +  outdir = attr(df, "path", exact = TRUE),
    +  shardby = NULL,
    +  overwrite = TRUE,
    +  shardby_function = "hash",
    +  sort_splits = NULL,
    +  desc_vars = NULL
    +)
    +
    -# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df -rechunk(cars.df, 3) -
    #> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP\back_up_tmp_dir4b144a1a2eda. You can recover there files until you restart your R session
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP\file4b1466eb2237.df" -#> nchunks: 3 -#> nrow (at source): 50 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    -new_path = tempfile(fileext = ".df") -# re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory -cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed") -
    #> Hashing...
    #> Hashing...
    #> Hashing...
    #> Appending disk.frames:
    -# clean up cars.df -delete(cars.df) -delete(cars2.df) -
    +
    +

    Arguments

    +
    df
    +

    the disk.frame to rechunk

    +
    nchunks
    +

    number of chunks

    +
    outdir
    +

    the output directory

    +
    shardby
    +

    the shardkeys

    +
    overwrite
    +

    overwrite the output directory

    +
    shardby_function
    +

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    +
    sort_splits
    +

    for the "sort" shardby function, a dataframe with the split values.

    +
    desc_vars
    +

    for the "sort" shardby function, the variables to sort descending.

    +
    + +
    +

    Examples

    +
    # create a disk.frame with 2 chunks in tempdir()
    +cars.df = as.disk.frame(cars, nchunks = 2)
    +
    +# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df
    +rechunk(cars.df, 3)
    +#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\back_up_tmp_dir56f4356b56cb. You can recover there files until you restart your R session
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f4c8a34c9.df"
    +#> nchunks: 3
    +#> nrow (at source): 50
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +
    +new_path = tempfile(fileext = ".df")
    +# re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory
    +cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed")
    +#> Hashing...
    +#> Hashing...
    +#> Hashing...
    +#> Appending disk.frames: 
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(cars2.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index 1b43a82d..e549d81c 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -1,68 +1,13 @@ - - - - - - - -Recommend number of chunks based on input size — recommend_nchunks • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Recommend number of chunks based on input size — recommend_nchunks • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -167,77 +97,67 @@

    Recommend number of chunks based on input size

    into. It can accept filesizes in bytes (as integer) or a data.frame

    -
    recommend_nchunks(
    -  df,
    -  type = "csv",
    -  minchunks = data.table::getDTthreads(),
    -  conservatism = 8,
    -  ram_size = df_ram_size()
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame or the file size in bytes of a CSV file holding the -data

    type

    only = "csv" is supported. It indicates the file type -corresponding to file size `df`

    minchunks

    the minimum number of chunks. Defaults to the number of CPU -cores (without hyper-threading)

    conservatism

    a multiplier to the recommended number of chunks. The +

    +
    recommend_nchunks(
    +  df,
    +  type = "csv",
    +  minchunks = data.table::getDTthreads(),
    +  conservatism = 8,
    +  ram_size = df_ram_size()
    +)
    +
    + +
    +

    Arguments

    +
    df
    +

    a disk.frame or the file size in bytes of a CSV file holding the +data

    +
    type
    +

    only = "csv" is supported. It indicates the file type +corresponding to file size `df`

    +
    minchunks
    +

    the minimum number of chunks. Defaults to the number of CPU +cores (without hyper-threading)

    +
    conservatism
    +

    a multiplier to the recommended number of chunks. The more chunks the smaller the chunk size and more likely that each chunk can -fit into RAM

    ram_size

    The amount of RAM available which is usually computed. Except on RStudio with R3.6+

    - - -

    Examples

    -
    # recommend nchunks based on data.frame -recommend_nchunks(cars) -
    #> [1] 6
    -# recommend nchunks based on file size ONLY CSV is implemented at the moment -recommend_nchunks(1024^3) -
    #> [1] 6
    +fit into RAM

    +
    ram_size
    +

    The amount of RAM available which is usually computed. Except on RStudio with R3.6+

    +
    + +
    +

    Examples

    +
    # recommend nchunks based on data.frame
    +recommend_nchunks(cars)
    +#> [1] 6
    +
    +# recommend nchunks based on file size ONLY CSV is implemented at the moment
    +recommend_nchunks(1024^3)
    +#> [1] 6
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index 6e5db636..974ae319 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -1,67 +1,12 @@ - - - - - - - -Removes a chunk from the disk.frame — remove_chunk • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Removes a chunk from the disk.frame — remove_chunk • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,85 +95,83 @@

    Removes a chunk from the disk.frame

    Removes a chunk from the disk.frame

    -
    remove_chunk(df, chunk_id, full.names = FALSE)
    - -

    Arguments

    - - - - - - - - - - - - - - -
    df

    a disk.frame

    chunk_id

    the chunk ID of the chunk to remove. If it's a number then return number.fst

    full.names

    TRUE or FALSE. Defaults to FALSE. If true then chunk_id is the full path to the chunk otherwise it's the relative path

    - - -

    Examples

    -
    # TODO add these to tests -cars.df = as.disk.frame(cars, nchunks = 4) - -# removes 3rd chunk -remove_chunk(cars.df, 3) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP\file4b1473bc5984.df" -#> nchunks: 3 -#> nrow (at source): 37 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    nchunks(cars.df) # 3 -
    #> [1] 3
    -# removes 4th chunk -remove_chunk(cars.df, "4.fst") -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP\file4b1473bc5984.df" -#> nchunks: 2 -#> nrow (at source): 26 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    nchunks(cars.df) # 3 -
    #> [1] 2
    -# removes 2nd chunk -remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE) -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP\file4b1473bc5984.df" -#> nchunks: 1 -#> nrow (at source): 13 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    nchunks(cars.df) # 1 -
    #> [1] 1
    -# clean up cars.df -delete(cars.df) -
    +
    +
    remove_chunk(df, chunk_id, full.names = FALSE)
    +
    + +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    chunk_id
    +

    the chunk ID of the chunk to remove. If it's a number then return number.fst

    +
    full.names
    +

    TRUE or FALSE. Defaults to FALSE. If true then chunk_id is the full path to the chunk otherwise it's the relative path

    +
    + +
    +

    Examples

    +
    # TODO add these to tests
    +cars.df = as.disk.frame(cars, nchunks = 4)
    +
    +# removes 3rd chunk
    +remove_chunk(cars.df, 3)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df"
    +#> nchunks: 3
    +#> nrow (at source): 37
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +nchunks(cars.df) # 3
    +#> [1] 3
    +
    +# removes 4th chunk
    +remove_chunk(cars.df, "4.fst")
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df"
    +#> nchunks: 2
    +#> nrow (at source): 26
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +nchunks(cars.df) # 3
    +#> [1] 2
    +
    +# removes 2nd chunk
    +remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE)
    +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df"
    +#> nchunks: 1
    +#> nrow (at source): 13
    +#> ncol (at source): 2
    +#> nrow (post operations): ???
    +#> ncol (post operations): ???
    +nchunks(cars.df) # 1
    +#> [1] 1
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/sample.html b/docs/reference/sample.html index 83e4f100..719d4244 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -1,67 +1,12 @@ - - - - - - - -Sample n rows from a disk.frame — sample_frac.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Sample n rows from a disk.frame — sample_frac.disk.frame • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,100 +95,87 @@

    Sample n rows from a disk.frame

    Sample n rows from a disk.frame

    -
    # S3 method for disk.frame
    -sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ...)
    +
    +
    # S3 method for disk.frame
    +sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ...)
    +
    -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - -
    tbl

    A data.frame.

    size

    <tidy-select> -For sample_n(), the number of rows to select. -For sample_frac(), the fraction of rows to select. -If tbl is grouped, size applies to each group.

    replace

    Sample with or without replacement?

    weight

    <tidy-select> Sampling weights. +

    +

    Arguments

    +
    tbl
    +

    A data.frame.

    +
    size
    +

    <tidy-select> +For sample_n(), the number of rows to select. +For sample_frac(), the fraction of rows to select. +If tbl is grouped, size applies to each group.

    +
    replace
    +

    Sample with or without replacement?

    +
    weight
    +

    <tidy-select> Sampling weights. This must evaluate to a vector of non-negative numbers the same length as -the input. Weights are automatically standardised to sum to 1.

    .env

    DEPRECATED.

    ...

    ignored

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) - -collect(sample_frac(cars.df, 0.5)) -
    #> speed dist -#> 1: 10 34 -#> 2: 7 22 -#> 3: 8 16 -#> 4: 4 2 -#> 5: 11 17 -#> 6: 12 20 -#> 7: 11 28 -#> 8: 13 34 -#> 9: 15 26 -#> 10: 15 54 -#> 11: 15 20 -#> 12: 14 80 -#> 13: 16 40 -#> 14: 18 42 -#> 15: 17 40 -#> 16: 19 36 -#> 17: 19 68 -#> 18: 20 64 -#> 19: 20 52 -#> 20: 20 48 -#> 21: 25 85 -#> 22: 24 120 -#> speed dist
    -# clean up cars.df -delete(cars.df) -
    +the input. Weights are automatically standardised to sum to 1.

    +
    .env
    +

    DEPRECATED.

    +
    ...
    +

    ignored

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +collect(sample_frac(cars.df, 0.5))
    +#>     speed dist
    +#>  1:     7   22
    +#>  2:     7    4
    +#>  3:     9   10
    +#>  4:    10   26
    +#>  5:    13   34
    +#>  6:    11   17
    +#>  7:    12   20
    +#>  8:    11   28
    +#>  9:    14   26
    +#> 10:    14   36
    +#> 11:    15   26
    +#> 12:    14   80
    +#> 13:    18   84
    +#> 14:    18   56
    +#> 15:    18   76
    +#> 16:    17   50
    +#> 17:    20   56
    +#> 18:    19   46
    +#> 19:    19   68
    +#> 20:    20   32
    +#> 21:    25   85
    +#> 22:    24   70
    +#>     speed dist
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 477f1e08..4b1ee1c7 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Set up disk.frame environment — setup_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Set up disk.frame environment — setup_disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,78 +95,71 @@

    Set up disk.frame environment

    Set up disk.frame environment

    -
    setup_disk.frame(
    -  workers = data.table::getDTthreads(),
    -  future_backend = future::multisession,
    -  ...,
    -  gui = FALSE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    workers

    the number of workers (background R processes in the

    future_backend

    which future backend to use for parallelization

    ...

    passed to `future::plan`

    gui

    Whether to use a Graphical User Interface (GUI) for selecting the options. Defaults to FALSE

    - - -

    Examples

    -
    if (interactive()) { - # setup disk.frame to use multiple workers these may use more than two - # cores, and is therefore not allowed on CRAN. Hence it's set to run only in - # interactive session - setup_disk.frame() - - # use a Shiny GUI to adjust settings - # only run in interactive() - setup_disk.frame(gui = TRUE) -} - -# set the number workers to 2 -setup_disk.frame(2) -
    #> The number of workers available for disk.frame is 2
    -# if you do not wish to use multiple workers you can set it to sequential -setup_disk.frame(future_backend=future::sequential) -
    #> Warning: Detected 1 unknown future arguments: 'workers'
    #> The number of workers available for disk.frame is 1
    +
    +
    setup_disk.frame(
    +  workers = data.table::getDTthreads(),
    +  future_backend = future::multisession,
    +  ...,
    +  gui = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    workers
    +

    the number of workers (background R processes in the

    +
    future_backend
    +

    which future backend to use for parallelization

    +
    ...
    +

    passed to `future::plan`

    +
    gui
    +

    Whether to use a Graphical User Interface (GUI) for selecting the options. Defaults to FALSE

    +
    + +
    +

    Examples

    +
    if (interactive()) {
    +  # setup disk.frame to use multiple workers these may use more than two
    +  # cores, and is therefore not allowed on CRAN. Hence it's set to run only in
    +  # interactive session
    +  setup_disk.frame()
    +  
    +  # use a Shiny GUI to adjust settings
    +  # only run in interactive()
    +  setup_disk.frame(gui = TRUE)
    +}
    +
    +# set the number workers to 2
    +setup_disk.frame(2)
    +#> The number of workers available for disk.frame is 2
    +
    +# if you do not wish to use multiple workers you can set it to sequential
    +setup_disk.frame(future_backend=future::sequential)
    +#> Warning: Detected 1 unknown future arguments: 'workers'
    +#> The number of workers available for disk.frame is 1
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/shard.html b/docs/reference/shard.html index 6e4ed76b..eca41f84 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -1,68 +1,13 @@ - - - - - - - -Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame — shard • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame — shard • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -167,95 +97,76 @@

    Shard a data.frame/data.table or disk.frame into chunk and saves it into a d

    `distribute` is an alias for `shard`

    -
    shard(
    -  df,
    -  shardby,
    -  outdir = tempfile(fileext = ".df"),
    -  ...,
    -  nchunks = recommend_nchunks(df),
    -  overwrite = FALSE,
    -  shardby_function = "hash",
    -  sort_splits = NULL,
    -  desc_vars = NULL
    -)
    -
    -distribute(...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    A data.frame/data.table or disk.frame. If disk.frame, then rechunk(df, ...) is run

    shardby

    The column(s) to shard the data by.

    outdir

    The output directory of the disk.frame

    ...

    not used

    nchunks

    The number of chunks

    overwrite

    If TRUE then the chunks are overwritten

    shardby_function

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    sort_splits

    If shardby_function is "sort", the split values for sharding

    desc_vars

    for the "sort" shardby function, the variables to sort descending.

    - +
    +
    shard(
    +  df,
    +  shardby,
    +  outdir = tempfile(fileext = ".df"),
    +  ...,
    +  nchunks = recommend_nchunks(df),
    +  overwrite = FALSE,
    +  shardby_function = "hash",
    +  sort_splits = NULL,
    +  desc_vars = NULL
    +)
    +
    +distribute(...)
    +
    -

    Examples

    -
    -# shard the cars data.frame by speed so that rows with the same speed are in the same chunk -iris.df = shard(iris, "Species") -
    #> Hashing...
    -# clean up cars.df -delete(iris.df) -
    +
    +

    Arguments

    +
    df
    +

    A data.frame/data.table or disk.frame. If disk.frame, then rechunk(df, ...) is run

    +
    shardby
    +

    The column(s) to shard the data by.

    +
    outdir
    +

    The output directory of the disk.frame

    +
    ...
    +

    not used

    +
    nchunks
    +

    The number of chunks

    +
    overwrite
    +

    If TRUE then the chunks are overwritten

    +
    shardby_function
    +

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    +
    sort_splits
    +

    If shardby_function is "sort", the split values for sharding

    +
    desc_vars
    +

    for the "sort" shardby function, the variables to sort descending.

    +
    + +
    +

    Examples

    +
    
    +# shard the cars data.frame by speed so that rows with the same speed are in the same chunk
    +iris.df = shard(iris, "Species")
    +#> Hashing...
    +
    +# clean up cars.df
    +delete(iris.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html index a23b17ef..eeca73d7 100644 --- a/docs/reference/shardkey.html +++ b/docs/reference/shardkey.html @@ -1,67 +1,12 @@ - - - - - - - -Returns the shardkey (not implemented yet) — shardkey • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Returns the shardkey (not implemented yet) — shardkey • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -165,43 +95,37 @@

    Returns the shardkey (not implemented yet)

    Returns the shardkey (not implemented yet)

    -
    shardkey(df)
    - -

    Arguments

    - - - - - - -
    df

    a disk.frame

    +
    +
    shardkey(df)
    +
    +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index 14596490..e51669c5 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -1,67 +1,12 @@ - - - - - - - -Compare two disk.frame shardkeys — shardkey_equal • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Compare two disk.frame shardkeys — shardkey_equal • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -165,47 +95,39 @@

    Compare two disk.frame shardkeys

    Compare two disk.frame shardkeys

    -
    shardkey_equal(sk1, sk2)
    - -

    Arguments

    - - - - - - - - - - -
    sk1

    shardkey1

    sk2

    shardkey2

    +
    +
    shardkey_equal(sk1, sk2)
    +
    +
    +

    Arguments

    +
    sk1
    +

    shardkey1

    +
    sk2
    +

    shardkey2

    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index f3db3865..a4e1badc 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -1,67 +1,12 @@ - - - - - - - -Show the code to setup disk.frame — show_ceremony • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Show the code to setup disk.frame — show_ceremony • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -165,41 +95,38 @@

    Show the code to setup disk.frame

    Show the code to setup disk.frame

    -
    show_ceremony()
    +    
    +
    show_ceremony()
     
    -ceremony_text()
    +ceremony_text()
     
    -show_boilerplate()
    -
    -insert_ceremony()
    +show_boilerplate() +insert_ceremony()
    +
    + -
    - +
    - - + + diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html index 898b76fa..934352ba 100644 --- a/docs/reference/srckeep.html +++ b/docs/reference/srckeep.html @@ -1,67 +1,12 @@ - - - - - - - -Keep only the variables from the input listed in selections — srckeep • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Keep only the variables from the input listed in selections — srckeep • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -165,117 +95,108 @@

    Keep only the variables from the input listed in selections

    Keep only the variables from the input listed in selections

    -
    srckeep(diskf, selections, ...)
    -
    -srckeepchunks(diskf, chunks, ...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    diskf

    a disk.frame

    selections

    The list of variables to keep from the input source

    ...

    not yet used

    chunks

    The chunks to load

    - +
    +
    srckeep(diskf, selections, ...)
     
    -    

    Examples

    -
    cars.df = as.disk.frame(cars) +srckeepchunks(diskf, chunks, ...)
    +
    -# when loading cars's chunks into RAM, load only the column speed -collect(srckeep(cars.df, "speed")) -
    #> speed -#> 1: 4 -#> 2: 4 -#> 3: 7 -#> 4: 7 -#> 5: 8 -#> 6: 9 -#> 7: 10 -#> 8: 10 -#> 9: 10 -#> 10: 11 -#> 11: 11 -#> 12: 12 -#> 13: 12 -#> 14: 12 -#> 15: 12 -#> 16: 13 -#> 17: 13 -#> 18: 13 -#> 19: 13 -#> 20: 14 -#> 21: 14 -#> 22: 14 -#> 23: 14 -#> 24: 15 -#> 25: 15 -#> 26: 15 -#> 27: 16 -#> 28: 16 -#> 29: 17 -#> 30: 17 -#> 31: 17 -#> 32: 18 -#> 33: 18 -#> 34: 18 -#> 35: 18 -#> 36: 19 -#> 37: 19 -#> 38: 19 -#> 39: 20 -#> 40: 20 -#> 41: 20 -#> 42: 20 -#> 43: 20 -#> 44: 22 -#> 45: 23 -#> 46: 24 -#> 47: 24 -#> 48: 24 -#> 49: 24 -#> 50: 25 -#> speed
    -# clean up cars.df -delete(cars.df) -
    +
    +

    Arguments

    +
    diskf
    +

    a disk.frame

    +
    selections
    +

    The list of variables to keep from the input source

    +
    ...
    +

    not yet used

    +
    chunks
    +

    The chunks to load

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# when loading cars's chunks into RAM, load only the column speed
    +collect(srckeep(cars.df, "speed"))
    +#>     speed
    +#>  1:     4
    +#>  2:     4
    +#>  3:     7
    +#>  4:     7
    +#>  5:     8
    +#>  6:     9
    +#>  7:    10
    +#>  8:    10
    +#>  9:    10
    +#> 10:    11
    +#> 11:    11
    +#> 12:    12
    +#> 13:    12
    +#> 14:    12
    +#> 15:    12
    +#> 16:    13
    +#> 17:    13
    +#> 18:    13
    +#> 19:    13
    +#> 20:    14
    +#> 21:    14
    +#> 22:    14
    +#> 23:    14
    +#> 24:    15
    +#> 25:    15
    +#> 26:    15
    +#> 27:    16
    +#> 28:    16
    +#> 29:    17
    +#> 30:    17
    +#> 31:    17
    +#> 32:    18
    +#> 33:    18
    +#> 34:    18
    +#> 35:    18
    +#> 36:    19
    +#> 37:    19
    +#> 38:    19
    +#> 39:    20
    +#> 40:    20
    +#> 41:    20
    +#> 42:    20
    +#> 43:    20
    +#> 44:    22
    +#> 45:    23
    +#> 46:    24
    +#> 47:    24
    +#> 48:    24
    +#> 49:    24
    +#> 50:    25
    +#>     speed
    +
    +# clean up cars.df
    +delete(cars.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index 40e28919..b3ecb1b5 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -[ interface for disk.frame using fst backend — [.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -[ interface for disk.frame using fst backend — [.disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -165,7 +95,8 @@

    [ interface for disk.frame using fst backend

    [ interface for disk.frame using fst backend

    -
    # S3 method for disk.frame
    +    
    +
    # S3 method for disk.frame
     [(
       df,
       ...,
    @@ -174,75 +105,59 @@ 

    [ interface for disk.frame using fst backend

    use.names = TRUE, fill = FALSE, idcol = NULL -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame

    ...

    same as data.table

    keep

    the columns to srckeep

    rbind

    Whether to rbind the chunks. Defaults to TRUE

    use.names

    Same as in data.table::rbindlist

    fill

    Same as in data.table::rbindlist

    idcol

    Same as in data.table::rbindlist

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) -speed_limit = 50 -cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))] -
    #> Error in .checkTypos(e, names_x): Object 'speed_limit' not found amongst speed, dist
    -# clean up -delete(cars.df) -
    +)
    +
    + +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    ...
    +

    same as data.table

    +
    keep
    +

    the columns to srckeep

    +
    rbind
    +

    Whether to rbind the chunks. Defaults to TRUE

    +
    use.names
    +

    Same as in data.table::rbindlist

    +
    fill
    +

    Same as in data.table::rbindlist

    +
    idcol
    +

    Same as in data.table::rbindlist

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +speed_limit = 50
    +cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
    +#> Error in .checkTypos(e, names_x): Object 'speed_limit' not found amongst speed, dist
    +
    +# clean up
    +delete(cars.df)
    +
    +
    + -
    - +
    - - + + diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html index 4869fa2f..d61fa7a6 100644 --- a/docs/reference/tbl_vars.disk.frame.html +++ b/docs/reference/tbl_vars.disk.frame.html @@ -1,68 +1,13 @@ - - - - - - - -Column names for RStudio auto-complete — tbl_vars.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Column names for RStudio auto-complete — tbl_vars.disk.frame • disk.frame - - - - + + -
    -
    - -
    - -
    +
    @@ -167,47 +97,41 @@

    Column names for RStudio auto-complete

    names

    -
    # S3 method for disk.frame
    -tbl_vars(x)
    +    
    +
    # S3 method for disk.frame
    +tbl_vars(x)
     
    -# S3 method for disk.frame
    -group_vars(x)
    - -

    Arguments

    - - - - - - -
    x

    a disk.frame

    +# S3 method for disk.frame +group_vars(x)
    +
    +
    +

    Arguments

    +
    x
    +

    a disk.frame

    +
    + -
    - +
    - - + + diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index 34f8bdf9..5f3f9fdb 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -1,68 +1,13 @@ - - - - - - - -Write disk.frame to disk — write_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Write disk.frame to disk — write_disk.frame • disk.frame + + - - - - -
    -
    - -
    - -
    +
    @@ -167,110 +97,89 @@

    Write disk.frame to disk

    then using the as.disk.frame function is recommended for most cases

    -
    write_disk.frame(
    -  df,
    -  outdir = tempfile(fileext = ".df"),
    -  nchunks = ifelse("disk.frame" %in% class(df), nchunks.disk.frame(df),
    -    recommend_nchunks(df)),
    -  overwrite = FALSE,
    -  shardby = NULL,
    -  compress = 50,
    -  shardby_function = "hash",
    -  sort_splits = NULL,
    -  desc_vars = NULL,
    -  ...
    -)
    -
    -output_disk.frame(...)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    df

    a disk.frame

    outdir

    output directory for the disk.frame

    nchunks

    number of chunks

    overwrite

    overwrite output directory

    shardby

    the columns to shard by

    compress

    compression ratio for fst files

    shardby_function

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    sort_splits

    for the "sort" shardby function, a dataframe with the split values.

    desc_vars

    for the "sort" shardby function, the variables to sort descending.

    ...

    passed to cmap.disk.frame

    - - -

    Examples

    -
    cars.df = as.disk.frame(cars) +
    +
    write_disk.frame(
    +  df,
    +  outdir = tempfile(fileext = ".df"),
    +  nchunks = ifelse("disk.frame" %in% class(df), nchunks.disk.frame(df),
    +    recommend_nchunks(df)),
    +  overwrite = FALSE,
    +  shardby = NULL,
    +  compress = 50,
    +  shardby_function = "hash",
    +  sort_splits = NULL,
    +  desc_vars = NULL,
    +  ...
    +)
    +
    +output_disk.frame(...)
    +
    -# write out a lazy disk.frame to disk -cars2.df = write_disk.frame(cmap(cars.df, ~.x[1,]), overwrite = TRUE) -collect(cars2.df) -
    #> speed dist -#> 1: 4 2 -#> 2: 11 17 -#> 3: 13 46 -#> 4: 16 40 -#> 5: 19 46 -#> 6: 24 70
    -# clean up cars.df -delete(cars.df) -delete(cars2.df) -
    +
    +

    Arguments

    +
    df
    +

    a disk.frame

    +
    outdir
    +

    output directory for the disk.frame

    +
    nchunks
    +

    number of chunks

    +
    overwrite
    +

    overwrite output directory

    +
    shardby
    +

    the columns to shard by

    +
    compress
    +

    compression ratio for fst files

    +
    shardby_function
    +

    splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

    +
    sort_splits
    +

    for the "sort" shardby function, a dataframe with the split values.

    +
    desc_vars
    +

    for the "sort" shardby function, the variables to sort descending.

    +
    ...
    +

    passed to cmap.disk.frame

    +
    + +
    +

    Examples

    +
    cars.df = as.disk.frame(cars)
    +
    +# write out a lazy disk.frame to disk
    +cars2.df = write_disk.frame(cmap(cars.df, ~.x[1,]), overwrite = TRUE)
    +collect(cars2.df)
    +#>    speed dist
    +#> 1:     4    2
    +#> 2:    11   17
    +#> 3:    13   46
    +#> 4:    16   40
    +#> 5:    19   46
    +#> 6:    24   70
    +
    +# clean up cars.df
    +delete(cars.df)
    +delete(cars2.df)
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index a6f1a16e..9188f8ac 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -1,70 +1,15 @@ - - - - - - - -`zip_to_disk.frame` is used to read and convert every CSV file within the zip -file to disk.frame format — zip_to_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`zip_to_disk.frame` is used to read and convert every CSV file within the zip +file to disk.frame format — zip_to_disk.frame • disk.frame - - + + - - -
    -
    - -
    - -
    +
    @@ -170,95 +100,84 @@

    `zip_to_disk.frame` is used to read and convert every CSV file within the zi file to disk.frame format

    -
    zip_to_disk.frame(
    -  zipfile,
    -  outdir,
    -  ...,
    -  validation.check = FALSE,
    -  overwrite = TRUE
    -)
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - - - - - -
    zipfile

    The zipfile

    outdir

    The output directory for disk.frame

    ...

    passed to fread

    validation.check

    should the function perform a check at the end to check for validity of output. It can detect issues with conversion

    overwrite

    overwrite output directory

    - -

    Value

    +
    +
    zip_to_disk.frame(
    +  zipfile,
    +  outdir,
    +  ...,
    +  validation.check = FALSE,
    +  overwrite = TRUE
    +)
    +
    +
    +

    Arguments

    +
    zipfile
    +

    The zipfile

    +
    outdir
    +

    The output directory for disk.frame

    +
    ...
    +

    passed to fread

    +
    validation.check
    +

    should the function perform a check at the end to check for validity of output. It can detect issues with conversion

    +
    overwrite
    +

    overwrite output directory

    +
    +
    +

    Value

    a list of disk.frame

    -

    See also

    - -

    Other ingesting data: -csv_to_disk.frame()

    - -

    Examples

    -
    # create a zip file containing a csv -csvfile = tempfile(fileext = ".csv") -write.csv(cars, csvfile) -zipfile = tempfile(fileext = ".zip") -zip(zipfile, csvfile) - -# read every file and convert it to a disk.frame -zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) +
    +
    +

    See also

    +

    Other ingesting data: +csv_to_disk.frame()

    +
    -# there is only one csv file so it return a list of one disk.frame -zip.df[[1]] -
    #> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2PGAkP\file4b14559e63cf.df/Users/RTX2080/AppData/Local/Temp/Rtmp2PGAkP/file4b1437ba44b8.csv" -#> nchunks: 6 -#> nrow (at source): 50 -#> ncol (at source): 3 -#> nrow (post operations): ??? -#> ncol (post operations): ???
    -# clean up -unlink(csvfile) -unlink(zipfile) -delete(zip.df[[1]]) -
    +
    +

    Examples

    +
    # create a zip file containing a csv
    +csvfile = tempfile(fileext = ".csv")
    +write.csv(cars, csvfile)
    +zipfile = tempfile(fileext = ".zip")
    +zip(zipfile, csvfile)
    +#> Warning: '"zip"' not found
    +
    +# read every file and convert it to a disk.frame
    +zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df"))
    +#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f44b886b42.zip' cannot be opened
    +
    +# there is only one csv file so it return a list of one disk.frame
    +zip.df[[1]]
    +#> Error in eval(expr, envir, enclos): object 'zip.df' not found
    +
    +# clean up
    +unlink(csvfile)
    +unlink(zipfile)
    +delete(zip.df[[1]])
    +#> Error in "disk.frame" %in% class(df): object 'zip.df' not found
    +
    +
    +
    -
    - +
    - - + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 00000000..4aa93c2a --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,294 @@ + + + + /02-intro-disk-frame.html + + + /404.html + + + /articles/01-intro-disk-frame.html + + + /articles/01-intro.html + + + /articles/02-common-questions.html + + + /articles/02-intro-disk-frame.html + + + /articles/03-concepts.html + + + /articles/03_concepts.html + + + /articles/04-ingesting-data.html + + + /articles/04_ingesting-data.html + + + /articles/05-data-table-syntax.html + + + /articles/06-vs-dask-juliadb.html + + + /articles/07-glm.html + + + /articles/08-more-epic.html + + + /articles/09-convenience-features.html + + + /articles/10-group-by.html + + + /articles/11-custom-group-by.html + + + /articles/88-trouble-shooting.html + + + /articles/common-questions.html + + + /articles/concepts.html + + + /articles/convenience-features.html + + + /articles/custom-group-by.html + + + /articles/data-table-syntax.html + + + /articles/glm.html + + + /articles/group-by.html + + + /articles/index.html + + + /articles/ingesting-data.html + + + /articles/intro-disk-frame.html + + + /articles/intro.html + + + /articles/more-epic.html + + + /articles/vs-dask-juliadb-2.html + + + /articles/vs-dask-juliadb.html + + + /authors.html + + + /index.html + + + /LICENSE-text.html + + + /news/index.html + + + /reference/add_chunk.html + + + /reference/as.data.frame.disk.frame.html + + + /reference/as.data.table.disk.frame.html + + + /reference/as.disk.frame.html + + + /reference/bloomfilter.html + + + /reference/chunk_group_by.html + + + /reference/cmap.html + + + /reference/cmap2.html + + + /reference/collect.html + + + /reference/colnames.html + + + /reference/compute.disk.frame.html + + + /reference/create_chunk_mapper.html + + + /reference/create_dplyr_mapper.html + + + /reference/csv_to_disk.frame.html + + + /reference/delete.html + + + /reference/dfglm.html + + + /reference/df_ram_size.html + + + /reference/disk.frame.html + + + /reference/dplyr_verbs.html + + + /reference/evalparseglue.html + + + /reference/foverlaps.disk.frame.html + + + /reference/gen_datatable_synthetic.html + + + /reference/get_chunk.html + + + /reference/get_chunk_ids.html + + + /reference/groups.disk.frame.html + + + /reference/group_by.html + + + /reference/hard_arrange.html + + + /reference/hard_group_by.html + + + /reference/head_tail.html + + + /reference/index.html + + + /reference/is_disk.frame.html + + + /reference/join.html + + + /reference/make_glm_streaming_fn.html + + + /reference/map.html + + + /reference/map2.html + + + /reference/mean.chunk_agg.disk.frame.html + + + /reference/mean.collected_agg.disk.frame.html + + + /reference/merge.disk.frame.html + + + /reference/move_to.html + + + /reference/nchunks.html + + + /reference/ncol_nrow.html + + + /reference/one-stage-group-by-verbs.html + + + /reference/overwrite_check.html + + + /reference/print.disk.frame.html + + + /reference/pull.disk.frame.html + + + /reference/rbindlist.disk.frame.html + + + /reference/rechunk.html + + + /reference/recommend_nchunks.html + + + /reference/remove_chunk.html + + + /reference/sample.html + + + /reference/setup_disk.frame.html + + + /reference/shard.html + + + /reference/shardkey.html + + + /reference/shardkey_equal.html + + + /reference/show_ceremony.html + + + /reference/srckeep.html + + + /reference/sub-.disk.frame.html + + + /reference/summarise.grouped_disk.frame.html + + + /reference/tbl_vars.disk.frame.html + + + /reference/write_disk.frame.html + + + /reference/zip_to_disk.frame.html + +