diff --git a/.Rbuildignore b/.Rbuildignore index c05278e2..5c5d5b95 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -64,4 +64,5 @@ vignettes.asis.template vignettes.Rnw.template ^codecov\.yml$ new-nse-dev.r -test-poorman.R \ No newline at end of file +test-poorman.R +*.parquet \ No newline at end of file diff --git a/CRAN-RELEASE b/CRAN-RELEASE index f72236de..48d603de 100644 --- a/CRAN-RELEASE +++ b/CRAN-RELEASE @@ -1,2 +1,2 @@ -This package was submitted to CRAN on 2021-02-13. -Once it is accepted, delete this file and tag the release (commit f7dd3db). +This package was submitted to CRAN on 2021-03-12. +Once it is accepted, delete this file and tag the release (commit 34bafaa). diff --git a/DESCRIPTION b/DESCRIPTION index a9207fc2..0f848ee8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.4.0 -Date: 2021-02-11 +Version: 0.5.0 +Date: 2021-05-09 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) diff --git a/NAMESPACE b/NAMESPACE index 51fa7439..da78d075 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,7 +67,6 @@ S3method(transmute,disk.frame) export(IQR_df.chunk_agg.disk.frame) export(IQR_df.collected_agg.disk.frame) export(add_chunk) -export(add_count.disk.frame) export(add_tally.disk.frame) export(all_df.chunk_agg.disk.frame) export(all_df.collected_agg.disk.frame) @@ -176,7 +175,6 @@ importFrom(data.table,setDT) importFrom(data.table,setkey) importFrom(data.table,setkeyv) importFrom(data.table,timetaken) -importFrom(dplyr,add_count) importFrom(dplyr,add_tally) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) diff --git a/NEWS.md b/NEWS.md index f06cc6e6..088222ce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# disk.frame 0.5 +* removed `add_count` method + +# disk.frame 0.4.1 +* removed use of `sysctl` which was violating CRAN policy + # disk.frame 0.4.0 * Removed `count` and `tally` * Fixed package compatibility diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index e92e678d..75be5774 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -70,10 +70,10 @@ chunk_arrange <- create_chunk_mapper(dplyr::arrange) # TODO alot of these .disk.frame functions are not generic -#' @export -#' @importFrom dplyr add_count -#' @rdname dplyr_verbs -add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) +#' #' @export +#' #' @importFrom dplyr add_count +#' #' @rdname dplyr_verbs +#' add_count.disk.frame <- create_chunk_mapper(dplyr::add_count) #' @export diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index 8d77e1f5..e5be6ac0 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -91,16 +91,18 @@ df_ram_size <- function() { } } } else { - os = R.version$os - if (length(grep("^darwin", os))) { - a = substring(system("sysctl hw.memsize", intern = TRUE), 13) - } #else { + #os = R.version$os + #if (length(grep("^darwin", os))) { + #a = substring(system("sysctl hw.memsize", intern = TRUE), 13) + # the above is not allowed by CRAN + #} #else { # This would work but is not allowed by CRAN #a = system('grep MemTotal /proc/meminfo', intern = TRUE) #} - l = strsplit(a, " ")[[1]] - l = as.numeric(l[length(l)-1]) - ram_size = l/1024^2 + #l = strsplit(a, " ")[[1]] + #l = as.numeric(l[length(l)-1]) + #ram_size = l/1024^2 + ram_size = 16 # to be conservative } if(is.null(ram_size)) { diff --git a/README.Rmd b/README.Rmd index 06bd5fa8..e5d8beaf 100644 --- a/README.Rmd +++ b/README.Rmd @@ -250,7 +250,7 @@ ncol(flights.df) ## Hex logo -![disk.frame logo](inst/figures/logo.png?raw=true) +![disk.frame logo](inst/figures/logo.png) ## Contributors @@ -283,6 +283,7 @@ The work priorities at this stage are | [深入对比数据科学工具箱:Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese | Harry Zhu | 2020-02-16 | Mentions disk.frame | + ### Interested in learning `{disk.frame}` in a structured course? Please register your interest at: diff --git a/README.md b/README.md index 751b0a66..8b8b1af9 100644 --- a/README.md +++ b/README.md @@ -211,15 +211,12 @@ flights.df %>% filter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay -#> 1 2013 1 1 517 515 2 830 819 11 -#> 2 2013 1 1 533 529 4 850 830 20 -#> carrier flight tailnum origin dest air_time distance hour minute time_hour -#> 1 UA 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 -#> 2 UA 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 -#> origin_dest -#> 1 EWRIAH -#> 2 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier +#> 1 2013 1 1 517 515 2 830 819 11 UA +#> 2 2013 1 1 533 529 4 850 830 20 UA +#> flight tailnum origin dest air_time distance hour minute time_hour origin_dest +#> 1 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH +#> 2 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH ``` ### Group-by @@ -276,15 +273,6 @@ obtained using estimated methods. ``` r library(data.table) -#> data.table 1.13.6 using 6 threads (see ?getDTthreads). Latest news: r-datatable.com -#> -#> Attaching package: 'data.table' -#> The following object is masked from 'package:purrr': -#> -#> transpose -#> The following objects are masked from 'package:dplyr': -#> -#> between, first, last suppressWarnings( grp_by_stage1 <- @@ -325,7 +313,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmpk3aGAr\\file3adc78655410.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpIlXNzn\\file568813b835a7.df" ``` A number of data.frame functions are implemented for disk.frame @@ -333,19 +321,19 @@ A number of data.frame functions are implemented for disk.frame ``` r # get first few rows head(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay -#> 1: 2013 1 1 517 515 2 830 819 11 -#> carrier flight tailnum origin dest air_time distance hour minute time_hour -#> 1: UA 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier +#> 1: 2013 1 1 517 515 2 830 819 11 UA +#> flight tailnum origin dest air_time distance hour minute time_hour +#> 1: 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 ``` ``` r # get last few rows tail(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay -#> 1: 2013 9 30 NA 840 NA NA 1020 NA -#> carrier flight tailnum origin dest air_time distance hour minute time_hour -#> 1: MQ 3531 N839MQ LGA RDU NA 431 8 40 2013-09-30 08:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier +#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ +#> flight tailnum origin dest air_time distance hour minute time_hour +#> 1: 3531 N839MQ LGA RDU NA 431 8 40 2013-09-30 08:00:00 ``` ``` r @@ -362,7 +350,7 @@ ncol(flights.df) ## Hex logo -![disk.frame logo](inst/figures/logo.png?raw=true) +![disk.frame logo](inst/figures/logo.png) ## Contributors @@ -456,11 +444,3 @@ ways? Here are some ways you can contribute [![](https://cranlogs.r-pkg.org/badges/disk.frame)](https://cran.r-project.org/package=disk.frame) [![](http://cranlogs.r-pkg.org/badges/grand-total/disk.frame)](https://cran.r-project.org/package=disk.frame) -[![Travis build -status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame) -[![AppVeyor build -status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame) - -## Live Stream of `{disk.frame}` development - -- diff --git a/book/01-intro.Rmd b/book/01-intro.Rmd index d0df5a20..0ffc5a2e 100644 --- a/book/01-intro.Rmd +++ b/book/01-intro.Rmd @@ -3,7 +3,7 @@ title: "Preface - The birth of `disk.frame`" author: "ZJ" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{preface} + %\VignetteIndexEntry{Preface - The birth of `disk.frame`} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/book/06-vs-dask-juliadb.Rmd b/book/06-vs-dask-juliadb.Rmd index 830081f9..c018af59 100644 --- a/book/06-vs-dask-juliadb.Rmd +++ b/book/06-vs-dask-juliadb.Rmd @@ -3,7 +3,7 @@ title: "Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone el author: "ZJ" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{benchmark-1} + %\VignetteIndexEntry{Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/book/10-group-by.Rmd b/book/10-group-by.Rmd index f0f6ec34..1f47eb55 100644 --- a/book/10-group-by.Rmd +++ b/book/10-group-by.Rmd @@ -58,22 +58,23 @@ It is important to note that not all functions that can run in `dplyr::summarize If a function you need/like is missing, please make a feature request [here](https://github.com/xiaodaigh/disk.frame/issues). It is a limitation that function that depend on the order a column can only obtained using estimated methods. -| Function | Exact/Estimate | Notes | -| -- | -- | -- | -| `min` | Exact | | -| `max` | Exact | | -| `mean` | Exact | | -| `sum` | Exact | | -| `length` | Exact | | -| `n` | Exact | | -| `n_distinct` | Exact | | -| `sd` | Exact | | -| `var` | Exact | `var(x)` only `cor, cov` support *planned* | -| `any` | Exact | | -| `all` | Exact | | -| `median` | Estimate | | -| `quantile` | Estimate | One quantile only | -| `IQR` | Estimate | | +| Function | Exact/Estimate | Notes | +|--------------|----------------|--------------------------------------------| +| `min` | Exact | | +| `max` | Exact | | +| `mean` | Exact | | +| `sum` | Exact | | +| `length` | Exact | | +| `n` | Exact | | +| `n_distinct` | Exact | | +| `sd` | Exact | | +| `var` | Exact | `var(x)` only `cor, cov` support *planned* | +| `any` | Exact | | +| `all` | Exact | | +| `median` | Estimate | | +| `quantile` | Estimate | One quantile only | +| `IQR` | Estimate | | + ### Notes on One-Stage group-by diff --git a/book/88-trouble-shooting.Rmd b/book/88-trouble-shooting.Rmd new file mode 100644 index 00000000..f9cbca51 --- /dev/null +++ b/book/88-trouble-shooting.Rmd @@ -0,0 +1,53 @@ +--- +title: "Trouble shooting" +author: "ZJ" +output: pdf_document +--- + +```{r include=FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval=TRUE, + include=TRUE +) +``` + +### Steps to trouble shoot + +1. I suggest updating {future} and your R version if you have not already done so. + +2. Are you able to share the data? + +3. Do a good MWE +``` +library(disk.frame) +setup_disk.frame() + +df<-as.disk.frame(a) + + +df1 = mutate(df, date = as.Date(as.character(datadate), format="%Y%m%d")) + +head(df1) +``` + + +3. Check if your virus scanner is blocking interprocess communication + +4. Try to apply the function to just one chunk, perhaps there is a syntax error or column error? If one chunk works then you can rule out coding error + +``` +get_chunk(df, 1) %>% + mutate(date = as.Date(as.character(datadate), format="%Y%m%d")) +``` + +5. Set the number of workers to 1, so there is no more inter-process communication. Does it work now? If it does, then it's the inter process communication. You might need to contact your admin for help + +``` +setup_disk.frame(workers=1) +mutate(df, date = as.Date(as.character(datadate), format="%Y%m%d")) +As an MWE this works for me. + +a = data.frame(datadate = rep("20201007", 3e6)) +``` diff --git a/cran-comments.md b/cran-comments.md index 3784d366..b661d1a8 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,11 +1,11 @@ -## Submission for v0.4.0 -* Fixed recently reported warnings +## Submission for v0.5.0 +* Fixed issue in CRAN check but needed to update version to follow semver conventions ## Test environments -* local Windows 10 Pro install, R 4.0.3 -* local Windows 10 Pro install, R devel (as of 2021-02-11) -* local Linux/Ubuntu install, R 4.0.3 -* local Linux/Ubuntu install, R devel (as of 2021-02-11) +* local Windows 10 Pro install, R 4.0.5 +* local Windows 10 Pro install, R devel (as of 2021-05-09) +* local Linux/Ubuntu install, R 4.0.5 +* local Linux/Ubuntu install, R devel (as of 2021-05-09) ## R CMD check results There were no ERRORs nor WARNINGs nor NOTE when run locally. diff --git a/docs/404.html b/docs/404.html index 8c5ef8b1..78100fa4 100644 --- a/docs/404.html +++ b/docs/404.html @@ -1,66 +1,27 @@ - - - - + + + + - Page not found (404) • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + - - - - -
-
- + +
+ + + - - -
+
+
-
- +
+ + - - diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index f2435efd..abba5cc8 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -1,66 +1,12 @@ - - - - - - - -License • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -License • disk.frame - - - - + + -
-
- -
- -
+
+
-
- +
- - + + diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index 93f57ca8..584809c3 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -19,6 +19,8 @@ + +
- - -
- +
- - + + diff --git a/docs/authors.html b/docs/authors.html index fd9e2f46..342af2fa 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -1,66 +1,12 @@ - - - - - - - -Authors • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Authors and Citation • disk.frame - + + - - - - -
-
-
- -
+
- @@ -172,22 +126,20 @@

Authors

-
- +
- - + + diff --git a/docs/index.html b/docs/index.html index 41066c9b..89c4a22e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -23,6 +23,8 @@ + +
-
- -
- +
- - + + diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html index 8204b014..8feaf6eb 100644 --- a/docs/reference/compute.disk.frame.html +++ b/docs/reference/compute.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Compute without writing — compute.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Compute without writing — compute.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,75 +95,64 @@

Compute without writing

Perform the computation; same as calling cmap without .f and lazy = FALSE

-
# S3 method for disk.frame
-compute(
-  x,
-  name,
-  outdir = tempfile("tmp_df_", fileext = ".df"),
-  overwrite = TRUE,
-  ...
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
x

a disk.frame

name

Not used. Kept for compatibility with dplyr

outdir

the output directory

overwrite

whether to overwrite or not

...

Not used. Kept for dplyr compatibility

- - -

Examples

-
cars.df = as.disk.frame(cars) -cars.df2 = cars.df %>% cmap(~.x) -# the computation is performed and the data is now stored elsewhere -cars.df3 = compute(cars.df2) - -# clean up -delete(cars.df) -delete(cars.df3)
+
+
# S3 method for disk.frame
+compute(
+  x,
+  name,
+  outdir = tempfile("tmp_df_", fileext = ".df"),
+  overwrite = TRUE,
+  ...
+)
+
+ +
+

Arguments

+
x
+

a disk.frame

+
name
+

Not used. Kept for compatibility with dplyr

+
outdir
+

the output directory

+
overwrite
+

whether to overwrite or not

+
...
+

Not used. Kept for dplyr compatibility

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+cars.df2 = cars.df %>% cmap(~.x)
+# the computation is performed and the data is now stored elsewhere
+cars.df3 = compute(cars.df2)
+
+# clean up
+delete(cars.df)
+delete(cars.df3)
+
+
+
-
- +
- - + + diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index 9fc66aac..7c28d311 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -1,67 +1,12 @@ - - - - - - - -Create function that applies to each chunk if disk.frame — create_chunk_mapper • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create function that applies to each chunk if disk.frame — create_chunk_mapper • disk.frame - - - - + + -
-
- -
- -
+
@@ -162,104 +95,102 @@

Create function that applies to each chunk if disk.frame

A function to make it easier to create functions like filter

-
create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
- -

Arguments

- - - - - - - - - - - - - - -
chunk_fn

The dplyr function to create a mapper for

warning_msg

The warning message to display when invoking the mapper

as.data.frame

force the input chunk of a data.frame; needed for dtplyr

- - -

Examples

-
-filter = create_chunk_mapper(dplyr::filter) - -#' example: creating a function that keeps only the first and last n row -first_and_last <- function(chunk, n, ...) { - nr = nrow(chunk) - print(nr-n+1:nr) - chunk[c(1:n, (nr-n+1):nr), ] -} - -#' create the function for use with disk.frame -first_and_last_df = create_chunk_mapper(first_and_last) - -mtcars.df = as.disk.frame(mtcars) - -#' the operation is lazy -lazy_mtcars.df = mtcars.df %>% - first_and_last_df(2) +
+
create_chunk_mapper(chunk_fn, warning_msg = NULL, as.data.frame = TRUE)
+
-#' bring into R -collect(lazy_mtcars.df)
#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 5 6 7 8 9 10 -#> [1] 1 2
#> mpg cyl disp hp drat wt qsec vs am gear carb -#> ...1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 -#> ...2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 -#> ...3 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 -#> ...4 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 -#> ...5 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 -#> ...6 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 -#> ...7 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 -#> ...8 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 -#> ...9 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 -#> ...10 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 -#> ...11 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 -#> ...12 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 -#> ...13 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 -#> ...14 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 -#> ...15 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 -#> ...16 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 -#> ...17 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 -#> ...18 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 -#> ...19 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 -#> ...20 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 -#> 1 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 -#> 2 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 -#> 1.1 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 -#> 2.1 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
-#' clean up -delete(mtcars.df)
+
+

Arguments

+
chunk_fn
+

The dplyr function to create a mapper for

+
warning_msg
+

The warning message to display when invoking the mapper

+
as.data.frame
+

force the input chunk of a data.frame; needed for dtplyr

+
+ +
+

Examples

+

+filter = create_chunk_mapper(dplyr::filter)
+
+#' example: creating a function that keeps only the first and last n row
+first_and_last <- function(chunk, n, ...) {
+  nr = nrow(chunk)
+  print(nr-n+1:nr)
+  chunk[c(1:n, (nr-n+1):nr), ]
+}
+
+#' create the function for use with disk.frame
+first_and_last_df = create_chunk_mapper(first_and_last)
+
+mtcars.df = as.disk.frame(mtcars)
+
+#' the operation is lazy
+lazy_mtcars.df = mtcars.df %>%
+  first_and_last_df(2)
+
+#' bring into R
+collect(lazy_mtcars.df)
+#> [1]  5  6  7  8  9 10
+#> [1]  5  6  7  8  9 10
+#> [1]  5  6  7  8  9 10
+#> [1]  5  6  7  8  9 10
+#> [1]  5  6  7  8  9 10
+#> [1] 1 2
+#>      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
+#>  1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
+#>  2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
+#>  3: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
+#>  4: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
+#>  5: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
+#>  6: 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
+#>  7: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
+#>  8: 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
+#>  9: 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
+#> 10: 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
+#> 11: 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
+#> 12: 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
+#> 13: 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
+#> 14: 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
+#> 15: 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
+#> 16: 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
+#> 17: 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
+#> 18: 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
+#> 19: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
+#> 20: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
+#> 21: 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
+#> 22: 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
+#> 23: 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
+#> 24: 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
+#>      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
+
+#' clean up
+delete(mtcars.df)
+
+
+
+
-
- +
- - + + diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index 3f80f571..17b0d387 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Convert CSV file(s) to disk.frame format — csv_to_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Convert CSV file(s) to disk.frame format — csv_to_disk.frame • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,141 +95,115 @@

Convert CSV file(s) to disk.frame format

Convert CSV file(s) to disk.frame format

-
csv_to_disk.frame(
-  infile,
-  outdir = tempfile(fileext = ".df"),
-  inmapfn = base::I,
-  nchunks = recommend_nchunks(sum(file.size(infile))),
-  in_chunk_size = NULL,
-  shardby = NULL,
-  compress = 50,
-  overwrite = TRUE,
-  header = TRUE,
-  .progress = TRUE,
-  backend = c("data.table", "readr", "LaF"),
-  chunk_reader = c("bigreadr", "data.table", "readr", "readLines"),
-  ...
-)
+
+
csv_to_disk.frame(
+  infile,
+  outdir = tempfile(fileext = ".df"),
+  inmapfn = base::I,
+  nchunks = recommend_nchunks(sum(file.size(infile))),
+  in_chunk_size = NULL,
+  shardby = NULL,
+  compress = 50,
+  overwrite = TRUE,
+  header = TRUE,
+  .progress = TRUE,
+  backend = c("data.table", "readr", "LaF"),
+  chunk_reader = c("bigreadr", "data.table", "readr", "readLines"),
+  ...
+)
+
-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
infile

The input CSV file or files

outdir

The directory to output the disk.frame to

inmapfn

A function to be applied to the chunk read in from CSV before +

+

Arguments

+
infile
+

The input CSV file or files

+
outdir
+

The directory to output the disk.frame to

+
inmapfn
+

A function to be applied to the chunk read in from CSV before the chunk is being written out. Commonly used to perform simple -transformations. Defaults to the identity function (ie. no transformation)

nchunks

Number of chunks to output

in_chunk_size

When reading in the file, how many lines to read in at +transformations. Defaults to the identity function (ie. no transformation)

+
nchunks
+

Number of chunks to output

+
in_chunk_size
+

When reading in the file, how many lines to read in at once. This is different to nchunks which controls how many chunks are -output

shardby

The column(s) to shard the data by. For example suppose +output

+
shardby
+

The column(s) to shard the data by. For example suppose `shardby = c("col1","col2")` then every row where the values `col1` and `col2` are the same will end up in the same chunk; this will allow merging -by `col1` and `col2` to be more efficient

compress

For fst backends it's a number between 0 and 100 where 100 is -the highest compression ratio.

overwrite

Whether to overwrite the existing directory

header

Whether the files have header. Defaults to TRUE

.progress

A logical, for whether or not to print a progress bar for -multiprocess, multisession, and multicore plans. From furrr

backend

The CSV reader backend to choose: "data.table" or "readr". +by `col1` and `col2` to be more efficient

+
compress
+

For fst backends it's a number between 0 and 100 where 100 is +the highest compression ratio.

+
overwrite
+

Whether to overwrite the existing directory

+
header
+

Whether the files have header. Defaults to TRUE

+
.progress
+

A logical, for whether or not to print a progress bar for +multiprocess, multisession, and multicore plans. From furrr

+
backend
+

The CSV reader backend to choose: "data.table" or "readr". disk.frame does not have its own CSV reader. It uses either data.table::fread or readr::read_delimited. It is worth noting that data.table::fread does not detect dates and all dates are imported as strings, and you are encouraged to use fasttime to convert the strings to date. You can use the `inmapfn` to do that. However, if you want automatic date detection, then backend="readr" may suit your needs. However, readr -is often slower than data.table, hence data.table is chosen as the default.

chunk_reader

Even if you choose a backend there can still be multiple +is often slower than data.table, hence data.table is chosen as the default.

+
chunk_reader
+

Even if you choose a backend there can still be multiple strategies on how to approach the CSV reads. For example, data.table::fread tries to mmap the whole file which can cause the whole read process to fail. In that case we can change the chunk_reader to "readLines" which uses the readLines function to read chunk by chunk and still use data.table::fread to process the chunks. There are currently no strategies for readr backend, -except the default one.

...

passed to data.table::fread, disk.frame::as.disk.frame, -disk.frame::shard

- -

See also

- -

Other ingesting data: -zip_to_disk.frame()

- -

Examples

-
tmpfile = tempfile() -write.csv(cars, tmpfile) -tmpdf = tempfile(fileext = ".df") -df = csv_to_disk.frame(tmpfile, outdir = tmpdf, overwrite = TRUE) +except the default one.

+
...
+

passed to data.table::fread, disk.frame::as.disk.frame, +disk.frame::shard

+
+
+

See also

+

Other ingesting data: +zip_to_disk.frame()

+
-# clean up -fs::file_delete(tmpfile) -delete(df)
+
+

Examples

+
tmpfile = tempfile()
+write.csv(cars, tmpfile)
+tmpdf = tempfile(fileext = ".df")
+df = csv_to_disk.frame(tmpfile, outdir = tmpdf, overwrite = TRUE)
+
+# clean up
+fs::file_delete(tmpfile)
+delete(df)
+
+
+
-
- +
- - + + diff --git a/docs/reference/delete.html b/docs/reference/delete.html index f44d0caf..2a238505 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -1,67 +1,12 @@ - - - - - - - -Delete a disk.frame — delete • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Delete a disk.frame — delete • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,46 +95,43 @@

Delete a disk.frame

Delete a disk.frame

-
delete(df)
- -

Arguments

- - - - - - -
df

a disk.frame

- +
+
delete(df)
+
-

Examples

-
cars.df = as.disk.frame(cars) -delete(cars.df)
+
+

Arguments

+
df
+

a disk.frame

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+delete(cars.df)
+
+
+
-
- +
- - + + diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index 6a7c7b25..57bfaa8d 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -1,67 +1,12 @@ - - - - - - - -Get the size of RAM in gigabytes — df_ram_size • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get the size of RAM in gigabytes — df_ram_size • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,41 +95,43 @@

Get the size of RAM in gigabytes

Get the size of RAM in gigabytes

-
df_ram_size()
- - -

Value

+
+
df_ram_size()
+
+
+

Value

integer of RAM in gigabyte (GB)

+
-

Examples

-
# returns the RAM size in gigabyte (GB) -df_ram_size()
#> [1] 64
+
+

Examples

+
# returns the RAM size in gigabyte (GB)
+df_ram_size() 
+#> [1] 64
+
+
+
-
- +
- - + + diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index 75c7571c..1197ac98 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -1,68 +1,13 @@ - - - - - - - -Fit generalized linear models (glm) with disk.frame — dfglm • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Fit generalized linear models (glm) with disk.frame — dfglm • disk.frame - - + + - - -
-
- -
- -
+
@@ -164,37 +97,30 @@

Fit generalized linear models (glm) with disk.frame

those return by those functions. This is a convenience wrapper

-
dfglm(formula, data, ..., glm_backend = c("biglm", "speedglm", "biglmm"))
+
+
dfglm(formula, data, ..., glm_backend = c("biglm", "speedglm", "biglmm"))
+
-

Arguments

- - - - - - - - - - - - - - - - - - -
formula

A model formula

data

See Details below. Method dispatch is on this argument

...

Additional arguments

glm_backend

Which package to use for fitting GLMs. The default is +

+

Arguments

+
formula
+

A model formula

+
data
+

See Details below. Method dispatch is on this argument

+
...
+

Additional arguments

+
glm_backend
+

Which package to use for fitting GLMs. The default is "biglm", which has known issues with factor level if different levels are present in different chunks. The "speedglm" option is more robust, but does not -implement `predict` which makes prediction and implementation impossible.

- -

Value

- +implement `predict` which makes prediction and implementation impossible.

+
+
+

Value

An object of class bigglm

-

Details

- +
+
+

Details

The data argument may be a function, a data frame, or a SQLiteConnection or RODBC connection object.

When it is a function the function must take a single argument @@ -214,60 +140,66 @@

Details variables needed for the model, not the whole table. The code in the SQLiteConnection method should work for other DBI connections, but I do not have any of these to check it with.

-

References

- +
+
+

References

Algorithm AS274 Applied Statistics (1992) Vol.41, No. 2

-

See also

- -

Other Machine Learning (ML): -make_glm_streaming_fn()

+
+
+

See also

+

Other Machine Learning (ML): +make_glm_streaming_fn()

+
-

Examples

-
cars.df = as.disk.frame(cars) -m = dfglm(dist ~ speed, data = cars.df)
#> Loading required namespace: biglm
-# can use normal R functions -# Only works in version > R 3.6 -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) -if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) { - summary(m) - predict(m, get_chunk(cars.df, 1)) - predict(m, collect(cars.df)) - # can use broom to tidy up the returned info - broom::tidy(m) -}
#> # A tibble: 2 x 4 -#> term estimate std.error p.value -#> <chr> <dbl> <dbl> <dbl> -#> 1 (Intercept) -17.6 6.76 9.29e- 3 -#> 2 speed 3.93 0.416 2.96e-21
-# clean up -delete(cars.df)
+
+

Examples

+
cars.df = as.disk.frame(cars)
+m = dfglm(dist ~ speed, data = cars.df)
+#> Loading required namespace: biglm
+
+# can use normal R functions
+# Only works in version > R 3.6
+majorv = as.integer(version$major)
+minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])
+if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) {
+  summary(m)
+  predict(m, get_chunk(cars.df, 1))
+  predict(m, collect(cars.df))
+  # can use broom to tidy up the returned info
+  broom::tidy(m)
+}
+#> # A tibble: 2 x 4
+#>   term        estimate std.error  p.value
+#>   <chr>          <dbl>     <dbl>    <dbl>
+#> 1 (Intercept)   -17.6      6.76  9.29e- 3
+#> 2 speed           3.93     0.416 2.96e-21
+
+# clean up
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index 5dd9de15..0d2320c2 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Create a disk.frame from a folder — disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create a disk.frame from a folder — disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,63 +95,64 @@

Create a disk.frame from a folder

Create a disk.frame from a folder

-
disk.frame(path, backend = "fst")
- -

Arguments

- - - - - - - - - - -
path

The path to store the output file or to a directory

backend

The only available backend is fst at the moment

- - -

Examples

-
path = file.path(tempdir(),"cars") -as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK/cars" -#> nchunks: 2 -#> nrow (at source): 50 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
df = disk.frame(path) -head(df)
#> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10
#> [1] 2
# clean up -delete(df)
+
+
disk.frame(path, backend = "fst")
+
+ +
+

Arguments

+
path
+

The path to store the output file or to a directory

+
backend
+

The only available backend is fst at the moment

+
+ +
+

Examples

+
path = file.path(tempdir(),"cars")
+as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5/cars"
+#> nchunks: 2
+#> nrow (at source): 50
+#> ncol (at source): 2
+#> nrow (post operations): ???
+#> ncol (post operations): ???
+df = disk.frame(path)
+head(df)
+#>    speed dist
+#> 1:     4    2
+#> 2:     4   10
+#> 3:     7    4
+#> 4:     7   22
+#> 5:     8   16
+#> 6:     9   10
+nchunks(df)
+#> [1] 2
+# clean up
+delete(df)
+
+
+
- - - + + diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html index 00f29a9e..4cbb0612 100644 --- a/docs/reference/dplyr_verbs.html +++ b/docs/reference/dplyr_verbs.html @@ -1,68 +1,13 @@ - - - - - - - -The dplyr verbs implemented for disk.frame — select.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -The dplyr verbs implemented for disk.frame — select.disk.frame • disk.frame - - - - + + -
-
- -
- -
+
@@ -164,99 +97,88 @@

The dplyr verbs implemented for disk.frame

`chunk_arrange` performs the actions within each chunk

-
# S3 method for disk.frame
-select(.data, ...)
-
-# S3 method for disk.frame
-rename(.data, ...)
+    
+
# S3 method for disk.frame
+select(.data, ...)
 
-# S3 method for disk.frame
-filter(.data, ...)
+# S3 method for disk.frame
+rename(.data, ...)
 
-# S3 method for disk.frame
-mutate(.data, ...)
+# S3 method for disk.frame
+filter(.data, ...)
 
-# S3 method for disk.frame
-transmute(.data, ...)
+# S3 method for disk.frame
+mutate(.data, ...)
 
-# S3 method for disk.frame
-arrange(.data, ...)
+# S3 method for disk.frame
+transmute(.data, ...)
 
-chunk_arrange(.data, ...)
+# S3 method for disk.frame
+arrange(.data, ...)
 
-tally.disk.frame(.data, ...)
+chunk_arrange(.data, ...)
 
-count.disk.frame(.data, ...)
+add_tally.disk.frame(.data, ...)
 
-add_count.disk.frame(.data, ...)
+# S3 method for disk.frame
+do(.data, ...)
 
-add_tally.disk.frame(.data, ...)
+# S3 method for disk.frame
+distinct(...)
 
-# S3 method for disk.frame
-do(.data, ...)
+chunk_distinct(.data, ...)
 
-# S3 method for disk.frame
-distinct(...)
-
-chunk_distinct(.data, ...)
-
-# S3 method for disk.frame
-glimpse(.data, ...)
- -

Arguments

- - - - - - - - - - -
.data

a disk.frame

...

Same as the dplyr functions

- - -

Examples

-
library(dplyr) -cars.df = as.disk.frame(cars) -mult = 2 - -# use all any of the supported dplyr -cars2 = cars.df %>% - select(speed) %>% - mutate(speed2 = speed * mult) %>% - filter(speed < 50) %>% - rename(speed1 = speed) %>% - collect +# S3 method for disk.frame +glimpse(.data, ...)
+
-# clean up cars.df -delete(cars.df)
+
+

Arguments

+
.data
+

a disk.frame

+
...
+

Same as the dplyr functions

+
+ +
+

Examples

+
library(dplyr)
+cars.df = as.disk.frame(cars)
+mult = 2
+
+# use all any of the supported dplyr
+cars2 = cars.df %>% 
+  select(speed) %>% 
+  mutate(speed2 = speed * mult) %>% 
+  filter(speed < 50) %>% 
+  rename(speed1 = speed) %>% 
+  collect
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index 60206d23..4172714e 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -1,67 +1,12 @@ - - - - - - - -Helper function to evalparse some `glue::glue` string — evalparseglue • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Helper function to evalparse some `glue::glue` string — evalparseglue • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,47 +95,39 @@

Helper function to evalparse some `glue::glue` string

Helper function to evalparse some `glue::glue` string

-
evalparseglue(code, env = parent.frame())
- -

Arguments

- - - - - - - - - - -
code

the code in character(string) format to evaluate

env

the environment in which to evaluate the code

+
+
evalparseglue(code, env = parent.frame())
+
+
+

Arguments

+
code
+

the code in character(string) format to evaluate

+
env
+

the environment in which to evaluate the code

+
+
- - - + + diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index 11ca11f6..19607792 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Apply data.table's foverlaps to the disk.frame — foverlaps.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Apply data.table's foverlaps to the disk.frame — foverlaps.disk.frame • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,101 +95,81 @@

Apply data.table's foverlaps to the disk.frame

EXPERIMENTAL

-
foverlaps.disk.frame(
-  df1,
-  df2,
-  by.x = if (identical(shardkey(df1)$shardkey, "")) shardkey(df1)$shardkey else
-    shardkey(df2)$shardkey,
-  by.y = shardkey(df2)$shardkey,
-  ...,
-  outdir = tempfile("df_foverlaps_tmp", fileext = ".df"),
-  merge_by_chunk_id = FALSE,
-  compress = 50,
-  overwrite = TRUE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df1

A disk.frame

df2

A disk.frame or a data.frame

by.x

character/string vector. by.x used in foverlaps

by.y

character/string vector. by.x used in foverlaps

...

passed to data.table::foverlaps and disk.frame::cmap.disk.frame

outdir

The output directory of the disk.frame

merge_by_chunk_id

If TRUE then the merges will happen for chunks in df1 and df2 with the same chunk id which speed up processing. Otherwise every chunk of df1 is merged with every chunk of df2. Ignored with df2 is not a disk.frame

compress

The compression ratio for fst

overwrite

overwrite existing directory

- - -

Examples

-
library(data.table) - -## simple example: -x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)) -y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)) -byxy = c("val1", "start", "end") -xy.df = foverlaps.disk.frame( - x, y, by.x = byxy, by.y = byxy, - merge_by_chunk_id = TRUE, overwrite = TRUE) +
+
foverlaps.disk.frame(
+  df1,
+  df2,
+  by.x = if (identical(shardkey(df1)$shardkey, "")) shardkey(df1)$shardkey else
+    shardkey(df2)$shardkey,
+  by.y = shardkey(df2)$shardkey,
+  ...,
+  outdir = tempfile("df_foverlaps_tmp", fileext = ".df"),
+  merge_by_chunk_id = FALSE,
+  compress = 50,
+  overwrite = TRUE
+)
+
-# clean up -delete(x) -delete(y) -delete(xy.df)
+
+

Arguments

+
df1
+

A disk.frame

+
df2
+

A disk.frame or a data.frame

+
by.x
+

character/string vector. by.x used in foverlaps

+
by.y
+

character/string vector. by.x used in foverlaps

+
...
+

passed to data.table::foverlaps and disk.frame::cmap.disk.frame

+
outdir
+

The output directory of the disk.frame

+
merge_by_chunk_id
+

If TRUE then the merges will happen for chunks in df1 and df2 with the same chunk id which speed up processing. Otherwise every chunk of df1 is merged with every chunk of df2. Ignored with df2 is not a disk.frame

+
compress
+

The compression ratio for fst

+
overwrite
+

overwrite existing directory

+
+ +
+

Examples

+
library(data.table)
+
+## simple example:
+x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10))
+y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3))
+byxy = c("start", "end")
+xy.df = foverlaps.disk.frame(
+   x, y, by.x = byxy, by.y = byxy,
+  merge_by_chunk_id = TRUE, overwrite = TRUE)
+# clean up
+delete(x)
+delete(y)
+delete(xy.df)
+
+
+
- - - + + diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index c7eb7d31..608041f6 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -1,67 +1,12 @@ - - - - - - - -Generate synthetic dataset for testing — gen_datatable_synthetic • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Generate synthetic dataset for testing — gen_datatable_synthetic • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,47 +95,39 @@

Generate synthetic dataset for testing

Generate synthetic dataset for testing

-
gen_datatable_synthetic(N = 2e+08, K = 100)
- -

Arguments

- - - - - - - - - - -
N

number of rows. Defaults to 200 million

K

controls the number of unique values for id. Some ids will have K distinct values while others have N/K distinct values

+
+
gen_datatable_synthetic(N = 2e+08, K = 100)
+
+
+

Arguments

+
N
+

number of rows. Defaults to 200 million

+
K
+

controls the number of unique values for id. Some ids will have K distinct values while others have N/K distinct values

+
+
- - - + + diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index 5cf436cb..66df5c10 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -1,67 +1,12 @@ - - - - - - - -Obtain one chunk by chunk id — get_chunk • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Obtain one chunk by chunk id — get_chunk • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,173 +95,170 @@

Obtain one chunk by chunk id

Obtain one chunk by chunk id

-
get_chunk(...)
-
-# S3 method for disk.frame
-get_chunk(df, n, keep = NULL, full.names = FALSE, ...)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
...

passed to fst::read_fst or whichever read function is used in the backend

df

a disk.frame

n

the chunk id. If numeric then matches by number, if character then returns the chunk with the same name as n

keep

the columns to keep

full.names

whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric

+
+
get_chunk(...)
 
+# S3 method for disk.frame
+get_chunk(df, n, keep = NULL, full.names = FALSE, ...)
+
-

Examples

-
cars.df = as.disk.frame(cars, nchunks = 2) -get_chunk(cars.df, 1)
#> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> speed dist
get_chunk(cars.df, 2)
#> speed dist -#> 1: 15 54 -#> 2: 16 32 -#> 3: 16 40 -#> 4: 17 32 -#> 5: 17 40 -#> 6: 17 50 -#> 7: 18 42 -#> 8: 18 56 -#> 9: 18 76 -#> 10: 18 84 -#> 11: 19 36 -#> 12: 19 46 -#> 13: 19 68 -#> 14: 20 32 -#> 15: 20 48 -#> 16: 20 52 -#> 17: 20 56 -#> 18: 20 64 -#> 19: 22 66 -#> 20: 23 54 -#> 21: 24 70 -#> 22: 24 92 -#> 23: 24 93 -#> 24: 24 120 -#> 25: 25 85 -#> speed dist
get_chunk(cars.df, 1, keep = "speed")
#> speed -#> 1: 4 -#> 2: 4 -#> 3: 7 -#> 4: 7 -#> 5: 8 -#> 6: 9 -#> 7: 10 -#> 8: 10 -#> 9: 10 -#> 10: 11 -#> 11: 11 -#> 12: 12 -#> 13: 12 -#> 14: 12 -#> 15: 12 -#> 16: 13 -#> 17: 13 -#> 18: 13 -#> 19: 13 -#> 20: 14 -#> 21: 14 -#> 22: 14 -#> 23: 14 -#> 24: 15 -#> 25: 15 -#> speed
-# if full.names = TRUE then the full path to the chunk need to be provided -get_chunk(cars.df, file.path(attr(cars.df, "path"), "1.fst"), full.names = TRUE)
#> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10 -#> 7: 10 18 -#> 8: 10 26 -#> 9: 10 34 -#> 10: 11 17 -#> 11: 11 28 -#> 12: 12 14 -#> 13: 12 20 -#> 14: 12 24 -#> 15: 12 28 -#> 16: 13 26 -#> 17: 13 34 -#> 18: 13 34 -#> 19: 13 46 -#> 20: 14 26 -#> 21: 14 36 -#> 22: 14 60 -#> 23: 14 80 -#> 24: 15 20 -#> 25: 15 26 -#> speed dist
-# clean up cars.df -delete(cars.df)
+
+

Arguments

+
...
+

passed to fst::read_fst or whichever read function is used in the backend

+
df
+

a disk.frame

+
n
+

the chunk id. If numeric then matches by number, if character then returns the chunk with the same name as n

+
keep
+

the columns to keep

+
full.names
+

whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars, nchunks = 2)
+get_chunk(cars.df, 1)
+#>     speed dist
+#>  1:     4    2
+#>  2:     4   10
+#>  3:     7    4
+#>  4:     7   22
+#>  5:     8   16
+#>  6:     9   10
+#>  7:    10   18
+#>  8:    10   26
+#>  9:    10   34
+#> 10:    11   17
+#> 11:    11   28
+#> 12:    12   14
+#> 13:    12   20
+#> 14:    12   24
+#> 15:    12   28
+#> 16:    13   26
+#> 17:    13   34
+#> 18:    13   34
+#> 19:    13   46
+#> 20:    14   26
+#> 21:    14   36
+#> 22:    14   60
+#> 23:    14   80
+#> 24:    15   20
+#> 25:    15   26
+#>     speed dist
+get_chunk(cars.df, 2)
+#>     speed dist
+#>  1:    15   54
+#>  2:    16   32
+#>  3:    16   40
+#>  4:    17   32
+#>  5:    17   40
+#>  6:    17   50
+#>  7:    18   42
+#>  8:    18   56
+#>  9:    18   76
+#> 10:    18   84
+#> 11:    19   36
+#> 12:    19   46
+#> 13:    19   68
+#> 14:    20   32
+#> 15:    20   48
+#> 16:    20   52
+#> 17:    20   56
+#> 18:    20   64
+#> 19:    22   66
+#> 20:    23   54
+#> 21:    24   70
+#> 22:    24   92
+#> 23:    24   93
+#> 24:    24  120
+#> 25:    25   85
+#>     speed dist
+get_chunk(cars.df, 1, keep = "speed")
+#>     speed
+#>  1:     4
+#>  2:     4
+#>  3:     7
+#>  4:     7
+#>  5:     8
+#>  6:     9
+#>  7:    10
+#>  8:    10
+#>  9:    10
+#> 10:    11
+#> 11:    11
+#> 12:    12
+#> 13:    12
+#> 14:    12
+#> 15:    12
+#> 16:    13
+#> 17:    13
+#> 18:    13
+#> 19:    13
+#> 20:    14
+#> 21:    14
+#> 22:    14
+#> 23:    14
+#> 24:    15
+#> 25:    15
+#>     speed
+
+# if full.names = TRUE then the full path to the chunk need to be provided
+get_chunk(cars.df, file.path(attr(cars.df, "path"), "1.fst"), full.names = TRUE)
+#>     speed dist
+#>  1:     4    2
+#>  2:     4   10
+#>  3:     7    4
+#>  4:     7   22
+#>  5:     8   16
+#>  6:     9   10
+#>  7:    10   18
+#>  8:    10   26
+#>  9:    10   34
+#> 10:    11   17
+#> 11:    11   28
+#> 12:    12   14
+#> 13:    12   20
+#> 14:    12   24
+#> 15:    12   28
+#> 16:    13   26
+#> 17:    13   34
+#> 18:    13   34
+#> 19:    13   46
+#> 20:    14   26
+#> 21:    14   36
+#> 22:    14   60
+#> 23:    14   80
+#> 24:    15   20
+#> 25:    15   26
+#>     speed dist
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index 968aaa4b..d54c8f15 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -1,67 +1,12 @@ - - - - - - - -Get the chunk IDs and files names — get_chunk_ids • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Get the chunk IDs and files names — get_chunk_ids • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,71 +95,68 @@

Get the chunk IDs and files names

Get the chunk IDs and files names

-
get_chunk_ids(df, ..., full.names = FALSE, strip_extension = TRUE)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
df

a disk.frame

...

passed to list.files

full.names

If TRUE returns the full path to the file, Defaults to FALSE

strip_extension

If TRUE then the file extension in the chunk_id is removed. Defaults to TRUE

- - -

Examples

-
cars.df = as.disk.frame(cars) - -# return the integer-string chunk IDs -get_chunk_ids(cars.df)
#> [1] "1" "2" "3" "4" "5" "6"
-# return the file name chunk IDs -get_chunk_ids(cars.df, full.names = TRUE)
#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpInritK\\file187ce9d748c.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpInritK\\file187ce9d748c.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpInritK\\file187ce9d748c.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpInritK\\file187ce9d748c.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpInritK\\file187ce9d748c.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpInritK\\file187ce9d748c.df/6.fst"
-# return the file name chunk IDs with file extension -get_chunk_ids(cars.df, strip_extension = FALSE)
#> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"
-# clean up cars.df -delete(cars.df)
+
+
get_chunk_ids(df, ..., full.names = FALSE, strip_extension = TRUE)
+
+ +
+

Arguments

+
df
+

a disk.frame

+
...
+

passed to list.files

+
full.names
+

If TRUE returns the full path to the file, Defaults to FALSE

+
strip_extension
+

If TRUE then the file extension in the chunk_id is removed. Defaults to TRUE

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+# return the integer-string chunk IDs
+get_chunk_ids(cars.df)
+#> [1] "1" "2" "3" "4" "5" "6"
+
+# return the file name chunk IDs
+get_chunk_ids(cars.df, full.names = TRUE)
+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/1.fst"
+#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/2.fst"
+#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/3.fst"
+#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/4.fst"
+#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/5.fst"
+#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2rQjw5\\file56f46178545a.df/6.fst"
+
+# return the file name chunk IDs with file extension
+get_chunk_ids(cars.df, strip_extension = FALSE)
+#> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index c2f6087d..a7d39c8e 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -1,70 +1,15 @@ - - - - - - - -A function to parse the summarize function — summarise.grouped_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -A function to parse the summarize function — summarise.grouped_disk.frame • disk.frame - - - - - - - - - - - + + - - -
-
- -
- -
+
@@ -168,71 +101,60 @@

A function to parse the summarize function

reorganizes the chunks by the shard key.

-
# S3 method for grouped_disk.frame
-summarise(.data, ...)
-
-# S3 method for grouped_disk.frame
-summarize(.data, ...)
-
-# S3 method for disk.frame
-group_by(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data))
-
-# S3 method for disk.frame
-summarize(.data, ...)
-
-# S3 method for disk.frame
-summarise(.data, ...)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
.data

a disk.frame

...

same as the dplyr::group_by

add

from dplyr

.drop

from dplyr

- -

See also

- -

hard_group_by

+
+
# S3 method for grouped_disk.frame
+summarise(.data, ...)
+
+# S3 method for grouped_disk.frame
+summarize(.data, ...)
+
+# S3 method for disk.frame
+group_by(.data, ..., add = FALSE, .drop = dplyr::group_by_drop_default(.data))
+
+# S3 method for disk.frame
+summarize(.data, ...)
+
+# S3 method for disk.frame
+summarise(.data, ...)
+
+ +
+

Arguments

+
.data
+

a disk.frame

+
...
+

same as the dplyr::group_by

+
add
+

from dplyr

+
.drop
+

from dplyr

+
+
+

See also

+

hard_group_by

+
+
- - - + + diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index dbe0e2dd..02be8acf 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -The shard keys of the disk.frame — groups.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -The shard keys of the disk.frame — groups.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,47 +95,42 @@

The shard keys of the disk.frame

The shard keys of the disk.frame

-
# S3 method for disk.frame
-groups(x)
- -

Arguments

- - - - - - -
x

a disk.frame

- -

Value

+
+
# S3 method for disk.frame
+groups(x)
+
+
+

Arguments

+
x
+

a disk.frame

+
+
+

Value

character

+
+
- - - + + diff --git a/docs/reference/hard_arrange.html b/docs/reference/hard_arrange.html index cb5c473d..c880a71b 100644 --- a/docs/reference/hard_arrange.html +++ b/docs/reference/hard_arrange.html @@ -1,69 +1,14 @@ - - - - - - - -Perform a hard arrange — hard_arrange • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Perform a hard arrange — hard_arrange • disk.frame - - - - - - - - - - + + - - - -
-
- -
- -
+
@@ -166,116 +99,107 @@

Perform a hard arrange

row that share the same `by` value will end up in the same chunk.

-
hard_arrange(df, ..., add = FALSE, .drop = FALSE)
-
-# S3 method for data.frame
-hard_arrange(df, ...)
-
-# S3 method for disk.frame
-hard_arrange(
-  df,
-  ...,
-  outdir = tempfile("tmp_disk_frame_hard_arrange"),
-  nchunks = disk.frame::nchunks(df),
-  overwrite = TRUE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df

a disk.frame

...

grouping variables

add

same as dplyr::arrange

.drop

same as dplyr::arrange

outdir

the output directory

nchunks

The number of chunks in the output. Defaults = nchunks.disk.frame(df)

overwrite

overwrite the out put directory

+
+
hard_arrange(df, ..., add = FALSE, .drop = FALSE)
 
+# S3 method for data.frame
+hard_arrange(df, ...)
 
-    

Examples

-
iris.df = as.disk.frame(iris, nchunks = 2) +# S3 method for disk.frame +hard_arrange( + df, + ..., + outdir = tempfile("tmp_disk_frame_hard_arrange"), + nchunks = disk.frame::nchunks(df), + overwrite = TRUE +)
+
-# arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk -iris_hard.df = hard_arrange(iris.df, Species)
#> Appending disk.frames:
-get_chunk(iris_hard.df, 1)
#> # A tibble: 50 x 5 -#> # Groups: Species [1] -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <dbl> <dbl> <dbl> <dbl> <fct> -#> 1 6.3 3.3 6 2.5 virginica -#> 2 5.8 2.7 5.1 1.9 virginica -#> 3 7.1 3 5.9 2.1 virginica -#> 4 6.3 2.9 5.6 1.8 virginica -#> 5 6.5 3 5.8 2.2 virginica -#> 6 7.6 3 6.6 2.1 virginica -#> 7 4.9 2.5 4.5 1.7 virginica -#> 8 7.3 2.9 6.3 1.8 virginica -#> 9 6.7 2.5 5.8 1.8 virginica -#> 10 7.2 3.6 6.1 2.5 virginica -#> # ... with 40 more rows
get_chunk(iris_hard.df, 2)
#> # A tibble: 50 x 5 -#> # Groups: Species [1] -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <dbl> <dbl> <dbl> <dbl> <fct> -#> 1 6.3 3.3 6 2.5 virginica -#> 2 5.8 2.7 5.1 1.9 virginica -#> 3 7.1 3 5.9 2.1 virginica -#> 4 6.3 2.9 5.6 1.8 virginica -#> 5 6.5 3 5.8 2.2 virginica -#> 6 7.6 3 6.6 2.1 virginica -#> 7 4.9 2.5 4.5 1.7 virginica -#> 8 7.3 2.9 6.3 1.8 virginica -#> 9 6.7 2.5 5.8 1.8 virginica -#> 10 7.2 3.6 6.1 2.5 virginica -#> # ... with 40 more rows
-# clean up cars.df -delete(iris.df) -delete(iris_hard.df)
+
+

Arguments

+
df
+

a disk.frame

+
...
+

grouping variables

+
add
+

same as dplyr::arrange

+
.drop
+

same as dplyr::arrange

+
outdir
+

the output directory

+
nchunks
+

The number of chunks in the output. Defaults = nchunks.disk.frame(df)

+
overwrite
+

overwrite the out put directory

+
+ +
+

Examples

+
iris.df = as.disk.frame(iris, nchunks = 2)
+
+# arrange iris.df by specifies and ensure rows with the same specifies are in the same chunk
+iris_hard.df = hard_arrange(iris.df, Species)
+#> Appending disk.frames: 
+
+get_chunk(iris_hard.df, 1)
+#> # A tibble: 50 x 5
+#> # Groups:   Species [1]
+#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
+#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
+#>  1          6.3         3.3          6           2.5 virginica
+#>  2          5.8         2.7          5.1         1.9 virginica
+#>  3          7.1         3            5.9         2.1 virginica
+#>  4          6.3         2.9          5.6         1.8 virginica
+#>  5          6.5         3            5.8         2.2 virginica
+#>  6          7.6         3            6.6         2.1 virginica
+#>  7          4.9         2.5          4.5         1.7 virginica
+#>  8          7.3         2.9          6.3         1.8 virginica
+#>  9          6.7         2.5          5.8         1.8 virginica
+#> 10          7.2         3.6          6.1         2.5 virginica
+#> # ... with 40 more rows
+get_chunk(iris_hard.df, 2)
+#> # A tibble: 50 x 5
+#> # Groups:   Species [1]
+#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
+#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
+#>  1          6.3         3.3          6           2.5 virginica
+#>  2          5.8         2.7          5.1         1.9 virginica
+#>  3          7.1         3            5.9         2.1 virginica
+#>  4          6.3         2.9          5.6         1.8 virginica
+#>  5          6.5         3            5.8         2.2 virginica
+#>  6          7.6         3            6.6         2.1 virginica
+#>  7          4.9         2.5          4.5         1.7 virginica
+#>  8          7.3         2.9          6.3         1.8 virginica
+#>  9          6.7         2.5          5.8         1.8 virginica
+#> 10          7.2         3.6          6.1         2.5 virginica
+#> # ... with 40 more rows
+
+# clean up cars.df
+delete(iris.df)
+delete(iris_hard.df)
+
+
+
- - - + + diff --git a/docs/reference/hard_group_by.html b/docs/reference/hard_group_by.html index 1b6f636c..8dd5247b 100644 --- a/docs/reference/hard_group_by.html +++ b/docs/reference/hard_group_by.html @@ -1,69 +1,14 @@ - - - - - - - -Perform a hard group — hard_group_by • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Perform a hard group — hard_group_by • disk.frame - - - - - - - - - - - + + - - -
-
- -
- -
+
@@ -166,122 +99,108 @@

Perform a hard group

row that share the same `by` value will end up in the same chunk.

-
hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
-
-# S3 method for data.frame
-hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
-
-# S3 method for disk.frame
-hard_group_by(
-  df,
-  ...,
-  outdir = tempfile("tmp_disk_frame_hard_group_by"),
-  nchunks = disk.frame::nchunks(df),
-  overwrite = TRUE,
-  shardby_function = "hash",
-  sort_splits = NULL,
-  desc_vars = NULL,
-  sort_split_sample_size = 100
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df

a disk.frame

...

grouping variables

.add

same as dplyr::group_by

.drop

same as dplyr::group_by

outdir

the output directory

nchunks

The number of chunks in the output. Defaults = nchunks.disk.frame(df)

overwrite

overwrite the out put directory

shardby_function

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

sort_splits

for the "sort" shardby function, a dataframe with the split values.

desc_vars

for the "sort" shardby function, the variables to sort descending.

sort_split_sample_size

for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.

- - -

Examples

-
iris.df = as.disk.frame(iris, nchunks = 2) +
+
hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
+
+# S3 method for data.frame
+hard_group_by(df, ..., .add = FALSE, .drop = FALSE)
+
+# S3 method for disk.frame
+hard_group_by(
+  df,
+  ...,
+  outdir = tempfile("tmp_disk_frame_hard_group_by"),
+  nchunks = disk.frame::nchunks(df),
+  overwrite = TRUE,
+  shardby_function = "hash",
+  sort_splits = NULL,
+  desc_vars = NULL,
+  sort_split_sample_size = 100
+)
+
-# group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk -iris_hard.df = hard_group_by(iris.df, Species)
#> Hashing...
#> Hashing...
#> Appending disk.frames:
-get_chunk(iris_hard.df, 1)
#> # A tibble: 150 x 5 -#> # Groups: Species [3] -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <dbl> <dbl> <dbl> <dbl> <fct> -#> 1 5.1 3.5 1.4 0.2 setosa -#> 2 4.9 3 1.4 0.2 setosa -#> 3 4.7 3.2 1.3 0.2 setosa -#> 4 4.6 3.1 1.5 0.2 setosa -#> 5 5 3.6 1.4 0.2 setosa -#> 6 5.4 3.9 1.7 0.4 setosa -#> 7 4.6 3.4 1.4 0.3 setosa -#> 8 5 3.4 1.5 0.2 setosa -#> 9 4.4 2.9 1.4 0.2 setosa -#> 10 4.9 3.1 1.5 0.1 setosa -#> # ... with 140 more rows
get_chunk(iris_hard.df, 2)
#> Warning: The chunk NA does not exist; returning an empty data.table
#> Null data.table (0 rows and 0 cols)
-# clean up cars.df -delete(iris.df) -delete(iris_hard.df)
+
+

Arguments

+
df
+

a disk.frame

+
...
+

grouping variables

+
.add
+

same as dplyr::group_by

+
.drop
+

same as dplyr::group_by

+
outdir
+

the output directory

+
nchunks
+

The number of chunks in the output. Defaults = nchunks.disk.frame(df)

+
overwrite
+

overwrite the out put directory

+
shardby_function
+

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

+
sort_splits
+

for the "sort" shardby function, a dataframe with the split values.

+
desc_vars
+

for the "sort" shardby function, the variables to sort descending.

+
sort_split_sample_size
+

for the "sort" shardby function, if sort_splits is null, the number of rows to sample per chunk for random splits.

+
+ +
+

Examples

+
iris.df = as.disk.frame(iris, nchunks = 2)
+
+# group_by iris.df by specifies and ensure rows with the same specifies are in the same chunk
+iris_hard.df = hard_group_by(iris.df, Species)
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+
+get_chunk(iris_hard.df, 1)
+#> # A tibble: 150 x 5
+#> # Groups:   Species [3]
+#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
+#>  1          5.1         3.5          1.4         0.2 setosa 
+#>  2          4.9         3            1.4         0.2 setosa 
+#>  3          4.7         3.2          1.3         0.2 setosa 
+#>  4          4.6         3.1          1.5         0.2 setosa 
+#>  5          5           3.6          1.4         0.2 setosa 
+#>  6          5.4         3.9          1.7         0.4 setosa 
+#>  7          4.6         3.4          1.4         0.3 setosa 
+#>  8          5           3.4          1.5         0.2 setosa 
+#>  9          4.4         2.9          1.4         0.2 setosa 
+#> 10          4.9         3.1          1.5         0.1 setosa 
+#> # ... with 140 more rows
+get_chunk(iris_hard.df, 2)
+#> Warning: The chunk NA does not exist; returning an empty data.table
+#> Null data.table (0 rows and 0 cols)
+
+# clean up cars.df
+delete(iris.df)
+delete(iris_hard.df)
+
+
+
- - - + + diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index da4adf74..6312a2b2 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -1,67 +1,12 @@ - - - - - - - -Head and tail of the disk.frame — head.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Head and tail of the disk.frame — head.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,71 +95,68 @@

Head and tail of the disk.frame

Head and tail of the disk.frame

-
# S3 method for disk.frame
-head(x, n = 6L, ...)
-
-# S3 method for disk.frame
-tail(x, n = 6L, ...)
- -

Arguments

- - - - - - - - - - - - - - -
x

a disk.frame

n

number of rows to include

...

passed to base::head or base::tail

- - -

Examples

-
cars.df = as.disk.frame(cars) -head(cars.df)
#> speed dist -#> 1: 4 2 -#> 2: 4 10 -#> 3: 7 4 -#> 4: 7 22 -#> 5: 8 16 -#> 6: 9 10
tail(cars.df)
#> speed dist -#> 1: 24 70 -#> 2: 24 92 -#> 3: 24 93 -#> 4: 24 120 -#> 5: 25 85
-# clean up -delete(cars.df)
+
+
# S3 method for disk.frame
+head(x, n = 6L, ...)
+
+# S3 method for disk.frame
+tail(x, n = 6L, ...)
+
+ +
+

Arguments

+
x
+

a disk.frame

+
n
+

number of rows to include

+
...
+

passed to base::head or base::tail

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+head(cars.df)
+#>    speed dist
+#> 1:     4    2
+#> 2:     4   10
+#> 3:     7    4
+#> 4:     7   22
+#> 5:     8   16
+#> 6:     9   10
+tail(cars.df)
+#>    speed dist
+#> 1:    24   70
+#> 2:    24   92
+#> 3:    24   93
+#> 4:    24  120
+#> 5:    25   85
+
+# clean up 
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/index.html b/docs/reference/index.html index 7a9bcfdc..482f042c 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -1,66 +1,12 @@ - - - - - - - -Function reference • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Function reference • disk.frame - + + - - - -
-
- -
- -
+
- - - - - - - - - - -
-

All functions

+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+

All functions

+

add_chunk()

Add a chunk to the disk.frame

+

as.data.frame(<disk.frame>)

Convert disk.frame to data.frame by collecting all chunks

+

as.data.table(<disk.frame>)

Convert disk.frame to data.table by collecting all chunks

+

as.disk.frame()

Make a data.frame into a disk.frame

+

chunk_summarize() chunk_summarise() chunk_group_by() chunk_ungroup()

Group by within each disk.frame

+

cmap() cmap_dfr() cimap() cimap_dfr() lazy() delayed() chunk_lapply() map() imap_dfr() imap() map_dfr(<disk.frame>) map_dfr(<default>)

Apply the same function to all chunks

+

cmap2() map2() map_by_chunk_id()

`cmap2` a function to two disk.frames

+

collect(<disk.frame>) collect_list() collect(<summarized_disk.frame>)

Bring the disk.frame into R

+

colnames() names(<disk.frame>)

Return the column names of the disk.frame

+

compute(<disk.frame>)

Compute without writing

+

create_chunk_mapper()

Create function that applies to each chunk if disk.frame

-

create_dplyr_mapper()

-

Kept for backwards-compatibility to be removed in 0.3

+

csv_to_disk.frame()

Convert CSV file(s) to disk.frame format

+

delete()

Delete a disk.frame

+

dfglm()

Fit generalized linear models (glm) with disk.frame

+

df_ram_size()

Get the size of RAM in gigabytes

+

disk.frame()

Create a disk.frame from a folder

-

select(<disk.frame>) rename(<disk.frame>) filter(<disk.frame>) mutate(<disk.frame>) transmute(<disk.frame>) arrange(<disk.frame>) chunk_arrange() tally.disk.frame() count.disk.frame() add_count.disk.frame() add_tally.disk.frame() do(<disk.frame>) distinct(<disk.frame>) chunk_distinct() glimpse(<disk.frame>)

+
+

select(<disk.frame>) rename(<disk.frame>) filter(<disk.frame>) mutate(<disk.frame>) transmute(<disk.frame>) arrange(<disk.frame>) chunk_arrange() add_tally.disk.frame() do(<disk.frame>) distinct(<disk.frame>) chunk_distinct() glimpse(<disk.frame>)

The dplyr verbs implemented for disk.frame

+

evalparseglue()

Helper function to evalparse some `glue::glue` string

+

foverlaps.disk.frame()

Apply data.table's foverlaps to the disk.frame

+

gen_datatable_synthetic()

Generate synthetic dataset for testing

+

get_chunk()

Obtain one chunk by chunk id

+

get_chunk_ids()

Get the chunk IDs and files names

+

groups(<disk.frame>)

The shard keys of the disk.frame

+

summarise(<grouped_disk.frame>) summarize(<grouped_disk.frame>) group_by(<disk.frame>) summarize(<disk.frame>) summarise(<disk.frame>)

A function to parse the summarize function

+

hard_arrange()

Perform a hard arrange

+

hard_group_by()

Perform a hard group

+

head(<disk.frame>) tail(<disk.frame>)

Head and tail of the disk.frame

+

is_disk.frame()

Checks if a folder is a disk.frame

+

anti_join(<disk.frame>) full_join(<disk.frame>) inner_join(<disk.frame>) left_join(<disk.frame>) semi_join(<disk.frame>)

Performs join/merge for disk.frames

+

make_glm_streaming_fn()

A streaming function for speedglm

+

merge(<disk.frame>)

Merge function for disk.frames

+

move_to() copy_df_to()

Move or copy a disk.frame to another location

+

nchunks() nchunk()

Returns the number of chunks in a disk.frame

+

nrow() ncol()

Number of rows or columns

+

var_df.chunk_agg.disk.frame() var_df.collected_agg.disk.frame() sd_df.chunk_agg.disk.frame() sd_df.collected_agg.disk.frame() mean_df.chunk_agg.disk.frame() mean_df.collected_agg.disk.frame() sum_df.chunk_agg.disk.frame() sum_df.collected_agg.disk.frame() min_df.chunk_agg.disk.frame() min_df.collected_agg.disk.frame() max_df.chunk_agg.disk.frame() max_df.collected_agg.disk.frame() median_df.chunk_agg.disk.frame() median_df.collected_agg.disk.frame() n_df.chunk_agg.disk.frame() n_df.collected_agg.disk.frame() length_df.chunk_agg.disk.frame() length_df.collected_agg.disk.frame() any_df.chunk_agg.disk.frame() any_df.collected_agg.disk.frame() all_df.chunk_agg.disk.frame() all_df.collected_agg.disk.frame() n_distinct_df.chunk_agg.disk.frame() n_distinct_df.collected_agg.disk.frame() quantile_df.chunk_agg.disk.frame() quantile_df.collected_agg.disk.frame() IQR_df.chunk_agg.disk.frame() IQR_df.collected_agg.disk.frame()

One Stage function

+

overwrite_check()

Check if the outdir exists or not

+

print(<disk.frame>)

Print disk.frame

+

pull(<disk.frame>)

Pull a column from table similar to `dplyr::pull`.

+

rbindlist.disk.frame()

rbindlist disk.frames together

+

rechunk()

Increase or decrease the number of chunks in the disk.frame

+

recommend_nchunks()

Recommend number of chunks based on input size

+

remove_chunk()

Removes a chunk from the disk.frame

+

sample_frac(<disk.frame>)

Sample n rows from a disk.frame

+

setup_disk.frame()

Set up disk.frame environment

+

shard() distribute()

Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame

+

shardkey()

Returns the shardkey (not implemented yet)

+

shardkey_equal()

Compare two disk.frame shardkeys

+

show_ceremony() ceremony_text() show_boilerplate() insert_ceremony()

Show the code to setup disk.frame

+

srckeep() srckeepchunks()

Keep only the variables from the input listed in selections

+

`[`(<disk.frame>)

[ interface for disk.frame using fst backend

+

tbl_vars(<disk.frame>) group_vars(<disk.frame>)

Column names for RStudio auto-complete

+

write_disk.frame() output_disk.frame()

Write disk.frame to disk

+

zip_to_disk.frame()

`zip_to_disk.frame` is used to read and convert every CSV file within the zip file to disk.frame format

- +
+
-
- +
- - + + diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index 3b294687..c56ab38f 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Checks if a folder is a disk.frame — is_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Checks if a folder is a disk.frame — is_disk.frame • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,49 +95,50 @@

Checks if a folder is a disk.frame

Checks if a folder is a disk.frame

-
is_disk.frame(df)
- -

Arguments

- - - - - - -
df

a disk.frame or directory to check

- - -

Examples

-
cars.df = as.disk.frame(cars) +
+
is_disk.frame(df)
+
-is_disk.frame(cars) # FALSE
#> [1] FALSE
is_disk.frame(cars.df) # TRUE
#> [1] TRUE
-# clean up cars.df -delete(cars.df)
+
+

Arguments

+
df
+

a disk.frame or directory to check

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+is_disk.frame(cars) # FALSE
+#> [1] FALSE
+is_disk.frame(cars.df) # TRUE
+#> [1] TRUE
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/join.html b/docs/reference/join.html index 3caac2ed..6bdc7ff7 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -1,67 +1,12 @@ - - - - - - - -Performs join/merge for disk.frames — anti_join.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Performs join/merge for disk.frames — anti_join.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,179 +95,208 @@

Performs join/merge for disk.frames

Performs join/merge for disk.frames

-
# S3 method for disk.frame
-anti_join(
-  x,
-  y,
-  by = NULL,
-  copy = FALSE,
-  ...,
-  outdir = tempfile("tmp_disk_frame_anti_join"),
-  merge_by_chunk_id = FALSE,
-  overwrite = TRUE,
-  .progress = FALSE
-)
-
-# S3 method for disk.frame
-full_join(
-  x,
-  y,
-  by = NULL,
-  copy = FALSE,
-  ...,
-  outdir = tempfile("tmp_disk_frame_full_join"),
-  overwrite = TRUE,
-  merge_by_chunk_id,
-  .progress = FALSE
-)
-
-# S3 method for disk.frame
-inner_join(
-  x,
-  y,
-  by = NULL,
-  copy = FALSE,
-  ...,
-  outdir = tempfile("tmp_disk_frame_inner_join"),
-  merge_by_chunk_id = NULL,
-  overwrite = TRUE,
-  .progress = FALSE
-)
-
-# S3 method for disk.frame
-left_join(
-  x,
-  y,
-  by = NULL,
-  copy = FALSE,
-  ...,
-  outdir = tempfile("tmp_disk_frame_left_join"),
-  merge_by_chunk_id = FALSE,
-  overwrite = TRUE,
-  .progress = FALSE
-)
-
-# S3 method for disk.frame
-semi_join(
-  x,
-  y,
-  by = NULL,
-  copy = FALSE,
-  ...,
-  outdir = tempfile("tmp_disk_frame_semi_join"),
-  merge_by_chunk_id = FALSE,
-  overwrite = TRUE,
-  .progress = FALSE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
x

a disk.frame

y

a data.frame or disk.frame. If data.frame then returns lazily; if disk.frame it performs the join eagerly and return a disk.frame

by

join by

copy

same as dplyr::anti_join

...

same as dplyr's joins

outdir

output directory for disk.frame

merge_by_chunk_id

the merge is performed by chunk id

overwrite

overwrite output directory

.progress

Show progress or not. Defaults to FALSE

- -

Value

+
+
# S3 method for disk.frame
+anti_join(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  outdir = tempfile("tmp_disk_frame_anti_join"),
+  merge_by_chunk_id = FALSE,
+  overwrite = TRUE,
+  .progress = FALSE
+)
+
+# S3 method for disk.frame
+full_join(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  outdir = tempfile("tmp_disk_frame_full_join"),
+  overwrite = TRUE,
+  merge_by_chunk_id,
+  .progress = FALSE
+)
+
+# S3 method for disk.frame
+inner_join(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  outdir = tempfile("tmp_disk_frame_inner_join"),
+  merge_by_chunk_id = NULL,
+  overwrite = TRUE,
+  .progress = FALSE
+)
+
+# S3 method for disk.frame
+left_join(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  outdir = tempfile("tmp_disk_frame_left_join"),
+  merge_by_chunk_id = FALSE,
+  overwrite = TRUE,
+  .progress = FALSE
+)
+
+# S3 method for disk.frame
+semi_join(
+  x,
+  y,
+  by = NULL,
+  copy = FALSE,
+  ...,
+  outdir = tempfile("tmp_disk_frame_semi_join"),
+  merge_by_chunk_id = FALSE,
+  overwrite = TRUE,
+  .progress = FALSE
+)
+
+
+

Arguments

+
x
+

a disk.frame

+
y
+

a data.frame or disk.frame. If data.frame then returns lazily; if disk.frame it performs the join eagerly and return a disk.frame

+
by
+

join by

+
copy
+

same as dplyr::anti_join

+
...
+

same as dplyr's joins

+
outdir
+

output directory for disk.frame

+
merge_by_chunk_id
+

the merge is performed by chunk id

+
overwrite
+

overwrite output directory

+
.progress
+

Show progress or not. Defaults to FALSE

+
+
+

Value

disk.frame or data.frame/data.table

+
-

Examples

-
df.df = as.disk.frame(data.frame(x = 1:3, y = 4:6), overwrite = TRUE) -df2.df = as.disk.frame(data.frame(x = 1:2, z = 10:11), overwrite = TRUE) - -anti_joined.df = anti_join(df.df, df2.df)
#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
#> Hashing...
#> Hashing...
#> Hashing...
#> Appending disk.frames:
#> Hashing...
#> Hashing...
#> Appending disk.frames:
-anti_joined.df %>% collect
#> x y -#> 1: 3 6
-anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11))
#> Joining, by = "x"
#> Joining, by = "x"
#> Joining, by = "x"
-# clean up -delete(df.df) -delete(df2.df) -delete(anti_joined.df) -cars.df = as.disk.frame(cars) - -join.df = full_join(cars.df, cars.df, merge_by_chunk_id = TRUE) - -# clean up cars.df -delete(cars.df) -delete(join.df) -cars.df = as.disk.frame(cars) - -join.df = inner_join(cars.df, cars.df, merge_by_chunk_id = TRUE) - -# clean up cars.df -delete(cars.df) -delete(join.df) -cars.df = as.disk.frame(cars) - -join.df = left_join(cars.df, cars.df)
#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Appending disk.frames:
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Appending disk.frames:
-# clean up cars.df -delete(cars.df) -delete(join.df) -cars.df = as.disk.frame(cars) - -join.df = semi_join(cars.df, cars.df)
#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Appending disk.frames:
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Hashing...
#> Appending disk.frames:
-# clean up cars.df -delete(cars.df) -delete(join.df)
+
+

Examples

+
df.df = as.disk.frame(data.frame(x = 1:3, y = 4:6), overwrite = TRUE)
+df2.df = as.disk.frame(data.frame(x = 1:2, z = 10:11), overwrite = TRUE)
+
+anti_joined.df = anti_join(df.df, df2.df) 
+#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+
+anti_joined.df %>% collect
+#>    x y
+#> 1: 3 6
+
+anti_joined.data.frame = anti_join(df.df, data.frame(x = 1:2, z = 10:11))
+#> Joining, by = "x"
+#> Joining, by = "x"
+#> Joining, by = "x"
+
+# clean up
+delete(df.df)
+delete(df2.df)
+delete(anti_joined.df)
+cars.df = as.disk.frame(cars)
+
+join.df = full_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
+
+# clean up cars.df
+delete(cars.df)
+delete(join.df)
+cars.df = as.disk.frame(cars)
+
+join.df = inner_join(cars.df, cars.df, merge_by_chunk_id = TRUE)
+
+# clean up cars.df
+delete(cars.df)
+delete(join.df)
+cars.df = as.disk.frame(cars)
+
+join.df = left_join(cars.df, cars.df)
+#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+
+# clean up cars.df
+delete(cars.df)
+delete(join.df)
+cars.df = as.disk.frame(cars)
+
+join.df = semi_join(cars.df, cars.df)
+#> Warning: merge_by_chunk_id = FALSE. This will take significantly longer and the preparations needed are performed eagerly which may lead to poor performance. Consider making y a data.frame or set merge_by_chunk_id = TRUE for better performance.
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+
+# clean up cars.df
+delete(cars.df)
+delete(join.df)
+
+
+
- - - + + diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index e6df5579..91b68bc6 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -1,67 +1,12 @@ - - - - - - - -A streaming function for speedglm — make_glm_streaming_fn • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -A streaming function for speedglm — make_glm_streaming_fn • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,118 +95,116 @@

A streaming function for speedglm

Define a function that can be used to feed data into speedglm and biglm

-
make_glm_streaming_fn(data, verbose = FALSE)
- -

Arguments

- - - - - - - - - - -
data

a disk.frame

verbose

Whether to print the status of data loading. Default to FALSE

- -

Value

+
+
make_glm_streaming_fn(data, verbose = FALSE)
+
+
+

Arguments

+
data
+

a disk.frame

+
verbose
+

Whether to print the status of data loading. Default to FALSE

+
+
+

Value

return a function, fn, that can be used as the data argument in biglm::bigglm or speedglm::shglm

-

See also

- -

Other Machine Learning (ML): -dfglm()

- -

Examples

-
cars.df = as.disk.frame(cars) -streamacq = make_glm_streaming_fn(cars.df, verbose = FALSE) +
+
+

See also

+

Other Machine Learning (ML): +dfglm()

+
-majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) -if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) { - m = biglm::bigglm(dist ~ speed, data = streamacq) - summary(m) - predict(m, get_chunk(cars.df, 1)) - predict(m, collect(cars.df, 1)) -} else { - m = speedglm::shglm(dist ~ speed, data = streamacq) -}
#> [,1] -#> 1 -1.849460 -#> 2 -1.849460 -#> 3 9.947766 -#> 4 9.947766 -#> 5 13.880175 -#> 6 17.812584 -#> 7 21.744993 -#> 8 21.744993 -#> 9 21.744993 -#> 10 25.677401 -#> 11 25.677401 -#> 12 29.609810 -#> 13 29.609810 -#> 14 29.609810 -#> 15 29.609810 -#> 16 33.542219 -#> 17 33.542219 -#> 18 33.542219 -#> 19 33.542219 -#> 20 37.474628 -#> 21 37.474628 -#> 22 37.474628 -#> 23 37.474628 -#> 24 41.407036 -#> 25 41.407036 -#> 26 41.407036 -#> 27 45.339445 -#> 28 45.339445 -#> 29 49.271854 -#> 30 49.271854 -#> 31 49.271854 -#> 32 53.204263 -#> 33 53.204263 -#> 34 53.204263 -#> 35 53.204263 -#> 36 57.136672 -#> 37 57.136672 -#> 38 57.136672 -#> 39 61.069080 -#> 40 61.069080 -#> 41 61.069080 -#> 42 61.069080 -#> 43 61.069080 -#> 44 68.933898 -#> 45 72.866307 -#> 46 76.798715 -#> 47 76.798715 -#> 48 76.798715 -#> 49 76.798715 -#> 50 80.731124
+
+

Examples

+
cars.df = as.disk.frame(cars)
+streamacq = make_glm_streaming_fn(cars.df, verbose = FALSE)
+
+majorv = as.integer(version$major)
+minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1])
+if(((majorv == 3) & (minorv >= 6)) | (majorv > 3)) {
+  m = biglm::bigglm(dist ~ speed, data = streamacq)
+  summary(m)
+  predict(m, get_chunk(cars.df, 1))
+  predict(m, collect(cars.df, 1))
+} else {
+  m = speedglm::shglm(dist ~ speed, data = streamacq)
+}
+#>         [,1]
+#> 1  -1.849460
+#> 2  -1.849460
+#> 3   9.947766
+#> 4   9.947766
+#> 5  13.880175
+#> 6  17.812584
+#> 7  21.744993
+#> 8  21.744993
+#> 9  21.744993
+#> 10 25.677401
+#> 11 25.677401
+#> 12 29.609810
+#> 13 29.609810
+#> 14 29.609810
+#> 15 29.609810
+#> 16 33.542219
+#> 17 33.542219
+#> 18 33.542219
+#> 19 33.542219
+#> 20 37.474628
+#> 21 37.474628
+#> 22 37.474628
+#> 23 37.474628
+#> 24 41.407036
+#> 25 41.407036
+#> 26 41.407036
+#> 27 45.339445
+#> 28 45.339445
+#> 29 49.271854
+#> 30 49.271854
+#> 31 49.271854
+#> 32 53.204263
+#> 33 53.204263
+#> 34 53.204263
+#> 35 53.204263
+#> 36 57.136672
+#> 37 57.136672
+#> 38 57.136672
+#> 39 61.069080
+#> 40 61.069080
+#> 41 61.069080
+#> 42 61.069080
+#> 43 61.069080
+#> 44 68.933898
+#> 45 72.866307
+#> 46 76.798715
+#> 47 76.798715
+#> 48 76.798715
+#> 49 76.798715
+#> 50 80.731124
+
+
+
- - - + + diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index 7e9d9792..26cafba5 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Merge function for disk.frames — merge.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Merge function for disk.frames — merge.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,85 +95,70 @@

Merge function for disk.frames

Merge function for disk.frames

-
# S3 method for disk.frame
-merge(
-  x,
-  y,
-  by,
-  outdir = tempfile(fileext = ".df"),
-  ...,
-  merge_by_chunk_id = FALSE,
-  overwrite = FALSE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
x

a disk.frame

y

a disk.frame or data.frame

by

the merge by keys

outdir

The output directory for the disk.frame

...

passed to merge and cmap.disk.frame

merge_by_chunk_id

if TRUE then only chunks in df1 and df2 with the same chunk id will get merged

overwrite

overwrite the outdir or not

- - -

Examples

-
b = as.disk.frame(data.frame(a = 51:150, b = 1:100)) -d = as.disk.frame(data.frame(a = 151:250, b = 1:100)) -bd.df = merge(b, d, by = "b", merge_by_chunk_id = TRUE) - -# clean up cars.df -delete(b) -delete(d) -delete(bd.df)
+
+
# S3 method for disk.frame
+merge(
+  x,
+  y,
+  by,
+  outdir = tempfile(fileext = ".df"),
+  ...,
+  merge_by_chunk_id = FALSE,
+  overwrite = FALSE
+)
+
+ +
+

Arguments

+
x
+

a disk.frame

+
y
+

a disk.frame or data.frame

+
by
+

the merge by keys

+
outdir
+

The output directory for the disk.frame

+
...
+

passed to merge and cmap.disk.frame

+
merge_by_chunk_id
+

if TRUE then only chunks in df1 and df2 with the same chunk id will get merged

+
overwrite
+

overwrite the outdir or not

+
+ +
+

Examples

+
b = as.disk.frame(data.frame(a = 51:150, b = 1:100))
+d = as.disk.frame(data.frame(a = 151:250, b = 1:100))
+bd.df = merge(b, d, by = "b", merge_by_chunk_id = TRUE)
+
+# clean up cars.df
+delete(b)
+delete(d)
+delete(bd.df)
+
+
+
- - - + + diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index 4755299f..9037467e 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -1,67 +1,12 @@ - - - - - - - -Move or copy a disk.frame to another location — move_to • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Move or copy a disk.frame to another location — move_to • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,70 +95,62 @@

Move or copy a disk.frame to another location

Move or copy a disk.frame to another location

-
move_to(df, outdir, ..., copy = FALSE)
-
-copy_df_to(df, outdir, ...)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
df

The disk.frame

outdir

The new location

...

NOT USED

copy

Merely copy and not move

- -

Value

- -

a disk.frame

+
+
move_to(df, outdir, ..., copy = FALSE)
 
-    

Examples

-
cars.df = as.disk.frame(cars) - -cars_copy.df = copy_df_to(cars.df, outdir = tempfile(fileext=".df")) +copy_df_to(df, outdir, ...)
+
-cars2.df = move_to(cars.df, outdir = tempfile(fileext=".df")) +
+

Arguments

+
df
+

The disk.frame

+
outdir
+

The new location

+
...
+

NOT USED

+
copy
+

Merely copy and not move

+
+
+

Value

+

a disk.frame

+
-# clean up -delete(cars_copy.df) -delete(cars2.df)
+
+

Examples

+
cars.df = as.disk.frame(cars)
+
+cars_copy.df = copy_df_to(cars.df, outdir = tempfile(fileext=".df"))
+
+cars2.df = move_to(cars.df, outdir = tempfile(fileext=".df"))
+
+# clean up
+delete(cars_copy.df)
+delete(cars2.df)
+
+
+
- - - + + diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index 19440550..988619c1 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -1,67 +1,12 @@ - - - - - - - -Returns the number of chunks in a disk.frame — nchunks • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Returns the number of chunks in a disk.frame — nchunks • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,66 +95,63 @@

Returns the number of chunks in a disk.frame

Returns the number of chunks in a disk.frame

-
nchunks(df, ...)
-
-nchunk(df, ...)
-
-# S3 method for disk.frame
-nchunk(df, ...)
-
-# S3 method for disk.frame
-nchunks(df, skip.ready.check = FALSE, ...)
- -

Arguments

- - - - - - - - - - - - - - -
df

a disk.frame

...

not used

skip.ready.check

NOT implemented

- - -

Examples

-
cars.df = as.disk.frame(cars) - -# return the number of chunks -nchunks(cars.df)
#> [1] 6
nchunk(cars.df)
#> [1] 6
-# clean up cars.df -delete(cars.df)
+
+
nchunks(df, ...)
+
+nchunk(df, ...)
+
+# S3 method for disk.frame
+nchunk(df, ...)
+
+# S3 method for disk.frame
+nchunks(df, skip.ready.check = FALSE, ...)
+
+ +
+

Arguments

+
df
+

a disk.frame

+
...
+

not used

+
skip.ready.check
+

NOT implemented

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+# return the number of chunks
+nchunks(cars.df)
+#> [1] 6
+nchunk(cars.df)
+#> [1] 6
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index 3b071e74..a8a9d09a 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -1,67 +1,12 @@ - - - - - - - -Number of rows or columns — nrow • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Number of rows or columns — nrow • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,62 +95,61 @@

Number of rows or columns

Number of rows or columns

-
nrow(df, ...)
-
-# S3 method for disk.frame
-nrow(df, ...)
-
-ncol(df)
-
-# S3 method for disk.frame
-ncol(df)
+
+
nrow(df, ...)
 
-    

Arguments

- - - - - - - - - - -
df

a disk.frame

...

passed to base::nrow

+# S3 method for disk.frame +nrow(df, ...) +ncol(df) -

Examples

-
cars.df = as.disk.frame(cars) +# S3 method for disk.frame +ncol(df)
+
-# return total number of column and rows -ncol(cars.df)
#> [1] 2
nrow(cars.df)
#> [1] 50
-# clean up cars.df -delete(cars.df)
+
+

Arguments

+
df
+

a disk.frame

+
...
+

passed to base::nrow

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+# return total number of column and rows
+ncol(cars.df)
+#> [1] 2
+nrow(cars.df)
+#> [1] 50
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index 5aa4d240..22b7a93c 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -1,69 +1,14 @@ - - - - - - - -One Stage function — var_df.chunk_agg.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -One Stage function — var_df.chunk_agg.disk.frame • disk.frame - - - - - - - - - - - + + - - -
-
- -
- -
+
@@ -166,109 +99,97 @@

One Stage function

mean collected_agg

-
var_df.chunk_agg.disk.frame(x, na.rm = FALSE)
+    
+
var_df.chunk_agg.disk.frame(x, na.rm = FALSE)
 
-var_df.collected_agg.disk.frame(listx)
+var_df.collected_agg.disk.frame(listx)
 
-sd_df.chunk_agg.disk.frame(x, na.rm = FALSE)
+sd_df.chunk_agg.disk.frame(x, na.rm = FALSE)
 
-sd_df.collected_agg.disk.frame(listx)
+sd_df.collected_agg.disk.frame(listx)
 
-mean_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
+mean_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
 
-mean_df.collected_agg.disk.frame(listx)
+mean_df.collected_agg.disk.frame(listx)
 
-sum_df.chunk_agg.disk.frame(x, ...)
+sum_df.chunk_agg.disk.frame(x, ...)
 
-sum_df.collected_agg.disk.frame(listx, ...)
+sum_df.collected_agg.disk.frame(listx, ...)
 
-min_df.chunk_agg.disk.frame(x, ...)
+min_df.chunk_agg.disk.frame(x, ...)
 
-min_df.collected_agg.disk.frame(listx, ...)
+min_df.collected_agg.disk.frame(listx, ...)
 
-max_df.chunk_agg.disk.frame(x, ...)
+max_df.chunk_agg.disk.frame(x, ...)
 
-max_df.collected_agg.disk.frame(listx, ...)
+max_df.collected_agg.disk.frame(listx, ...)
 
-median_df.chunk_agg.disk.frame(x, ...)
+median_df.chunk_agg.disk.frame(x, ...)
 
-median_df.collected_agg.disk.frame(listx, ...)
+median_df.collected_agg.disk.frame(listx, ...)
 
-n_df.chunk_agg.disk.frame(...)
+n_df.chunk_agg.disk.frame(...)
 
-n_df.collected_agg.disk.frame(listx, ...)
+n_df.collected_agg.disk.frame(listx, ...)
 
-length_df.chunk_agg.disk.frame(x, ...)
+length_df.chunk_agg.disk.frame(x, ...)
 
-length_df.collected_agg.disk.frame(listx, ...)
+length_df.collected_agg.disk.frame(listx, ...)
 
-any_df.chunk_agg.disk.frame(x, ...)
+any_df.chunk_agg.disk.frame(x, ...)
 
-any_df.collected_agg.disk.frame(listx, ...)
+any_df.collected_agg.disk.frame(listx, ...)
 
-all_df.chunk_agg.disk.frame(x, ...)
+all_df.chunk_agg.disk.frame(x, ...)
 
-all_df.collected_agg.disk.frame(listx, ...)
+all_df.collected_agg.disk.frame(listx, ...)
 
-n_distinct_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
+n_distinct_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
 
-n_distinct_df.collected_agg.disk.frame(listx, ...)
+n_distinct_df.collected_agg.disk.frame(listx, ...)
 
-quantile_df.chunk_agg.disk.frame(x, ...)
+quantile_df.chunk_agg.disk.frame(x, ...)
 
-quantile_df.collected_agg.disk.frame(listx, ...)
+quantile_df.collected_agg.disk.frame(listx, ...)
 
-IQR_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
+IQR_df.chunk_agg.disk.frame(x, na.rm = FALSE, ...)
 
-IQR_df.collected_agg.disk.frame(listx, ...)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
x

the input

na.rm

Remove NAs. TRUE of FALSE

listx

a list

...

additional options

+IQR_df.collected_agg.disk.frame(listx, ...)
+
+
+

Arguments

+
x
+

the input

+
na.rm
+

Remove NAs. TRUE of FALSE

+
listx
+

a list

+
...
+

additional options

+
+ - - - + + diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index 235c9466..f4d07be2 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -1,67 +1,12 @@ - - - - - - - -Check if the outdir exists or not — overwrite_check • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Check if the outdir exists or not — overwrite_check • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,54 +95,49 @@

Check if the outdir exists or not

If the overwrite is TRUE then the folder will be deleted, otherwise the folder will be created.

-
overwrite_check(outdir, overwrite)
- -

Arguments

- - - - - - - - - - -
outdir

the output directory

overwrite

TRUE or FALSE if `outdir`` exists and overwrite = FALSE then throw an error

- - -

Examples

-
tf = tempfile() -overwrite_check(tf, overwrite = FALSE) -overwrite_check(tf, overwrite = TRUE) - -# clean up -fs::dir_delete(tf)
+
+
overwrite_check(outdir, overwrite)
+
+ +
+

Arguments

+
outdir
+

the output directory

+
overwrite
+

TRUE or FALSE if `outdir`` exists and overwrite = FALSE then throw an error

+
+ +
+

Examples

+
tf = tempfile()
+overwrite_check(tf, overwrite = FALSE)
+overwrite_check(tf, overwrite = TRUE)
+
+# clean up
+fs::dir_delete(tf)
+
+
+
- - - + + diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index b8402b06..a0ef8db4 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Print disk.frame — print.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Print disk.frame — print.disk.frame • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,48 +95,40 @@

Print disk.frame

a new print method for disk.frame

-
# S3 method for disk.frame
-print(x, ...)
- -

Arguments

- - - - - - - - - - -
x

disk.frame

...

not used

+
+
# S3 method for disk.frame
+print(x, ...)
+
+
+

Arguments

+
x
+

disk.frame

+
...
+

not used

+
+
- - - + + diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index e22e0329..ff438239 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Pull a column from table similar to `dplyr::pull`. — pull.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Pull a column from table similar to `dplyr::pull`. — pull.disk.frame • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,56 +95,44 @@

Pull a column from table similar to `dplyr::pull`.

Pull a column from table similar to `dplyr::pull`.

-
# S3 method for disk.frame
-pull(.data, var = -1, name = NULL, ...)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
.data

The disk.frame

var

can be an positive or negative integer or a character/string. See dplyr::pull documentation

name

See dplyr::pull documentation

...

Not used, kept for compatibility with `dplyr::pull`

+
+
# S3 method for disk.frame
+pull(.data, var = -1, name = NULL, ...)
+
+
+

Arguments

+
.data
+

The disk.frame

+
var
+

can be an positive or negative integer or a character/string. See dplyr::pull documentation

+
name
+

See dplyr::pull documentation

+
...
+

Not used, kept for compatibility with `dplyr::pull`

+
+
- - - + + diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index 9226c689..081dadec 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -rbindlist disk.frames together — rbindlist.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -rbindlist disk.frames together — rbindlist.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,83 +95,70 @@

rbindlist disk.frames together

rbindlist disk.frames together

-
rbindlist.disk.frame(
-  df_list,
-  outdir = tempfile(fileext = ".df"),
-  by_chunk_id = TRUE,
-  parallel = TRUE,
-  compress = 50,
-  overwrite = TRUE,
-  .progress = TRUE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df_list

A list of disk.frames

outdir

Output directory of the row-bound disk.frames

by_chunk_id

If TRUE then only the chunks with the same chunk IDs will be bound

parallel

if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE

compress

0-100, 100 being the highest compression rate.

overwrite

overwrite the output directory

.progress

A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

- - -

Examples

-
cars.df = as.disk.frame(cars) - -# row-bind two disk.frames -cars2.df = rbindlist.disk.frame(list(cars.df, cars.df))
#> Appending disk.frames:
-# clean up cars.df -delete(cars.df) -delete(cars2.df)
+
+
rbindlist.disk.frame(
+  df_list,
+  outdir = tempfile(fileext = ".df"),
+  by_chunk_id = TRUE,
+  parallel = TRUE,
+  compress = 50,
+  overwrite = TRUE,
+  .progress = TRUE
+)
+
+ +
+

Arguments

+
df_list
+

A list of disk.frames

+
outdir
+

Output directory of the row-bound disk.frames

+
by_chunk_id
+

If TRUE then only the chunks with the same chunk IDs will be bound

+
parallel
+

if TRUE then bind multiple disk.frame simultaneously, Defaults to TRUE

+
compress
+

0-100, 100 being the highest compression rate.

+
overwrite
+

overwrite the output directory

+
.progress
+

A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+# row-bind two disk.frames
+cars2.df = rbindlist.disk.frame(list(cars.df, cars.df))
+#> Appending disk.frames: 
+
+# clean up cars.df
+delete(cars.df)
+delete(cars2.df)
+
+
+
- - - + + diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index 2abbba00..1ad617e3 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -1,67 +1,12 @@ - - - - - - - -Increase or decrease the number of chunks in the disk.frame — rechunk • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Increase or decrease the number of chunks in the disk.frame — rechunk • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,97 +95,88 @@

Increase or decrease the number of chunks in the disk.frame

Increase or decrease the number of chunks in the disk.frame

-
rechunk(
-  df,
-  nchunks,
-  outdir = attr(df, "path", exact = TRUE),
-  shardby = NULL,
-  overwrite = TRUE,
-  shardby_function = "hash",
-  sort_splits = NULL,
-  desc_vars = NULL
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df

the disk.frame to rechunk

nchunks

number of chunks

outdir

the output directory

shardby

the shardkeys

overwrite

overwrite the output directory

shardby_function

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

sort_splits

for the "sort" shardby function, a dataframe with the split values.

desc_vars

for the "sort" shardby function, the variables to sort descending.

- - -

Examples

-
# create a disk.frame with 2 chunks in tempdir() -cars.df = as.disk.frame(cars, nchunks = 2) +
+
rechunk(
+  df,
+  nchunks,
+  outdir = attr(df, "path", exact = TRUE),
+  shardby = NULL,
+  overwrite = TRUE,
+  shardby_function = "hash",
+  sort_splits = NULL,
+  desc_vars = NULL
+)
+
-# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df -rechunk(cars.df, 3)
#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK\back_up_tmp_dir187c185718b5. You can recover there files until you restart your R session
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK\file187c36421ee.df" -#> nchunks: 3 -#> nrow (at source): 50 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
-new_path = tempfile(fileext = ".df") -# re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory -cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed")
#> Hashing...
#> Hashing...
#> Hashing...
#> Appending disk.frames:
-# clean up cars.df -delete(cars.df) -delete(cars2.df)
+
+

Arguments

+
df
+

the disk.frame to rechunk

+
nchunks
+

number of chunks

+
outdir
+

the output directory

+
shardby
+

the shardkeys

+
overwrite
+

overwrite the output directory

+
shardby_function
+

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

+
sort_splits
+

for the "sort" shardby function, a dataframe with the split values.

+
desc_vars
+

for the "sort" shardby function, the variables to sort descending.

+
+ +
+

Examples

+
# create a disk.frame with 2 chunks in tempdir()
+cars.df = as.disk.frame(cars, nchunks = 2)
+
+# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df
+rechunk(cars.df, 3)
+#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\back_up_tmp_dir56f4356b56cb. You can recover there files until you restart your R session
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f4c8a34c9.df"
+#> nchunks: 3
+#> nrow (at source): 50
+#> ncol (at source): 2
+#> nrow (post operations): ???
+#> ncol (post operations): ???
+
+new_path = tempfile(fileext = ".df")
+# re-chunking cars.df to 4 chunks, shard by speed, and done "out-of-place" to a new directory
+cars2.df = rechunk(cars.df, 4, outdir=new_path, shardby = "speed")
+#> Hashing...
+#> Hashing...
+#> Hashing...
+#> Appending disk.frames: 
+
+# clean up cars.df
+delete(cars.df)
+delete(cars2.df)
+
+
+
- - - + + diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index f0c9e401..e549d81c 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -1,68 +1,13 @@ - - - - - - - -Recommend number of chunks based on input size — recommend_nchunks • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Recommend number of chunks based on input size — recommend_nchunks • disk.frame + + - - - - -
-
- -
- -
+
@@ -164,75 +97,67 @@

Recommend number of chunks based on input size

into. It can accept filesizes in bytes (as integer) or a data.frame

-
recommend_nchunks(
-  df,
-  type = "csv",
-  minchunks = data.table::getDTthreads(),
-  conservatism = 8,
-  ram_size = df_ram_size()
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
df

a disk.frame or the file size in bytes of a CSV file holding the -data

type

only = "csv" is supported. It indicates the file type -corresponding to file size `df`

minchunks

the minimum number of chunks. Defaults to the number of CPU -cores (without hyper-threading)

conservatism

a multiplier to the recommended number of chunks. The +

+
recommend_nchunks(
+  df,
+  type = "csv",
+  minchunks = data.table::getDTthreads(),
+  conservatism = 8,
+  ram_size = df_ram_size()
+)
+
+ +
+

Arguments

+
df
+

a disk.frame or the file size in bytes of a CSV file holding the +data

+
type
+

only = "csv" is supported. It indicates the file type +corresponding to file size `df`

+
minchunks
+

the minimum number of chunks. Defaults to the number of CPU +cores (without hyper-threading)

+
conservatism
+

a multiplier to the recommended number of chunks. The more chunks the smaller the chunk size and more likely that each chunk can -fit into RAM

ram_size

The amount of RAM available which is usually computed. Except on RStudio with R3.6+

- - -

Examples

-
# recommend nchunks based on data.frame -recommend_nchunks(cars)
#> [1] 6
-# recommend nchunks based on file size ONLY CSV is implemented at the moment -recommend_nchunks(1024^3)
#> [1] 6
+fit into RAM

+
ram_size
+

The amount of RAM available which is usually computed. Except on RStudio with R3.6+

+
+ +
+

Examples

+
# recommend nchunks based on data.frame
+recommend_nchunks(cars)
+#> [1] 6
+
+# recommend nchunks based on file size ONLY CSV is implemented at the moment
+recommend_nchunks(1024^3)
+#> [1] 6
+
+
+
- - - + + diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index 00048845..974ae319 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -1,67 +1,12 @@ - - - - - - - -Removes a chunk from the disk.frame — remove_chunk • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Removes a chunk from the disk.frame — remove_chunk • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,78 +95,83 @@

Removes a chunk from the disk.frame

Removes a chunk from the disk.frame

-
remove_chunk(df, chunk_id, full.names = FALSE)
- -

Arguments

- - - - - - - - - - - - - - -
df

a disk.frame

chunk_id

the chunk ID of the chunk to remove. If it's a number then return number.fst

full.names

TRUE or FALSE. Defaults to FALSE. If true then chunk_id is the full path to the chunk otherwise it's the relative path

- - -

Examples

-
# TODO add these to tests -cars.df = as.disk.frame(cars, nchunks = 4) - -# removes 3rd chunk -remove_chunk(cars.df, 3)
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK\file187c1fad1470.df" -#> nchunks: 3 -#> nrow (at source): 37 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
nchunks(cars.df) # 3
#> [1] 3
-# removes 4th chunk -remove_chunk(cars.df, "4.fst")
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK\file187c1fad1470.df" -#> nchunks: 2 -#> nrow (at source): 26 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
nchunks(cars.df) # 3
#> [1] 2
-# removes 2nd chunk -remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE)
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK\file187c1fad1470.df" -#> nchunks: 1 -#> nrow (at source): 13 -#> ncol (at source): 2 -#> nrow (post operations): ??? -#> ncol (post operations): ???
nchunks(cars.df) # 1
#> [1] 1
-# clean up cars.df -delete(cars.df)
+
+
remove_chunk(df, chunk_id, full.names = FALSE)
+
+ +
+

Arguments

+
df
+

a disk.frame

+
chunk_id
+

the chunk ID of the chunk to remove. If it's a number then return number.fst

+
full.names
+

TRUE or FALSE. Defaults to FALSE. If true then chunk_id is the full path to the chunk otherwise it's the relative path

+
+ +
+

Examples

+
# TODO add these to tests
+cars.df = as.disk.frame(cars, nchunks = 4)
+
+# removes 3rd chunk
+remove_chunk(cars.df, 3)
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df"
+#> nchunks: 3
+#> nrow (at source): 37
+#> ncol (at source): 2
+#> nrow (post operations): ???
+#> ncol (post operations): ???
+nchunks(cars.df) # 3
+#> [1] 3
+
+# removes 4th chunk
+remove_chunk(cars.df, "4.fst")
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df"
+#> nchunks: 2
+#> nrow (at source): 26
+#> ncol (at source): 2
+#> nrow (post operations): ???
+#> ncol (post operations): ???
+nchunks(cars.df) # 3
+#> [1] 2
+
+# removes 2nd chunk
+remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE)
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f45e4b14bd.df"
+#> nchunks: 1
+#> nrow (at source): 13
+#> ncol (at source): 2
+#> nrow (post operations): ???
+#> ncol (post operations): ???
+nchunks(cars.df) # 1
+#> [1] 1
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/sample.html b/docs/reference/sample.html index 161546d8..719d4244 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -1,67 +1,12 @@ - - - - - - - -Sample n rows from a disk.frame — sample_frac.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Sample n rows from a disk.frame — sample_frac.disk.frame • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,97 +95,87 @@

Sample n rows from a disk.frame

Sample n rows from a disk.frame

-
# S3 method for disk.frame
-sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ...)
+
+
# S3 method for disk.frame
+sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = NULL, ...)
+
-

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - -
tbl

A data.frame.

size

<tidy-select> -For sample_n(), the number of rows to select. -For sample_frac(), the fraction of rows to select. -If tbl is grouped, size applies to each group.

replace

Sample with or without replacement?

weight

<tidy-select> Sampling weights. +

+

Arguments

+
tbl
+

A data.frame.

+
size
+

<tidy-select> +For sample_n(), the number of rows to select. +For sample_frac(), the fraction of rows to select. +If tbl is grouped, size applies to each group.

+
replace
+

Sample with or without replacement?

+
weight
+

<tidy-select> Sampling weights. This must evaluate to a vector of non-negative numbers the same length as -the input. Weights are automatically standardised to sum to 1.

.env

DEPRECATED.

...

ignored

- - -

Examples

-
cars.df = as.disk.frame(cars) - -collect(sample_frac(cars.df, 0.5))
#> speed dist -#> 1 10 34 -#> 2 8 16 -#> 3 4 10 -#> 4 10 26 -#> 5 12 14 -#> 6 13 26 -#> 7 11 28 -#> 8 13 34 -#> 9 13 46 -#> 10 15 26 -#> 11 16 32 -#> 12 15 54 -#> 13 18 42 -#> 14 17 32 -#> 15 18 84 -#> 16 17 40 -#> 17 22 66 -#> 18 19 68 -#> 19 20 48 -#> 20 20 52 -#> 21 24 120 -#> 22 25 85
-# clean up cars.df -delete(cars.df)
+the input. Weights are automatically standardised to sum to 1.

+
.env
+

DEPRECATED.

+
...
+

ignored

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+collect(sample_frac(cars.df, 0.5))
+#>     speed dist
+#>  1:     7   22
+#>  2:     7    4
+#>  3:     9   10
+#>  4:    10   26
+#>  5:    13   34
+#>  6:    11   17
+#>  7:    12   20
+#>  8:    11   28
+#>  9:    14   26
+#> 10:    14   36
+#> 11:    15   26
+#> 12:    14   80
+#> 13:    18   84
+#> 14:    18   56
+#> 15:    18   76
+#> 16:    17   50
+#> 17:    20   56
+#> 18:    19   46
+#> 19:    19   68
+#> 20:    20   32
+#> 21:    25   85
+#> 22:    24   70
+#>     speed dist
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 2c8a7cda..4b1ee1c7 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -Set up disk.frame environment — setup_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Set up disk.frame environment — setup_disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,76 +95,71 @@

Set up disk.frame environment

Set up disk.frame environment

-
setup_disk.frame(
-  workers = data.table::getDTthreads(),
-  future_backend = future::multisession,
-  ...,
-  gui = FALSE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
workers

the number of workers (background R processes in the

future_backend

which future backend to use for parallelization

...

passed to `future::plan`

gui

Whether to use a Graphical User Interface (GUI) for selecting the options. Defaults to FALSE

- - -

Examples

-
if (interactive()) { - # setup disk.frame to use multiple workers these may use more than two - # cores, and is therefore not allowed on CRAN. Hence it's set to run only in - # interactive session - setup_disk.frame() - - # use a Shiny GUI to adjust settings - # only run in interactive() - setup_disk.frame(gui = TRUE) -} - -# set the number workers to 2 -setup_disk.frame(2)
#> The number of workers available for disk.frame is 2
-# if you do not wish to use multiple workers you can set it to sequential -setup_disk.frame(future_backend=future::sequential)
#> Warning: Ignored 2 unknown arguments: 'workers', 'gc'
#> The number of workers available for disk.frame is 1
+
+
setup_disk.frame(
+  workers = data.table::getDTthreads(),
+  future_backend = future::multisession,
+  ...,
+  gui = FALSE
+)
+
+ +
+

Arguments

+
workers
+

the number of workers (background R processes in the

+
future_backend
+

which future backend to use for parallelization

+
...
+

passed to `future::plan`

+
gui
+

Whether to use a Graphical User Interface (GUI) for selecting the options. Defaults to FALSE

+
+ +
+

Examples

+
if (interactive()) {
+  # setup disk.frame to use multiple workers these may use more than two
+  # cores, and is therefore not allowed on CRAN. Hence it's set to run only in
+  # interactive session
+  setup_disk.frame()
+  
+  # use a Shiny GUI to adjust settings
+  # only run in interactive()
+  setup_disk.frame(gui = TRUE)
+}
+
+# set the number workers to 2
+setup_disk.frame(2)
+#> The number of workers available for disk.frame is 2
+
+# if you do not wish to use multiple workers you can set it to sequential
+setup_disk.frame(future_backend=future::sequential)
+#> Warning: Detected 1 unknown future arguments: 'workers'
+#> The number of workers available for disk.frame is 1
+
+
+
- - - + + diff --git a/docs/reference/shard.html b/docs/reference/shard.html index 13f9501d..eca41f84 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -1,68 +1,13 @@ - - - - - - - -Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame — shard • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Shard a data.frame/data.table or disk.frame into chunk and saves it into a disk.frame — shard • disk.frame - + + - - - -
-
- -
- -
+
@@ -164,93 +97,76 @@

Shard a data.frame/data.table or disk.frame into chunk and saves it into a d

`distribute` is an alias for `shard`

-
shard(
-  df,
-  shardby,
-  outdir = tempfile(fileext = ".df"),
-  ...,
-  nchunks = recommend_nchunks(df),
-  overwrite = FALSE,
-  shardby_function = "hash",
-  sort_splits = NULL,
-  desc_vars = NULL
-)
-
-distribute(...)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df

A data.frame/data.table or disk.frame. If disk.frame, then rechunk(df, ...) is run

shardby

The column(s) to shard the data by.

outdir

The output directory of the disk.frame

...

not used

nchunks

The number of chunks

overwrite

If TRUE then the chunks are overwritten

shardby_function

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

sort_splits

If shardby_function is "sort", the split values for sharding

desc_vars

for the "sort" shardby function, the variables to sort descending.

- +
+
shard(
+  df,
+  shardby,
+  outdir = tempfile(fileext = ".df"),
+  ...,
+  nchunks = recommend_nchunks(df),
+  overwrite = FALSE,
+  shardby_function = "hash",
+  sort_splits = NULL,
+  desc_vars = NULL
+)
+
+distribute(...)
+
-

Examples

-
-# shard the cars data.frame by speed so that rows with the same speed are in the same chunk -iris.df = shard(iris, "Species")
#> Hashing...
-# clean up cars.df -delete(iris.df)
+
+

Arguments

+
df
+

A data.frame/data.table or disk.frame. If disk.frame, then rechunk(df, ...) is run

+
shardby
+

The column(s) to shard the data by.

+
outdir
+

The output directory of the disk.frame

+
...
+

not used

+
nchunks
+

The number of chunks

+
overwrite
+

If TRUE then the chunks are overwritten

+
shardby_function
+

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

+
sort_splits
+

If shardby_function is "sort", the split values for sharding

+
desc_vars
+

for the "sort" shardby function, the variables to sort descending.

+
+ +
+

Examples

+

+# shard the cars data.frame by speed so that rows with the same speed are in the same chunk
+iris.df = shard(iris, "Species")
+#> Hashing...
+
+# clean up cars.df
+delete(iris.df)
+
+
+
- - - + + diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html index 4e99bbca..eeca73d7 100644 --- a/docs/reference/shardkey.html +++ b/docs/reference/shardkey.html @@ -1,67 +1,12 @@ - - - - - - - -Returns the shardkey (not implemented yet) — shardkey • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Returns the shardkey (not implemented yet) — shardkey • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,43 +95,37 @@

Returns the shardkey (not implemented yet)

Returns the shardkey (not implemented yet)

-
shardkey(df)
- -

Arguments

- - - - - - -
df

a disk.frame

+
+
shardkey(df)
+
+
+

Arguments

+
df
+

a disk.frame

+
+
- - - + + diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index 5c4e1ad8..e51669c5 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -1,67 +1,12 @@ - - - - - - - -Compare two disk.frame shardkeys — shardkey_equal • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Compare two disk.frame shardkeys — shardkey_equal • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,47 +95,39 @@

Compare two disk.frame shardkeys

Compare two disk.frame shardkeys

-
shardkey_equal(sk1, sk2)
- -

Arguments

- - - - - - - - - - -
sk1

shardkey1

sk2

shardkey2

+
+
shardkey_equal(sk1, sk2)
+
+
+

Arguments

+
sk1
+

shardkey1

+
sk2
+

shardkey2

+
+
- - - + + diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index 7481dc90..a4e1badc 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -1,67 +1,12 @@ - - - - - - - -Show the code to setup disk.frame — show_ceremony • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Show the code to setup disk.frame — show_ceremony • disk.frame - - + + - - -
-
- -
- -
+
@@ -162,41 +95,38 @@

Show the code to setup disk.frame

Show the code to setup disk.frame

-
show_ceremony()
+    
+
show_ceremony()
 
-ceremony_text()
+ceremony_text()
 
-show_boilerplate()
-
-insert_ceremony()
+show_boilerplate() +insert_ceremony()
+
+ - - - + + diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html index fe997fbd..934352ba 100644 --- a/docs/reference/srckeep.html +++ b/docs/reference/srckeep.html @@ -1,67 +1,12 @@ - - - - - - - -Keep only the variables from the input listed in selections — srckeep • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Keep only the variables from the input listed in selections — srckeep • disk.frame - + + - - - -
-
- -
- -
+
@@ -162,115 +95,108 @@

Keep only the variables from the input listed in selections

Keep only the variables from the input listed in selections

-
srckeep(diskf, selections, ...)
-
-srckeepchunks(diskf, chunks, ...)
- -

Arguments

- - - - - - - - - - - - - - - - - - -
diskf

a disk.frame

selections

The list of variables to keep from the input source

...

not yet used

chunks

The chunks to load

- +
+
srckeep(diskf, selections, ...)
 
-    

Examples

-
cars.df = as.disk.frame(cars) +srckeepchunks(diskf, chunks, ...)
+
-# when loading cars's chunks into RAM, load only the column speed -collect(srckeep(cars.df, "speed"))
#> speed -#> 1: 4 -#> 2: 4 -#> 3: 7 -#> 4: 7 -#> 5: 8 -#> 6: 9 -#> 7: 10 -#> 8: 10 -#> 9: 10 -#> 10: 11 -#> 11: 11 -#> 12: 12 -#> 13: 12 -#> 14: 12 -#> 15: 12 -#> 16: 13 -#> 17: 13 -#> 18: 13 -#> 19: 13 -#> 20: 14 -#> 21: 14 -#> 22: 14 -#> 23: 14 -#> 24: 15 -#> 25: 15 -#> 26: 15 -#> 27: 16 -#> 28: 16 -#> 29: 17 -#> 30: 17 -#> 31: 17 -#> 32: 18 -#> 33: 18 -#> 34: 18 -#> 35: 18 -#> 36: 19 -#> 37: 19 -#> 38: 19 -#> 39: 20 -#> 40: 20 -#> 41: 20 -#> 42: 20 -#> 43: 20 -#> 44: 22 -#> 45: 23 -#> 46: 24 -#> 47: 24 -#> 48: 24 -#> 49: 24 -#> 50: 25 -#> speed
-# clean up cars.df -delete(cars.df)
+
+

Arguments

+
diskf
+

a disk.frame

+
selections
+

The list of variables to keep from the input source

+
...
+

not yet used

+
chunks
+

The chunks to load

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+# when loading cars's chunks into RAM, load only the column speed
+collect(srckeep(cars.df, "speed"))
+#>     speed
+#>  1:     4
+#>  2:     4
+#>  3:     7
+#>  4:     7
+#>  5:     8
+#>  6:     9
+#>  7:    10
+#>  8:    10
+#>  9:    10
+#> 10:    11
+#> 11:    11
+#> 12:    12
+#> 13:    12
+#> 14:    12
+#> 15:    12
+#> 16:    13
+#> 17:    13
+#> 18:    13
+#> 19:    13
+#> 20:    14
+#> 21:    14
+#> 22:    14
+#> 23:    14
+#> 24:    15
+#> 25:    15
+#> 26:    15
+#> 27:    16
+#> 28:    16
+#> 29:    17
+#> 30:    17
+#> 31:    17
+#> 32:    18
+#> 33:    18
+#> 34:    18
+#> 35:    18
+#> 36:    19
+#> 37:    19
+#> 38:    19
+#> 39:    20
+#> 40:    20
+#> 41:    20
+#> 42:    20
+#> 43:    20
+#> 44:    22
+#> 45:    23
+#> 46:    24
+#> 47:    24
+#> 48:    24
+#> 49:    24
+#> 50:    25
+#>     speed
+
+# clean up cars.df
+delete(cars.df)
+
+
+
- - - + + diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index 961cbbe9..b3ecb1b5 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -1,67 +1,12 @@ - - - - - - - -[ interface for disk.frame using fst backend — [.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -[ interface for disk.frame using fst backend — [.disk.frame • disk.frame + + - - - - -
-
- -
- -
+
@@ -162,7 +95,8 @@

[ interface for disk.frame using fst backend

[ interface for disk.frame using fst backend

-
# S3 method for disk.frame
+    
+
# S3 method for disk.frame
 [(
   df,
   ...,
@@ -171,73 +105,59 @@ 

[ interface for disk.frame using fst backend

use.names = TRUE, fill = FALSE, idcol = NULL -)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df

a disk.frame

...

same as data.table

keep

the columns to srckeep

rbind

Whether to rbind the chunks. Defaults to TRUE

use.names

Same as in data.table::rbindlist

fill

Same as in data.table::rbindlist

idcol

Same as in data.table::rbindlist

- - -

Examples

-
cars.df = as.disk.frame(cars) -speed_limit = 50 -cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
#> Error in .checkTypos(e, names_x): Object 'speed_limit' not found amongst speed, dist
-# clean up -delete(cars.df)
+)
+
+ +
+

Arguments

+
df
+

a disk.frame

+
...
+

same as data.table

+
keep
+

the columns to srckeep

+
rbind
+

Whether to rbind the chunks. Defaults to TRUE

+
use.names
+

Same as in data.table::rbindlist

+
fill
+

Same as in data.table::rbindlist

+
idcol
+

Same as in data.table::rbindlist

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+speed_limit = 50
+cars.df[speed < speed_limit ,.N, cut(dist, pretty(dist))]
+#> Error in .checkTypos(e, names_x): Object 'speed_limit' not found amongst speed, dist
+
+# clean up
+delete(cars.df)
+
+
+ - - - + + diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html index f7d0febf..d61fa7a6 100644 --- a/docs/reference/tbl_vars.disk.frame.html +++ b/docs/reference/tbl_vars.disk.frame.html @@ -1,68 +1,13 @@ - - - - - - - -Column names for RStudio auto-complete — tbl_vars.disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Column names for RStudio auto-complete — tbl_vars.disk.frame • disk.frame - - + + - - -
-
- -
- -
+
@@ -164,47 +97,41 @@

Column names for RStudio auto-complete

names

-
# S3 method for disk.frame
-tbl_vars(x)
+    
+
# S3 method for disk.frame
+tbl_vars(x)
 
-# S3 method for disk.frame
-group_vars(x)
- -

Arguments

- - - - - - -
x

a disk.frame

+# S3 method for disk.frame +group_vars(x)
+
+
+

Arguments

+
x
+

a disk.frame

+
+ - - - + + diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index b8e57f50..5f3f9fdb 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -1,68 +1,13 @@ - - - - - - - -Write disk.frame to disk — write_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Write disk.frame to disk — write_disk.frame • disk.frame - + + - - - -
-
- -
- -
+
@@ -164,108 +97,89 @@

Write disk.frame to disk

then using the as.disk.frame function is recommended for most cases

-
write_disk.frame(
-  df,
-  outdir = tempfile(fileext = ".df"),
-  nchunks = ifelse("disk.frame" %in% class(df), nchunks.disk.frame(df),
-    recommend_nchunks(df)),
-  overwrite = FALSE,
-  shardby = NULL,
-  compress = 50,
-  shardby_function = "hash",
-  sort_splits = NULL,
-  desc_vars = NULL,
-  ...
-)
-
-output_disk.frame(...)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
df

a disk.frame

outdir

output directory for the disk.frame

nchunks

number of chunks

overwrite

overwrite output directory

shardby

the columns to shard by

compress

compression ratio for fst files

shardby_function

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

sort_splits

for the "sort" shardby function, a dataframe with the split values.

desc_vars

for the "sort" shardby function, the variables to sort descending.

...

passed to cmap.disk.frame

- - -

Examples

-
cars.df = as.disk.frame(cars) +
+
write_disk.frame(
+  df,
+  outdir = tempfile(fileext = ".df"),
+  nchunks = ifelse("disk.frame" %in% class(df), nchunks.disk.frame(df),
+    recommend_nchunks(df)),
+  overwrite = FALSE,
+  shardby = NULL,
+  compress = 50,
+  shardby_function = "hash",
+  sort_splits = NULL,
+  desc_vars = NULL,
+  ...
+)
+
+output_disk.frame(...)
+
-# write out a lazy disk.frame to disk -cars2.df = write_disk.frame(cmap(cars.df, ~.x[1,]), overwrite = TRUE) -collect(cars2.df)
#> speed dist -#> 1: 4 2 -#> 2: 11 17 -#> 3: 13 46 -#> 4: 16 40 -#> 5: 19 46 -#> 6: 24 70
-# clean up cars.df -delete(cars.df) -delete(cars2.df)
+
+

Arguments

+
df
+

a disk.frame

+
outdir
+

output directory for the disk.frame

+
nchunks
+

number of chunks

+
overwrite
+

overwrite output directory

+
shardby
+

the columns to shard by

+
compress
+

compression ratio for fst files

+
shardby_function
+

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

+
sort_splits
+

for the "sort" shardby function, a dataframe with the split values.

+
desc_vars
+

for the "sort" shardby function, the variables to sort descending.

+
...
+

passed to cmap.disk.frame

+
+ +
+

Examples

+
cars.df = as.disk.frame(cars)
+
+# write out a lazy disk.frame to disk
+cars2.df = write_disk.frame(cmap(cars.df, ~.x[1,]), overwrite = TRUE)
+collect(cars2.df)
+#>    speed dist
+#> 1:     4    2
+#> 2:    11   17
+#> 3:    13   46
+#> 4:    16   40
+#> 5:    19   46
+#> 6:    24   70
+
+# clean up cars.df
+delete(cars.df)
+delete(cars2.df)
+
+
+
- - - + + diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index 862dab16..9188f8ac 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -1,70 +1,15 @@ - - - - - - - -`zip_to_disk.frame` is used to read and convert every CSV file within the zip -file to disk.frame format — zip_to_disk.frame • disk.frame - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`zip_to_disk.frame` is used to read and convert every CSV file within the zip +file to disk.frame format — zip_to_disk.frame • disk.frame - - - - + + -
-
- -
- -
+
@@ -167,93 +100,84 @@

`zip_to_disk.frame` is used to read and convert every CSV file within the zi file to disk.frame format

-
zip_to_disk.frame(
-  zipfile,
-  outdir,
-  ...,
-  validation.check = FALSE,
-  overwrite = TRUE
-)
- -

Arguments

- - - - - - - - - - - - - - - - - - - - - - -
zipfile

The zipfile

outdir

The output directory for disk.frame

...

passed to fread

validation.check

should the function perform a check at the end to check for validity of output. It can detect issues with conversion

overwrite

overwrite output directory

- -

Value

+
+
zip_to_disk.frame(
+  zipfile,
+  outdir,
+  ...,
+  validation.check = FALSE,
+  overwrite = TRUE
+)
+
+
+

Arguments

+
zipfile
+

The zipfile

+
outdir
+

The output directory for disk.frame

+
...
+

passed to fread

+
validation.check
+

should the function perform a check at the end to check for validity of output. It can detect issues with conversion

+
overwrite
+

overwrite output directory

+
+
+

Value

a list of disk.frame

-

See also

- -

Other ingesting data: -csv_to_disk.frame()

- -

Examples

-
# create a zip file containing a csv -csvfile = tempfile(fileext = ".csv") -write.csv(cars, csvfile) -zipfile = tempfile(fileext = ".zip") -zip(zipfile, csvfile) - -# read every file and convert it to a disk.frame -zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) +
+
+

See also

+

Other ingesting data: +csv_to_disk.frame()

+
-# there is only one csv file so it return a list of one disk.frame -zip.df[[1]]
#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpInritK\file187c1d9666ec.df/Users/RTX2080/AppData/Local/Temp/RtmpInritK/file187c725c4b4.csv" -#> nchunks: 6 -#> nrow (at source): 50 -#> ncol (at source): 3 -#> nrow (post operations): ??? -#> ncol (post operations): ???
-# clean up -unlink(csvfile) -unlink(zipfile) -delete(zip.df[[1]])
+
+

Examples

+
# create a zip file containing a csv
+csvfile = tempfile(fileext = ".csv")
+write.csv(cars, csvfile)
+zipfile = tempfile(fileext = ".zip")
+zip(zipfile, csvfile)
+#> Warning: '"zip"' not found
+
+# read every file and convert it to a disk.frame
+zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df"))
+#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\Rtmp2rQjw5\file56f44b886b42.zip' cannot be opened
+
+# there is only one csv file so it return a list of one disk.frame
+zip.df[[1]]
+#> Error in eval(expr, envir, enclos): object 'zip.df' not found
+
+# clean up
+unlink(csvfile)
+unlink(zipfile)
+delete(zip.df[[1]])
+#> Error in "disk.frame" %in% class(df): object 'zip.df' not found
+
+
+
- - - + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 00000000..4aa93c2a --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,294 @@ + + + + /02-intro-disk-frame.html + + + /404.html + + + /articles/01-intro-disk-frame.html + + + /articles/01-intro.html + + + /articles/02-common-questions.html + + + /articles/02-intro-disk-frame.html + + + /articles/03-concepts.html + + + /articles/03_concepts.html + + + /articles/04-ingesting-data.html + + + /articles/04_ingesting-data.html + + + /articles/05-data-table-syntax.html + + + /articles/06-vs-dask-juliadb.html + + + /articles/07-glm.html + + + /articles/08-more-epic.html + + + /articles/09-convenience-features.html + + + /articles/10-group-by.html + + + /articles/11-custom-group-by.html + + + /articles/88-trouble-shooting.html + + + /articles/common-questions.html + + + /articles/concepts.html + + + /articles/convenience-features.html + + + /articles/custom-group-by.html + + + /articles/data-table-syntax.html + + + /articles/glm.html + + + /articles/group-by.html + + + /articles/index.html + + + /articles/ingesting-data.html + + + /articles/intro-disk-frame.html + + + /articles/intro.html + + + /articles/more-epic.html + + + /articles/vs-dask-juliadb-2.html + + + /articles/vs-dask-juliadb.html + + + /authors.html + + + /index.html + + + /LICENSE-text.html + + + /news/index.html + + + /reference/add_chunk.html + + + /reference/as.data.frame.disk.frame.html + + + /reference/as.data.table.disk.frame.html + + + /reference/as.disk.frame.html + + + /reference/bloomfilter.html + + + /reference/chunk_group_by.html + + + /reference/cmap.html + + + /reference/cmap2.html + + + /reference/collect.html + + + /reference/colnames.html + + + /reference/compute.disk.frame.html + + + /reference/create_chunk_mapper.html + + + /reference/create_dplyr_mapper.html + + + /reference/csv_to_disk.frame.html + + + /reference/delete.html + + + /reference/dfglm.html + + + /reference/df_ram_size.html + + + /reference/disk.frame.html + + + /reference/dplyr_verbs.html + + + /reference/evalparseglue.html + + + /reference/foverlaps.disk.frame.html + + + /reference/gen_datatable_synthetic.html + + + /reference/get_chunk.html + + + /reference/get_chunk_ids.html + + + /reference/groups.disk.frame.html + + + /reference/group_by.html + + + /reference/hard_arrange.html + + + /reference/hard_group_by.html + + + /reference/head_tail.html + + + /reference/index.html + + + /reference/is_disk.frame.html + + + /reference/join.html + + + /reference/make_glm_streaming_fn.html + + + /reference/map.html + + + /reference/map2.html + + + /reference/mean.chunk_agg.disk.frame.html + + + /reference/mean.collected_agg.disk.frame.html + + + /reference/merge.disk.frame.html + + + /reference/move_to.html + + + /reference/nchunks.html + + + /reference/ncol_nrow.html + + + /reference/one-stage-group-by-verbs.html + + + /reference/overwrite_check.html + + + /reference/print.disk.frame.html + + + /reference/pull.disk.frame.html + + + /reference/rbindlist.disk.frame.html + + + /reference/rechunk.html + + + /reference/recommend_nchunks.html + + + /reference/remove_chunk.html + + + /reference/sample.html + + + /reference/setup_disk.frame.html + + + /reference/shard.html + + + /reference/shardkey.html + + + /reference/shardkey_equal.html + + + /reference/show_ceremony.html + + + /reference/srckeep.html + + + /reference/sub-.disk.frame.html + + + /reference/summarise.grouped_disk.frame.html + + + /reference/tbl_vars.disk.frame.html + + + /reference/write_disk.frame.html + + + /reference/zip_to_disk.frame.html + + diff --git a/man/dplyr_verbs.Rd b/man/dplyr_verbs.Rd index d3930c57..340e6958 100644 --- a/man/dplyr_verbs.Rd +++ b/man/dplyr_verbs.Rd @@ -8,7 +8,6 @@ \alias{transmute.disk.frame} \alias{arrange.disk.frame} \alias{chunk_arrange} -\alias{add_count.disk.frame} \alias{add_tally.disk.frame} \alias{do.disk.frame} \alias{distinct.disk.frame} @@ -30,8 +29,6 @@ chunk_arrange(.data, ...) -add_count.disk.frame(.data, ...) - add_tally.disk.frame(.data, ...) \method{do}{disk.frame}(.data, ...) diff --git a/presentation/rstudio conf 2021/1min video script b/presentation/rstudio conf 2021/1min video script new file mode 100644 index 00000000..53337167 --- /dev/null +++ b/presentation/rstudio conf 2021/1min video script @@ -0,0 +1,11 @@ +Hi my name is ZedJ and I am a Data Scientist local to Melbourne. I am a keen contributor to open source data science projects, one of which I want to talk about at rstudio:conf 2021. That project is {disk.frame} - a larger-than-RAM data manipulation package. + +R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. + +{disk.frame} solves this issue by providing a 100%-R framework to manipulate data on disk. A modern laptop with {disk.frame} can comfortably handle 100GB's of data. + +Also, {disk.frame} uses {dplyr} verbs to manipulate data so useRs will find it very easy to pick up. + +Finally, because {disk.frame} is 100%-R, you can use any R package with it at no extra cost unlike Spark. + +The talk I propose will introduce {disk.frame} to users with the needs to manipulate large amounts of data with minimal setup. They will find {disk.frame} very familiar, as {disk.frame} uses {dplyr} verbs directly;. Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large datasets. They will find lots of benefits in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. Because {disk.frame} can run R functions natively, they will find that {disk.frame} allows them to many R packages directly with {disk.frame}. diff --git a/presentation/rstudio conf 2021/Abstract proposal.md b/presentation/rstudio conf 2021/Abstract proposal.md new file mode 100644 index 00000000..e7406d3f --- /dev/null +++ b/presentation/rstudio conf 2021/Abstract proposal.md @@ -0,0 +1,5 @@ +Learn how to handle 100GBs of data with ease using {disk.frame} - the larger-than-RAM-data manipulation package. + +R loads data in its entirety into RAM. However, RAM is a precious resource and often do run out. That's why most R user would have run into the "cannot allocate vector of size xxB." error at some point. + +However, the need to handle larger-than-RAM data doesn't go away just because RAM isn't large enough. So many useRs turn to big data tools like Spark for the task. In this talk, I will make the case that {disk.frame} is sufficient and often preferable for manipulating larger-than-RAM data that fit on disk. I will show how you can apply familiar {dplyr}-verbs to manipulate larger-than-RAM data with {disk.frame}. \ No newline at end of file diff --git a/presentation/twin cities/.gitignore b/presentation/twin cities/.gitignore new file mode 100644 index 00000000..26416673 --- /dev/null +++ b/presentation/twin cities/.gitignore @@ -0,0 +1 @@ +*.mp4 diff --git a/tests/testthat.R b/tests/testthat.R deleted file mode 100644 index 6681ff05..00000000 --- a/tests/testthat.R +++ /dev/null @@ -1,4 +0,0 @@ -library(testthat) -library(disk.frame) - -test_check("disk.frame") diff --git a/tests/testthat/test-Rcpp.R b/tests/testthat/test-Rcpp.R deleted file mode 100644 index d4144ca9..00000000 --- a/tests/testthat/test-Rcpp.R +++ /dev/null @@ -1,6 +0,0 @@ -context("test-RcppExprts") - - -test_that("testing Rccpexports nothing here", { - expect_equal(2L, 2L) -}) \ No newline at end of file diff --git a/tests/testthat/test-add-chunk.r b/tests/testthat/test-add-chunk.r deleted file mode 100644 index 07855ac3..00000000 --- a/tests/testthat/test-add-chunk.r +++ /dev/null @@ -1,41 +0,0 @@ -context("test-add-chunk") - -setup({ - setup_disk.frame(workers = 2) -}) - -test_that("testing add chunk without naming chunk_id", { - a = data.frame(a = 1:100, b = 1:100) - - a1 = as.disk.frame(a, overwrite = TRUE) - - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - add_chunk(a1, b) - expect_equal(nrow(a1), 200) - - add_chunk(a1, d) - expect_equal(nrow(a1), 250) - - delete(a1) -}) - -test_that("testing add chunk by naming chunk_id", { - a = data.frame(a = 1:100, b = 1:100) - - a1 = as.disk.frame(a, overwrite = TRUE) - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - add_chunk(a1, b, chunk_id = nchunks(a1)+2) - expect_equal(nrow(a1), 200) - - add_chunk(a1, d, chunk_id = nchunks(a1)+2) - expect_equal(nrow(a1), 250) - - delete(a1) -}) - -teardown({ -}) \ No newline at end of file diff --git a/tests/testthat/test-anti_join.R b/tests/testthat/test-anti_join.R deleted file mode 100644 index 348ebaf2..00000000 --- a/tests/testthat/test-anti_join.R +++ /dev/null @@ -1,75 +0,0 @@ -context("test-anti_join") - -setup({ - setup_disk.frame(workers = 2) - - a = data.frame(a = 1:100, b = 1:100) - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_aj.df"), nchunks = 4, overwrite = TRUE) - as.disk.frame(b, file.path(tempdir(), "tmp_b_aj.df"), nchunks = 5, overwrite = TRUE) - as.disk.frame(d, file.path(tempdir(), "tmp_d_aj.df"), overwrite = TRUE) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_aj2.df"), nchunks = 4, overwrite = TRUE) - as.disk.frame(b, file.path(tempdir(), "tmp_b_aj2.df"), nchunks = 5, overwrite = TRUE) - as.disk.frame(d, file.path(tempdir(), "tmp_d_aj2.df"), overwrite = TRUE) -}) - -test_that("testing anti_join where right is data.frame", { - #skip_on_cran() - a = disk.frame(file.path(tempdir(), "tmp_a_aj.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_aj.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_aj.df")) - bc = collect(b) - dc = collect(d) - - abc = anti_join(a, bc, by = "a") %>% collect - expect_equal(nrow(abc), 50) - - abc0 = anti_join(a, bc, by = c("a","b")) %>% collect - expect_equal(nrow(abc0), 100) - - abc100 = anti_join(a, bc, by = "b") %>% collect - expect_equal(nrow(abc100), 0) - - abd50 = anti_join(a, dc, by = "b") %>% collect - expect_equal(nrow(abd50), 50) -}) - -test_that("testing anti_join where right is disk.frame", { - #skip_on_cran() - a = disk.frame(file.path(tempdir(),"tmp_a_aj2.df")) - b = disk.frame(file.path(tempdir(),"tmp_b_aj2.df")) - d = disk.frame(file.path(tempdir(),"tmp_d_aj2.df")) - - expect_warning({ - ab <- anti_join(a, b, by = "a", merge_by_chunk_id = FALSE) %>% collect - }) - expect_equal(nrow(ab), 50) - - expect_warning({ - ab0 = anti_join(a, b, by = c("a","b"), merge_by_chunk_id = FALSE) %>% collect - }) - expect_equal(nrow(ab0), 100) - - expect_warning({ - ab100 = anti_join(a, b, by = "b", merge_by_chunk_id = FALSE) %>% collect - }) - expect_equal(nrow(ab100), 0) - - expect_warning({ - ad50 = anti_join(a, d, by = "b", merge_by_chunk_id = FALSE) %>% collect - }) - expect_equal(nrow(ad50), 50) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(),"tmp_a_aj.df")) - fs::dir_delete(file.path(tempdir(),"tmp_b_aj.df")) - fs::dir_delete(file.path(tempdir(),"tmp_d_aj.df")) - - fs::dir_delete(file.path(tempdir(),"tmp_a_aj2.df")) - fs::dir_delete(file.path(tempdir(),"tmp_b_aj2.df")) - fs::dir_delete(file.path(tempdir(),"tmp_d_aj2.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-as-data-frame.R b/tests/testthat/test-as-data-frame.R deleted file mode 100644 index 0cb853fb..00000000 --- a/tests/testthat/test-as-data-frame.R +++ /dev/null @@ -1,14 +0,0 @@ -context("test-as-data-frame") - -test_that("as.data.frame works", { - tmpdir = tempfile("disk.frame.tmp") - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11, 100), tmpdir, overwrite = T) - dff = as.data.frame(df) - dft = data.table::as.data.table(df) - expect_s3_class(dff, "data.frame") - expect_s3_class(dft, "data.table") - expect_equal(nrow(dff), 1e5+11) - - delete(df) -}) - diff --git a/tests/testthat/test-as-disk-frame.R b/tests/testthat/test-as-disk-frame.R deleted file mode 100644 index b1ac42dc..00000000 --- a/tests/testthat/test-as-disk-frame.R +++ /dev/null @@ -1,15 +0,0 @@ -context("test-as-disk-frame") - -test_that("as.disk.frame works", { - ROWS = 1e5+11 - - df = disk.frame:::gen_datatable_synthetic(ROWS) - tf = file.path(tempdir(), "tmp_as_disk_frame_delete") - - dfdf <- as.disk.frame(df, outdir = tf, overwrite=TRUE) - - expect_equal(nrow(dfdf), ROWS) - expect_error(dfdf <- as.disk.frame(df, tf, overwrite=FALSE)) - - delete(dfdf) -}) diff --git a/tests/testthat/test-bloom-filter.r b/tests/testthat/test-bloom-filter.r deleted file mode 100644 index 9fa4ad42..00000000 --- a/tests/testthat/test-bloom-filter.r +++ /dev/null @@ -1,16 +0,0 @@ -# context("test-bloomfilter") -# -# test_that("bloomfilter should fail here", { -# expect_error(make_bloomfilter(df, c("origin", "dest"))) -# }) -# -# test_that("bloomfilter should succeed", { -# df = nycflights13::flights %>% as.disk.frame(shardby = c("carrier")) -# make_bloomfilter(df, "carrier") -# expect_true(length(bf_likely_in_chunks(df, "carrier", "UA")) == 1) -# -# expect_equal(nrow(collect(use_bloom_filter(df, "carrier", "UA"))), nrow(filter(nycflights13::flights, carrier == "UA"))) -# -# # clean up -# delete(df) -# }) diff --git a/tests/testthat/test-collect.R b/tests/testthat/test-collect.R deleted file mode 100644 index b5a174c8..00000000 --- a/tests/testthat/test-collect.R +++ /dev/null @@ -1,57 +0,0 @@ -context("test-collect") - -setup({ - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11), file.path(tempdir(),"tmp_col_delete"), overwrite=T) -}) - -test_that("collect works on simple data", { - df = disk.frame(file.path(tempdir(),"tmp_col_delete")) - dff = dplyr::collect(df) - expect_equal(nrow(dff), 1e5+11) - expect_s3_class(dff, "data.frame") - expect_s3_class(dff, "data.table") -}) - -test_that("collect works on lazy stream", { - df = disk.frame(file.path(tempdir(),"tmp_col_delete")) - df = cmap(df, lazy = T, ~{ - .x[1:10, ] - }) - dff = dplyr::collect(df) - expect_equal(nrow(dff), nchunks(df)*10) - expect_s3_class(dff, "data.frame") - expect_s3_class(dff, "data.table") -}) - -test_that("collect works on lazy stream followed by dplyr", { - df = disk.frame(file.path(tempdir(),"tmp_col_delete")) - df = cmap(df, lazy = T, ~{ - .x[1:10, ] - }) %>% select(id1, id4) - - dff = dplyr::collect(df) - expect_equal(nrow(dff), nchunks(df)*10) - expect_equal(ncol(dff), 2) - expect_s3_class(dff, "data.frame") - expect_s3_class(dff, "data.table") -}) - - -test_that("collect works on dplyr::select followed by lazy", { - df = disk.frame(file.path(tempdir(),"tmp_col_delete")) - df = df %>% select(id1, id4) %>% - cmap.disk.frame(lazy = T, ~{ - .x[1:10, ] - }) - - dff = dplyr::collect(df) - expect_equal(nrow(dff), nchunks(df)*10) - expect_equal(ncol(dff), 2) - expect_s3_class(dff, "data.frame") - expect_s3_class(dff, "data.table") -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(),"tmp_col_delete")) -}) \ No newline at end of file diff --git a/tests/testthat/test-compute.r b/tests/testthat/test-compute.r deleted file mode 100644 index 3a1bd56e..00000000 --- a/tests/testthat/test-compute.r +++ /dev/null @@ -1,55 +0,0 @@ -context("test-compute") - -setup({ - setup_disk.frame(workers = 2) - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11), file.path(tempdir(),"tmp_compute_delete"), overwrite=T) -}) - -test_that("compute works on simple data", { - df = disk.frame(file.path(tempdir(),"tmp_compute_delete")) - dff = compute(df) - - expect_equal(nrow(dff), 1e5+11) - expect_s3_class(dff, "disk.frame") -}) - -test_that("compute works on lazy stream", { - df = disk.frame(file.path(tempdir(),"tmp_compute_delete")) - df = cmap(df, lazy = T, ~{ - .x[1:10, ] - }) - dff = compute(df) - expect_equal(nrow(dff), nchunks(df)*10) - expect_s3_class(dff, "disk.frame") -}) - -test_that("compute works on lazy stream followed by dplyr", { - df = disk.frame(file.path(tempdir(),"tmp_compute_delete")) - df = cmap(df, lazy = T, ~{ - .x[1:10, ] - }) %>% select(id1, id4) - - dff = compute(df) - expect_equal(nrow(dff), nchunks(df)*10) - expect_equal(ncol(dff), 2) - expect_s3_class(dff, "disk.frame") -}) - - -test_that("compute works on dplyr::select followed by lazy", { - df = disk.frame(file.path(tempdir(),"tmp_compute_delete")) - df = df %>% select(id1, id4) %>% - cmap(lazy = T, ~{ - .x[1:10, ] - }) - - dff = dplyr::collect(df) - expect_equal(nrow(dff), nchunks(df)*10) - expect_equal(ncol(dff), 2) - expect_s3_class(dff, "data.frame") -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(),"tmp_compute_delete")) -}) \ No newline at end of file diff --git a/tests/testthat/test-csv2disk.frame.r b/tests/testthat/test-csv2disk.frame.r deleted file mode 100644 index dcc73652..00000000 --- a/tests/testthat/test-csv2disk.frame.r +++ /dev/null @@ -1,84 +0,0 @@ -context("test-csv2disk.frame") - -setup({ - df = disk.frame:::gen_datatable_synthetic(1e3+11) - data.table::fwrite(df, file.path(tempdir(), "tmp_pls_delete_csv2df.csv")) - data.table::fwrite(df, file.path(tempdir(), "tmp_pls_delete_csv2df2.csv")) - data.table::fwrite(df, file.path(tempdir(), "tmp_pls_delete_csv2df3.csv")) -}) - -test_that("csv2disk.frame works with no shard", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_csv2df.csv"), - file.path(tempdir(), "tmp_pls_delete_csv2df.df"), - overwrite=TRUE, - nchunks=max(2, recommend_nchunks(file.size(file.path(tempdir(), "tmp_pls_delete_csv2df.csv"))))) - dff1 = dff[,sum(v1), id1] - dff2 = dff1[,sum(V1), id1] - expect_false(nrow(dff1) == nrow(dff2)) - expect_equal(nrow(dff), 1e3+11) - expect_equal(ncol(dff), 10) -}) - -test_that("csv2disk.frame works with shard", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_csv2df2.csv"), - file.path(tempdir(), "tmp_pls_delete_csv2df2.df"), - shardby = "id1", overwrite = TRUE) - dff1 = dff[,sum(v1), id1] - dff2 = dff1[,sum(V1), id1] - expect_true(nrow(dff1) == nrow(dff2)) - expect_equal(nrow(dff), 1e3+11) - expect_equal(ncol(dff), 10) - - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_csv2df3.csv"), - file.path(tempdir(), "tmp_pls_delete_csv2df3.df"), - shardby = c("id1","id2")) - - dff1 = dff[,sum(v1), .(id1,id2)] - dff2 = dff1[,sum(V1), .(id1,id2)] - expect_true(nrow(dff1) == nrow(dff2)) - expect_equal(nrow(dff), 1e3+11) - expect_equal(ncol(dff), 10) -}) - -test_that("csv2disk.frame tests readr", { - library(dplyr) - library(disk.frame) - library(data.table) - library(nycflights13) - - expect_equal(1,1) - - # TODO make this test better - # convert from a data frame - # flights <- flights %>% - # dplyr::mutate(date = as.Date(paste(year, month, day, sep = "-"))) - # str(flights) # time_hour is POSIXct - # - # flights.df <- as.disk.frame( - # flights, - # outdir = file.path(tempdir(), "tmp_flights.df"), - # overwrite = TRUE) - # flights.df - # str(collect(flights.df)) - # - # # with sharding - # df_path = file.path(tempdir(), "tmp_flights.df") - # flights.df <- csv_to_disk.frame( - # csv_path, - # outdir = df_path, - # shardby = "minute", - # overwrite = T, - # backend = "readr") -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_csv2df.df")) - fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_csv2df2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_csv2df3.df")) - fs::file_delete(file.path(tempdir(), "tmp_pls_delete_csv2df.csv")) - fs::file_delete(file.path(tempdir(), "tmp_pls_delete_csv2df2.csv")) - fs::file_delete(file.path(tempdir(), "tmp_pls_delete_csv2df3.csv")) -}) \ No newline at end of file diff --git a/tests/testthat/test-data-table.r b/tests/testthat/test-data-table.r deleted file mode 100644 index 0275ee4a..00000000 --- a/tests/testthat/test-data-table.r +++ /dev/null @@ -1,41 +0,0 @@ -context("test-data.table [") - -setup({ - library(data.table) - setup_disk.frame(workers = 2) - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11), file.path(tempdir(), "tmp_col_delete"), overwrite=TRUE, nchunks = 8) -}) - -test_that("data.table .N", { - library(data.table) - df = disk.frame(file.path(tempdir(), "tmp_col_delete")) - res <- sum(unlist(df[,.N])) - expect_equal(res , 1e5+11) -}) - -test_that("data.table .N+y V1", { - df = disk.frame(file.path(tempdir(), "tmp_col_delete")) - if(interactive()) { - y = 2 - - {y = 3; a <- df[,.(n_plus_y = .N + y), v1]} - b <- df[,.N, v1] - - expect_equal(a$n_plus_y, b$N + y) - } else { - # TODO figure out why the above fails - expect_equal(2L, 2L) - } -}) - -test_that("data.table do not return a data.table", { - library(data.table) - df = disk.frame(file.path(tempdir(), "tmp_col_delete")) - res <- df[,.(.N), rbind=FALSE] - expect_equal(typeof(res), "list") - expect_equal(length(res), 8) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_col_delete")) -}) diff --git a/tests/testthat/test-delete.r b/tests/testthat/test-delete.r deleted file mode 100644 index cc13cce8..00000000 --- a/tests/testthat/test-delete.r +++ /dev/null @@ -1,20 +0,0 @@ -context("test-delete") - -setup({ - setup_disk.frame(workers = 2) - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11), file.path(tempdir(), "tmp_del_delete"), overwrite = TRUE) -}) - -test_that("data.table .N", { - df = disk.frame(file.path(tempdir(), "tmp_del_delete")) - p = attr(df, "path", exact=TRUE) - expect_true(fs::dir_exists(p)) - - delete(df) - - expect_false(fs::dir_exists(p)) -}) - -teardown({ - #fs::dir_delete("tmp_del_delete") -}) \ No newline at end of file diff --git a/tests/testthat/test-disk-frame.r b/tests/testthat/test-disk-frame.r deleted file mode 100644 index 236e7a18..00000000 --- a/tests/testthat/test-disk-frame.r +++ /dev/null @@ -1,34 +0,0 @@ -context("test-disk.frame") - -# TODO add in tests here - -test_that("test add_meta", { - # it works so how to test this? - expect_equal(2L, 2L) - -}) - -test_that("test head", { - # it works so how to test this? - expect_equal(2L, 2L) - -}) - -test_that("test tail", { - # it works so how to test this? - expect_equal(2L, 2L) - -}) - - -test_that("test nrow", { - # it works so how to test this? - - expect_equal(2L, 2L) -}) - -test_that("test ncol", { - # it works so how to test this? - - expect_equal(2L, 2L) -}) \ No newline at end of file diff --git a/tests/testthat/test-dplyr-verbs.r b/tests/testthat/test-dplyr-verbs.r deleted file mode 100644 index f8f18f35..00000000 --- a/tests/testthat/test-dplyr-verbs.r +++ /dev/null @@ -1,175 +0,0 @@ -context("test-dplyr-verbs") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_b_dv.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing select", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - select(a) %>% - collect - - expect_equal(ncol(df), 1) -}) - -test_that("testing rename", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - rename(a_new_name = a) %>% - collect - - expect_setequal(colnames(df), c("a_new_name", "b")) -}) - -test_that("testing filter", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - filter(a <= 100, b <= 10) %>% - collect - - expect_setequal(nrow(df), 10) -}) - -test_that("testing filter - global vars", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - one_hundred = 100 - - df = b %>% - filter(a <= one_hundred, b <= 10) %>% - collect - - expect_setequal(nrow(df), 10) -}) - -test_that("testing mutate", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - - df = b %>% - mutate(d = a + b) %>% - collect - - expect_setequal(sum(df$d), sum(df$a, df$b)) - - df = b %>% - mutate(e = rank(desc(a))) %>% - collect - - expect_equal(nrow(df), 100) - - # need to test - value <- as.disk.frame(tibble(char = LETTERS, - num = 1:26)) - df2 = value %>% - dplyr::mutate(b = case_when( - char %in% c("A", "B", "C") ~ "1", - TRUE ~ char)) %>% - collect - - expect_equal(ncol(df2), 3) - - # testing - fn = function(a, b) { - a+b - } - - df3 = value %>% - dplyr::mutate(b = fn(num, num)) %>% - collect - - expect_equal(ncol(df3), 3) - - - global_var = 100 - - df4 = value %>% - dplyr::mutate(b = fn(num, num), d = global_var*2) %>% - collect - - expect_equal(ncol(df4), 4) - expect_true(all(df4$d == 200)) -}) - -test_that("testing mutate user-defined function", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - - udf = function(a1, b1) { - a1 + b1 - } - - df = b %>% - mutate(d = udf(a,b)) %>% - collect - - expect_setequal(sum(df$d), sum(df$a, df$b)) -}) - -test_that("testing transmute", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - transmute(d = a + b) %>% - collect - - expect_setequal(names(df), c("d")) -}) - -test_that("testing arrange", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - expect_warning(df <- b %>% - mutate(random_unif = runif(dplyr::n())) %>% - arrange(desc(random_unif))) - - df <- b %>% - mutate(random_unif = runif(dplyr::n())) %>% - chunk_arrange(desc(random_unif)) - - x = purrr::map_lgl(1:nchunks(df), ~{ - is.unsorted(.x) == FALSE - }) - - expect_true(all(x)) -}) - -test_that("testing chunk_summarise", { - b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - chunk_summarise(suma = sum(a)) %>% - collect %>% - summarise(suma = sum(suma)) - - expect_equal(df$suma, collect(b)$a %>% sum) -}) - -test_that("testing mutate within function works", { - test_f <- function(params, x_df){ - x_df %>% mutate(aha = params[1]*cyl + params[2]*disp) - } - - expect_true("aha" %in% names(test_f(c(1, 2), mtcars))) -}) - -test_that("filter failure: prevent github #191 regression", { - flights_df = as.disk.frame(nycflights13::flights) - - # expect error due to syntax error - expect_warning(expect_error(flights_df %>% - filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>% - collect)) - - delete(flights_df) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_b_dv.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-dtplyr-support.r b/tests/testthat/test-dtplyr-support.r deleted file mode 100644 index a3499435..00000000 --- a/tests/testthat/test-dtplyr-support.r +++ /dev/null @@ -1,46 +0,0 @@ -context("test-dtplyr-verbs") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - tf = file.path(tempdir(), "test-dtplyr.df") - as.disk.frame(b, outdir = tf, nchunks = 5, overwrite = TRUE) -}) - -test_that("testing dtplyr", { - # TODO add tests when new version of dtplyr on CRAN - # iris_df = as.disk.frame(iris) - # - # iris_df %>% - # filter(Sepal.Length > 7) %>% - # collect() - # - # - # aa = iris_df %>% - # cmap(~{ - # dtplyr::lazy_dt(.x) %>% - # filter(Sepal.Length > 7) %>% - # collect() - # }) %>% - # collect - # - # - # lazy_dt <- function(...) { - # UseMethod("lazy_dt") - # } - # - # lazy_dt.disk.frame <- function(df, ...) { - # cmap(df, ) - # } - # - # lazy_dt.default <- function(...) { - # dtplyr::lazy_dt(...) - # } - expect_true(TRUE) -}) - - - -teardown({ - fs::dir_delete(file.path(tempdir(), "test-dtplyr.df")) -}) - diff --git a/tests/testthat/test-foverlaps.r b/tests/testthat/test-foverlaps.r deleted file mode 100644 index a4438aa0..00000000 --- a/tests/testthat/test-foverlaps.r +++ /dev/null @@ -1,42 +0,0 @@ -context("test-foverlaps") - -setup({ - #setup_disk.frame(workers = 1) -}) - -# TODO currently it's not possible to do -test_that("test foverlap with data.frame", { - x = as.disk.frame(data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)) - y = as.disk.frame(data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)) - byxy = c("start", "end") - xy.df = foverlaps.disk.frame( - x, y, by.x = byxy, by.y = byxy, - merge_by_chunk_id = TRUE, overwrite = TRUE) - - collect(xy.df) - - testthat::expect_equal(nrow(xy.df), 3) -}) - - -# TODO this is also not a good test case -# test_that("test foverlap with disk.frame", { -# x = data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10) -# y = data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3) -# setkey(y, start, end) -# -# dx = shard(x, "tmp_fo.df", overwrite = T, shardby=c("start","end")) -# dy = shard(y, "tmp_to.df", overwrite = T, shardby=c("start","end")) -# -# xy1 = foverlaps(x,y, type="any", which = T) -# -# dxy1 = foverlaps.disk.frame(dx, dy, type="any", outdir="tmp_fo_out2.df") ## return overlap join -# dxy1c = dxy1 %>% collect -# -# foverlaps.disk.frame(dx, dy, type="any", mult="first", outdir="tmp_fo_out2.df") ## returns only first match -# foverlaps.disk.frame(dx, dy, type="within", outdir="tmp_fo_out3.df") ## matches iff 'x' is within 'y' -# }) - -teardown({ - -}) \ No newline at end of file diff --git a/tests/testthat/test-full_join.R b/tests/testthat/test-full_join.R deleted file mode 100644 index f40e8cb8..00000000 --- a/tests/testthat/test-full_join.R +++ /dev/null @@ -1,58 +0,0 @@ -context("test-full_join") - -setup({ - a = data.frame(a = 1:100, b = 1:100) - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_fj.df"), nchunks = 4, overwrite = T) - as.disk.frame(b, file.path(tempdir(), "tmp_b_fj.df"), nchunks = 5, overwrite = T) - as.disk.frame(d, file.path(tempdir(), "tmp_d_fj.df"), overwrite = T) -}) - -test_that("testing full_join where right is data.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_fj.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_fj.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_fj.df")) - bc = collect(b) - dc = collect(d) - - abc <- full_join(a, bc, by = "a") %>% collect - expect_equal(nrow(abc), 150) - - abc0 <- full_join(a, bc, by = c("a","b")) %>% collect - expect_equal(nrow(abc0), 200) - - abc100 <- full_join(a, bc, by = "b") %>% collect - expect_equal(nrow(abc100), 100) - - abd50 <- full_join(a, dc, by = "b") %>% collect - expect_equal(nrow(abd50), 100) -}) - -test_that("testing full_join where right is disk.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_fj.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_fj.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_fj.df")) - - expect_warning({ - ab <- full_join(a, b, by = "a", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab), 150) - - expect_warning({ab0 = full_join(a, b, by = c("a","b"), merge_by_chunk_id = F) %>% collect}) - expect_equal(nrow(ab0), 200) - - expect_warning({ab100 = full_join(a, b, by = "b", merge_by_chunk_id = F) %>% collect}) - expect_equal(nrow(ab100), 100) - - expect_warning({ad50 = full_join(a, d, by = "b", merge_by_chunk_id = F) %>% collect}) - expect_equal(nrow(ad50), 100) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_a_fj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_fj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_fj.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-get_chunk.r b/tests/testthat/test-get_chunk.r deleted file mode 100644 index d33c336e..00000000 --- a/tests/testthat/test-get_chunk.r +++ /dev/null @@ -1,17 +0,0 @@ -context("test-get_chunk") - -setup({ - #setup_disk.frame(workers = 1) - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11), file.path(tempdir(), "tmp_del_delete"), overwrite=T) -}) - -test_that("data.table .N", { - df = disk.frame(file.path(tempdir(), "tmp_del_delete")) - expect_s3_class(get_chunk(df, 1), "data.frame") - - expect_s3_class(get_chunk(df, "1.fst"), "data.frame") -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_del_delete")) -}) \ No newline at end of file diff --git a/tests/testthat/test-get_chunk_ids.r b/tests/testthat/test-get_chunk_ids.r deleted file mode 100644 index 5e95c701..00000000 --- a/tests/testthat/test-get_chunk_ids.r +++ /dev/null @@ -1,20 +0,0 @@ -context("test-get_chunk_ids") - -setup({ - #setup_disk.frame(workers = 1) - df = as.disk.frame(disk.frame:::gen_datatable_synthetic(1e5+11), file.path(tempdir(), "tmp_del_delete"), overwrite=T) -}) - -test_that("get_chunk_ids", { - df = disk.frame(file.path(tempdir(), "tmp_del_delete")) - - gci = get_chunk_ids(df) - expect_type(get_chunk_ids(df), "character") - - gcis = get_chunk_ids(df, strip_extension = F) - expect_true("1.fst" %in% gcis) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_del_delete")) -}) \ No newline at end of file diff --git a/tests/testthat/test-glm.r b/tests/testthat/test-glm.r deleted file mode 100644 index efa0122a..00000000 --- a/tests/testthat/test-glm.r +++ /dev/null @@ -1,30 +0,0 @@ -context("test-glm") - -setup({ - #setup_disk.frame(workers = 1) -}) - -test_that("glm", { - cars.df = as.disk.frame(cars, outdir = file.path(tempdir(), "cars.df"), overwrite = TRUE) - - majorv = as.integer(version$major) - minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) - - if((majorv == 3) & (minorv < 6)) { - expect_warning({m <- dfglm(dist~speed, cars.df, glm_backend = "biglm")}) - } else { - m <- dfglm(dist~speed, cars.df, glm_backend = "biglm") - } - summary(m) - - if((majorv == 3) & (minorv >= 6) ) { - broom::tidy(m) - } - - m <- dfglm(dist~speed, cars.df, glm_backend = "speedglm") - summary(m) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "cars.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-group-by.R b/tests/testthat/test-group-by.R deleted file mode 100644 index d6684114..00000000 --- a/tests/testthat/test-group-by.R +++ /dev/null @@ -1,309 +0,0 @@ -context("test-group_by") - -setup({ - df = disk.frame:::gen_datatable_synthetic(1e3+11) - data.table::fwrite(df, file.path(tempdir(), "tmp_pls_delete_gb.csv")) -}) - - -test_that("new group_by framework", { - if(interactive()) { - iris.df = iris %>% - as.disk.frame - - grpby = iris.df %>% - group_by(Species) %>% - summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% - collect - - grpby2 = iris %>% - group_by(Species) %>% - summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% - arrange() - - for (n in names(grpby)) { - expect_true(all(grpby2[, n] == grpby[, n]) || all(abs(grpby2[, n] - grpby[, n]) < 0.0001)) - } - - delete(iris.df) - } - expect_true(TRUE) -}) - -test_that("new group_by framework - no group-by just summarise", { - if(interactive()) { - iris.df = iris %>% - as.disk.frame - - grpby = iris.df %>% - summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% - collect - - grpby2 = iris %>% - summarize(mean(Petal.Length), sumx = sum(Petal.Length/Sepal.Width), sd(Sepal.Width/ Petal.Length), var(Sepal.Width/ Sepal.Width)) %>% - arrange() - - for (n in names(grpby)) { - expect_true(all(grpby2[, n] == grpby[, n]) || all(abs(grpby2[, n] - grpby[, n]) < 0.0001)) - } - - delete(iris.df) - } - expect_true(TRUE) -}) - -# test_that("new group_by framework - nested-group-by", { - # if(interactive()) { - # iris.df = iris %>% - # as.disk.frame - # - # expect_error(grpby <- iris.df %>% - # summarize(mean(Petal.Length + max(Petal.Length))) %>% - # collect) - # - # expect_error(grpby <- iris.df %>% - # summarize(mean(Petal.Length) + max(Petal.Length)) %>% - # collect) - # - # expect_error(grpby <- iris.df %>% - # summarize(mean(Petal.Length) + 1) %>% - # collect) - # - # expect_error(grpby <- iris.df %>% - # summarize(list(mean(Petal.Length))) %>% - # collect) - # - # fn_tmp = function(x) x + 1 - # grpby <- iris.df %>% - # summarize(mean(fn_tmp(Petal.Length))) %>% - # collect - # - # grpby2 <- iris %>% - # summarize(mean(fn_tmp(Petal.Length))) - # - # for (n in names(grpby)) { - # expect_true(all(grpby2[, n] == grpby[, n]) || all(abs(grpby2[, n] - grpby[, n]) < 0.0001)) - # } - # delete(iris.df) - # } - # expect_true(TRUE) -# }) - -test_that("guard against github #241", { - if(interactive()) { - # I suspect there was an issue with number of chunk = 1 - result_from_disk.frame = iris %>% - as.disk.frame(nchunks = 1) %>% - group_by(Species) %>% - summarize( - mean(Petal.Length), - sumx = sum(Petal.Length/Sepal.Width), - sd(Sepal.Width/ Petal.Length), - var(Sepal.Width/ Sepal.Width), - l = length(Sepal.Width/ Sepal.Width + 2), - max(Sepal.Width), - min(Sepal.Width), - median(Sepal.Width) - ) %>% - collect - } else { - expect_true(TRUE) - } -}) - - -test_that("group_by", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - - dff_res = dff %>% - collect %>% - group_by(id1) %>% - summarise(mv1 = mean(v1)) - - dff1 <- dff %>% - chunk_group_by(id1, id2) %>% - chunk_summarise(mv1 = mean(v1)) %>% - collect - - expect_false(nrow(dff1) == nrow(dff_res)) -}) - -test_that("test hard_group_by on disk.frame", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - - dff_res = dff %>% - collect %>% - group_by(id1, id2) %>% - summarise(mv1 = mean(v1)) - - dff1 <- dff %>% - hard_group_by(id1, id2) %>% - chunk_summarise(mv1 = mean(v1)) %>% collect - - expect_equal(nrow(dff1), nrow(dff_res)) -}) - -test_that("test hard_group_by on data.frame", { - df = disk.frame:::gen_datatable_synthetic(1e3+11) - - df1 = df %>% - group_by(id1, id2) %>% - summarise(mv1 = mean(v1)) - - dff1 <- df %>% - hard_group_by(id1,id2) %>% - summarise(mv1 = mean(v1)) - - expect_equal(nrow(dff1), nrow(df1)) -}) - - -test_that("test hard_group_by on disk.frame (sort)", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - - dff_res = dff %>% - collect %>% - group_by(id1, id2) %>% - summarise(mv1 = mean(v1)) - - dff1 <- dff %>% - hard_group_by(id1, id2, shardby_function="sort") %>% - chunk_summarise(mv1 = mean(v1)) %>% collect - - expect_equal(nrow(dff1), nrow(dff_res)) -}) - -test_that("test hard_group_by on data.frame (sort)", { - df = disk.frame:::gen_datatable_synthetic(1e3+11) - - df1 = df %>% - group_by(id1, id2) %>% - summarise(mv1 = mean(v1)) - - dff1 <- df %>% - hard_group_by(id1, id2, shardby_function="sort") %>% - summarise(mv1 = mean(v1)) - - expect_equal(nrow(dff1), nrow(df1)) -}) - -test_that("guard against github 256", { - test2 <- tibble::tibble( - date = lubridate::ymd(rep(c("2019-01-02", "2019-02-03", "2019-03-04"), 4)), - uid = as.factor(rep(c(uuid::UUIDgenerate(), uuid::UUIDgenerate()), 6)), - proto = as.factor(rep(c("TCP", "UDP", "ICMP"), 4)), - port = as.double(rep(c(22, 21, 0), 4)) - ) - - correct_result = test2 %>% - group_by(date, uid, proto, port) %>% - summarize(n=n()) %>% - collect - - test_df = as.disk.frame(test2, nchunks = 2, overwrite=TRUE) - - incorrect_result = test_df %>% - group_by(date, uid, proto, port) %>% - summarize(n=n()) %>% - collect - - expect_equal(dim(incorrect_result), dim(correct_result)) -}) - -test_that("guard against github 256 #2", { - test2 <- tibble::tibble( - date = lubridate::ymd(rep(c("2019-01-02", "2019-02-03", "2019-03-04"), 4)), - uid = as.factor(rep(c(uuid::UUIDgenerate(), uuid::UUIDgenerate()), 6)), - proto = as.factor(rep(c("TCP", "UDP", "ICMP"), 4)), - port = as.double(rep(c(22, 21, 0), 4)) - ) - - test_df = as.disk.frame(test2, nchunks = 2, overwrite=TRUE) - - - correct_result = test_df %>% - group_by(!!!syms(names(test_df))) %>% - summarize(n=n()) %>% - collect - - incorrect_result = test_df %>% - group_by(date, uid, proto, port) %>% - summarize(n=n()) %>% - collect - - expect_equal(dim(incorrect_result), dim(correct_result)) -}) - -test_that("guard against github 256 #3", { - library(testthat) - library(disk.frame) - setup_disk.frame() - - test2 <- tibble::tibble( - date = sample(1:10, 20, replace = TRUE), - uid = sample(1:10, 20, replace = TRUE) - ) - - test_df = as.disk.frame(test2, nchunks = 2, overwrite=TRUE) - - ntd = names(test_df) - - correct_result = test_df %>% - group_by(!!!syms(ntd)) %>% - summarize(n=n()) %>% - collect - - incorrect_result = test_df %>% - group_by(date, uid) %>% - summarize(n=n()) %>% - collect - - expect_equal(dim(incorrect_result), dim(correct_result)) -}) - -test_that("tests for github #250", { - aggregate_expressions <- list(n = quote(n())) - - result1 = iris %>% - as.disk.frame() %>% - group_by(Species) %>% - summarise(n = n()) %>% - collect - - result2 <- iris %>% - as.disk.frame() %>% - group_by(Species) %>% - summarize(!!!(aggregate_expressions)) %>% - collect - - expect_equal(result1, result2) -}) - -test_that("tests for github #250 2", { - aggregate_expressions <- list(n = quote(n()), quote(n())) - - result1 = iris %>% - as.disk.frame() %>% - group_by(Species) %>% - summarise(n = n(), n()) %>% - collect; result1 - - result2 <- iris %>% - as.disk.frame() %>% - group_by(Species) %>% - summarize(!!!(aggregate_expressions)) %>% - collect - - expect_equal(result1, result2) -}) - -teardown({ - fs::file_delete(file.path(tempdir(), "tmp_pls_delete_gb.csv")) - fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_gb.df")) -}) diff --git a/tests/testthat/test-hard-arrange.R b/tests/testthat/test-hard-arrange.R deleted file mode 100644 index 0419cee8..00000000 --- a/tests/testthat/test-hard-arrange.R +++ /dev/null @@ -1,111 +0,0 @@ -context("test-arrange") - -setup({ - - df = disk.frame:::gen_datatable_synthetic(1e3+11) - data.table::fwrite(df, file.path(tempdir(), "tmp_pls_delete_gb.csv")) -}) - -test_that("test hard_arrange on disk.frame, single chunk", { - # Randomise rows since rows are already sorted - iris.df = as.disk.frame(sample_n(iris, nrow(iris)), nchunks = 1) - iris_hard.df = hard_arrange(iris.df, Species) - - # Check sort - expect_true(!is.unsorted(iris_hard.df$Species)) -}) - -test_that("test hard_arrange on disk.frame, single variable", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - - # Sort ascending, one level - sorted_dff <- dff %>% hard_arrange(id1) - sorted_df <- sorted_dff %>% collect - - expect_true(!is.unsorted(sorted_df$id1)) -}) - -test_that("test hard_arrange on disk.frame, factor data type", { - iris.df = as.disk.frame(sample_n(iris, nrow(iris)), nchunks = 2) - iris_hard.df = hard_arrange(iris.df, Species) - - expect_true(!is.unsorted(iris_hard.df$Species)) -}) - -test_that("test hard_arrange on disk.frame, date data type", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - sorted_dff <- dff %>% hard_arrange(date1) - - expect_true(!is.unsorted(sorted_dff$date1)) -}) - -test_that("test hard_arrange on disk.frame, two and three variables", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - - dfp <- read.csv(file.path(tempdir(), "tmp_pls_delete_gb.csv")) - - # Sort ascending, two levels - sorted_dff <- dff %>% hard_arrange(id1, id4) %>% collect - sorted_dfp <- dff %>% collect %>% dplyr::arrange(id1, id4) - - # Compare vs dplyr - expect_true(all(sorted_dff$id1 == sorted_dfp$id1)) - expect_true(all(sorted_dff$id4 == sorted_dfp$id4)) - - # Sort ascending, three levels, from already partially sorted disk frame - sorted_dff2 <- sorted_dff %>% hard_arrange(id1, id4, id6) %>% collect - sorted_dfp2 <- dff %>% collect %>% dplyr::arrange(id1, id4, id6) - - # Compare vs dplyr - expect_true(all(sorted_dff2$id1 == sorted_dfp2$id1)) - expect_true(all(sorted_dff2$id4 == sorted_dfp2$id4)) - expect_true(all(sorted_dff2$id6 == sorted_dfp2$id6)) -}) - -test_that("test hard_arrange on disk.frame, two factors", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete_gb.csv"), - file.path(tempdir(), "tmp_pls_delete_gb.df")) - - # Sort decending, two levels - desc_dff <- dff %>% hard_arrange(desc(id4), id2) - desc_dff <- desc_dff %>% collect - - # Level 1 - expect_true(!is.unsorted(-desc_dff$id4)) - - # Level 2 - desc_dff$id4_id2 <- paste0( - formatC(max(desc_dff$id4) - desc_dff$id4, width=3, format="d", flag= "0"), - desc_dff$id2) - expect_true(!is.unsorted(-desc_dff$id4)) -}) - -test_that("test hard_arrange on data.frame vs dplyr", { - df = disk.frame:::gen_datatable_synthetic(1e3+11) - - # Sort ascending - sorted_dff <- df %>% hard_arrange(id1, id4) %>% collect - sorted_dfp <- df %>% dplyr::arrange(id1, id4) - - expect_true(all(sorted_dff$id1 == sorted_dfp$id1)) - expect_true(all(sorted_dff$id4 == sorted_dfp$id4)) - - # Sort decending - desc_dff <- df %>% hard_arrange(desc(id4), id2) %>% collect - desc_dfp <- df %>% dplyr::arrange(desc(id4), id2) - - expect_true(all(sorted_dff$id4 == sorted_dfp$id4)) - expect_true(all(sorted_dff$id2 == sorted_dfp$dfp)) -}) - -teardown({ - fs::file_delete(file.path(tempdir(), "tmp_pls_delete_gb.csv")) - fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_gb.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-inner_join.R b/tests/testthat/test-inner_join.R deleted file mode 100644 index e165a142..00000000 --- a/tests/testthat/test-inner_join.R +++ /dev/null @@ -1,64 +0,0 @@ -context("test-inner_join") - -setup({ - a = data.frame(a = 1:100, b = 1:100) - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_ij.df"), nchunks = 4, overwrite = T) - as.disk.frame(b, file.path(tempdir(), "tmp_b_ij.df"), nchunks = 5, overwrite = T) - as.disk.frame(d, file.path(tempdir(), "tmp_d_ij.df"), overwrite = T) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_ij2.df"), nchunks = 4, overwrite = T) - as.disk.frame(b, file.path(tempdir(), "tmp_b_ij2.df"), nchunks = 5, overwrite = T) - as.disk.frame(d, file.path(tempdir(), "tmp_d_ij2.df"), overwrite = T) -}) - -test_that("testing inner_join where right is data.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_ij.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_ij.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_ij.df")) - bc = collect(b) - dc = collect(d) - - abc = inner_join(a, bc, by = "a") %>% collect - expect_equal(nrow(abc), 50) - - abc0 = inner_join(a, bc, by = c("a","b")) %>% collect - expect_equal(nrow(abc0), 0) - - abc100 = inner_join(a, bc, by = "b") %>% collect - expect_equal(nrow(abc100), 100) - - abd50 = inner_join(a, dc, by = "b") %>% collect - expect_equal(nrow(abd50), 50) -}) - -test_that("testing inner_join where right is disk.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_ij2.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_ij2.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_ij2.df")) - - ab = inner_join(a, b, by = "a", merge_by_chunk_id = F) %>% collect - expect_equal(nrow(ab), 50) - - # expecting a warning for some chunks being 0 rows - expect_warning(ab0 <- inner_join(a, b, by = c("a","b"), merge_by_chunk_id = F) %>% collect) - expect_equal(nrow(ab0), 0) - - ab100 = inner_join(a, b, by = "b", merge_by_chunk_id = F) %>% collect - expect_equal(nrow(ab100), 100) - - ad50 = inner_join(a, d, by = "b", merge_by_chunk_id = F) %>% collect - expect_equal(nrow(ad50), 50) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_a_ij.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_ij.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_ij.df")) - fs::dir_delete(file.path(tempdir(), "tmp_a_ij2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_ij2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_ij2.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-is-disk.frame.r b/tests/testthat/test-is-disk.frame.r deleted file mode 100644 index 87d4fde4..00000000 --- a/tests/testthat/test-is-disk.frame.r +++ /dev/null @@ -1,12 +0,0 @@ -context("test-is-disk-frame") - -test_that("testing is_disk.frame", { - fs::dir_create(file.path(tempdir(), "tmp_is_disk_frame")) - fst::write_fst(data.frame(a= 1, b = 1), file.path(tempdir(), "tmp_is_disk_frame/1.fst")) - fst::write_fst(data.frame(a= 1, b = 1), file.path(tempdir(), "tmp_is_disk_frame/2.fst")) - - df = disk.frame(file.path(tempdir(), "tmp_is_disk_frame")) - expect_true(is_disk.frame(df)) - - disk.frame::delete(df) -}) diff --git a/tests/testthat/test-left_join.R b/tests/testthat/test-left_join.R deleted file mode 100644 index 66801085..00000000 --- a/tests/testthat/test-left_join.R +++ /dev/null @@ -1,71 +0,0 @@ -context("test-left_join") - -setup({ - - a = data.frame(a = 1:100, b = 1:100) - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_lj.df"), nchunks = 4, overwrite = T) - as.disk.frame(b, file.path(tempdir(), "tmp_b_lj.df"), nchunks = 5, overwrite = T) - as.disk.frame(d, file.path(tempdir(), "tmp_d_lj.df"), overwrite = T) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_lj2.df"), nchunks = 4, overwrite = T) - as.disk.frame(b, file.path(tempdir(), "tmp_b_lj2.df"), nchunks = 5, overwrite = T) - as.disk.frame(d, file.path(tempdir(), "tmp_d_lj2.df"), overwrite = T) -}) - -test_that("testing left_join where right is data.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_lj.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_lj.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_lj.df")) - bc = collect(b) - dc = collect(d) - - abc = left_join(a, bc, by = "a") %>% collect - expect_equal(nrow(abc), 100) - - abc0 = left_join(a, bc, by = c("a","b")) %>% collect - expect_equal(nrow(abc0), 100) - - abc100 = left_join(a, bc, by = "b") %>% collect - expect_equal(nrow(abc100), 100) - - abd50 = left_join(a, dc, by = "b") %>% collect - expect_equal(nrow(abd50), 100) -}) - -test_that("testing left_join where right is disk.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_lj2.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_lj2.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_lj2.df")) - - expect_warning({ - ab = left_join(a, b, by = "a", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab), 100) - - expect_warning({ - ab0 = left_join(a, b, by = c("a","b"), merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab0), 100) - - expect_warning({ - ab100 = left_join(a, b, by = "b", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab100), 100) - - expect_warning({ - ad50 = left_join(a, d, by = "b", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ad50), 100) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_a_lj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_lj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_lj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_a_lj2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_lj2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_lj2.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-map.r b/tests/testthat/test-map.r deleted file mode 100644 index beec44ba..00000000 --- a/tests/testthat/test-map.r +++ /dev/null @@ -1,76 +0,0 @@ -context("test-cmap") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_map.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing cmap lazy", { - b = disk.frame(file.path(tempdir(), "tmp_map.df")) - - # return 1 row from each chunk - df = b %>% cmap(~.x[1]) - - expect_s3_class(df, "disk.frame") - - df2 = df %>% collect - - expect_s3_class(df2, "data.frame") - - expect_equal(nrow(df2), 5L) -}) - -test_that("testing cmap eager", { - b = disk.frame(file.path(tempdir(), "tmp_map.df")) - - # return 1 row from each chunk - df = b %>% cmap(~.x[1], lazy = F) - expect_false("disk.frame" %in% class(df)) - - # return 1 row from each chunk - df = b %>% cmap_dfr(~.x[1]) - expect_false("disk.frame" %in% class(df)) - expect_true("data.frame" %in% class(df)) -}) - -test_that("testing delayed", { - b = disk.frame(file.path(tempdir(), "tmp_map.df")) - - # return 1 row from each chunk - df = b %>% delayed(~.x[1]) - - expect_s3_class(df, "disk.frame") - - df1 = collect(df) - - expect_equal(nrow(df1), 5) -}) - - -test_that("testing map_dfr", { - b = disk.frame(file.path(tempdir(), "tmp_map.df")) - - # return 1 row from each chunk - df = b %>% cmap_dfr(~.x[1,]) - - expect_s3_class(df, "data.frame") -}) - - -test_that("testing imap", { - b = disk.frame(file.path(tempdir(), "tmp_map.df")) - - # return 1 row from each chunk - df = b %>% cimap_dfr(~{ - y = .x[1,] - y[,ok := .y] - y - }) - - expect_s3_class(df, "data.frame") -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_map.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-map2.r b/tests/testthat/test-map2.r deleted file mode 100644 index 926d49ee..00000000 --- a/tests/testthat/test-map2.r +++ /dev/null @@ -1,46 +0,0 @@ -context("test-cmap2") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 151:250, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_map2.df"), nchunks = 5, overwrite = T) - as.disk.frame(d, file.path(tempdir(), "tmp_map2d.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing cmap2 .y is disk.frame", { - b = disk.frame(file.path(tempdir(), "tmp_map2.df")) - d = disk.frame(file.path(tempdir(), "tmp_map2d.df")) - - # return 1 row from each chunk - df = cmap2(b, d, ~rbindlist(list(.x[1,],.y[1,])), outdir = file.path(tempdir(), "tmp_map2_out.df")) - - expect_s3_class(df, "disk.frame") - - df2 = df %>% collect - - expect_s3_class(df2, "data.frame") - - expect_equal(nrow(df2), 10L) -}) - -test_that("testing map2 .y is not disk.frame", { - b = disk.frame(file.path(tempdir(), "tmp_map2.df")) - d = 1:nchunks(b) - - # return 1 row from each chunk - expect_warning(df <- cmap2(b, d, ~.x[1,.(y = .y)], outdir = "tmp_map2_out2.df")) - - expect_type(df, "list") - - df2 = df %>% rbindlist - - expect_s3_class(df2, "data.frame") - - expect_equal(nrow(df2), 5L) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_map2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_map2d.df")) - fs::dir_delete(file.path(tempdir(), "tmp_map2_out.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-merge.r b/tests/testthat/test-merge.r deleted file mode 100644 index 41bb37e4..00000000 --- a/tests/testthat/test-merge.r +++ /dev/null @@ -1,59 +0,0 @@ -context("test-merge.disk.frame") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 151:250, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_merge.df"), nchunks = 5, overwrite = TRUE) - as.disk.frame(d, file.path(tempdir(), "tmp_merge2.df"), nchunks = 5, overwrite = TRUE) -}) - -test_that("testing merge of disk.frame", { - b.df = disk.frame(file.path(tempdir(), "tmp_merge.df")) - d.df = disk.frame(file.path(tempdir(), "tmp_merge2.df")) - - bd.df = merge(b.df, d.df, by = "b", outdir = file.path(tempdir(), "tmp_bd_merge.df"), overwrite = TRUE, merge_by_chunk_id = TRUE) - - expect_s3_class(bd.df, "disk.frame") - expect_equal(nrow(bd.df), 100) -}) - -test_that("testing merge of data.frame", { - b.df = disk.frame(file.path(tempdir(), "tmp_merge.df")) - d = data.frame(a = 151:250, b = 1:100) - - bd.df = merge(b.df, d, by = "b", outdir = file.path(tempdir(), "tmp_bd_merge2.df"), overwrite = TRUE) - - expect_s3_class(bd.df, "disk.frame") - expect_equal(nrow(bd.df), 100) - - tmp = collect(bd.df) - - expect_s3_class(tmp, "data.frame") - expect_equal(nrow(tmp), 100) -}) - -test_that("testing error when merge_by_chunk = FALSE", { - b.df = disk.frame(file.path(tempdir(), "tmp_merge.df")) - d.df = disk.frame(file.path(tempdir(), "tmp_merge2.df")) - - testthat::expect_error() - expect_error( - merge( - b.df, - d.df, - by = "b", - outdir = file.path(tempdir(), "tmp_bd_merge.df"), - overwrite = TRUE, - merge_by_chunkd_id = FALSE - ) - ) -}) - - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_merge.df")) - fs::dir_delete(file.path(tempdir(), "tmp_merge2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_bd_merge.df")) - fs::dir_delete(file.path(tempdir(), "tmp_bd_merge2.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-names.r b/tests/testthat/test-names.r deleted file mode 100644 index 0441136b..00000000 --- a/tests/testthat/test-names.r +++ /dev/null @@ -1,18 +0,0 @@ -context("test-names") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_names.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing names", { - b = disk.frame(file.path(tempdir(), "tmp_names.df")) - - expect_setequal(colnames(b), c("a","b")) - expect_setequal(names(b), c("a","b")) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_names.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-nchunks.r b/tests/testthat/test-nchunks.r deleted file mode 100644 index bf716810..00000000 --- a/tests/testthat/test-nchunks.r +++ /dev/null @@ -1,18 +0,0 @@ -context("test-nchunks") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_chunks.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing nchunks", { - b = disk.frame(file.path(tempdir(), "tmp_chunks.df")) - - expect_equal(nchunks(b), 5) - expect_equal(nchunk(b), 5) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_chunks.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-nrow-ncol.R b/tests/testthat/test-nrow-ncol.R deleted file mode 100644 index 05b33601..00000000 --- a/tests/testthat/test-nrow-ncol.R +++ /dev/null @@ -1,20 +0,0 @@ -context("test-nrow-ncol") - -setup({ - df = disk.frame:::gen_datatable_synthetic(1e3+11) - data.table::fwrite(df, file.path(tempdir(), "tmp_pls_delete.csv")) -}) - -test_that("nrow ncol", { - dff = csv_to_disk.frame( - file.path(tempdir(), "tmp_pls_delete.csv"), - file.path(tempdir(), "tmp_pls_delete.df")) - - expect_equal(nrow(dff), 1e3+11) - expect_equal(ncol(dff), 10) -}) - -teardown({ - fs::file_delete(file.path(tempdir(), "tmp_pls_delete.csv")) - fs::dir_delete(file.path(tempdir(), "tmp_pls_delete.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-overwrite_check.r b/tests/testthat/test-overwrite_check.r deleted file mode 100644 index 59f6548e..00000000 --- a/tests/testthat/test-overwrite_check.r +++ /dev/null @@ -1,21 +0,0 @@ -context("test-overwrite_check") - -setup({ -}) - -test_that("testing overwrite_check", { - b = data.frame(a = 51:150, b = 1:100) - - fs::dir_create(file.path(tempdir(), "tmp_overwrite-check")) - fs::file_create(file.path(tempdir(), "tmp_overwrite-check/tmp")) - - - expect_error(disk.frame::overwrite_check(file.path(tempdir(), "tmp_overwrite-check"), overwrite = TRUE)) - - expect_error(disk.frame::overwrite_check(file.path(tempdir(), "tmp_overwrite-check"), overwrite = FALSE)) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_overwrite-check")) -}) \ No newline at end of file diff --git a/tests/testthat/test-pls-add.r b/tests/testthat/test-pls-add.r deleted file mode 100644 index 29f0e78b..00000000 --- a/tests/testthat/test-pls-add.r +++ /dev/null @@ -1,44 +0,0 @@ -context("test-pls-add") - -setup({ - #setup_disk.frame(workers = 1) -}) - -test_that("pls-add", { - - if (interactive()) { - library(disk.frame) - library(tidyverse) - - setup_disk.frame(2) - example <- as.disk.frame( - data.frame( - purchase_date=c("2020-03-20","2020-04-20"), - a = 1:2, - b = 3:4 - ) - ) - example %>% - mutate(Panel_Month = str_sub(purchase_date, 6, 7)) %>% - collect - - str_sub2 = function(xx, yy) xx + yy - - example %>% - mutate(Panel_Month = str_sub2(a, b)) %>% - collect - - example %>% - mutate(Panel_Month = str_sub2(a, 7)) %>% - collect - - example %>% - mutate(Panel_Month = str_sub2(6, 7)) %>% - collect - - - example %>% - mutate(Panel_Month = str_subs(purchase_date, 7)) %>% - collect - } -}) \ No newline at end of file diff --git a/tests/testthat/test-print.r b/tests/testthat/test-print.r deleted file mode 100644 index 9d1053cc..00000000 --- a/tests/testthat/test-print.r +++ /dev/null @@ -1,14 +0,0 @@ -context("test-print") - -setup({ -}) - -test_that("testing print", { - # TODO proper tests - expect_equal(2L, 2L) -}) - - -teardown({ - -}) \ No newline at end of file diff --git a/tests/testthat/test-pull.r b/tests/testthat/test-pull.r deleted file mode 100644 index 4acfa8d4..00000000 --- a/tests/testthat/test-pull.r +++ /dev/null @@ -1,50 +0,0 @@ -context("test-pull") - - -test_that("pull with", { - flights_df = as.disk.frame(nycflights13::flights) - - a = flights_df %>% - pull(carrier, carrier) - - b = flights_df %>% collect() %>% pull(carrier, carrier) - - expect_equal(a, b) - - a = flights_df %>% - pull(2, 2) - b = flights_df %>% collect() %>% pull(2, 2) - - expect_equal(a, b) - - a = flights_df %>% - pull(-1, -1) - b = flights_df %>% collect() %>% pull(-1, -1) - expect_equal(a, b) - - delete(flights_df) -}) - - -test_that("pull", { - flights_df = as.disk.frame(nycflights13::flights) - - a = flights_df %>% - pull(carrier) - b = flights_df %>% collect() %>% pull(carrier) - - expect_setequal(a, b) - - a = flights_df %>% - pull(2) - b = flights_df %>% collect() %>% pull(2) - - expect_setequal(a, b) - - a = flights_df %>% - pull(-1) - b = flights_df %>% collect() %>% pull(-1) - expect_setequal(a, b) - - delete(flights_df) -}) diff --git a/tests/testthat/test-rbindlist.r b/tests/testthat/test-rbindlist.r deleted file mode 100644 index d30cada0..00000000 --- a/tests/testthat/test-rbindlist.r +++ /dev/null @@ -1,31 +0,0 @@ -context("test-rbindlist") - -setup({ - as.disk.frame(disk.frame:::gen_datatable_synthetic(1e3+11), file.path(tempdir(), "tmp_rbindlist1.df"), overwrite=TRUE) - as.disk.frame(disk.frame:::gen_datatable_synthetic(1e3+11), file.path(tempdir(), "tmp_rbindlist2.df"), overwrite=TRUE) - as.disk.frame(disk.frame:::gen_datatable_synthetic(1e3+11), file.path(tempdir(), "tmp_rbindlist4.df"), overwrite=TRUE) -}) - -test_that("test rbindlist", { - df1 = disk.frame(file.path(tempdir(), "tmp_rbindlist1.df")) - df2 = disk.frame(file.path(tempdir(), "tmp_rbindlist2.df")) - - df3 = rbindlist.disk.frame(list(df1, df2), outdir = file.path(tempdir(), "tmp_rbindlist3.df"), overwrite=TRUE) - - expect_equal(nrow(df3), 2*(1e3+11)) -}) - -test_that("test rbindlist accepts only list", { - df1 = disk.frame(file.path(tempdir(), "tmp_rbindlist4.df")) - - expect_error(rbindlist.disk.frame(df1, outdir = file.path(tempdir(), "tmp_rbindlist5.df"))) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_rbindlist1.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rbindlist2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rbindlist3.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rbindlist4.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rbindlist5.df")) -}) diff --git a/tests/testthat/test-rechunk.r b/tests/testthat/test-rechunk.r deleted file mode 100644 index 68f59938..00000000 --- a/tests/testthat/test-rechunk.r +++ /dev/null @@ -1,82 +0,0 @@ -context("test-rechunk") - -setup({ -}) - -test_that("testing rechunk 5 to 4", { - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_rechunks.df"), nchunks = 5, overwrite = T) - - b = disk.frame(file.path(tempdir(), "tmp_rechunks.df")) - - b = rechunk(b, 4) - expect_equal(nrow(b), 100) - expect_equal(ncol(b), 2) - expect_equal(nchunk(b), 4) - - res = collect(b)[order(b)] - - expect_equal(res$b, 1:100) - expect_equal(res$a, 51:150) -}) - -test_that("testing rechunk 5 to 3", { - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_rechunks2.df"), nchunks = 5, overwrite = T) - - b = disk.frame(file.path(tempdir(), "tmp_rechunks2.df")) - - b = rechunk(b, 3) - expect_equal(nrow(b), 100) - expect_equal(ncol(b), 2) - expect_equal(nchunk(b), 3) - - res = collect(b)[order(b)] - - expect_equal(res$b, 1:100) - expect_equal(res$a, 51:150) -}) - -test_that("testing rechunk 5 to 6", { - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_rechunks3.df"), nchunks = 5, overwrite = T) - - b = disk.frame(file.path(tempdir(), "tmp_rechunks3.df")) - - b = rechunk(b, 6) - expect_equal(nrow(b), 100) - expect_equal(ncol(b), 2) - expect_equal(nchunk(b), 6) - - res = collect(b)[order(b)] - - expect_equal(res$b, 1:100) - expect_equal(res$a, 51:150) -}) - -test_that("testing rechunk 5 to 7", { - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_rechunks4.df"), nchunks = 5, overwrite = T) - - b = disk.frame(file.path(tempdir(), "tmp_rechunks4.df")) - - b = rechunk(b, 7) - expect_equal(nrow(b), 100) - expect_equal(ncol(b), 2) - expect_equal(nchunk(b), 7) - - res = collect(b)[order(b)] - - expect_equal(res$b, 1:100) - expect_equal(res$a, 51:150) -}) - -# TODO do shardby; it's kinda of mitigated by thorough testing on Fannie Mae - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_rechunks.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rechunks2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rechunks3.df")) - fs::dir_delete(file.path(tempdir(), "tmp_rechunks4.df")) -}) diff --git a/tests/testthat/test-recommend_nchunk.R b/tests/testthat/test-recommend_nchunk.R deleted file mode 100644 index c5fbb109..00000000 --- a/tests/testthat/test-recommend_nchunk.R +++ /dev/null @@ -1,16 +0,0 @@ -context("test-recommend_nchunk") - -test_that("testing df_ram_size", { - expect_true(is.numeric(df_ram_size())) - - expect_true(!is.na(df_ram_size())) - expect_true(!is.null(df_ram_size())) - expect_true(!is.nan(df_ram_size())) - expect_true(is.finite(df_ram_size())) -}) - -test_that("testing df_ram_size; guards #213", { - # TODO tests - expect_true(df_ram_size() >= 1) -}) - diff --git a/tests/testthat/test-remove_chunk.r b/tests/testthat/test-remove_chunk.r deleted file mode 100644 index 8aec02ae..00000000 --- a/tests/testthat/test-remove_chunk.r +++ /dev/null @@ -1,18 +0,0 @@ -context("test-remove") - -test_that("testing remove chunk 3 of 5", { - b = data.frame(a = 51:150, b = 1:100) - tmp = file.path(tempdir(), "tmp_remove.df") - b = as.disk.frame(b, tmp, nchunks = 5, overwrite = T) - - b = remove_chunk(b, 3) - expect_equal(nrow(b), 80) - expect_equal(ncol(b), 2) - expect_equal(nchunk(b), 4) - - res <- collect(b)[order(b)] - - expect_equal(nrow(res), 80) - - delete(b) -}) diff --git a/tests/testthat/test-right_join.r b/tests/testthat/test-right_join.r deleted file mode 100644 index 27e532b4..00000000 --- a/tests/testthat/test-right_join.r +++ /dev/null @@ -1,7 +0,0 @@ -context("test-right_join") - -test_that("testing right_join", { - # TODO tests - expect_equal(2L, 2L) -}) - diff --git a/tests/testthat/test-sample_frac.r b/tests/testthat/test-sample_frac.r deleted file mode 100644 index 8c45f8f7..00000000 --- a/tests/testthat/test-sample_frac.r +++ /dev/null @@ -1,20 +0,0 @@ -context("test-sampe_frac") - -setup({ - a = data.frame(a = 1:100, b = 1:100) - - as.disk.frame(a, file.path(tempdir(), "tmp_sample_frac.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing sample_frac", { - a = disk.frame(file.path(tempdir(), "tmp_sample_frac.df")) - a40 <- sample_frac(a, 0.4) %>% collect - - expect_equal(nrow(a40), 40) - - expect_error(a40 <- sample_frac(a, 0.4, weight = 1) %>% collect) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_sample_frac.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-sample_n.r b/tests/testthat/test-sample_n.r deleted file mode 100644 index faf76259..00000000 --- a/tests/testthat/test-sample_n.r +++ /dev/null @@ -1,16 +0,0 @@ -context("test-sample_n") - -setup({ - a = data.frame(a = 1:100, b = 1:100) - - as.disk.frame(a, file.path(tempdir(), "tmp_sample_n.df"), nchunks = 5, overwrite = T) -}) - -test_that("testing semi_join where right is data.frame", { - a = disk.frame(file.path(tempdir(), "tmp_sample_n.df")) - expect_error(a40 <- sample_n(a, 40) %>% collect) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_sample_n.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-sas_to_csv.r b/tests/testthat/test-sas_to_csv.r deleted file mode 100644 index 0bdd6fa4..00000000 --- a/tests/testthat/test-sas_to_csv.r +++ /dev/null @@ -1,8 +0,0 @@ -# TODO everything - -context("test-sas_to_csv") - -test_that("testing sas_to_csv", { - # TODO tests - expect_equal(2L, 2L) -}) diff --git a/tests/testthat/test-sas_to_disk.frame.r b/tests/testthat/test-sas_to_disk.frame.r deleted file mode 100644 index dbd64e3e..00000000 --- a/tests/testthat/test-sas_to_disk.frame.r +++ /dev/null @@ -1,10 +0,0 @@ -# TODO everything - -context("test-sas_to_disk.frame") - - -test_that("testing sas_to_disk.frame", { - # TODO tests - expect_equal(2L, 2L) -}) - diff --git a/tests/testthat/test-semi_join.R b/tests/testthat/test-semi_join.R deleted file mode 100644 index 7b96c3ac..00000000 --- a/tests/testthat/test-semi_join.R +++ /dev/null @@ -1,72 +0,0 @@ -context("test-semi_join") - -setup({ - - a = data.frame(a = 1:100, b = 1:100) - b = data.frame(a = 51:150, b = 1:100) - d = data.frame(a = 1:50, b = 1:50) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_sj.df"), nchunks = 4, overwrite = TRUE) - as.disk.frame(b, file.path(tempdir(), "tmp_b_sj.df"), nchunks = 5, overwrite = TRUE) - as.disk.frame(d, file.path(tempdir(), "tmp_d_sj.df"), overwrite = TRUE) - - as.disk.frame(a, file.path(tempdir(), "tmp_a_sj2.df"), nchunks = 4, overwrite = TRUE) - as.disk.frame(b, file.path(tempdir(), "tmp_b_sj2.df"), nchunks = 5, overwrite = TRUE) - as.disk.frame(d, file.path(tempdir(), "tmp_d_sj2.df"), overwrite = TRUE) -}) - -test_that("testing semi_join where right is data.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_sj.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_sj.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_sj.df")) - bc = collect(b) - dc = collect(d) - - abc = semi_join(a, bc, by = "a") %>% collect - expect_equal(nrow(abc), 50) - - abc0 = semi_join(a, bc, by = c("a","b")) %>% collect - expect_equal(nrow(abc0), 0) - - abc100 = semi_join(a, bc, by = "b") %>% collect - expect_equal(nrow(abc100), 100) - - abd50 = semi_join(a, dc, by = "b") %>% collect - expect_equal(nrow(abd50), 50) -}) - -test_that("testing semi_join where right is disk.frame", { - a = disk.frame(file.path(tempdir(), "tmp_a_sj2.df")) - b = disk.frame(file.path(tempdir(), "tmp_b_sj2.df")) - d = disk.frame(file.path(tempdir(), "tmp_d_sj2.df")) - - expect_warning({ - ab = semi_join(a, b, by = "a", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab), 50) - - expect_warning({ - ab0 = semi_join(a, b, by = c("a","b"), merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab0), 0) - - expect_warning({ - ab100 = semi_join(a, b, by = "b", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ab100), 100) - - expect_warning({ - ad50 = semi_join(a, d, by = "b", merge_by_chunk_id = F) %>% collect - }) - expect_equal(nrow(ad50), 50) -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_a_sj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_sj.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_sj.df")) - - fs::dir_delete(file.path(tempdir(), "tmp_a_sj2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_b_sj2.df")) - fs::dir_delete(file.path(tempdir(), "tmp_d_sj2.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-setup.r b/tests/testthat/test-setup.r deleted file mode 100644 index 4d3473d1..00000000 --- a/tests/testthat/test-setup.r +++ /dev/null @@ -1,7 +0,0 @@ -context("test-setup") - -test_that("testing sas_to_disk.frame", { - setup_disk.frame(workers = 2) - a = future::nbrOfWorkers() - expect_equal(a, 2) -}) \ No newline at end of file diff --git a/tests/testthat/test-shard.r b/tests/testthat/test-shard.r deleted file mode 100644 index 0a2e28e5..00000000 --- a/tests/testthat/test-shard.r +++ /dev/null @@ -1,28 +0,0 @@ -context("test-shard") - -setup({ -}) - -test_that("testing shard data.frame", { - set.seed(1) - a = data.table(a = rep(1:10, 10), b = 1:100) - a = shard(a, "a", nchunks = 2, overwrite = TRUE, outdir=file.path(tempdir(), "tmp_shard.df")) - - expect_equal(nchunks(a), 2) - expect_equal(nrow(a), 100) - expect_equal(ncol(a), 2) - - a1 = unique(get_chunk(a,1)$a) - a2 = unique(get_chunk(a,2)$a) - expect_equal(length(intersect(a1, a2)), 0) - - a3 = shard(a, "a", nchunks = 4, overwrite = TRUE) - - expect_equal(nchunks(a3), 4) - expect_equal(nrow(a3), 100) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_shard.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-shardkey.r b/tests/testthat/test-shardkey.r deleted file mode 100644 index 846b81bb..00000000 --- a/tests/testthat/test-shardkey.r +++ /dev/null @@ -1,17 +0,0 @@ -context("test-shardkey") - -setup({ -}) - -test_that("testing shardkey", { - set.seed(1) - a = data.table(a = rep(1:10, 10), b = 1:100) - a = shard(a, "a", nchunks = 2, overwrite = TRUE, outdir=file.path(tempdir(), "tmp_shardkey.df")) - - expect_equal(shardkey(a), list(shardkey="a", shardchunks=2)) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_shardkey.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-srckeep.r b/tests/testthat/test-srckeep.r deleted file mode 100644 index 5b6fa370..00000000 --- a/tests/testthat/test-srckeep.r +++ /dev/null @@ -1,17 +0,0 @@ -context("test-keep") - -setup({ - b = data.frame(a = 51:150, b = 1:100) - as.disk.frame(b, file.path(tempdir(), "tmp_srckeep.df"), nchunks = 5, overwrite = TRUE) -}) - -test_that("testing srckeep", { - b = disk.frame(file.path(tempdir(), "tmp_srckeep.df")) - b1 = b %>% srckeep("a") - expect_equal(ncol(b1 %>% collect), 1) - expect_equal(colnames(b1 %>% collect), "a") -}) - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_srckeep.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-tbl_vars.r b/tests/testthat/test-tbl_vars.r deleted file mode 100644 index 5cd00ffa..00000000 --- a/tests/testthat/test-tbl_vars.r +++ /dev/null @@ -1,16 +0,0 @@ -context("test-tbl_vars") - -setup({ -}) - -test_that("testing tbl_vars", { - a = data.table(a = rep(1:10, 10), b = 1:100) - a = shard(a, "a", nchunks = 2, overwrite = TRUE, outdir=file.path(tempdir(), "tmp_tbl_vars.df")) - - expect_setequal(tbl_vars(a), c("a","b")) -}) - - -teardown({ - fs::dir_delete(file.path(tempdir(), "tmp_tbl_vars.df")) -}) \ No newline at end of file diff --git a/tests/testthat/test-util.r b/tests/testthat/test-util.r deleted file mode 100644 index a9db9f37..00000000 --- a/tests/testthat/test-util.r +++ /dev/null @@ -1,8 +0,0 @@ -context("test-util") - -test_that("testing evalparseglue", { - x = 2 - y = 3 - expect_equal(evalparseglue("{x}+{y}"), 5) -}) - diff --git a/tests/testthat/test-write_disk.frame.R b/tests/testthat/test-write_disk.frame.R deleted file mode 100644 index c5650c81..00000000 --- a/tests/testthat/test-write_disk.frame.R +++ /dev/null @@ -1,39 +0,0 @@ -context("test-write_disk.frame") - -test_that("as.disk.frame works", { - ROWS = 1e3+11 - - tmp_write_disk.frame = tempfile() - tmp_write_disk.frame2 = tempfile() - - df = disk.frame:::gen_datatable_synthetic(ROWS) - dfdf <- as.disk.frame(df, tmp_write_disk.frame, overwrite = TRUE, nchunks = 5) - - a = dfdf %>% cmap(~{ - .x[1,] - }) %>% write_disk.frame(outdir = tmp_write_disk.frame2, overwrite = T) - - expect_equal(nrow(a), 5) - - fs::dir_delete(tmp_write_disk.frame) - fs::dir_delete(tmp_write_disk.frame2) -}) - -test_that("as.disk.frame fails if data frame has list-columns", { - df <- tibble::tibble("a" = c(1,2,3), "b" = list("a", "b", "c")) - expect_error(as.disk.frame(df, file.path(tempdir(), "tmp_write_disk.frame"), overwrite = TRUE, nchunks = 6)) -}) - -test_that("write_disk.frame shard works", { - mtcars_df = as.disk.frame( - mtcars, - outdir = file.path(tempdir(), "mt_shard_by_cyl"), - shardby = c("cyl","vs"), - nchunks = 3, - overwrite = TRUE) - - res = mtcars_df %>% collect_list - expect_equal(length(res), 3) - testthat::expect_type(res, "list") - -}) diff --git a/tests/testthat/test-zip_to_disk.frame.r b/tests/testthat/test-zip_to_disk.frame.r deleted file mode 100644 index b2e4c31e..00000000 --- a/tests/testthat/test-zip_to_disk.frame.r +++ /dev/null @@ -1,7 +0,0 @@ -context("test-zip_to_disk.frame") - -# TODO do some testing -test_that("testing zip_to_disk.frame", { - expect_true(TRUE) -}) - diff --git a/utils/build_utils.R b/utils/build_utils.R index b158a0bf..33103cf0 100644 --- a/utils/build_utils.R +++ b/utils/build_utils.R @@ -73,19 +73,19 @@ df_setup_vignette <- function(excl = "", strip_number = FALSE) { df_test <- function() { # rename tests - if(fs::dir_exists("tests_manual")) { - fs::dir_copy("tests_manual", "tests") - Sys.sleep(3) # allow enough time for it to happen - fs::dir_delete("tests_manual") - } + # if(fs::dir_exists("tests_manual")) { + # fs::dir_copy("tests_manual", "tests") + # Sys.sleep(3) # allow enough time for it to happen + # fs::dir_delete("tests_manual") + # } devtools::test() - if(fs::dir_exists("tests")) { - fs::dir_copy("tests", "tests_manual") - Sys.sleep(8) # allow enough time for it to happen - fs::dir_delete("tests") - } + # if(fs::dir_exists("tests")) { + # fs::dir_copy("tests", "tests_manual") + # Sys.sleep(8) # allow enough time for it to happen + # fs::dir_delete("tests") + # } } df_build_vignettes_for_cran <- function() { diff --git a/vignettes/concepts.Rmd b/vignettes/concepts.Rmd deleted file mode 100644 index d320fd90..00000000 --- a/vignettes/concepts.Rmd +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: "Key `{disk.frame}` concepts" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Key disk.frame concepts} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Key `{disk.frame}` concepts -There are a number of concepts and terminologies that are useful to understand in order to use `disk.frame` effectively. - -## What is a `disk.frame` and what are chunks? - -A `disk.frame` is a folder containing [`fst`](https://www.fstpackage.org/) files named "1.fst", "2.fst", "3.fst" etc. Each of the ".fst" file is called a _chunk_. - -## Workers and parallelism - -Parallelism in `disk.frame` is achieved using the [`future` package](https://cran.r-project.org/package=future). When performing many tasks, `disk.frame` uses multiple workers, where each _worker_ is an R session, to perform the tasks in parallel. - -It is recommended that you should run the following immediately after `library(disk.frame)` to set-up multiple workers. For example: - -```r -library(disk.frame) -setup_disk.frame() - -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -For example, suppose we wish to compute the number of rows for each chunk, we can clearly perform this simultaneously in parallel. The code to do that is - -```r -# use only one column is fastest -df[,.N, keep = "first_col"] -``` - -or equivalent using the `srckeep` function - -```r -# use only one column is fastest -srckeep(df, "first_col")[,.N, keep = "first_col"] -``` - -Say there are `n` chunks in `df`, and there are `m` workers. Then the first `m` chunks will run `chunk[,.N]` simultaneously. - -To see how many workers are at work, use -```r -# see how many workers are available for work -future::nbrOfWorkers() -``` - -## How `{disk.frame}` works - -When `df %>% some_fn %>% collect` is called. The `some_fn` is applied to each chunk of `df`. The collect will row-bind the results from `some_fn(chunk)`together if the returned value of `some_fn` is a data.frame, or it will return a `list` containing the results of `some_fn`. - -The session that receives these results is called the **main session**. In general, we should try to minimize the amount of data passed from the worker sessions back to the main session, because passing data around can be slow. - -Also, please note that there is no communication between the workers, except for workers passing data back to the main session. - - diff --git a/vignettes/convenience-features.Rmd b/vignettes/convenience-features.Rmd deleted file mode 100644 index 6269a97b..00000000 --- a/vignettes/convenience-features.Rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Convenience features" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Convenience features} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## Convenience Features - -### GUI for setting options - -I wanted to make `{disk.frame}` as easy to use as possible. I often forget what options are available to me. So I've made a GUI - -```r -setup_disk.frame(gui = TRUE) -``` -which opens up a Shiny app where the user can choose the options. - -### RStudio column name completion - -```r -library(disk.frame) -mtcars.df = as.disk.frame(mtcars) - -mtcars.df %>% - filter() -``` - -you can press tab in RStudio and it will show all column available - -### Insert ceremony/boilerplate into code in RStudio - -The below will insert the recommended ceremony code into your editor -```r -disk.frame::insert_ceremony() -``` -should insert - -```r -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` diff --git a/vignettes/data-table-syntax.Rmd b/vignettes/data-table-syntax.Rmd deleted file mode 100644 index e87ca88b..00000000 --- a/vignettes/data-table-syntax.Rmd +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: "Using data.table syntax with disk.frame" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Using data.table syntax} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## `disk.frame` supports `data.table` syntax - - -```{r setup_data_table, cache=TRUE} -library(disk.frame) - -# set-up disk.frame to use multiple workers -if(interactive()) { - setup_disk.frame() - # highly recommended, however it is pun into interactive() for CRAN because - # change user options are not allowed on CRAN - options(future.globals.maxSize = Inf) -} else { - setup_disk.frame(2) -} - - -library(nycflights13) - -# create a disk.frame -flights.df = as.disk.frame(nycflights13::flights, outdir = file.path(tempdir(),"flights13"), overwrite = TRUE) -``` - -In the following example, I will use the `.N` from the `data.table` package to count the unique combinations `year` and `month` within each chunk. - -```{r ok, dependson='setup_data_table'} -library(data.table) -library(disk.frame) - -flights.df = disk.frame(file.path(tempdir(),"flights13")) - -names(flights.df) - -flights.df[,.N, .(year, month), keep = c("year", "month")] -``` - -All `data.table` syntax are supported. However, `disk.frame` adds the ability to load only those columns required for the analysis using the `keep =` option. In the above analysis, only the `year` and `month` variables are required and hence `keep = c("year", "month")` was used. - -Alternatively, we can use the `srckeep` function to achieve the same, e.g. - -```r -srckeep(flights.df, c("year", "month"))[,.N, .(year, month)] -``` - -### External variables are captured - -`disk.frame` sends the computation to background workers which are essentially distinct and separate R sessions. Typically, the variables that you have available in your current R session aren't visible in the other R sessions, but `disk.frame` uses the `future` package's variable detection abilities to figure out which variables are in use and then send them to the background workers so they have access to the variables as well. E.g. - -```{r var_detect, dependson='setup_data_table'} -y = 42 -some_fn <- function(x) x - - -flights.df[,some_fn(y)] -``` - -In the above example, neither `some_fn` nor `y` are defined in the background workers' environments, but `disk.frame` still manages to evaluate this code `flights.df[,some_fn(y)]`. - -```{r clean_up, include=FALSE} -fs::dir_delete(file.path(tempdir(),"flights13")) -``` \ No newline at end of file diff --git a/vignettes/glm.Rmd b/vignettes/glm.Rmd deleted file mode 100644 index 1d5fb418..00000000 --- a/vignettes/glm.Rmd +++ /dev/null @@ -1,111 +0,0 @@ ---- -title: "Generalized Linear Models (GLM) including logistic regression with disk.frame" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Generalized Linear Models (logistic regression etc) with disk.frame} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup, cache=TRUE} -suppressPackageStartupMessages(library(disk.frame)) - -if(interactive()) { - setup_disk.frame() -} else { - # only use 1 work to pass CRAN check - setup_disk.frame(1) -} - -``` - -# GLMs - -### Prerequisites -In this article, we will assume you are familiar with Generalized Linear Models (GLMs). You are also expected to have basic working knowledge of {`disk.frame`}, see this [{`disk.frame`} Quick Start](http://diskframe.com/articles/intro-disk-frame.html). - -## Introduction -One can fit a GLM using the `glm` function. For example, - -```{r glm, cache=TRUE} -m = glm(dist ~ speed, data = cars) -``` - -would fit a linear model on the data `cars` with `dist` as the target and `speed` as the explanatory variable. You can inspect the results of the model fit using - -```{r, depeondson='glm'} -summary(m) -``` - -or if you have `{broom}` installed - -```{r, depeondson='glm'} -broom::tidy(m) -``` - -With {`disk.frame`}, you can run GLM `dfglm` function, where the `df` stands for `disk.frame` of course! -```{r dependson='setup'} -cars.df = as.disk.frame(cars) - -m = dfglm(dist ~ speed, cars.df) - -summary(m) - - -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) - -if((majorv == 3) & (minorv >= 6)) { - broom::tidy(m) -} else { - # broom doesn't work in version < R3.6 because biglm does not work -} - -``` - -The syntax didn't change at all! You are able to enjoy the benefits of `disk.frame` when dealing with larger-than-RAM data. - -## Logistic regression -Logistic regression is one of the most commonly deployed machine learning (ML) models. It is often used to build binary classification models - -```{r dependson='setup'} -iris.df = as.disk.frame(iris) - -# fit a logistic regression model to predict Speciess == "setosa" using all variables -all_terms_except_species = setdiff(names(iris.df), "Species") -formula_rhs = paste0(all_terms_except_species, collapse = "+") - -formula = as.formula(paste("Species == 'versicolor' ~ ", formula_rhs)) - -iris_model = dfglm(formula , data = iris.df, family=binomial()) - -# iris_model = dfglm(Species == "setosa" ~ , data = iris.df, family=binomial()) - -summary(iris_model) - -majorv = as.integer(version$major) -minorv = as.integer(strsplit(version$minor, ".", fixed=TRUE)[[1]][1]) - -if((majorv == 3) & (minorv >= 6)) { - broom::tidy(iris_model) -} else { - # broom doesn't work in version < R3.6 because biglm does not work -} - -``` - -The arguments to the `dfglm` function are the same as the arguments to `biglm::bigglm` which are based on the `glm` function. Please check their documentations for other argument options. - -## Notes -`{disk.frame}` uses `{biglm}` and `{speedglm}` as the backend for GLMs. Unfortunately, neither package is managed on open-source platforms, so it's more difficult to contribute to them by making bug fixes and submitting bug reports. So bugs are likely to persists. There is an active effort on `disk.frame` to look for alternatives. Example of avenues to explore include tighter integration with `{keras}`, h2o, or Julia's OnlineStats.jl for model fit purposes. - -Another package for larger-than-RAM glm fitting, `{bigFastlm}`, has been taken off CRAN, it is managed on Github. - -Currently, parallel processing of GLM fit are not possible with {`disk.frame`}. diff --git a/vignettes/ingesting-data.Rmd b/vignettes/ingesting-data.Rmd deleted file mode 100644 index 541d870a..00000000 --- a/vignettes/ingesting-data.Rmd +++ /dev/null @@ -1,174 +0,0 @@ ---- -title: "Ingesting Data" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Ingesting data including CSVs} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Ingesting Data - -One of the most important tasks to perform before using the `{disk.frame}` package is to make some `disk.frame`s! There are a few functions to help you do that. Before we do that, we set up the `{disk.frame}` as usual - -**Setting up** - -```r -library(disk.frame) - -# set-up disk.frame to use multiple workers -if(interactive()) { - setup_disk.frame() - # highly recommended, however it is pun into interactive() for CRAN because - # change user options are not allowed on CRAN - options(future.globals.maxSize = Inf) -} else { - setup_disk.frame(2) -} - -``` - -## Convert a `data.frame` to `disk.frame` -Firstly, there is `as.disk.frame()` which allows you to make a `disk.frame` from a `data.frame`, e.g. - -```r -flights.df = as.disk.frame(nycflights13::flights) -``` - -will convert the `nycflights13::flights` `data.frame` to a `disk.frame` somewhere in `tempdir()`. To find out the location of the `disk.frame` use: - -```r -attr(flights.df, "path") -``` - -You can also specify a location to output the `disk.frame` to using `outdir` - -```r -flights.df = as.disk.frame(nycflights13::flights, outdir = "some/path.df") -``` - -it is recommended that you use `.df` as the extension for a `disk.frame`, however this is not an enforced requirement. - -However, one of the reasons for `disk.frame` to exist is to handle larger-than-RAM files, hence `as.disk.frame` is not all that useful because it can only convert data that can fit into RAM. `disk.frame` comes with a couple more ways to create `disk.frame`. - -## Creating `disk.frame` from CSVs -The function `csv_to_disk.frame` can convert CSV files to `disk.frame`. The most basic usage is - -```r -some.df = csv_to_disk.frame("some/path.csv", outdir = "some.df") -``` - -this will convert the CSV file `"some/path.csv"` to a `disk.frame`. - -## Multiple CSV files - -However, sometimes we have multiple CSV files that you want to read in and row-bind into one large `disk.frame`. You can do so by supplying a vector of file paths e.g. from the result of `list.files` - -```r -some.df = csv_to_disk.frame(c("some/path/file1.csv", "some/path/file2.csv")) - -# or -some.df = csv_to_disk.frame(list.files("some/path")) -``` - -## Ingesting CSV files chunk-wise -The `csv_to_disk.frame(path, ...)` function reads the file located at `path` in full into RAM but sometimes the CSV file may be too large to read in one go, as that would require loading the whole file into RAM. In that case, you can read the files chunk-by-chunk by using the `in_chunk_size` argument which controls how many rows you read in per chunk - -```r -# to read in 1 million (=1e6) rows per chunk -csv_to_disk.frame(path, in_chunk_size = 1e6) -``` - -When `in_chunk_size` is specified, the input file is split into many smaller files using `bigreadr`'s split file functions. This is generally the fastest way to ingest large CSVs, as the split files can be processed in parallel using all CPU cores. But the disk space requirement is doubled because the split files are as large as the original file. If you run out of disk space, then you must clean R's temporary folder at `tempdir()` and choose another `chunk_reader` e.g. `csv_to_disk.frame(..., chunk_reader = "LaF")`. - -## Sharding -One of the most important aspects of `disk.frame` is sharding. One can shard a `disk.frame` at read time by using the `shardby` - -```r -csv_to_disk.frame(path, shardby = "id") -``` - -In the above case, all rows with the same `id` values will end up in the same chunk. - - -## Just-in-time transformation -Sometimes, one may wish to perform some transformation on the CSV before writing out to disk. One can use the `inmapfn` argument to do that. The `inmapfn` name comes from INput MAPping FuNction. The general usage pattern is as follows: - -```r -csv_to_disk.frame(file.path(tempdir(), "df.csv"), inmapfn = function(chunk) { - some_transformation(chunk) -}) -``` - -As a contrived example, suppose you wish to convert a string into date at read time: - -```r -df = data.frame(date_str = c("2019-01-02", "2019-01-02")) - -# write the data.frame -write.csv(df, file.path(tempdir(), "df.csv")) - - -# this would show that date_str is a string -str(collect(csv_to_disk.frame(file.path(tempdir(), "df.csv")))$date_str) -## chr [1:2] "2019-01-02" "2019-01-02" - -# this would show that date_str is a string -df = csv_to_disk.frame(file.path(tempdir(), "df.csv"), inmapfn = function(chunk) { - # convert to date_str to date format and store as "date" - chunk[, date := as.Date(date_str, "%Y-%m-%d")] - chunk[, date_str:=NULL] -}) - -str(collect(df)$date) -## Date[1:2], format: "2019-01-02" "2019-01-02" -``` - -## Reading CSVs from zip files -Often, CSV comes zipped in a zip files. You can use the `zip_to_disk.frame` to convert all CSVs within a zip file - -```r -zip_to_disk.frame(path_to_zip_file) -``` - -The arguments for `zip_to_disk.frame` are the same as `csv_to_disk.frame`'s. - - -## Using `add_chunk` - -What if the method of converting to a `disk.frame` isn't implemented in `disk.frame` yet? One can use some lower level constructs provided by `disk.frame` to create `disk.frame`s. For example, the `add_chunk` function can be used to add more chunks to a `disk.frame`, e.g. - -```r -a.df = disk.frame() # create an empty disk.frame -add_chunk(a.df, cars) # adds cars as chunk 1 -add_chunk(a.df, cars) # adds cars as chunk 2 -``` - -Another example of using `add_chunk` is via `readr`'s chunked read functions to create a delimited file reader - -```r -delimited_to_disk.frame <- function(file, outdir, ...) { - res.df = disk.frame(outdir, ...) - readr::read_delim_chunked(file, callback = function(chunk) { - add_chunk(res.df, chunk) - }, ...) - - res.df -} - -delimited_to_disk.frame(path, outdir = "some.df") -``` - -The above code uses `readr`'s `read_delim_chunked` function to read `file` and call `add_chunk`. The problem with this approach is that is it sequential in nature and hence is not able to take advantage of parallelism. - -## Exploiting the structure of a disk.frame - -Of course, a `disk.frame` is just a folder with many `fst` files named as `1.fst`, `2.fst` etc. So one can simply create these `fst` files and ensure they have the same variable names and put them in a folder. \ No newline at end of file diff --git a/vignettes/intro-disk-frame.Rmd b/vignettes/intro-disk-frame.Rmd deleted file mode 100644 index c3d364fb..00000000 --- a/vignettes/intro-disk-frame.Rmd +++ /dev/null @@ -1,378 +0,0 @@ ---- -title: "Quick Start: Basic Operations with nycflights13" -author: "ZJ" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Quick Start} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -suppressPackageStartupMessages(library(disk.frame)) -library(fst) -library(magrittr) -library(nycflights13) -library(dplyr) -library(data.table) - -# you need to run this for multi-worker support -# limit to 2 cores if not running interactively; most likely on CRAN -# set-up disk.frame to use multiple workers -if(interactive()) { - setup_disk.frame() - # highly recommended, however it is pun into interactive() for CRAN because - # change user options are not allowed on CRAN - options(future.globals.maxSize = Inf) -} else { - setup_disk.frame(2) -} - - -knitr::opts_chunk$set( - eval = FALSE, - collapse = TRUE, - comment = "#>", - include = TRUE -) -``` - -# Quick Start - replicating dplyr's tutorial on nycflight13 - -The [`disk.frame` package](https://github.com/xiaodaigh/disk.frame) aims to be the answer to the question: how do I manipulate structured tabular data that doesn't fit into Random Access Memory (RAM)? - -In a nutshell, `disk.frame` makes use of two simple ideas: - -1) split up a larger-than-RAM dataset into chunks and store each chunk in a separate file inside a folder and -2) provide a convenient API to manipulate these chunks - -`disk.frame` performs a similar role to distributed systems such as Apache Spark, Python's Dask, and Julia's JuliaDB.jl for *medium data* which are datasets that are too large for RAM but not quite large enough to qualify as *big data*. - -In this tutorial, we introduce `disk.frame`, address some common questions, and replicate the [sparklyr data manipulation tutorial](https://spark.rstudio.com/dplyr/) using `disk.frame` constructs. - -## Installation -Simply run - -```r -install.packages("disk.frame") # when CRAN ready -``` -or - -```r -devtools::install_github("xiaodaigh/disk.frame") -``` - -## Set-up `disk.frame` -`disk.frame` works best if it can process multiple data chunks in parallel. The best way to set-up `disk.frame` so that each CPU core runs a background worker is by using - -```r -setup_disk.frame() - -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -The `setup_disk.frame()` function sets up background workers equal to the number of CPU cores available on your machine; please note that, by default, hyper-threaded cores are counted as one not two. - -Alternatively, one may specify the number of workers using `setup_disk.frame(workers = n)`. - -## Basic Data Operations with `disk.frame` - -The `disk.frame` package provides convenient functions to convert `data.frame`s and CSVs to `disk.frame`s. - -### Creating a `disk.frame` from `data.frame` -We convert a `data.frame` to `disk.frame` using the `as.data.frame` function. - -```{r asdiskframe, cache=TRUE} -library(nycflights13) -library(dplyr) -library(disk.frame) -library(data.table) - -# convert the flights data to a disk.frame and store the disk.frame in the folder -# "tmp_flights" and overwrite any content if needed -flights.df <- as.disk.frame( - flights, - outdir = file.path(tempdir(), "tmp_flights.df"), - overwrite = TRUE) -flights.df -``` -You should now see a folder called `tmp_flights` with some files in it, namely `1.fst`, `2.fst`... where each `fst` files is one chunk of the `disk.frame`. - - -### Creating a `disk.frame` from CSV -```{r} -library(nycflights13) -# write a csv -csv_path = file.path(tempdir(), "tmp_flights.csv") -data.table::fwrite(flights, csv_path) - -# load the csv into a disk.frame -df_path = file.path(tempdir(), "tmp_flights.df") -flights.df <- csv_to_disk.frame( - csv_path, - outdir = df_path, - overwrite = T) - -flights.df -``` - -If the CSV is too large to read in, then we can also use the `in_chunk_size` option to control how many rows to read in at once. For example, to read in the data 100,000 rows at a time: - -```{r} -library(nycflights13) -library(disk.frame) - -# write a csv -csv_path = file.path(tempdir(), "tmp_flights.csv") - -data.table::fwrite(flights, csv_path) - -df_path = file.path(tempdir(), "tmp_flights.df") - -flights.df <- csv_to_disk.frame( - csv_path, - outdir = df_path, - in_chunk_size = 100000) - -flights.df -``` - -`disk.frame` also has a function `zip_to_disk.frame` that can convert every CSV in a zip file to a `disk.frame`. - -### Simple `dplyr` verbs and lazy evaluation -```{r dfselect, dependson='asdiskframe', cache=TRUE} -flights.df1 <- select(flights.df, year:day, arr_delay, dep_delay) -flights.df1 -``` - -```{r dependson='dfselect'} -class(flights.df1) -``` - -The class of `flights.df1` is also a `disk.frame` after the `dplyr::select` transformation. Also, `disk.frame` operations are by default (and where possible) **lazy**, meaning they don't perform the operations right away. Instead, these functions wait until you call `collect`. Exceptions to this rule are the `*_join` operations, which evaluate *eagerly* under certain conditions--see **Joins for disk.frame in-depth** for details. - -For lazily constructed `disk.frame`s (e.g. `flights.df1`), the function `collect` can be used to bring the results from disk into R, e.g. -```{r, dependson='dfselect'} -collect(flights.df1) %>% head(2) -``` - -Of course, for larger-than-RAM datasets, one wouldn't call `collect` on the whole `disk.frame` (because why would you need `disk.frame` otherwise). More likely, one would call `collect` on a `filter`ed dataset or one summarized with `group_by`. - -Some examples of other dplyr verbs applied: - -```{r, dependson='asdiskframe'} -filter(flights.df, dep_delay > 1000) %>% collect %>% head(2) -``` - -```{r, dependson='asdiskframe'} -mutate(flights.df, speed = distance / air_time * 60) %>% collect %>% head(2) -``` - -### Examples of NOT fully supported `dplyr` verbs - -The `chunk_arrange` function arranges (sorts) each chunk but not the whole dataset. So use with caution. Similarly, `chunk_summarise` creates summary variables within each chunk and hence also needs to be used with caution. In the **Group-by** section, we demonstrate how to use `summarise` in the `disk.frame` context correctly with `hard_group_by`s. - -```{r, dependson='asdiskframe'} -# this only sorts within each chunk -chunk_arrange(flights.df, dplyr::desc(dep_delay)) %>% collect %>% head(2) -``` - - -```{r, dependson='asdiskframe'} -chunk_summarize(flights.df, mean_dep_delay = mean(dep_delay, na.rm =T)) %>% collect -``` - -### Piping - -One can chain `dplyr` verbs together like with a `data.frame` - -```{r, dependson='asdiskframe'} -c4 <- flights %>% - filter(month == 5, day == 17, carrier %in% c('UA', 'WN', 'AA', 'DL')) %>% - select(carrier, dep_delay, air_time, distance) %>% - mutate(air_time_hours = air_time / 60) %>% - collect %>% - arrange(carrier)# arrange should occur after `collect` - -c4 %>% head -``` - -### List of supported `dplyr` verbs - -```r -select -rename -filter -chunk_arrange # within each chunk -chunk_group_by # within each chunk -chunk_summarize # within each chunk -group_by # limited functions -summarize # limited functions -mutate -transmute -left_join -inner_join -full_join # careful. Performance! -semi_join -anit_join -``` - -## Sharding and distribution of chunks - -Like other distributed data manipulation frameworks, `disk.frame` utilizes the *sharding* concept to distribute the data into chunks. For example, "to shard by `cust_id`" means that all rows with the same `cust_id` will be stored in the same chunk. This enables `chunk_group_by` by `cust_id` to produce the same results as non-chunked data. - -The `by` variables that were used to shard the dataset are called the `shardkey`s. The *sharding* is performed by computing a deterministic hash on the shard keys (the `by` variables) for each row. The hash function produces an integer between `1` and `n`, where `n` is the number of chunks. - -## Group-by - -`{disk.frame}` implements the `group_by` operation with some caveats. In the `{disk.frame}` framework, only a subset of functions are supported in `summarize`. However, the user can create more custom `group-by` functions on the fly. - -```{r, dependson='asdiskframe'} -flights.df %>% - group_by(carrier) %>% # notice that hard_group_by needs to be set - summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules - collect %>% - arrange(carrier) -``` - -## Restrict input columns for faster processing - -One can restrict which input columns to load into memory for each chunk; this can significantly increase the speed of data processing. To restrict the input columns, use the `srckeep` function which only accepts column names as a string vector. - -```{r, dependson='asdiskframe'} -flights.df %>% - srckeep(c("carrier","dep_delay")) %>% - group_by(carrier) %>% - summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm=T)) %>% # mean follows normal R rules - collect -``` - -Input column restriction is one of the most critical efficiencies provided by `disk.frame`. Because the underlying format allows random access to columns (i.e. retrieve only the columns used for processing), hence one can drastically reduce the amount of data loaded into RAM for processing by keeping only those columns that are directly used to produce the results. - -## Joins - -`disk.frame` supports many dplyr joins including: - -```r -left_join -inner_join -semi_join -inner_join -full_join # requires hard_group_by on both left and right -``` -In all cases, the left dataset (`x`) must be a `disk.frame`, and the right dataset (`y`) can be either a `disk.frame` or a `data.frame`. If the right dataset is a `disk.frame` and the `shardkey`s are different between the two `disk.frame`s then two expensive `hard` `group_by` operations are performed *eagerly*, one on the left `disk.frame` and one on the right `disk.frame` to perform the joins correctly. - -However, if the right dataset is a `data.frame` then `hard_group_by`s are only performed in the case of `full_join`. - -Note `disk.frame` does not support `right_join`. The user should use `left_join` instead. - -The below joins are performed *lazily* because `airlines.dt` is a `data.table` not a `disk.frame`: - -```{r airlines_dt, dependson='asdiskframe', cache=TRUE} -# make airlines a data.table -airlines.dt <- data.table(airlines) -# flights %>% left_join(airlines, by = "carrier") # -flights.df %>% - left_join(airlines.dt, by ="carrier") %>% - collect %>% - head -``` - -```{r, dependson='airlines_dt'} -flights.df %>% - left_join(airlines.dt, by = c("carrier", "carrier")) %>% - collect %>% - tail -``` - -## Window functions and arbitrary functions - -`{disk.frame}` supports all `data.frame` operations, unlike Spark which can only perform those operations that Spark has implemented. Hence windowing functions like `min_rank` and `rank` are supported out of the box. - -For the following example, we will use the `hard_group_by` which performs a group-by and also reorganises the chunks so that all records with the same `year`, `month`, and `day` end up in the same chunk. This is typically not advised, as `hard_group_by` can be slow for large datasets. - -```{r, dependson='asdiskframe'} -# Find the most and least delayed flight each day -bestworst <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(dep_delay == min(dep_delay, na.rm = T) || dep_delay == max(dep_delay, na.rm = T)) %>% - collect - -bestworst %>% head -``` - -Another example: - -```{r, dependson='asdiskframe'} -ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - hard_group_by(c("year", "month", "day")) %>% - filter(min_rank(desc(dep_delay)) <= 2 & dep_delay > 0) %>% - collect - -ranked %>% head -``` - -One more example: - -```{r, dependson='asdiskframe'} -# Rank each flight within a daily window -ranked <- flights.df %>% - srckeep(c("year","month","day", "dep_delay")) %>% - chunk_group_by(year, month, day) %>% - select(dep_delay) %>% - mutate(rank = rank(desc(dep_delay))) %>% - collect - -ranked %>% head -``` - - -## Arbitrary by-chunk processing - -One can apply arbitrary transformations to each chunk of the `disk.frame` by using the `delayed` function which evaluates lazily or the `map.disk.frame(lazy = F)` function which evaluates eagerly. For example to return the number of rows in each chunk: - -```{r, dependson='asdiskframe'} -flights.df1 <- delayed(flights.df, ~nrow(.x)) -collect_list(flights.df1) %>% head # returns number of rows for each data.frame in a list -``` -and to do the same with `map.disk.frame`: - -```{r, dependson='asdiskframe'} -map(flights.df, ~nrow(.x), lazy = F) %>% head -``` -The `map` function can also output the results to another disk.frame folder, e.g. - -```{r, dependson='asdiskframe'} -# return the first 10 rows of each chunk -flights.df2 <- map(flights.df, ~.x[1:10,], lazy = F, outdir = file.path(tempdir(), "tmp2"), overwrite = T) - -flights.df2 %>% head -``` - -Notice `{disk.frame}` supports the `purrr` syntax for defining a function using `~`. - -## Sampling - -In the `disk.frame` framework, sampling a proportion of rows within each chunk can be performed using `sample_frac`. - -```{r, dependson='asdiskframe'} -flights.df %>% sample_frac(0.01) %>% collect %>% head -``` - -## Writing Data - -One can output a `disk.frame` by using the `write_disk.frame` function. E.g. - -```r -write_disk.frame(flights.df, outdir="out") -``` -this will output a disk.frame to the folder "out" - -```{r cleanup} -fs::dir_delete(file.path(tempdir(), "tmp_flights.df")) -fs::dir_delete(file.path(tempdir(), "tmp2")) -fs::file_delete(file.path(tempdir(), "tmp_flights.csv")) -```