diff --git a/DESCRIPTION b/DESCRIPTION index 457262cf..8c4e2bba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework Version: 0.3.5 -Date: 2020-03-01 +Date: 2020-03-21 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) @@ -48,10 +48,11 @@ Suggests: biglmm, speedglm, broom, - ggplot2 + ggplot2, + tidyfast (>= 0.2.1) LinkingTo: Rcpp -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 Encoding: UTF-8 URL: https://diskframe.com BugReports: https://github.com/xiaodaigh/disk.frame/issues diff --git a/NAMESPACE b/NAMESPACE index 6f3fa037..eefcea82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,12 +75,17 @@ export(as.disk.frame) export(ceremony_text) export(chunk_arrange) export(chunk_distinct) +export(chunk_dt_count.disk.frame) +export(chunk_dt_fill) +export(chunk_dt_hoist) +export(chunk_dt_nest) +export(chunk_dt_uncount.disk.frame) +export(chunk_dt_unnest) export(chunk_group_by) export(chunk_group_by_all.disk.frame) export(chunk_group_by_at.disk.frame) export(chunk_group_by_if.disk.frame) export(chunk_lapply) -export(chunk_summarise) export(chunk_summarise_all) export(chunk_summarise_at) export(chunk_summarize) @@ -106,6 +111,7 @@ export(df_ram_size) export(dfglm) export(disk.frame) export(distribute) +export(dt_separate.disk.frame) export(evalparseglue) export(filter_all.disk.frame) export(filter_at.disk.frame) @@ -283,6 +289,13 @@ importFrom(stats,median) importFrom(stats,quantile) importFrom(stats,runif) importFrom(stringr,fixed) +importFrom(tidyfast,dt_count) +importFrom(tidyfast,dt_fill) +importFrom(tidyfast,dt_hoist) +importFrom(tidyfast,dt_nest) +importFrom(tidyfast,dt_separate) +importFrom(tidyfast,dt_uncount) +importFrom(tidyfast,dt_unnest) importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,memory.limit) diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index 2063252a..eea3a288 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -39,7 +39,7 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR warning(warning_msg) } - + browser() quo_dotdotdot = rlang::enquos(...) # this is designed to capture any global stuff @@ -47,7 +47,6 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR data_for_eval_tidy = force(vars_and_pkgs$globals) res = cmap(.data, ~{ - this_env = environment() if(length(data_for_eval_tidy) > 0) { @@ -86,4 +85,4 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR }, lazy = TRUE) } return_func -} \ No newline at end of file +} diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 90abb981..a3beab21 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -123,61 +123,46 @@ add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally) #' @rdname chunk_group_by chunk_summarize <- create_chunk_mapper(dplyr::summarize) - -#' @export -#' @importFrom dplyr summarise -#' @rdname chunk_group_by -chunk_summarise <- create_chunk_mapper(dplyr::summarise) - - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr do do.disk.frame <- create_chunk_mapper(dplyr::do) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr group_by_all chunk_group_by_all.disk.frame <- create_chunk_mapper(dplyr::group_by_all) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr group_by_at chunk_group_by_at.disk.frame <- create_chunk_mapper(dplyr::group_by_at) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr group_by_if chunk_group_by_if.disk.frame <- create_chunk_mapper(dplyr::group_by_if) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr mutate_all mutate_all.disk.frame <- create_chunk_mapper(dplyr::mutate_all) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr mutate_at mutate_at.disk.frame <- create_chunk_mapper(dplyr::mutate_at) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr mutate_if mutate_if.disk.frame <- create_chunk_mapper(dplyr::mutate_if) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr rename_all rename_all.disk.frame <- create_chunk_mapper(dplyr::rename_all) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr rename_at @@ -219,7 +204,6 @@ chunk_summarise_all <- create_chunk_mapper(dplyr::summarise_all) #' @importFrom dplyr summarise_at chunk_summarise_at <- create_chunk_mapper(dplyr::summarise_at) - #' @export #' @rdname dplyr_verbs #' @importFrom dplyr summarize_all @@ -280,7 +264,6 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) # do not introduce it as it was never introduced #ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework") - #' @export #' @rdname dplyr_verbs glimpse.disk.frame <- function(.data, ...) { diff --git a/R/tidyfast-verbs.r b/R/tidyfast-verbs.r new file mode 100644 index 00000000..b4883238 --- /dev/null +++ b/R/tidyfast-verbs.r @@ -0,0 +1,59 @@ +#' The tidy verbs implemented for disk.frame +#' @description +#' Please see the tidyfast document for their usage +#' @export +#' @importFrom tidyfast dt_count dt_uncount dt_hoist dt_nest dt_unnest dt_fill dt_separate +#' @param ... Same as the tidyfast functions +#' @param .data a disk.frame +#' @rdname tidyfast_verbs +#' @family tidyfast verbs +#' @examples +#' library(tidyfast) +#' library(data.table) +#' +#' #' create a disk.frame +#' disk.frame_to_split <- as.disk.frame(data.table( +#' x = paste(letters, LETTERS, sep = ".") +#' )) +#' +#' disk.frame_to_split %>% +#' dt_separate(x, into = c("lower", "upper")) %>% +#' collect +#' +#' #' clean up +#' delete(disk.frame_to_split) +chunk_dt_count <- create_chunk_mapper(tidyfast::dt_count, as.data.frame = FALSE) + +#' dt_count working on whole disk.frame +dt_count.disk.frame <- function(dt_, ..., na.rm = FALSE, wt = NULL) { + stop("ZJ: I was up to here, and I need better understanding of NSE. Why? + ifelse(is.null(wt), NULL, wt) is not going to work if wt is a column name") + + dt_ %>% + chunk_dt_count(..., na.rm = force(na.rm), wt = ifelse(is.null(wt), NULL, wt)) %>% + collect +} + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_uncount <- create_chunk_mapper(tidyfast::dt_uncount, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_unnest = create_chunk_mapper(tidyfast::dt_unnest, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_nest = create_chunk_mapper(tidyfast::dt_nest, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_hoist = create_chunk_mapper(tidyfast::dt_hoist, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +chunk_dt_fill = create_chunk_mapper(tidyfast::dt_fill, as.data.frame = FALSE) + +#' @rdname tidyfast_verbs +#' @export +dt_separate.disk.frame = create_chunk_mapper(tidyfast::dt_separate, as.data.frame = FALSE) diff --git a/README.Rmd b/README.Rmd index 0c1d02ce..bcd3e1a0 100644 --- a/README.Rmd +++ b/README.Rmd @@ -327,3 +327,7 @@ Do you wish to give back the open-source community in non-financial ways? Here a [![](http://cranlogs.r-pkg.org/badges/grand-total/disk.frame)](https://cran.r-project.org/package=disk.frame) [![Travis build status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame) + +## Live Stream of `{disk.frame}` development + +* https://www.youtube.com/playlist?list=PL3DVdT3kym4fIU5CO-pxKtWhdjMVn4XGe diff --git a/README.md b/README.md index 95e17ce0..a43cfe2a 100644 --- a/README.md +++ b/README.md @@ -217,12 +217,15 @@ flights.df %>% filter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 -#> 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 -#> origin dest air_time distance hour minute time_hour origin_dest -#> 1 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH -#> 2 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1 2013 1 1 517 515 2 830 819 +#> 2 2013 1 1 533 529 4 850 830 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> time_hour origin_dest +#> 1 2013-01-01 05:00:00 EWRIAH +#> 2 2013-01-01 05:00:00 LGAIAH ``` ### Group-by @@ -279,7 +282,6 @@ obtained using estimated methods. ``` r library(data.table) -#> data.table 1.12.8 using 6 threads (see ?getDTthreads). Latest news: r-datatable.com #> #> Attaching package: 'data.table' #> The following object is masked from 'package:purrr': @@ -296,6 +298,30 @@ grp_by_stage1 = .(sum_dist = sum(distance)), .(qtr = ifelse(month <= 3, "Q1", "Q2")) ] +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading +#> Warning in serialize(data, node$con): 'package:stats' may not be available when +#> loading +#> Warning in serialize(data, node$con): 'package:data.table' may not be available +#> when loading grp_by_stage1 #> qtr sum_dist @@ -326,7 +352,7 @@ To find out where the disk.frame is stored on disk: ``` r # where is the disk.frame stored attr(flights.df, "path") -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmpa6R05d\\file1b086cec36c7.df" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpOeAro4\\file17a0150634fd.df" ``` A number of data.frame functions are implemented for disk.frame @@ -334,19 +360,23 @@ A number of data.frame functions are implemented for disk.frame ``` r # get first few rows head(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1: 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 -#> origin dest air_time distance hour minute time_hour -#> 1: EWR IAH 227 1400 5 15 2013-01-01 05:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 1 1 517 515 2 830 819 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 +#> time_hour +#> 1: 2013-01-01 05:00:00 ``` ``` r # get last few rows tail(flights.df, 1) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ 3531 N839MQ -#> origin dest air_time distance hour minute time_hour -#> 1: LGA RDU NA 431 8 40 2013-09-30 08:00:00 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time +#> 1: 2013 9 30 NA 840 NA NA 1020 +#> arr_delay carrier flight tailnum origin dest air_time distance hour minute +#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40 +#> time_hour +#> 1: 2013-09-30 08:00:00 ``` ``` r @@ -455,3 +485,7 @@ ways? Here are some ways you can contribute status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame) + +## Live Stream of `{disk.frame}` development + + - diff --git a/man/chunk_group_by.Rd b/man/chunk_group_by.Rd index f06836c2..9a54e6f3 100644 --- a/man/chunk_group_by.Rd +++ b/man/chunk_group_by.Rd @@ -2,15 +2,12 @@ % Please edit documentation in R/dplyr_verbs.r \name{chunk_summarize} \alias{chunk_summarize} -\alias{chunk_summarise} \alias{chunk_group_by} \alias{chunk_ungroup} \title{Group by within each disk.frame} \usage{ chunk_summarize(.data, ...) -chunk_summarise(.data, ...) - chunk_group_by(.data, ...) chunk_ungroup(.data, ...) diff --git a/man/tidyfast_verbs.Rd b/man/tidyfast_verbs.Rd new file mode 100644 index 00000000..977cbcf3 --- /dev/null +++ b/man/tidyfast_verbs.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyfast-verbs.r +\name{chunk_dt_count.disk.frame} +\alias{chunk_dt_count.disk.frame} +\alias{chunk_dt_uncount.disk.frame} +\alias{chunk_dt_unnest} +\alias{chunk_dt_nest} +\alias{chunk_dt_hoist} +\alias{chunk_dt_fill} +\alias{dt_separate.disk.frame} +\title{The tidy verbs implemented for disk.frame} +\usage{ +chunk_dt_count.disk.frame(.data, ...) + +chunk_dt_uncount.disk.frame(.data, ...) + +chunk_dt_unnest(.data, ...) + +chunk_dt_nest(.data, ...) + +chunk_dt_hoist(.data, ...) + +chunk_dt_fill(.data, ...) + +dt_separate.disk.frame(.data, ...) +} +\arguments{ +\item{.data}{a disk.frame} + +\item{...}{Same as the tidyfast functions} +} +\description{ +Please see the tidyfast document for their usage +} +\examples{ +library(tidyfast) +library(data.table) + +#' create a disk.frame +disk.frame_to_split <- as.disk.frame(data.table( + x = paste(letters, LETTERS, sep = ".") +)) + +disk.frame_to_split \%>\% + dt_separate(x, into = c("lower", "upper")) \%>\% + collect + +#' clean up +delete(disk.frame_to_split) +} +\concept{tidyfast verbs}