Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: adding support for tidyfast #220

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Type: Package
Package: disk.frame
Title: Larger-than-RAM Disk-Based Data Manipulation Framework
Version: 0.3.5
Date: 2020-03-01
Date: 2020-03-21
Authors@R: c(
person("Dai", "ZJ", email = "[email protected]", role = c("aut", "cre")),
person("Jacky", "Poon", role = c("ctb"))
Expand Down Expand Up @@ -48,10 +48,11 @@ Suggests:
biglmm,
speedglm,
broom,
ggplot2
ggplot2,
tidyfast (>= 0.2.1)
LinkingTo:
Rcpp
RoxygenNote: 7.0.2
RoxygenNote: 7.1.0
Encoding: UTF-8
URL: https://diskframe.com
BugReports: https://github.com/xiaodaigh/disk.frame/issues
15 changes: 14 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,17 @@ export(as.disk.frame)
export(ceremony_text)
export(chunk_arrange)
export(chunk_distinct)
export(chunk_dt_count.disk.frame)
export(chunk_dt_fill)
export(chunk_dt_hoist)
export(chunk_dt_nest)
export(chunk_dt_uncount.disk.frame)
export(chunk_dt_unnest)
export(chunk_group_by)
export(chunk_group_by_all.disk.frame)
export(chunk_group_by_at.disk.frame)
export(chunk_group_by_if.disk.frame)
export(chunk_lapply)
export(chunk_summarise)
export(chunk_summarise_all)
export(chunk_summarise_at)
export(chunk_summarize)
Expand All @@ -106,6 +111,7 @@ export(df_ram_size)
export(dfglm)
export(disk.frame)
export(distribute)
export(dt_separate.disk.frame)
export(evalparseglue)
export(filter_all.disk.frame)
export(filter_at.disk.frame)
Expand Down Expand Up @@ -283,6 +289,13 @@ importFrom(stats,median)
importFrom(stats,quantile)
importFrom(stats,runif)
importFrom(stringr,fixed)
importFrom(tidyfast,dt_count)
importFrom(tidyfast,dt_fill)
importFrom(tidyfast,dt_hoist)
importFrom(tidyfast,dt_nest)
importFrom(tidyfast,dt_separate)
importFrom(tidyfast,dt_uncount)
importFrom(tidyfast,dt_unnest)
importFrom(utils,capture.output)
importFrom(utils,head)
importFrom(utils,memory.limit)
Expand Down
5 changes: 2 additions & 3 deletions R/chunk_mapper.r
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,14 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR
warning(warning_msg)
}


browser()
quo_dotdotdot = rlang::enquos(...)

# this is designed to capture any global stuff
vars_and_pkgs = future::getGlobalsAndPackages(quo_dotdotdot)
data_for_eval_tidy = force(vars_and_pkgs$globals)

res = cmap(.data, ~{

this_env = environment()

if(length(data_for_eval_tidy) > 0) {
Expand Down Expand Up @@ -86,4 +85,4 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = TR
}, lazy = TRUE)
}
return_func
}
}
17 changes: 0 additions & 17 deletions R/dplyr_verbs.r
Original file line number Diff line number Diff line change
Expand Up @@ -123,61 +123,46 @@ add_tally.disk.frame <- create_chunk_mapper(dplyr::add_tally)
#' @rdname chunk_group_by
chunk_summarize <- create_chunk_mapper(dplyr::summarize)


#' @export
#' @importFrom dplyr summarise
#' @rdname chunk_group_by
chunk_summarise <- create_chunk_mapper(dplyr::summarise)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr do
do.disk.frame <- create_chunk_mapper(dplyr::do)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr group_by_all
chunk_group_by_all.disk.frame <- create_chunk_mapper(dplyr::group_by_all)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr group_by_at
chunk_group_by_at.disk.frame <- create_chunk_mapper(dplyr::group_by_at)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr group_by_if
chunk_group_by_if.disk.frame <- create_chunk_mapper(dplyr::group_by_if)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr mutate_all
mutate_all.disk.frame <- create_chunk_mapper(dplyr::mutate_all)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr mutate_at
mutate_at.disk.frame <- create_chunk_mapper(dplyr::mutate_at)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr mutate_if
mutate_if.disk.frame <- create_chunk_mapper(dplyr::mutate_if)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr rename_all
rename_all.disk.frame <- create_chunk_mapper(dplyr::rename_all)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr rename_at
Expand Down Expand Up @@ -219,7 +204,6 @@ chunk_summarise_all <- create_chunk_mapper(dplyr::summarise_all)
#' @importFrom dplyr summarise_at
chunk_summarise_at <- create_chunk_mapper(dplyr::summarise_at)


#' @export
#' @rdname dplyr_verbs
#' @importFrom dplyr summarize_all
Expand Down Expand Up @@ -280,7 +264,6 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup)
# do not introduce it as it was never introduced
#ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework")


#' @export
#' @rdname dplyr_verbs
glimpse.disk.frame <- function(.data, ...) {
Expand Down
59 changes: 59 additions & 0 deletions R/tidyfast-verbs.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#' The tidy verbs implemented for disk.frame
#' @description
#' Please see the tidyfast document for their usage
#' @export
#' @importFrom tidyfast dt_count dt_uncount dt_hoist dt_nest dt_unnest dt_fill dt_separate
#' @param ... Same as the tidyfast functions
#' @param .data a disk.frame
#' @rdname tidyfast_verbs
#' @family tidyfast verbs
#' @examples
#' library(tidyfast)
#' library(data.table)
#'
#' #' create a disk.frame
#' disk.frame_to_split <- as.disk.frame(data.table(
#' x = paste(letters, LETTERS, sep = ".")
#' ))
#'
#' disk.frame_to_split %>%
#' dt_separate(x, into = c("lower", "upper")) %>%
#' collect
#'
#' #' clean up
#' delete(disk.frame_to_split)
chunk_dt_count <- create_chunk_mapper(tidyfast::dt_count, as.data.frame = FALSE)

#' dt_count working on whole disk.frame
dt_count.disk.frame <- function(dt_, ..., na.rm = FALSE, wt = NULL) {
stop("ZJ: I was up to here, and I need better understanding of NSE. Why?
ifelse(is.null(wt), NULL, wt) is not going to work if wt is a column name")

dt_ %>%
chunk_dt_count(..., na.rm = force(na.rm), wt = ifelse(is.null(wt), NULL, wt)) %>%
collect
}

#' @rdname tidyfast_verbs
#' @export
chunk_dt_uncount <- create_chunk_mapper(tidyfast::dt_uncount, as.data.frame = FALSE)

#' @rdname tidyfast_verbs
#' @export
chunk_dt_unnest = create_chunk_mapper(tidyfast::dt_unnest, as.data.frame = FALSE)

#' @rdname tidyfast_verbs
#' @export
chunk_dt_nest = create_chunk_mapper(tidyfast::dt_nest, as.data.frame = FALSE)

#' @rdname tidyfast_verbs
#' @export
chunk_dt_hoist = create_chunk_mapper(tidyfast::dt_hoist, as.data.frame = FALSE)

#' @rdname tidyfast_verbs
#' @export
chunk_dt_fill = create_chunk_mapper(tidyfast::dt_fill, as.data.frame = FALSE)

#' @rdname tidyfast_verbs
#' @export
dt_separate.disk.frame = create_chunk_mapper(tidyfast::dt_separate, as.data.frame = FALSE)
4 changes: 4 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,7 @@ Do you wish to give back the open-source community in non-financial ways? Here a
[![](http://cranlogs.r-pkg.org/badges/grand-total/disk.frame)](https://cran.r-project.org/package=disk.frame)
[![Travis build status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame)
[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame)

## Live Stream of `{disk.frame}` development

* https://www.youtube.com/playlist?list=PL3DVdT3kym4fIU5CO-pxKtWhdjMVn4XGe
66 changes: 50 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,15 @@ flights.df %>%
filter(year == 2013) %>%
mutate(origin_dest = paste0(origin, dest)) %>%
head(2)
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum
#> 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228
#> 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211
#> origin dest air_time distance hour minute time_hour origin_dest
#> 1 EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH
#> 2 LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#> 1 2013 1 1 517 515 2 830 819
#> 2 2013 1 1 533 529 4 850 830
#> arr_delay carrier flight tailnum origin dest air_time distance hour minute
#> 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
#> 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
#> time_hour origin_dest
#> 1 2013-01-01 05:00:00 EWRIAH
#> 2 2013-01-01 05:00:00 LGAIAH
```

### Group-by
Expand Down Expand Up @@ -279,7 +282,6 @@ obtained using estimated methods.

``` r
library(data.table)
#> data.table 1.12.8 using 6 threads (see ?getDTthreads). Latest news: r-datatable.com
#>
#> Attaching package: 'data.table'
#> The following object is masked from 'package:purrr':
Expand All @@ -296,6 +298,30 @@ grp_by_stage1 =
.(sum_dist = sum(distance)),
.(qtr = ifelse(month <= 3, "Q1", "Q2"))
]
#> Warning in serialize(data, node$con): 'package:stats' may not be available when
#> loading
#> Warning in serialize(data, node$con): 'package:data.table' may not be available
#> when loading
#> Warning in serialize(data, node$con): 'package:stats' may not be available when
#> loading
#> Warning in serialize(data, node$con): 'package:data.table' may not be available
#> when loading
#> Warning in serialize(data, node$con): 'package:stats' may not be available when
#> loading
#> Warning in serialize(data, node$con): 'package:data.table' may not be available
#> when loading
#> Warning in serialize(data, node$con): 'package:stats' may not be available when
#> loading
#> Warning in serialize(data, node$con): 'package:data.table' may not be available
#> when loading
#> Warning in serialize(data, node$con): 'package:stats' may not be available when
#> loading
#> Warning in serialize(data, node$con): 'package:data.table' may not be available
#> when loading
#> Warning in serialize(data, node$con): 'package:stats' may not be available when
#> loading
#> Warning in serialize(data, node$con): 'package:data.table' may not be available
#> when loading

grp_by_stage1
#> qtr sum_dist
Expand Down Expand Up @@ -326,27 +352,31 @@ To find out where the disk.frame is stored on disk:
``` r
# where is the disk.frame stored
attr(flights.df, "path")
#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmpa6R05d\\file1b086cec36c7.df"
#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpOeAro4\\file17a0150634fd.df"
```

A number of data.frame functions are implemented for disk.frame

``` r
# get first few rows
head(flights.df, 1)
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum
#> 1: 2013 1 1 517 515 2 830 819 11 UA 1545 N14228
#> origin dest air_time distance hour minute time_hour
#> 1: EWR IAH 227 1400 5 15 2013-01-01 05:00:00
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#> 1: 2013 1 1 517 515 2 830 819
#> arr_delay carrier flight tailnum origin dest air_time distance hour minute
#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15
#> time_hour
#> 1: 2013-01-01 05:00:00
```

``` r
# get last few rows
tail(flights.df, 1)
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum
#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ 3531 N839MQ
#> origin dest air_time distance hour minute time_hour
#> 1: LGA RDU NA 431 8 40 2013-09-30 08:00:00
#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#> 1: 2013 9 30 NA 840 NA NA 1020
#> arr_delay carrier flight tailnum origin dest air_time distance hour minute
#> 1: NA MQ 3531 N839MQ LGA RDU NA 431 8 40
#> time_hour
#> 1: 2013-09-30 08:00:00
```

``` r
Expand Down Expand Up @@ -455,3 +485,7 @@ ways? Here are some ways you can contribute
status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame)
[![AppVeyor build
status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame)

## Live Stream of `{disk.frame}` development

- <https://www.youtube.com/playlist?list=PL3DVdT3kym4fIU5CO-pxKtWhdjMVn4XGe>
3 changes: 0 additions & 3 deletions man/chunk_group_by.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading