Merge pull request #367 from xiaodaigh/development

Development
DiskFrame · Jan 24, 2022 · b522999 · b522999
2 parents 3ae85d4 + 9f1ebf6
commit b522999
Show file tree

Hide file tree

Showing 170 changed files with 7,050 additions and 13,706 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -64,4 +64,5 @@ vignettes.asis.template
 vignettes.Rnw.template
 ^codecov\.yml$
 new-nse-dev.r
-test-poorman.R
+test-poorman.R
+*.parquet
diff --git a/CRAN-RELEASE b/CRAN-RELEASE
@@ -1,2 +1,2 @@
-This package was submitted to CRAN on 2021-02-13.
-Once it is accepted, delete this file and tag the release (commit f7dd3db).
+This package was submitted to CRAN on 2021-03-12.
+Once it is accepted, delete this file and tag the release (commit 34bafaa).
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Type: Package
 Package: disk.frame
 Title: Larger-than-RAM Disk-Based Data Manipulation Framework
-Version: 0.4.0
-Date: 2021-02-11
+Version: 0.5.0
+Date: 2021-05-09
 Authors@R: c(
   person("Dai", "ZJ", email = "[email protected]", role = c("aut", "cre")),
   person("Jacky", "Poon", role = c("ctb"))

diff --git a/NAMESPACE b/NAMESPACE
@@ -67,7 +67,6 @@ S3method(transmute,disk.frame)
 export(IQR_df.chunk_agg.disk.frame)
 export(IQR_df.collected_agg.disk.frame)
 export(add_chunk)
-export(add_count.disk.frame)
 export(add_tally.disk.frame)
 export(all_df.chunk_agg.disk.frame)
 export(all_df.collected_agg.disk.frame)
@@ -176,7 +175,6 @@ importFrom(data.table,setDT)
 importFrom(data.table,setkey)
 importFrom(data.table,setkeyv)
 importFrom(data.table,timetaken)
-importFrom(dplyr,add_count)
 importFrom(dplyr,add_tally)
 importFrom(dplyr,anti_join)
 importFrom(dplyr,arrange)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# disk.frame 0.5
+* removed `add_count` method
+
+# disk.frame 0.4.1
+* removed use of `sysctl` which was violating CRAN policy
+
 # disk.frame 0.4.0
 * Removed `count` and `tally`
 * Fixed package compatibility

diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r
@@ -70,10 +70,10 @@ chunk_arrange <- create_chunk_mapper(dplyr::arrange)
 # TODO alot of these .disk.frame functions are not generic
 
 
-#' @export
-#' @importFrom dplyr add_count
-#' @rdname dplyr_verbs
-add_count.disk.frame <- create_chunk_mapper(dplyr::add_count)
+#' #' @export
+#' #' @importFrom dplyr add_count
+#' #' @rdname dplyr_verbs
+#' add_count.disk.frame <- create_chunk_mapper(dplyr::add_count)
 
 
 #' @export

diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r
@@ -91,16 +91,18 @@ df_ram_size <- function() {
         }
       } 
     } else {
-      os = R.version$os
-      if (length(grep("^darwin", os))) {
-        a = substring(system("sysctl hw.memsize", intern = TRUE), 13)
-      } #else {
+      #os = R.version$os
+      #if (length(grep("^darwin", os))) {
+        #a = substring(system("sysctl hw.memsize", intern = TRUE), 13)
+        # the above is not allowed by CRAN
+      #} #else {
         # This would work but is not allowed by CRAN
         #a = system('grep MemTotal /proc/meminfo', intern = TRUE)
       #}
-      l = strsplit(a, " ")[[1]]
-      l = as.numeric(l[length(l)-1])
-      ram_size = l/1024^2
+      #l = strsplit(a, " ")[[1]]
+      #l = as.numeric(l[length(l)-1])
+      #ram_size = l/1024^2
+      ram_size = 16 # to be conservative
     } 
 
     if(is.null(ram_size)) {

diff --git a/README.Rmd b/README.Rmd
@@ -250,7 +250,7 @@ ncol(flights.df)
 
 ## Hex logo
 
-![disk.frame logo](inst/figures/logo.png?raw=true)
+![disk.frame logo](inst/figures/logo.png)
 
 ## Contributors
 
@@ -283,6 +283,7 @@ The work priorities at this stage are
 | [深入对比数据科学工具箱：Python3 和 R 之争(2020版)](https://segmentfault.com/a/1190000021653567) | Chinese  | Harry Zhu       | 2020-02-16 | Mentions disk.frame                                          |
 
 
+
 ### Interested in learning `{disk.frame}` in a structured course?
 
 Please register your interest at:

diff --git a/README.md b/README.md
@@ -211,15 +211,12 @@ flights.df %>%
   filter(year == 2013) %>% 
   mutate(origin_dest = paste0(origin, dest)) %>% 
   head(2)
-#>   year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
-#> 1 2013     1   1      517            515         2      830            819        11
-#> 2 2013     1   1      533            529         4      850            830        20
-#>   carrier flight tailnum origin dest air_time distance hour minute           time_hour
-#> 1      UA   1545  N14228    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00
-#> 2      UA   1714  N24211    LGA  IAH      227     1416    5     29 2013-01-01 05:00:00
-#>   origin_dest
-#> 1      EWRIAH
-#> 2      LGAIAH
+#>   year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
+#> 1 2013     1   1      517            515         2      830            819        11      UA
+#> 2 2013     1   1      533            529         4      850            830        20      UA
+#>   flight tailnum origin dest air_time distance hour minute           time_hour origin_dest
+#> 1   1545  N14228    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00      EWRIAH
+#> 2   1714  N24211    LGA  IAH      227     1416    5     29 2013-01-01 05:00:00      LGAIAH
 ```
 
 ### Group-by
@@ -276,15 +273,6 @@ obtained using estimated methods.
 
 ``` r
 library(data.table)
-#> data.table 1.13.6 using 6 threads (see ?getDTthreads).  Latest news: r-datatable.com
-#> 
-#> Attaching package: 'data.table'
-#> The following object is masked from 'package:purrr':
-#> 
-#>     transpose
-#> The following objects are masked from 'package:dplyr':
-#> 
-#>     between, first, last
 
 suppressWarnings(
   grp_by_stage1 <- 
@@ -325,27 +313,27 @@ To find out where the disk.frame is stored on disk:
 ``` r
 # where is the disk.frame stored
 attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmpk3aGAr\\file3adc78655410.df"
+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpIlXNzn\\file568813b835a7.df"
 ```
 
 A number of data.frame functions are implemented for disk.frame
 
 ``` r
 # get first few rows
 head(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
-#> 1: 2013     1   1      517            515         2      830            819        11
-#>    carrier flight tailnum origin dest air_time distance hour minute           time_hour
-#> 1:      UA   1545  N14228    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00
+#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
+#> 1: 2013     1   1      517            515         2      830            819        11      UA
+#>    flight tailnum origin dest air_time distance hour minute           time_hour
+#> 1:   1545  N14228    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00
 ```
 
 ``` r
 # get last few rows
 tail(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
-#> 1: 2013     9  30       NA            840        NA       NA           1020        NA
-#>    carrier flight tailnum origin dest air_time distance hour minute           time_hour
-#> 1:      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40 2013-09-30 08:00:00
+#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
+#> 1: 2013     9  30       NA            840        NA       NA           1020        NA      MQ
+#>    flight tailnum origin dest air_time distance hour minute           time_hour
+#> 1:   3531  N839MQ    LGA  RDU       NA      431    8     40 2013-09-30 08:00:00
 ```
 
 ``` r
@@ -362,7 +350,7 @@ ncol(flights.df)
 
 ## Hex logo
 
-![disk.frame logo](inst/figures/logo.png?raw=true)
+![disk.frame logo](inst/figures/logo.png)
 
 ## Contributors
 
@@ -456,11 +444,3 @@ ways? Here are some ways you can contribute
 
 [![](https://cranlogs.r-pkg.org/badges/disk.frame)](https://cran.r-project.org/package=disk.frame)
 [![](http://cranlogs.r-pkg.org/badges/grand-total/disk.frame)](https://cran.r-project.org/package=disk.frame)
-[![Travis build
-status](https://travis-ci.org/xiaodaigh/disk.frame.svg?branch=master)](https://travis-ci.org/xiaodaigh/disk.frame)
-[![AppVeyor build
-status](https://ci.appveyor.com/api/projects/status/github/xiaodaigh/disk.frame?branch=master&svg=true)](https://ci.appveyor.com/project/xiaodaigh/disk.frame)
-
-## Live Stream of `{disk.frame}` development
-
--   <https://www.youtube.com/playlist?list=PL3DVdT3kym4fIU5CO-pxKtWhdjMVn4XGe>
diff --git a/book/01-intro.Rmd b/book/01-intro.Rmd
@@ -3,7 +3,7 @@ title: "Preface - The birth of `disk.frame`"
 author: "ZJ"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{preface}
+  %\VignetteIndexEntry{Preface - The birth of `disk.frame`}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---

diff --git a/book/06-vs-dask-juliadb.Rmd b/book/06-vs-dask-juliadb.Rmd
@@ -3,7 +3,7 @@ title: "Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone el
 author: "ZJ"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{benchmark-1}
+  %\VignetteIndexEntry{Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---

diff --git a/book/10-group-by.Rmd b/book/10-group-by.Rmd
@@ -58,22 +58,23 @@ It is important to note that not all functions that can run in `dplyr::summarize
 
 If a function you need/like is missing, please make a feature request [here](https://github.com/xiaodaigh/disk.frame/issues). It is a limitation that function that depend on the order a column can only obtained using estimated methods.
 
-| Function | Exact/Estimate | Notes |
-| -- | -- | -- |
-| `min` | Exact |  |
-| `max` | Exact |  |
-| `mean` | Exact |  |
-| `sum` | Exact |  |
-| `length` | Exact |  |
-| `n` | Exact |  |
-| `n_distinct` | Exact |  |
-| `sd` | Exact |  |
-| `var` | Exact | `var(x)` only `cor, cov` support *planned*  |
-| `any` | Exact |  |
-| `all` | Exact |  |
-| `median` | Estimate |  |
-| `quantile` | Estimate | One quantile only |
-| `IQR` | Estimate |  |
+| Function     | Exact/Estimate | Notes                                      |
+|--------------|----------------|--------------------------------------------|
+| `min`        | Exact          |                                            |
+| `max`        | Exact          |                                            |
+| `mean`       | Exact          |                                            |
+| `sum`        | Exact          |                                            |
+| `length`     | Exact          |                                            |
+| `n`          | Exact          |                                            |
+| `n_distinct` | Exact          |                                            |
+| `sd`         | Exact          |                                            |
+| `var`        | Exact          | `var(x)` only `cor, cov` support *planned* |
+| `any`        | Exact          |                                            |
+| `all`        | Exact          |                                            |
+| `median`     | Estimate       |                                            |
+| `quantile`   | Estimate       | One quantile only                          |
+| `IQR`        | Estimate       |                                            |
+
 
 ### Notes on One-Stage group-by
 

diff --git a/book/88-trouble-shooting.Rmd b/book/88-trouble-shooting.Rmd
@@ -0,0 +1,53 @@
+---
+title: "Trouble shooting"
+author: "ZJ"
+output: pdf_document
+---
+
+```{r include=FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  eval=TRUE,
+  include=TRUE
+)
+```
+
+### Steps to trouble shoot
+
+1. I suggest updating {future} and your R version if you have not already done so.
+
+2. Are you able to share the data?
+
+3. Do a good MWE
+```
+library(disk.frame)
+setup_disk.frame()
+
+df<-as.disk.frame(a)
+
+
+df1 = mutate(df, date = as.Date(as.character(datadate), format="%Y%m%d"))
+
+head(df1)
+```
+
+
+3. Check if your virus scanner is blocking interprocess communication
+
+4. Try to apply the function to just one chunk, perhaps there is a syntax error or column error? If one chunk works then you can rule out coding error
+
+```
+get_chunk(df, 1) %>%
+  mutate(date = as.Date(as.character(datadate), format="%Y%m%d"))
+```
+
+5. Set the number of workers to 1, so there is no more inter-process communication. Does it work now? If it does, then it's the inter process communication. You might need to contact your admin for help
+
+```
+setup_disk.frame(workers=1)
+mutate(df, date = as.Date(as.character(datadate), format="%Y%m%d"))
+As an MWE this works for me.
+
+a = data.frame(datadate = rep("20201007", 3e6))
+```
diff --git a/cran-comments.md b/cran-comments.md
@@ -1,11 +1,11 @@
-## Submission for v0.4.0
-*  Fixed recently reported warnings
+## Submission for v0.5.0
+*  Fixed issue in CRAN check but needed to update version to follow semver conventions
 
 ## Test environments
-* local Windows 10 Pro install, R 4.0.3
-* local Windows 10 Pro install, R devel (as of 2021-02-11)
-* local Linux/Ubuntu install, R 4.0.3
-* local Linux/Ubuntu install, R devel (as of 2021-02-11)
+* local Windows 10 Pro install, R 4.0.5
+* local Windows 10 Pro install, R devel (as of 2021-05-09)
+* local Linux/Ubuntu install, R 4.0.5
+* local Linux/Ubuntu install, R devel (as of 2021-05-09)
 
 ## R CMD check results
 There were no ERRORs nor WARNINGs nor NOTE when run locally.