update dfcor

xiaodaigh · May 27, 2021 · cd399f1 · cd399f1 · xiaodaigh · May 27, 2021
1 parent 4b2d50b
commit cd399f1
Show file tree

Hide file tree

Showing 10 changed files with 177 additions and 206 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DataConvenience"
 uuid = "3b531cbf-ee43-4e67-8118-dca2c9372f86"
 authors = ["Dai ZJ <[email protected]>"]
-version = "0.2.2"
+version = "0.3.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/README.jl b/README.jl
@@ -0,0 +1,65 @@
+using DataConvenience
+using DataFrames
+df = DataFrame(col = rand(1_000_000), col1 = rand(1_000_000), col2 = rand(1_000_000))
+
+fsort(df, :col) # sort by `:col`
+fsort(df, [:col1, :col2]) # sort by `:col1` and `:col2`
+fsort!(df, :col) # sort by `:col` # sort in-place by `:col`
+fsort!(df, [:col1, :col2]) # sort in-place by `:col1` and `:col2`
+
+
+df = DataFrame(col = rand(1_000_000), col1 = rand(1_000_000), col2 = rand(1_000_000))
+
+using BenchmarkTools
+fsort_1col = @belapsed fsort($df, :col) # sort by `:col`
+fsort_2col = @belapsed fsort($df, [:col1, :col2]) # sort by `:col1` and `:col2`
+
+sort_1col = @belapsed sort($df, :col) # sort by `:col`
+sort_2col = @belapsed sort($df, [:col1, :col2]) # sort by `:col1` and `:col2`
+
+using Plots
+bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","DataCon.sort 2 col2"],
+    [sort_1col, sort_2col, fsort_1col, fsort_2col],
+    title="DataFrames sort performance comparison",
+    label = "seconds")
+
+
+using DataFrames
+using CSV
+
+df = DataFrame(a = rand(1_000_000), b = rand(Int8, 1_000_000), c = rand(Int8, 1_000_000))
+
+filepath = tempname()*".csv"
+CSV.write(filepath, df)
+
+for chunk in CsvChunkIterator(filepath)
+  print(describe(chunk))
+end
+
+
+# read all column as String
+for chunk in CsvChunkIterator(filepath, type=String)
+    print(describe(chunk))
+end
+
+
+# read a three colunms csv where the column types are String, Int, Float32
+for chunk in CsvChunkIterator(filepath, types=[String, Int, Float32])
+  print(describe(chunk))
+end
+
+
+@replicate 10 8
+
+
+x = Vector{Union{Missing, Int}}(undef, 10_000_000)
+
+cmx = count_missing(x) # this is faster
+
+cmx2 = countmissing(x) # this is faster
+
+cimx = count(ismissing, x) # the way available at base
+
+
+cmx == cimx # true
+
diff --git a/README.jmd b/README.jmd
@@ -4,26 +4,6 @@ An eclectic collection of convenience functions for your data manipulation needs
 
 ## Data
 
-### Piping Convenience
-
-#### Defining `filter(::AbstractDataFrame, arg)`
-DataFrames.jl does not define `filter(::AbstractDataFrame, arg)` and instead has `filter(arg, ::AbstractDataFrame)` only. This makes it inconsistent with the other functions so that's why I am defining `filter` with the signature `filter(::AbstractDataFrame, arg)`.
-
-#### Examples
-```julia
-using DataConvenience
-using DataFrames
-using Chain: @chain
-
-df = DataFrame(a=1:8)
-
-@chain df begin
-    filter(:a => ==(1))
-end
-```
-
-Note: DataConvenience.jl used to re-export Lazy.jl's `@>` which it no longer does. Users are encouraged to use [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) instead.
-
 ### Sampling with `sample`
 
 You can conveniently sample a dataframe with the `sample` method
@@ -50,6 +30,7 @@ sample(df, 1//10)
 You can sort `DataFrame`s (in ascending order only) faster than the `sort` function by using the `fsort` function. E.g.
 
 ```julia
+using DataConvenience
 using DataFrames
 df = DataFrame(col = rand(1_000_000), col1 = rand(1_000_000), col2 = rand(1_000_000))
 

diff --git a/README.md b/README.md
@@ -4,38 +4,6 @@ An eclectic collection of convenience functions for your data manipulation needs
 
 ## Data
 
-### Piping Convenience
-
-#### Defining `filter(::AbstractDataFrame, arg)`
-DataFrames.jl does not define `filter(::AbstractDataFrame, arg)` and instead has `filter(arg, ::AbstractDataFrame)` only. This makes it inconsistent with the other functions so that's why I am defining `filter` with the signature `filter(::AbstractDataFrame, arg)`.
-
-#### Examples
-```julia
-using DataConvenience
-using DataFrames
-using Chain: @chain
-
-df = DataFrame(a=1:8)
-
-@chain df begin
-    filter(:a => ==(1))
-end
-```
-
-```
-1×1 DataFrame
- Row │ a
-     │ Int64
-─────┼───────
-   1 │     1
-```
-
-
-
-
-
-Note: DataConvenience.jl used to re-export Lazy.jl's `@>` which it no longer does. Users are encouraged to use [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) instead.
-
 ### Sampling with `sample`
 
 You can conveniently sample a dataframe with the `sample` method
@@ -62,6 +30,7 @@ sample(df, 1//10)
 You can sort `DataFrame`s (in ascending order only) faster than the `sort` function by using the `fsort` function. E.g.
 
 ```julia
+using DataConvenience
 using DataFrames
 df = DataFrame(col = rand(1_000_000), col1 = rand(1_000_000), col2 = rand(1_000_000))
 
@@ -73,26 +42,26 @@ fsort!(df, [:col1, :col2]) # sort in-place by `:col1` and `:col2`
 
 ```
 1000000×3 DataFrame
-     Row │ col        col1        col2
-         │ Float64    Float64     Float64
-─────────┼──────────────────────────────────
-       1 │ 0.561204   7.28226e-7  0.364491
-       2 │ 0.552371   1.55213e-6  0.449652
-       3 │ 0.995762   2.64605e-6  0.024013
-       4 │ 0.601954   3.16072e-6  0.743319
-       5 │ 0.932321   6.11559e-6  0.190004
-       6 │ 0.147286   6.73857e-6  0.0394049
-       7 │ 0.722439   8.40162e-6  0.0565526
-       8 │ 0.358826   8.62958e-6  0.788989
-    ⋮    │     ⋮          ⋮           ⋮
-  999994 │ 0.79161    0.999993    0.312891
-  999995 │ 0.779757   0.999996    0.0197649
-  999996 │ 0.681739   0.999997    0.0685774
-  999997 │ 0.736364   0.999997    0.15211
-  999998 │ 0.259878   0.999997    0.480823
-  999999 │ 0.943275   0.999998    0.96846
- 1000000 │ 0.837561   0.999999    0.289213
-                         999985 rows omitted
+     Row │ col         col1        col2
+         │ Float64     Float64     Float64
+─────────┼───────────────────────────────────
+       1 │ 0.3708      7.98914e-7  0.0982182
+       2 │ 0.743345    8.62962e-7  0.609425
+       3 │ 0.379679    1.0321e-6   0.353734
+       4 │ 0.0357946   4.01304e-6  0.632459
+       5 │ 0.588126    4.32507e-6  0.439859
+       6 │ 0.706394    4.54834e-6  0.811462
+       7 │ 0.228183    4.76902e-6  0.0418427
+       8 │ 0.3761      5.15514e-6  0.163736
+    ⋮    │     ⋮           ⋮           ⋮
+  999994 │ 0.469715    0.999991    0.442478
+  999995 │ 0.971895    0.999992    0.637568
+  999996 │ 0.891238    0.999993    0.72935
+  999997 │ 0.404767    0.999993    0.905502
+  999998 │ 0.249169    0.999996    0.584482
+  999999 │ 0.784547    0.999997    0.362961
+ 1000000 │ 0.705492    1.0         0.296773
+                          999985 rows omitted
 ```
 
 
@@ -114,7 +83,7 @@ bar(["DataFrames.sort 1 col","DataFrames.sort 2 col2", "DataCon.sort 1 col","Dat
     label = "seconds")
 ```
 
-![](figures/README_3_1.png)
+![](figures/README_2_1.png)
 
 
 
@@ -142,18 +111,18 @@ end
 
 ```
 3×7 DataFrame
- Row │ variable  mean       min            median     max    nmissing  elty
-pe
-     │ Symbol    Float64    Real           Float64    Real   Int64     Data
-Type
+ Row │ variable  mean       min            median     max         nmissing 
+ eltype
+     │ Symbol    Float64    Real           Float64    Real        Int64    
+ DataType
 ─────┼─────────────────────────────────────────────────────────────────────
-─────
-   1 │ a          0.499336     1.48721e-6   0.499138    1.0         0  Floa
-t64
-   2 │ b         -0.495945  -128            0.0       127           0  Int6
-4
-   3 │ c         -0.574404  -128           -1.0       127           0  Int6
-4
+──────────
+   1 │ a          0.500112     9.77158e-7   0.500207    0.999999         0 
+ Float64
+   2 │ b         -0.446016  -128            0.0       127                0 
+ Int64
+   3 │ c         -0.667185  -128           -1.0       127                0 
+ Int64
 ```
 
 
@@ -177,8 +146,8 @@ end
       Int64     DataType
 ─────┼─────────────────────────────────────────────────────────────────────
 ─────────────────────────
-   1 │ a                  0.00010077562806376505           9.79725879761694
-8e-5         0  String
+   1 │ a                  0.00010057134141727708           9.77544678875119
+6e-5         0  String
    2 │ b                  -1                               99              
              0  String
    3 │ c                  -1                               99              
@@ -202,11 +171,11 @@ end
        Int64     DataType
 ─────┼─────────────────────────────────────────────────────────────────────
 ──────────────────────────
-   1 │ a                    0.00010077562806376505          9.7972587976169
-48e-5         0  String
-   2 │ b         -0.495945  -128                    0.0     127            
+   1 │ a                    0.00010057134141727708          9.7754467887511
+96e-5         0  String
+   2 │ b         -0.446016  -128                    0.0     127            
               0  Int64
-   3 │ c         -0.574404  -128.0                  -1.0    127.0          
+   3 │ c         -0.667185  -128.0                  -1.0    127.0          
               0  Float32
 ```
 

diff --git a/build-readme.jl b/build-readme.jl
@@ -2,8 +2,8 @@
 using Pkg
 cd("c:/git/DataConvenience/")
 Pkg.activate("c:/git/DataConvenience/readme-env")
-# Pkg.update()
 upcheck()
+# Pkg.update()
 
 using Weave
 
@@ -12,3 +12,12 @@ weave("README.jmd", out_path = :pwd, doctype = "github")
 if false
     tangle("README.jmd")
 end
+
+using DataFrames
+
+a = DataFrame(a=1:3)
+
+vscodedisplay(a)
+
+
+
diff --git a/figures/README_2_1.png b/figures/README_2_1.png
diff --git a/src/DataConvenience.jl b/src/DataConvenience.jl
@@ -1,10 +1,10 @@
 module DataConvenience
 
 import WeakRefStrings:StringVector
-using DataFrames: AbstractDataFrame, DataFrame, rename
+using DataFrames: AbstractDataFrame, DataFrame, rename, dropmissing
 using CategoricalArrays
 using Statistics
-using Missings:nonmissingtype
+using Missings: nonmissingtype
 
 import Statistics:cor
 export cor, dfcor, @replicate, StringVector
@@ -13,7 +13,7 @@ export cor, dfcor, @replicate, StringVector
 include("cate-arrays.jl")
 include("CCA.jl")
 include("janitor.jl")
-include("filter.jl")
+include("dfcor.jl")
 # include("replace_onehot.jl")
 include("create-missing.jl")
 include("read-csv-in-chunks.jl")
@@ -40,55 +40,4 @@ macro replicate(n, expr)
 end
 
 
-"""
-    cor(x::AbstractVector{Bool}, y)
-
-    cor(y, x::AbstractVector{Bool})
-
-Compute correlation between `Bool` and other types
-"""
-Statistics.cor(x::AbstractVector{Bool}, y::AbstractVector) = cor(y, Int.(x))
-Statistics.cor(x::AbstractVector{Union{Bool, Missing}}, y::AbstractVector) = cor(y, passmissing(Int).(x))
-
-"""
-    dfcor(df::AbstractDataFrame, cols1=names(df), cols2=names(df), verbose=false)
-
-Compute correlation in a DataFrames by specifying a set of columns `cols1` vs
-another set `cols2`. The cartesian product of `cols1` and `cols2`'s correlation
-will be computed
-"""
-dfcor(df::AbstractDataFrame, cols1 = names(df), cols2 = names(df); verbose=false) = begin
-    k = 1
-    l1 = length(cols1)
-    l2 = length(cols2)
-    res = Vector{Float32}(undef, l1*l2)
-    names1 = Vector{Symbol}(undef, l1*l2)
-    names2 = Vector{Symbol}(undef, l1*l2)
-    for i in 1:l1
-        icol = df[!, cols1[i]]
-
-        if eltype(icol) >: String
-            # do nothing
-        else
-            Threads.@threads for j in 1:l2
-                if eltype(df[!, cols2[j]]) >: String
-                    # do nothing
-                else
-                    if verbose
-                        println(k, " ", cols1[i], " ", cols2[j])
-                    end
-                    df2 = df[:,[cols1[i], cols2[j]]] |> dropmissing
-                    if size(df2, 1) > 0
-                        res[k] = cor(df2[!,1], df2[!, 2])
-                        names1[k] = cols1[i]
-                        names2[k] = cols2[j]
-                        k+=1
-                    end
-                end
-            end
-        end
-    end
-    (names1[1:k-1], names2[1:k-1], res[1:k-1])
-end
-
 end # module