From 39fcd6a999898d0a8288c86a0be072e827b63d5d Mon Sep 17 00:00:00 2001 From: David Little Date: Tue, 28 Jun 2022 17:04:37 -0400 Subject: [PATCH] Initial setup (#1) Initial implementation of `interval_join`, `groupby_interval_join` and `quantile_windows`. --- .github/workflows/CI.yml | 39 ++++ .github/workflows/CompatHelper.yml | 16 ++ .github/workflows/TagBot.yml | 17 ++ .github/workflows/style.yml | 35 +++ Manifest.toml | 307 +++++++++++++++++++++++++ Project.toml | 29 +++ README.md | 72 +++++- docs/Project.toml | 3 + docs/make.jl | 14 ++ docs/src/index.md | 14 ++ examples/.gitkeep | 0 format/Manifest.toml | 194 ++++++++++++++++ format/Project.toml | 5 + format/run.jl | 20 ++ src/DataFrameIntervals.jl | 350 +++++++++++++++++++++++++++++ test/runtests.jl | 99 ++++++++ 16 files changed, 1212 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/CI.yml create mode 100644 .github/workflows/CompatHelper.yml create mode 100644 .github/workflows/TagBot.yml create mode 100644 .github/workflows/style.yml create mode 100644 Manifest.toml create mode 100644 Project.toml create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/index.md create mode 100644 examples/.gitkeep create mode 100644 format/Manifest.toml create mode 100644 format/Project.toml create mode 100644 format/run.jl create mode 100644 src/DataFrameIntervals.jl create mode 100644 test/runtests.jl diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..a01c33e --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,39 @@ +name: CI +on: + push: + branches: + tags: '*' + pull_request: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.6' + - '1' + - 'nightly' + os: + - ubuntu-latest + arch: + - x64 + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v2 + with: + files: lcov.info diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..cba9134 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..897cb29 --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,17 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'beacon-buddy' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} + registry: beacon-biosignals/BeaconRegistry + registry_ssh: ${{ secrets.BEACON_REGISTRY_RO_SSH_KEY }} diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 0000000..c3c1f95 --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,35 @@ +name: Style-Enforcer +on: + push: + branches: + - 'main' + tags: '*' + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + # note: keep in sync with `format/run.jl` + paths-ignore: + - 'README.md' + - '.gitignore' +jobs: + format-check: + name: Style Enforcement (Julia ${{ matrix.julia-version }} - ${{ github.event_name }}) + # Run on push's or non-draft PRs + if: (github.event_name == 'push') || (github.event.pull_request.draft == false) + runs-on: ubuntu-latest + strategy: + matrix: + julia-version: [1.7] + steps: + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - uses: actions/checkout@v1 + - name: Instantiate `format` environment and format + run: | + julia --project=format -e 'using Pkg; Pkg.instantiate()' + julia --project=format 'format/run.jl' + - uses: reviewdog/action-suggester@v1 + if: github.event_name == 'pull_request' + with: + tool_name: JuliaFormatter + fail_on_error: true diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..8c60f2c --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,307 @@ +# This file is machine-generated - editing it directly is not advised + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "9be8be1d8a6f44b96482c8af52238ea7987da3e3" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.45.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[DataAPI]] +git-tree-sha1 = "fb5f5316dd3fd4c5e7c30a24d50643b73e37cd40" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.10.0" + +[[DataFrames]] +deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "daa21eb85147f72e41f6352a57fccea377e310a9" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.3.4" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "d1fff3a548102f48987a52a2e0d114fa97d730f0" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.13" + +[[DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[ExprTools]] +git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.8" + +[[FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[Formatting]] +deps = ["Printf"] +git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8" +uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" +version = "0.4.2" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "61feba885fac3a407465726d0c330b3055df897f" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.1.2" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[Intervals]] +deps = ["Dates", "Printf", "RecipesBase", "Serialization", "TimeZones"] +git-tree-sha1 = "c6a1efca1968f7f6a572510756c3df6741c6b9f0" +repo-rev = "rf/intervalset-type" +repo-url = "https://github.com/invenia/Intervals.jl" +uuid = "d8418881-c3e1-53bb-8760-2df7ec849ed5" +version = "1.8.0" + +[[InvertedIndices]] +git-tree-sha1 = "bee5f1ef5bf65df56bdd2e40447590b272a5471f" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.1.0" + +[[IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl", "libblastrampoline_jll"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "bf210ce90b6c9eed32d25dbcae1ebc565df2687f" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "1.0.2" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[Mocking]] +deps = ["Compat", "ExprTools"] +git-tree-sha1 = "29714d0a7a8083bba8427a4fbfb00a540c681ce7" +uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" +version = "0.7.3" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parsers]] +deps = ["Dates"] +git-tree-sha1 = "0044b23da09b5608b4ecacb4e5e6c6332f833a7e" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.3.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.2" + +[[PrettyTables]] +deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"] +git-tree-sha1 = "dfb54c4e414caa595a1f2ed759b160f5a3ddcba5" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "1.3.1" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[RecipesBase]] +git-tree-sha1 = "6bf3f380ff52ce0832ddd3a2a7b9538ed1bcca7d" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "1.2.1" + +[[Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.3.0" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures"] +git-tree-sha1 = "b3363d7460f7d098ca0912c69b082f75625d7508" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "1.0.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.1" + +[[Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] +git-tree-sha1 = "5ce79ce186cc678bbb5c5681ca3379d1ddae11a1" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.7.0" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TimeZones]] +deps = ["Dates", "Downloads", "InlineStrings", "LazyArtifacts", "Mocking", "Printf", "RecipesBase", "Serialization", "Unicode"] +git-tree-sha1 = "0a4d8838dc28b4bcfaa3a20efb8d63975ad6781d" +uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" +version = "1.8.0" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[libblastrampoline_jll]] +deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..338f257 --- /dev/null +++ b/Project.toml @@ -0,0 +1,29 @@ +name = "DataFrameIntervals" +uuid = "33b79e07-adbe-4034-b8be-6bacde625d75" +authors = ["Beacon Biosignals, Inc."] +version = "0.0.1" + +[deps] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Intervals = "d8418881-c3e1-53bb-8760-2df7ec849ed5" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" + +[compat] +DataFrames = "1" +Intervals = "1.8" +Requires = "1" +julia = "1.6" + +[extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Intervals = "d8418881-c3e1-53bb-8760-2df7ec849ed5" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TimeSpans = "bb34ddd2-327f-4c4a-bfb0-c98fc494ece1" + +[targets] +test = ["Test", "Distributions", "TimeSpans", "Intervals", "DataFrames", "Random", "Dates", "Aqua"] diff --git a/README.md b/README.md index 635118b..1b96cf8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,70 @@ -# DataFrameIntervals.jl -Utilities for working with DataFrames of `Intervals.jl` or `TimeSpans.jl` objects. +# DataFrameIntervals + +[![Build Status](https://github.com/haberdashpi/DataFrameIntervals.jl/actions/workflows/CI.yml/badge.svg?branch=)](https://github.com/biosignals/DataFrameIntervals.jl/actions/workflows/CI.yml?query=branch%3A) +[![Coverage](https://codecov.io/gh/beacon-biosignals/DataFrameIntervals.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/biosignals/DataFrameIntervals.jl) +[![Code Style: YASGuide](https://img.shields.io/badge/code%20style-yas-violet.svg)](https://github.com/jrevels/YASGu) + +DataFrameIntervals provides two functions that are handy for computing joins over intervals +of time: split_into and split_into_combine, and a helper function called `quantile_windows`. + +Rows match in this join if their time spans overlap. The time spans can be represented as i[`TimeSpan`](https://juliapackages.com/p/timespans) objects or [`Interval`](https://juliapackages.com/p/intervals) objects. + +Currently this requires an unreleased version of `Intervals.jl` (which should be version 1.8 when released). Make sure to add the following to your project before adding `DataFrameIntervals`. + +``` +julia> ]add https://github.com/invenia/Intervals.jl#rf/intervalset-type +``` + + +## Example + +```julia +using TimeSpans +using DataFrames +using DataFrameIntervals +using Distributions +using Random +using Dates + +n = 100 +tovalue(x) = Nanosecond(round(Int, x * 1e9)) +times = cumsum(rand(MersenneTwister(hash((:dataframe_intervals, 2022_06_01))), Gamma(3, 2), n+1)) +spans = TimeSpan.(tovalue.(times[1:(end-1)]), tovalue.(times[2:end])) +df = DataFrame(label = rand(('a':'d'), n), x = rand(n), span = spans) +``` + +``` +100×3 DataFrame + Row │ label x span + │ Char Float64 TimeSpan +─────┼───────────────────────────────────────────────────── + 1 │ b 0.0606309 TimeSpan(00:00:05.164631882, 00:… + 2 │ a 0.961599 TimeSpan(00:00:08.853504418, 00:… + 3 │ c 0.55525 TimeSpan(00:00:13.431519652, 00:… + 4 │ d 0.058248 TimeSpan(00:00:25.929078264, 00:… + ⋮ │ ⋮ ⋮ ⋮ + 98 │ a 0.995222 TimeSpan(00:08:51.512608520, 00:… + 99 │ d 0.188141 TimeSpan(00:08:56.662988067, 00:… + 100 │ a 0.338053 TimeSpan(00:08:58.445446762, 00:… + ``` + +```julia +quarters = quantile_windows(4, df, label=:quarter) + +split_into(df, quarters) +``` + +``` +103×6 DataFrame + Row │ quarter label x left_span right_span span + │ Int64 Char Float64 TimeSpan TimeSpan TimeSpan +─────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ 1 b 0.0606309 TimeSpan(00:00:05.164631882, 00:… TimeSpan(00:00:05.164631882, 00:… TimeSpan(00:00:05.164631882, 00:… + 2 │ 1 a 0.961599 TimeSpan(00:00:08.853504418, 00:… TimeSpan(00:00:05.164631882, 00:… TimeSpan(00:00:08.853504418, 00:… + 3 │ 1 c 0.55525 TimeSpan(00:00:13.431519652, 00:… TimeSpan(00:00:05.164631882, 00:… TimeSpan(00:00:13.431519652, 00:… + 4 │ 1 d 0.058248 TimeSpan(00:00:25.929078264, 00:… TimeSpan(00:00:05.164631882, 00:… TimeSpan(00:00:25.929078264, 00:… + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ + 101 │ 4 a 0.995222 TimeSpan(00:08:51.512608520, 00:… TimeSpan(00:06:51.442142229, 00:… TimeSpan(00:08:51.512608520, 00:… + 102 │ 4 d 0.188141 TimeSpan(00:08:56.662988067, 00:… TimeSpan(00:06:51.442142229, 00:… TimeSpan(00:08:56.662988067, 00:… + 103 │ 4 a 0.338053 TimeSpan(00:08:58.445446762, 00:… TimeSpan(00:06:51.442142229, 00:… TimeSpan(00:08:58.445446762, 00:… +``` \ No newline at end of file diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..e38361d --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,3 @@ +[deps] +DataFrameIntervals = "33b79e07-adbe-4034-b8be-6bacde625d75" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..669fca7 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,14 @@ +using DataFrameIntervals +using Documenter + +DocMeta.setdocmeta!(DataFrameIntervals, :DocTestSetup, :(using DataFrameIntervals); + recursive=true) + +makedocs(; + modules=[DataFrameIntervals], + repo="https://github.com/beacon-biosignals/DataFrameIntervals.jl/blob/{commit}{path}#{line}", + sitename="DataFrameIntervals.jl", + format=Documenter.HTML(; + prettyurls=get(ENV, "CI", "false") == "true", + assets=String[]), + pages=["Home" => "index.md"]) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..a857f92 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,14 @@ +```@meta +CurrentModule = DataFrameIntervals +``` + +# DataFrameIntervals + +Documentation for [DataFrameIntervals](https://github.com/beacon-biosignals/DataFrameIntervals.jl). + +```@index +``` + +```@autodocs +Modules = [DataFrameIntervals] +``` diff --git a/examples/.gitkeep b/examples/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/format/Manifest.toml b/format/Manifest.toml new file mode 100644 index 0000000..981bfe3 --- /dev/null +++ b/format/Manifest.toml @@ -0,0 +1,194 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.7.2" +manifest_format = "2.0" + +[[deps.ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[deps.Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[deps.CSTParser]] +deps = ["Tokenize"] +git-tree-sha1 = "b66abc140f8b90a1d6bc7bfad5c80070f8c1ddc6" +uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f" +version = "3.3.3" + +[[deps.CommonMark]] +deps = ["Crayons", "JSON", "URIs"] +git-tree-sha1 = "4cd7063c9bdebdbd55ede1af70f3c2f48fab4215" +uuid = "a80b9123-70ca-4bc0-993e-6e3bcb318db6" +version = "0.8.6" + +[[deps.Compat]] +deps = ["Dates", "LinearAlgebra", "UUIDs"] +git-tree-sha1 = "924cdca592bc16f14d2f7006754a621735280b74" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "4.1.0" + +[[deps.CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[deps.Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[deps.DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "d1fff3a548102f48987a52a2e0d114fa97d730f0" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.13" + +[[deps.Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[deps.Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.3" + +[[deps.JuliaFormatter]] +deps = ["CSTParser", "CommonMark", "DataStructures", "Pkg", "Tokenize"] +git-tree-sha1 = "9f1e11d5bf6fff5a7592f2aa602fe6eb4c801da7" +uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899" +version = "1.0.3" + +[[deps.LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[deps.LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[deps.LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[deps.LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[deps.Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[deps.LinearAlgebra]] +deps = ["Libdl", "libblastrampoline_jll"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[deps.Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[deps.MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" + +[[deps.OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[deps.Parsers]] +deps = ["Dates"] +git-tree-sha1 = "0044b23da09b5608b4ecacb4e5e6c6332f833a7e" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.3.2" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[deps.Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[deps.REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[deps.Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[deps.TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[deps.Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[deps.Tokenize]] +git-tree-sha1 = "2b3af135d85d7e70b863540160208fa612e736b9" +uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624" +version = "0.5.24" + +[[deps.URIs]] +git-tree-sha1 = "97bbe755a53fe859669cd907f2d96aee8d2c1355" +uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +version = "1.3.0" + +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[deps.Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[deps.Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[deps.p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/format/Project.toml b/format/Project.toml new file mode 100644 index 0000000..71708c8 --- /dev/null +++ b/format/Project.toml @@ -0,0 +1,5 @@ +[deps] +JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" + +[compat] +JuliaFormatter = "1" diff --git a/format/run.jl b/format/run.jl new file mode 100644 index 0000000..3cfb507 --- /dev/null +++ b/format/run.jl @@ -0,0 +1,20 @@ +using JuliaFormatter + +function main() + perfect = true + # note: keep in sync with `.github/workflows/format-check.yml` + for d in ["src/", "test/", "docs/", "examples/"] + @info "...linting $d ..." + dir_perfect = format(d; style=YASStyle()) + perfect = perfect && dir_perfect + end + if perfect + @info "Linting complete - no files altered" + else + @info "Linting complete - files altered" + run(`git status`) + end + return nothing +end + +main() diff --git a/src/DataFrameIntervals.jl b/src/DataFrameIntervals.jl new file mode 100644 index 0000000..f30b33b --- /dev/null +++ b/src/DataFrameIntervals.jl @@ -0,0 +1,350 @@ +module DataFrameIntervals + +using Intervals, DataFrames, Requires, Dates +export quantile_windows, interval_join, groupby_interval_join + +##### +##### Support `find_intersection` and `intersect` over `Interval` and `TimeSpan` objects. +##### + +function find_intersections_(x::AbstractVector, y::AbstractVector) + return Intervals.find_intersections(IntervalArray(x), IntervalArray(y)) +end +intersect_(x, y) = backto(x, intersect(interval(x), interval(y))) + +# IntervalArray is a helper that treats any vector of interval-like objects as an array of +# `Interval` objects. For now this includes only `TimeSpans` and `NamedTuple` objects with +# a `start` and `stop` field +struct IntervalArray{A,I} <: AbstractVector{I} + val::A +end +Base.size(x::IntervalArray) = size(x.val) +Base.getindex(x::IntervalArray, i) = interval(x.val[i]) +Base.IndexStyle(::Type{<:IntervalArray}) = IndexLinear() + +# support for `Interval` vectors +IntervalArray(x::AbstractVector{<:Interval}) = x +interval(x::Interval) = x +backto(::Interval, x) = x + +# support for `NamedTuple` vectors +const IntervalTuple = Union{NamedTuple{(:start, :stop)},NamedTuple{(:stop, :start)}} +interval_type(x::Type{<:T}) where {T<:IntervalTuple} = Union{T.parameters[2].parameters...} +interval_type(x::IntervalTuple) = Union{typeof(x).parameters[2].parameters...} +function IntervalArray(x::AbstractVector{<:IntervalTuple}) + return IntervalArray{typeof(x),Interval{interval_type(eltype(x)),Closed,Open}}(x) +end +interval(x::IntervalTuple) = Interval{interval_type(x),Closed,Open}(x.start, x.stop) +backto(::NamedTuple{(:start, :stop)}, x::Interval) = (; start=first(x), stop=last(x)) +backto(::NamedTuple{(:stop, :start)}, x::Interval) = (; stop=last(x), start=first(x)) + +# support for `TimeSpan` vectors +function __init__() + @require TimeSpans = "bb34ddd2-327f-4c4a-bfb0-c98fc494ece1" begin + using .TimeSpans + interval(x::TimeSpan) = Interval{Nanosecond,Closed,Open}(x.start, x.stop) + function backto(::TimeSpan, x::Interval{Nanosecond,Closed,Open}) + return TimeSpan(first(x), last(x)) + end + function IntervalArray(x::AbstractVector{<:TimeSpan}) + return IntervalArray{typeof(x),Interval{Nanosecond,Closed,Open}}(x) + end + end +end + +forleft(x) = x +forright(x) = x +forleft(x::Pair) = first(x) +forright(x::Pair) = last(x) + +function setup_column_names!(left, right; on, renamecols=identity => identity, + renameon=:_left => :_right) + if !(on isa Symbol || on isa AbstractString) + error("Interval joins support only one `on` column; iterables are not allowed.") + end + + left_on = renamer(forleft(on), forleft(renameon)) + right_on = renamer(forright(on), forright(renameon)) + joined_on = forleft(on) + rename!(left, + (renamer(n, forleft(renamecols), forleft(on), forleft(renameon)) + for n in names(left))...) + rename!(right, + (renamer(n, forright(renamecols), forright(on), forright(renameon)) + for n in names(right))...) + if string(left_on) == string(joined_on) + error("Interval join failed: left dataframe's `on` column has the final name ", + "`$left_on` which clashes with joined dataframe's `on` column name ", + "`$joined_on`. Make sure `renameon` is set properly.") + end + if string(right_on) == string(joined_on) + error("Interval join failed: right dataframe's `on` column has the final name ", + "`$right_on` which clashes with joined dataframe's `on` column name ", + "`$joined_on`. Make sure `renameon` is set properly.") + end + + return (; left_on, right_on, joined_on, left, right) +end + +""" + interval_join(left, right; on, renamecols=identity => identity, + renameon=:_left => :_right, makeunique=false) + +Join two dataframes based on the intervals they represent (denoted by the `on` column); +these are typically intervals of time. The join includes one row for every pairing of rows +in `left` and `right` whose intervals overlap (i.e. `!isempty(intersect(left.on, +right.on))`). + +- `on`: The column name to join left and right on. If the column on which left and right + will be joined have different names, then a left=>right pair can be passed. on is a + required argument. The value of the on column in the output data frame is the intersection + of the left and right interval. `on` can be one of three different types of objects: + an `Interval`, a `TimeSpan` or a `NamedTuple` with a `start` and a `stop` field. + +- `makeunique`: if false (the default), an error will be raised if duplicate names are found + in columns not joined on; if true, duplicate names will be suffixed with _i (i starting at + 1 for the first duplicate). + +- `renamecols`: a Pair specifying how columns of left and right data frames should be + renamed in the resulting data frame. Each element of the pair can be a string or a Symbol + can be passed in which case it is appended to the original column name; alternatively a + function can be passed in which case it is applied to each column name, which is passed to + it as a String. Note that renamecols does not affect any of the `on` columns. + +- `renameon`: a Pair specifying how the left and right data frame `on` column is renamed and + stored in the resulting data frame, following the same format as `renamecols`. + +""" +function interval_join(left, right; makeunique=false, kwds...) + left = DataFrame(left; copycols=false) + right = DataFrame(right; copycols=false) + (; left_on, right_on, joined_on) = setup_column_names!(left, right; kwds...) + regions = find_intersections_(view(right, :, right_on), view(left, :, left_on)) + + # perform the join + left_side, right_side = join_indices(regions, left, right) + joined = hcat(right_side, left_side; makeunique) + transform!(joined, [left_on, right_on] => ByRow(intersect_) => joined_on) + return joined +end +function renamer(n, renamecols, on, renameon) + return n == string(on) ? n => renamer(n, renameon) : n => renamer(n, renamecols) +end +renamer(col, suffix::Union{Symbol,AbstractString}) = string(col, suffix) +renamer(col, fn) = fn(col) +function join_indices(regions, left, right) + ixs = map(enumerate(regions)) do (right_i, left_ixs) + return (fill(right_i, length(left_ixs)), left_ixs) + end + left_side = view(left, mapreduce(last, vcat, ixs), :) + right_side = view(right, mapreduce(first, vcat, ixs), :) + return left_side, right_side +end + +# helpers to handle grouping DataFrames +struct Invalid + name::String +end +Base.string(x::Invalid) = x.name +function oncol_error(on) + return error("Column $on cannot be used for grouping during a call to `split_into_combine`.") +end +function check_oncol(on, names) + string(on) ∈ names && oncol_error(on) + return names +end + +# `find_valid`: given a DataFrame column selector return an array of strings and `Invalid` +# objects. The strings represent all columns present in the dataframe that would be selected +# by the given selector. Any `Invalid` values are columns the selector requestred that were +# not actually present in the dataframe. +function find_valid(on, df, + col::Union{<:Integer,<:AbstractRange{<:Integer}, + <:AbstractVector{<:Integer}}) + return error("Cannot use index or boolean as grouping variable when using `split_into_combine`") +end +function find_valid(on, df, col::Union{<:AbstractString,Symbol}) + col = string(col) + return col ∈ names(df) ? check_oncol(on, Union{String,Invalid}[col]) : + Union{String,Invalid}[Invalid(col)] +end +function find_valid(on, df, cols::Not) + valids = in.(string.(cols.skip), Ref(names(df))) + return check_oncol(on, names(df, Not(cols.skip[valids]))) +end +function find_valid(on, df, cols::Not{<:Union{Symbol,<:AbstractString}}) + if in(string(cols.skip), names(df)) + check_oncol(on, names(df, cols)) + else + check_oncol(on, names(df)) + end +end +find_valid(on, df, cols::All) = oncol_error(on) +find_valid(on, df, cols::Colon) = oncol_error(on) +function find_valid(on, df, cols::Cols{<:Tuple{<:Function}}) + return check_oncol(on, names(df, cols)) +end +function find_valid(on, df, cols::Cols) + return check_oncol(on, union(find_valid.(on, Ref(df), cols.cols)...)) +end +find_valid(on, df, cols::Regex) = check_oncol(on, names(df, cols)) +function find_valid(on, df, cols::Between) + first_last = [find_valid(on, df, cols.first); find_valid(on, df, cols.last)] + if all(x -> x isa String, first_last) + check_oncol(on, names(df, cols)) + else + return filter(x -> x isa Invalid, first_last) + end +end +find_valid(on, df, cols) = mapreduce(c -> find_valid(on, df, c), vcat, cols) + +# helper for `split_into_combine` + +struct GroupedIntervalJoin{R,LG,LD} + right_grouped::R + left_groups::LG + left_df::LD + makeunique::Bool + left_index::Symbol + left_on::Symbol + right_on::Symbol + joined_on::Symbol +end + +""" + groupby_interval_join(left, right, groups; on, renamecols=identity => identity, + renameon=:_left => :_right, makeunique=false) + + Similar to, but less resource intensive than +`groupby(interval_join(left, right), groups)`. You can iterate over the groups or call +`combine` on said groups. Note however that the returned object is not a `GroupedDataFrame` +and only supports these two operations. + +See also [`interval_join`](@ref) +""" +function groupby_interval_join(left, right, groups; on, makeunique=false, kwds...) + # split column groupings into `left` columns and `right` columns + right_groups = find_valid(forright(on), right, groups) + left_groups = find_valid(forleft(on), left, groups) + + right_cols = filter(x -> x isa String, right_groups) + right_invalid = filter(x -> x isa Invalid, right_groups) + left_cols = filter(x -> x isa String, left_groups) + left_invalid = filter(x -> x isa Invalid, left_groups) + invalid = intersect(right_invalid, left_invalid) + if !isempty(invalid) + error("Columns do not exist: " * join(string.(invalid), ", ", " and ")) + end + + # setup column names + left = DataFrame(left; copycols=false) + right = DataFrame(right; copycols=false) + (; left_on, right_on, joined_on) = setup_column_names!(left, right; on, kwds...) + + # compute interval intersections + left_index = gensym(:__left_index__) + regions = find_intersections_(view(right, :, right_on), view(left, :, left_on)) + right = insertcols!(right, left_index => regions) + + # a lazy instantiation of the joined dataframe + return GroupedIntervalJoin(groupby(right, right_cols), left_cols, left, makeunique, + Symbol(left_index), Symbol(left_on), Symbol(right_on), + Symbol(joined_on)) +end + +function Base.iterate(grouped::GroupedIntervalJoin) + mapped = Iterators.map(grouped.right_grouped) do gdf + return groupby(select!(joingroup(gdf, grouped), Not(grouped.left_index)), + grouped.left_groups) + end + iterable = Iterators.flatten(mapped) + + result = iterate(iterable) + isnothing(result) && return nothing + item, state = result + return item, (iterable, state) +end +function Base.iterate(::GroupedIntervalJoin, (iterable, state)) + result = iterate(iterable, state) + isnothing(result) && return nothing + item, state = result + return item, (iterable, state) +end + +function joingroup(right_df, grouped) + left_df = grouped.left_df + left_side, right_side = join_indices(right_df[!, grouped.left_index], left_df, right_df) + joined = hcat(right_side, left_side; grouped.makeunique) + return transform!(joined, + [grouped.left_on, grouped.right_on] => ByRow(intersect_) => grouped.joined_on) +end + +function DataFrames.combine(grouped::GroupedIntervalJoin, pairs...; kwargs...) + helper = x -> combine(groupby(joingroup(DataFrame(x), grouped), grouped.left_groups), + pairs...; kwargs...) + result = combine(grouped.right_grouped, AsTable(:) => helper => AsTable; kwargs...) + if grouped.left_index ∈ propertynames(result) + return select!(result, Not(grouped.left_index)) + else + return result + end +end + +label_helper(x::Symbol) = x +value_helper(::Symbol, n) = 1:n +label_helper(x::Pair) = first(x) +value_helper(x::Pair, _) = last(x) + +function intervals(steps, el) + return map(steps[1:(end - 1)], steps[2:end]) do start, stop + return backto(el, Interval{eltype(steps),Closed,Open}(start, stop)) + end +end +toval(x::TimePeriod) = float(Dates.value(convert(Nanosecond, x))) +asnanoseconds(x::Real) = Nanosecond(round(Int, x, RoundDown)) +function range_(a::TimePeriod, b::TimePeriod; length) + return map(asnanoseconds, range(toval(a), toval(b); length)) +end +range_(a, b; length) = range(a, b; length) + +""" + quantile_windows(n, span; spancol=:span, label=:count => 1:n, + min_duration = 0.75*Intervals.span(span)/n) + +Generate a data frame with `n` rows that divide the interval `span` into equally spaced +intervals. The output is a DataFrame with a `:span` column and a column of name `label` with +the index for the span (== 1:n). The label argument can also be a pair in which case it +should be a symbol paired with an iterable of `n` items to assign as the value of the given +column. + +The value `span` can also be a dataframe, in which case quantiles that cover the entire +range of time spans in the dataframe are used. + +The output is useful as the right argument to `interva_join` and `groupby_interval_join` +""" +function quantile_windows(n, span_; spancol=:span, label=:index, min_duration=nothing) + ismissing(span_) && return missing + + span = interval(span_) + splits = intervals(range_(first(span), last(span); length=n + 1), span_) + min_duration = if isnothing(min_duration) + asnanoseconds(0.75 * toval(Intervals.span(interval(first(splits))))) + else + min_duration + end + df = DataFrame(; (spancol => splits, label_helper(label) => value_helper(label, n))...) + return df +end +function quantile_windows(n, span::DataFrame; spancol=:span, kwds...) + return quantile_windows(n, dfspan(span, spancol); spancol, kwds...) +end + +function dfspan(df, spancol) + if nrow(df) == 0 + return missing + else + return backto(first(df[!, spancol]), superset(IntervalArray(df[!, spancol]))) + end +end + +end # module diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..2c3cbd2 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,99 @@ +using DataFrameIntervals +using Intervals +using DataFrames +using TimeSpans +using Test +using Random +using Dates +using Distributions +using Aqua + +# some light type piracy +Base.isapprox(a::TimePeriod, b::TimePeriod; atol=period) = return abs(a - b) ≤ atol + +@testset "DataFrameIntervals.jl" begin + n = 100 + tovalue(x) = Nanosecond(round(Int, x * 1e9)) + times = cumsum(rand(MersenneTwister(hash((:dataframe_intervals, 2022_06_01))), + Gamma(3, 2), n + 1)) + spans = TimeSpan.(tovalue.(times[1:(end - 1)]), tovalue.(times[2:end])) + df1 = DataFrame(; label=rand(('a':'d'), n), x=rand(n), span=spans) + quarters = quantile_windows(4, df1; label=:quarter) + @test nrow(quarters) == 4 + @test isapprox(duration(quarters.span[1]), duration(quarters.span[2]), + atol=Nanosecond(1)) + @test isapprox(duration(quarters.span[2]), duration(quarters.span[3]), + atol=Nanosecond(1)) + @test isapprox(duration(quarters.span[2]), duration(quarters.span[3]); + atol=Nanosecond(1)) || + duration(quarters.span[4]) ≤ duration(quarters.span[3]) + + # TODO: test various column renaming bevhariors + + # NOTE: the bulk of the correctness testing for interval intersections + # has already been handled by calling out to `Intervals.find_intervals` + # which has been tested in `Intervals.jl` + df_result = interval_join(df1, quarters; on=:span) + for quarter in groupby(df_result, :span_right) + @test sum(duration, quarter.span) ≤ duration(quarter.span_right[1]) + end + ixs = Intervals.find_intersections(DataFrameIntervals.interval.(quarters.span), + DataFrameIntervals.interval.(df1.span)) + @test df_result.span_left == mapreduce(ix -> df1.span[ix], vcat, ixs) + + # test interval joins with named tuples + nt_spans = [(; start=start(x), stop=stop(x)) for x in spans] + df1_nt = hcat(df1[!, Not(:span)], DataFrame(; span=nt_spans)) + df_result_nt = interval_join(df1_nt, quarters; on=:span) + @test nrow(df_result_nt) == nrow(df_result) + + # groubpy_interval_join equivalence + df_combined = combine(groupby_interval_join(df1, quarters, [:quarter, :label]; + on=:span), :x => mean) + df_manual_combined = combine(groupby(interval_join(df1, quarters; on=:span), + [:quarter, :label]), :x => mean) + @test df_combined.x_mean == df_manual_combined.x_mean + + df_grouped1 = groupby(interval_join(df1, quarters; on=:span), [:quarter, :label]) + df_grouped2 = groupby_interval_join(df1, quarters, [:quarter, :label]; on=:span) + for (gdf1, gdf2) in zip(df_grouped1, df_grouped2) + @test gdf1.x == gdf2.x + end + + # test out various column specifiers + df_combined = combine(groupby_interval_join(df1, quarters, r"quar|lab"; on=:span), + :x => mean) + df_combined = combine(groupby_interval_join(df1, quarters, Cols(:quarter, r"lab"); + on=:span), :x => mean) + df_combined = combine(groupby_interval_join(df1, quarters, Not([:span, :x]); on=:span), + :x => mean) + err = ErrorException("Column span cannot be used for grouping during a call to `split_into_combine`.") + @test_throws err combine(groupby_interval_join(df1, quarters, All(); on=:span), + :x => mean) + @test_throws err combine(groupby_interval_join(df1, quarters, Cols(:); on=:span), + :x => mean) + + df2 = DataFrame(; label=rand(('a':'d'), n), sublabel=rand(('k':'n'), n), x=rand(n), + span=spans) + df2_split = combine(groupby_interval_join(df2, quarters, + Cols(Between(:label, :sublabel), :quarter); + on=:span), + :x => mean) + df2_manual = combine(groupby(interval_join(df2, quarters; on=:span), + Cols(Between(:label, :sublabel), :quarter)), :x => mean) + @test df2_split.x_mean == df2_manual.x_mean + @test_throws ErrorException combine(groupby_interval_join(df2, quarters, + [:i_dont_exist]; on=:span), + :x => mean) + @test_throws ErrorException combine(groupby_interval_join(df2, quarters, Cols(1:2); + on=:span), :x => mean) + + @testset "Code Quality" begin + Aqua.test_all(DataFrameIntervals; + project_extras=true, + stale_deps=true, + deps_compat=true, + project_toml_formatting=true, + ambiguities=false) + end +end