diff --git a/.binder/conda-lock.yml b/.binder/conda-lock.yml index 3805a2f2..594f4785 100644 --- a/.binder/conda-lock.yml +++ b/.binder/conda-lock.yml @@ -15,9 +15,9 @@ metadata: - url: conda-forge used_env_vars: [] content_hash: - linux-64: a9096c0ec9e0da28a616a7cd5195dab1031d21fb745df69a95feb6a283140daa - osx-64: 89c1c35d776c0f1b3d62e1fd19c5976432944021d6ac9f353375f840c488f237 - win-64: 2562905301719ce1a1031bd3e8be2439edb55ff28fb5ae2de57cad0279b6ba28 + linux-64: a06ab1f6b457a8e01ffc7a0c8389db5666da7a52528bfea3171b297122101e98 + osx-64: ab65fbc2b501e3e5c402a28483b0fb11501ec06cae542e31b8d7179a96cc0e48 + win-64: 1d8ac907bf70156023b42600406482d8e59b586e2b3c87dabd321f2b1c0348d3 platforms: - linux-64 - osx-64 @@ -2079,6 +2079,18 @@ package: platform: linux-64 url: https://conda.anaconda.org/conda-forge/linux-64/antlr-python-runtime-4.7.2-py311h38be061_1003.tar.bz2 version: 4.7.2 +- category: main + dependencies: + python: '' + hash: + md5: c0481c9de49f040272556e2cedf42816 + sha256: b3e9369529fe7d721b66f18680ff4b561e20dbf6507e209e1f60eac277c97560 + manager: conda + name: asciitree + optional: false + platform: linux-64 + url: https://conda.anaconda.org/conda-forge/noarch/asciitree-0.3.3-py_2.tar.bz2 + version: 0.3.3 - category: main dependencies: python: '>2.7' @@ -2417,6 +2429,18 @@ package: platform: linux-64 url: https://conda.anaconda.org/conda-forge/noarch/executing-1.2.0-pyhd8ed1ab_0.tar.bz2 version: 1.2.0 +- category: main + dependencies: + python: '>=3.6' + hash: + md5: 348e27e78a5e39090031448c72f66d5e + sha256: 42be6ac8478051b26751d778490d6a71de12e5c6443e145ff3eddbc577d9bcda + manager: conda + name: fasteners + optional: false + platform: linux-64 + url: https://conda.anaconda.org/conda-forge/noarch/fasteners-0.17.3-pyhd8ed1ab_0.tar.bz2 + version: 0.17.3 - category: main dependencies: python: '>=3.7' @@ -2895,6 +2919,21 @@ package: platform: linux-64 url: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_0.conda version: 1.3.0 +- category: main + dependencies: + libgcc-ng: '>=12' + libstdcxx-ng: '>=12' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* *_cp311 + hash: + md5: 7415f24f8c44e44152623d93c5015000 + sha256: 9dea30d75b5eb31dac447aee56bf0648b5d58438a686123a2e085a166ed69900 + manager: conda + name: msgpack-python + optional: false + platform: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.0.5-py311ha3edf6b_0.conda + version: 1.0.5 - category: main dependencies: python: '' @@ -3251,14 +3290,14 @@ package: dependencies: python: '>=3.6' hash: - md5: b8afba5fbf891019eae564c3edb28b9e - sha256: 904c98c6bb45302f6349656d5e2f2743677da4254ac76be660475ecdd0fd0c6a + md5: de631f19ba156d224d80241e3fc7d32f + sha256: c0781a1aacc2227379c9614852bf92b967a0e8b52f66c04b5723e0b7b2bd4f1e manager: conda name: python-tzdata optional: false platform: linux-64 - url: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2022.7-pyhd8ed1ab_0.conda - version: '2022.7' + url: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2023.2-pyhd8ed1ab_0.conda + version: '2023.2' - category: main dependencies: libgcc-ng: '>=12' @@ -4544,6 +4583,24 @@ package: platform: linux-64 url: https://conda.anaconda.org/conda-forge/linux-64/numba-0.56.3-py310ha5257ce_0.tar.bz2 version: 0.56.3 +- category: main + dependencies: + entrypoints: '' + libgcc-ng: '>=12' + libstdcxx-ng: '>=12' + msgpack-python: '' + numpy: '>=1.7' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* *_cp311 + hash: + md5: ecdaf0772e524ed51218f6d52ef74424 + sha256: 0837de92c8251f2ce6940bcb404f6cb8a2bf5f6cfbc1f65d1c823ef223bcc6ca + manager: conda + name: numcodecs + optional: false + platform: linux-64 + url: https://conda.anaconda.org/conda-forge/linux-64/numcodecs-0.11.0-py311hcafe171_1.conda + version: 0.11.0 - category: main dependencies: attrs: '>=19.2.0' @@ -5283,6 +5340,19 @@ package: platform: linux-64 url: https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-6.0.0-h8e241bc_0.conda version: 6.0.0 +- category: main + dependencies: + importlib-metadata: '' + python: '>=3.7' + hash: + md5: b2ec66de3480db2f5124f547cad7e7a4 + sha256: 0569bf2c7ad1f647b4f4b83dd1152df093585ff9a2ba38af98d7a2a348b2b74a + manager: conda + name: humanize + optional: false + platform: linux-64 + url: https://conda.anaconda.org/conda-forge/noarch/humanize-4.6.0-pyhd8ed1ab_0.conda + version: 4.6.0 - category: main dependencies: importlib-metadata: '>=6.1.0,<6.1.1.0a0' @@ -5740,6 +5810,22 @@ package: platform: linux-64 url: https://conda.anaconda.org/conda-forge/noarch/wsproto-1.2.0-pyhd8ed1ab_0.tar.bz2 version: 1.2.0 +- category: main + dependencies: + asciitree: '' + fasteners: '' + numcodecs: '>=0.10.0' + numpy: '>=1.7' + python: '>=3.5' + hash: + md5: 0c5776fe65a12a421d7ddf90411a6c3f + sha256: 0f029f7efea00b8258782b5e68989fc140c227e6d9edd231d46fdd954b39d23f + manager: conda + name: zarr + optional: false + platform: linux-64 + url: https://conda.anaconda.org/conda-forge/noarch/zarr-2.14.2-pyhd8ed1ab_0.conda + version: 2.14.2 - category: main dependencies: argon2-cffi-bindings: '' @@ -9768,7 +9854,6 @@ package: - category: main dependencies: joblib: '>=1.1.1' - libcblas: '>=3.9.0,<4.0a0' libgcc-ng: '>=12' libstdcxx-ng: '>=12' numpy: '>=1.23.5,<2.0a0' @@ -9777,13 +9862,13 @@ package: scipy: '' threadpoolctl: '>=2.0.0' hash: - md5: c928bd2c1f3dfe6e5c89bbc5f03af284 - sha256: 344b84f89a2d537c250d4eff98ee85c4bfbfb0227174448c4e03bbf4e6126345 + md5: 1a69529b0bcf0e3a03e6585903659df7 + sha256: 7bf78d2c375b53e3a61fcd70847b12d2152a2699e43224ba9817badb199a62f1 manager: conda name: scikit-learn optional: false platform: linux-64 - url: https://conda.anaconda.org/conda-forge/linux-64/scikit-learn-1.2.2-py311h67c5ca5_0.conda + url: https://conda.anaconda.org/conda-forge/linux-64/scikit-learn-1.2.2-py311h103fc68_1.conda version: 1.2.2 - category: main dependencies: @@ -12665,6 +12750,18 @@ package: platform: osx-64 url: https://conda.anaconda.org/conda-forge/noarch/appnope-0.1.3-pyhd8ed1ab_0.tar.bz2 version: 0.1.3 +- category: main + dependencies: + python: '' + hash: + md5: c0481c9de49f040272556e2cedf42816 + sha256: b3e9369529fe7d721b66f18680ff4b561e20dbf6507e209e1f60eac277c97560 + manager: conda + name: asciitree + optional: false + platform: osx-64 + url: https://conda.anaconda.org/conda-forge/noarch/asciitree-0.3.3-py_2.tar.bz2 + version: 0.3.3 - category: main dependencies: python: '>2.7' @@ -13013,6 +13110,18 @@ package: platform: osx-64 url: https://conda.anaconda.org/conda-forge/noarch/executing-1.2.0-pyhd8ed1ab_0.tar.bz2 version: 1.2.0 +- category: main + dependencies: + python: '>=3.6' + hash: + md5: 348e27e78a5e39090031448c72f66d5e + sha256: 42be6ac8478051b26751d778490d6a71de12e5c6443e145ff3eddbc577d9bcda + manager: conda + name: fasteners + optional: false + platform: osx-64 + url: https://conda.anaconda.org/conda-forge/noarch/fasteners-0.17.3-pyhd8ed1ab_0.tar.bz2 + version: 0.17.3 - category: main dependencies: python: '>=3.7' @@ -13407,6 +13516,20 @@ package: platform: osx-64 url: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_0.conda version: 1.3.0 +- category: main + dependencies: + libcxx: '>=14.0.6' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* *_cp311 + hash: + md5: d3a60c5422b7d61b2740c7c5df508c86 + sha256: ebe4d269e0605e7de3b9a9199e1e20d96c66945ac67fe1fccf778177d1a615a7 + manager: conda + name: msgpack-python + optional: false + platform: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/msgpack-python-1.0.5-py311hd2070f0_0.conda + version: 1.0.5 - category: main dependencies: python: '' @@ -13758,14 +13881,14 @@ package: dependencies: python: '>=3.6' hash: - md5: b8afba5fbf891019eae564c3edb28b9e - sha256: 904c98c6bb45302f6349656d5e2f2743677da4254ac76be660475ecdd0fd0c6a + md5: de631f19ba156d224d80241e3fc7d32f + sha256: c0781a1aacc2227379c9614852bf92b967a0e8b52f66c04b5723e0b7b2bd4f1e manager: conda name: python-tzdata optional: false platform: osx-64 - url: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2022.7-pyhd8ed1ab_0.conda - version: '2022.7' + url: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2023.2-pyhd8ed1ab_0.conda + version: '2023.2' - category: main dependencies: python: '>=3.11,<3.12.0a0' @@ -15636,6 +15759,19 @@ package: platform: osx-64 url: https://conda.anaconda.org/conda-forge/osx-64/harfbuzz-6.0.0-h08f8713_0.conda version: 6.0.0 +- category: main + dependencies: + importlib-metadata: '' + python: '>=3.7' + hash: + md5: b2ec66de3480db2f5124f547cad7e7a4 + sha256: 0569bf2c7ad1f647b4f4b83dd1152df093585ff9a2ba38af98d7a2a348b2b74a + manager: conda + name: humanize + optional: false + platform: osx-64 + url: https://conda.anaconda.org/conda-forge/noarch/humanize-4.6.0-pyhd8ed1ab_0.conda + version: 4.6.0 - category: main dependencies: importlib-metadata: '>=6.1.0,<6.1.1.0a0' @@ -16522,6 +16658,23 @@ package: platform: osx-64 url: https://conda.anaconda.org/conda-forge/osx-64/numba-0.56.3-py310h62db5c2_0.tar.bz2 version: 0.56.3 +- category: main + dependencies: + entrypoints: '' + libcxx: '>=14.0.6' + msgpack-python: '' + numpy: '>=1.7' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* *_cp311 + hash: + md5: 21f2ae35161c19b8c4ad0791d12ef2a3 + sha256: 27c7eb0a5f6b3726252c5331b6101d492b8280af0a041b1c8b5fcd45ee56268d + manager: conda + name: numcodecs + optional: false + platform: osx-64 + url: https://conda.anaconda.org/conda-forge/osx-64/numcodecs-0.11.0-py311h814d153_1.conda + version: 0.11.0 - category: main dependencies: libcxx: '>=14.0.6' @@ -17043,6 +17196,22 @@ package: platform: osx-64 url: https://conda.anaconda.org/conda-forge/noarch/xarray-2023.3.0-pyhd8ed1ab_0.conda version: 2023.3.0 +- category: main + dependencies: + asciitree: '' + fasteners: '' + numcodecs: '>=0.10.0' + numpy: '>=1.7' + python: '>=3.5' + hash: + md5: 0c5776fe65a12a421d7ddf90411a6c3f + sha256: 0f029f7efea00b8258782b5e68989fc140c227e6d9edd231d46fdd954b39d23f + manager: conda + name: zarr + optional: false + platform: osx-64 + url: https://conda.anaconda.org/conda-forge/noarch/zarr-2.14.2-pyhd8ed1ab_0.conda + version: 2.14.2 - category: main dependencies: python: '>=3.8' @@ -18165,7 +18334,6 @@ package: - category: main dependencies: joblib: '>=1.1.1' - libcblas: '>=3.9.0,<4.0a0' libcxx: '>=14.0.6' llvm-openmp: '>=14.0.6' numpy: '>=1.23.5,<2.0a0' @@ -18174,13 +18342,13 @@ package: scipy: '' threadpoolctl: '>=2.0.0' hash: - md5: 7eedaa9f41426c4208d97ebea14b59b7 - sha256: 6765710a393afdde4e96d75047d81ca5276dc6c1b3a0390ee78c9a829c054b33 + md5: e2dd2bd2dcf23b11d5af2d6df01904a6 + sha256: 5595daa14a0d93752eef7b266b0bdf0a8c1b12c1260c6f1e6862c52ab030772b manager: conda name: scikit-learn optional: false platform: osx-64 - url: https://conda.anaconda.org/conda-forge/osx-64/scikit-learn-1.2.2-py311h087fafe_0.conda + url: https://conda.anaconda.org/conda-forge/osx-64/scikit-learn-1.2.2-py311hda7f639_1.conda version: 1.2.2 - category: main dependencies: @@ -22877,6 +23045,18 @@ package: platform: win-64 url: https://conda.anaconda.org/conda-forge/win-64/antlr-python-runtime-4.7.2-py311h1ea47a8_1003.tar.bz2 version: 4.7.2 +- category: main + dependencies: + python: '' + hash: + md5: c0481c9de49f040272556e2cedf42816 + sha256: b3e9369529fe7d721b66f18680ff4b561e20dbf6507e209e1f60eac277c97560 + manager: conda + name: asciitree + optional: false + platform: win-64 + url: https://conda.anaconda.org/conda-forge/noarch/asciitree-0.3.3-py_2.tar.bz2 + version: 0.3.3 - category: main dependencies: python: '>2.7' @@ -23225,6 +23405,18 @@ package: platform: win-64 url: https://conda.anaconda.org/conda-forge/noarch/executing-1.2.0-pyhd8ed1ab_0.tar.bz2 version: 1.2.0 +- category: main + dependencies: + python: '>=3.6' + hash: + md5: 348e27e78a5e39090031448c72f66d5e + sha256: 42be6ac8478051b26751d778490d6a71de12e5c6443e145ff3eddbc577d9bcda + manager: conda + name: fasteners + optional: false + platform: win-64 + url: https://conda.anaconda.org/conda-forge/noarch/fasteners-0.17.3-pyhd8ed1ab_0.tar.bz2 + version: 0.17.3 - category: main dependencies: python: '>=3.7' @@ -23712,6 +23904,22 @@ package: platform: win-64 url: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_0.conda version: 1.3.0 +- category: main + dependencies: + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* *_cp311 + ucrt: '>=10.0.20348.0' + vc: '>=14.2,<15' + vs2015_runtime: '>=14.29.30139' + hash: + md5: 01a252f384a5d1ad338cff1184d9a9c0 + sha256: 0df20f87b92ad3e86f4ed468d753901bf2da4e4661e395e8dbc5ee4f652ca9cc + manager: conda + name: msgpack-python + optional: false + platform: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/msgpack-python-1.0.5-py311h005e61a_0.conda + version: 1.0.5 - category: main dependencies: python: '' @@ -24024,14 +24232,14 @@ package: dependencies: python: '>=3.6' hash: - md5: b8afba5fbf891019eae564c3edb28b9e - sha256: 904c98c6bb45302f6349656d5e2f2743677da4254ac76be660475ecdd0fd0c6a + md5: de631f19ba156d224d80241e3fc7d32f + sha256: c0781a1aacc2227379c9614852bf92b967a0e8b52f66c04b5723e0b7b2bd4f1e manager: conda name: python-tzdata optional: false platform: win-64 - url: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2022.7-pyhd8ed1ab_0.conda - version: '2022.7' + url: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2023.2-pyhd8ed1ab_0.conda + version: '2023.2' - category: main dependencies: python: '>=3.11,<3.12.0a0' @@ -26021,6 +26229,19 @@ package: platform: win-64 url: https://conda.anaconda.org/conda-forge/win-64/glib-2.74.1-h12be248_1.tar.bz2 version: 2.74.1 +- category: main + dependencies: + importlib-metadata: '' + python: '>=3.7' + hash: + md5: b2ec66de3480db2f5124f547cad7e7a4 + sha256: 0569bf2c7ad1f647b4f4b83dd1152df093585ff9a2ba38af98d7a2a348b2b74a + manager: conda + name: humanize + optional: false + platform: win-64 + url: https://conda.anaconda.org/conda-forge/noarch/humanize-4.6.0-pyhd8ed1ab_0.conda + version: 4.6.0 - category: main dependencies: importlib-metadata: '>=6.1.0,<6.1.1.0a0' @@ -27342,6 +27563,25 @@ package: platform: win-64 url: https://conda.anaconda.org/conda-forge/win-64/numba-0.56.3-py310h19bcfe9_0.tar.bz2 version: 0.56.3 +- category: main + dependencies: + entrypoints: '' + msgpack-python: '' + numpy: '>=1.7' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* *_cp311 + ucrt: '>=10.0.20348.0' + vc: '>=14.2,<15' + vs2015_runtime: '>=14.29.30139' + hash: + md5: 9b66ec550742f950ba5fdc1fe650ce26 + sha256: 07dc9ec52ddb18a70f8ada65a9be9a982a427b6bcd479725659cc0d34d9e8fa0 + manager: conda + name: numcodecs + optional: false + platform: win-64 + url: https://conda.anaconda.org/conda-forge/win-64/numcodecs-0.11.0-py311h12c1d0e_1.conda + version: 0.11.0 - category: main dependencies: dataclasses: '' @@ -29471,6 +29711,22 @@ package: platform: win-64 url: https://conda.anaconda.org/conda-forge/noarch/xarray-2023.3.0-pyhd8ed1ab_0.conda version: 2023.3.0 +- category: main + dependencies: + asciitree: '' + fasteners: '' + numcodecs: '>=0.10.0' + numpy: '>=1.7' + python: '>=3.5' + hash: + md5: 0c5776fe65a12a421d7ddf90411a6c3f + sha256: 0f029f7efea00b8258782b5e68989fc140c227e6d9edd231d46fdd954b39d23f + manager: conda + name: zarr + optional: false + platform: win-64 + url: https://conda.anaconda.org/conda-forge/noarch/zarr-2.14.2-pyhd8ed1ab_0.conda + version: 2.14.2 - category: main dependencies: geos: '>=3.11.2,<3.11.3.0a0' @@ -30241,7 +30497,6 @@ package: - category: main dependencies: joblib: '>=1.1.1' - libcblas: '>=3.9.0,<4.0a0' numpy: '>=1.23.5,<2.0a0' python: '>=3.11,<3.12.0a0' python_abi: 3.11.* *_cp311 @@ -30251,13 +30506,13 @@ package: vc: '>=14.2,<15' vs2015_runtime: '>=14.29.30139' hash: - md5: 05b85818eef8dd80bfd854b9bde11e09 - sha256: 0f7f139c9218348c0ab53057e0f6a78da240a911ebef0e9e21af3bdecd29f39b + md5: ce1dbe81f1199a0e2719c9876715f7d4 + sha256: d362742096303d8d4eb0f052d53327074e9bbbae6ad297f7bde7a92c16f238ea manager: conda name: scikit-learn optional: false platform: win-64 - url: https://conda.anaconda.org/conda-forge/win-64/scikit-learn-1.2.2-py311h6619ee7_0.conda + url: https://conda.anaconda.org/conda-forge/win-64/scikit-learn-1.2.2-py311h142b183_1.conda version: 1.2.2 - category: main dependencies: diff --git a/.binder/environment-python_and_r.yml b/.binder/environment-python_and_r.yml index 01e961c3..0dd44b2c 100644 --- a/.binder/environment-python_and_r.yml +++ b/.binder/environment-python_and_r.yml @@ -23,6 +23,7 @@ dependencies: - geopandas - geoplot - gridgeo + - humanize - hvplot - ioos_qc - ipyleaflet @@ -63,6 +64,7 @@ dependencies: - xarray - xlrd - xmltodict + - zarr # R packages. - rpy2 - r-base=4 diff --git a/.binder/environment.yml b/.binder/environment.yml index 3723d6d5..f6ba20cb 100644 --- a/.binder/environment.yml +++ b/.binder/environment.yml @@ -22,6 +22,7 @@ dependencies: - geopandas - geoplot - gridgeo + - humanize - hvplot - ioos_qc - ipyleaflet @@ -62,3 +63,4 @@ dependencies: - xarray - xlrd - xmltodict + - zarr diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3ab1c1fb..b0a06279 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,12 +35,12 @@ repos: - id: add-trailing-comma - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.254 + rev: v0.0.257 hooks: - id: ruff - repo: https://github.com/nbQA-dev/nbQA - rev: 1.6.3 + rev: 1.6.4 hooks: # mdformat works on the CLI but not as pre-commit yet. # Use `nbqa mdformat jupyterbook --nbqa-md` to run it locally. diff --git a/jupyterbook/content/code_gallery/data_management_notebooks/2023-03-20-Reading_and_writing_zarr.ipynb b/jupyterbook/content/code_gallery/data_management_notebooks/2023-03-20-Reading_and_writing_zarr.ipynb new file mode 100644 index 00000000..76be410b --- /dev/null +++ b/jupyterbook/content/code_gallery/data_management_notebooks/2023-03-20-Reading_and_writing_zarr.ipynb @@ -0,0 +1,1681 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "29370978", + "metadata": {}, + "source": [ + "# Reading and writting zarr files with xarray\n", + "\n", + "\n", + "Created: 2023-03-20\n", + "\n", + "\n", + "\n", + "[The zarr format](https://zarr.readthedocs.io/en/stable/) is a file storage based specification for chunked, compressed, N-dimensional arrays. The format is based on an open-source specification and its main goal is to make cloud data read/write a bit easier and more effective.\n", + "\n", + "The main propblems in data storage are:\n", + "\n", + "1. Read/write data that is larger than memory\n", + "1. Being able to parallelize computations\n", + "1. Reduce the I/O botteneck\n", + "1. Compression\n", + "1. Speed\n", + "\n", + "One solution is to use a chunked\\* parallel computing framework and a chunked parallel storage library. Zarr helps us with the latter.\n", + "\n", + "In this example we will load an ocean model data, stored as netCDF and served via THREDDS, subset it and save as zarr. Let's start by saving a single time step for the surface layer temperature and salinity.\n", + "\n", + "\n", + "\\* Many data formats can take advantage of storing the data in chunks for faster access, the zarr approach is different in that each chunk is a different object in cloud storage, making them better for parallel access. The chunks can be compressed to reduce their size and improve cloud performance even further. Zarr has a nice tutorial on how to balance chunk size for performance. Check it out: https://zarr.readthedocs.io/en/stable/tutorial.html#chunk-optimizations." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2961833f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:   (time: 1, eta_rho: 106, xi_rho: 242)\n",
+       "Coordinates:\n",
+       "    s_rho     float64 -0.0125\n",
+       "    lon_rho   (eta_rho, xi_rho) float64 ...\n",
+       "    lat_rho   (eta_rho, xi_rho) float64 ...\n",
+       "  * time      (time) datetime64[ns] 2022-06-06T12:00:00\n",
+       "    time_run  (time) datetime64[ns] ...\n",
+       "Dimensions without coordinates: eta_rho, xi_rho\n",
+       "Data variables:\n",
+       "    temp      (time, eta_rho, xi_rho) float64 ...\n",
+       "    salt      (time, eta_rho, xi_rho) float64 ...\n",
+       "Attributes: (12/46)\n",
+       "    file:                            doppio_avg_6280_0004.nc\n",
+       "    format:                          netCDF-4/HDF5 file\n",
+       "    Conventions:                     CF-1.4, SGRID-0.3\n",
+       "    type:                            ROMS/TOMS nonlinear model averages file\n",
+       "    title:                           ROMS doppio Real-Time Operational PSAS F...\n",
+       "    var_info:                        ../Data/varinfo1040t_daily.dat\n",
+       "    ...                              ...\n",
+       "    his_base:                        doppio_his_6280\n",
+       "    cdm_data_type:                   GRID\n",
+       "    featureType:                     GRID\n",
+       "    location:                        Proto fmrc:doppio_2017_da_avg\n",
+       "    summary:                         doppio\n",
+       "    DODS_EXTRA.Unlimited_Dimension:  ocean_time
" + ], + "text/plain": [ + "\n", + "Dimensions: (time: 1, eta_rho: 106, xi_rho: 242)\n", + "Coordinates:\n", + " s_rho float64 -0.0125\n", + " lon_rho (eta_rho, xi_rho) float64 ...\n", + " lat_rho (eta_rho, xi_rho) float64 ...\n", + " * time (time) datetime64[ns] 2022-06-06T12:00:00\n", + " time_run (time) datetime64[ns] ...\n", + "Dimensions without coordinates: eta_rho, xi_rho\n", + "Data variables:\n", + " temp (time, eta_rho, xi_rho) float64 ...\n", + " salt (time, eta_rho, xi_rho) float64 ...\n", + "Attributes: (12/46)\n", + " file: doppio_avg_6280_0004.nc\n", + " format: netCDF-4/HDF5 file\n", + " Conventions: CF-1.4, SGRID-0.3\n", + " type: ROMS/TOMS nonlinear model averages file\n", + " title: ROMS doppio Real-Time Operational PSAS F...\n", + " var_info: ../Data/varinfo1040t_daily.dat\n", + " ... ...\n", + " his_base: doppio_his_6280\n", + " cdm_data_type: GRID\n", + " featureType: GRID\n", + " location: Proto fmrc:doppio_2017_da_avg\n", + " summary: doppio\n", + " DODS_EXTRA.Unlimited_Dimension: ocean_time" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xarray as xr\n", + "\n", + "url = (\n", + " \"https://tds.marine.rutgers.edu/thredds/dodsC/roms/doppio/2017_da/avg/Averages_Best\"\n", + ")\n", + "\n", + "ds = xr.open_dataset(url)\n", + "\n", + "time_slice = {\"time\": \"2022-06-06\"}\n", + "surface = {\"s_rho\": -1}\n", + "\n", + "ds = ds[[\"temp\", \"salt\"]].sel(time_slice).isel(surface)\n", + "\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d8fa04f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'820.9 kB'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import humanize\n", + "\n", + "humanize.naturalsize(ds.nbytes)" + ] + }, + { + "cell_type": "markdown", + "id": "a661b199", + "metadata": {}, + "source": [ + "It is a small subset but it is enough to ilustrate zarr's compression options.\n", + "\n", + "Now let's choose a compression level and save it as zarr." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d8e3601c", + "metadata": {}, + "outputs": [], + "source": [ + "import zarr\n", + "\n", + "compressor = zarr.Blosc(clevel=2, shuffle=-1)\n", + "\n", + "\n", + "fname = \"doppio/doppio_compressed.zarr\"\n", + "\n", + "ds.to_zarr(\n", + " fname,\n", + " mode=\"w\",\n", + " safe_chunks=True,\n", + " consolidated=True,\n", + " encoding={var: {\"compressor\": compressor} for var in ds.variables},\n", + ");" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d982caef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34mdoppio/doppio_compressed.zarr\u001b[0m\n", + "├── \u001b[01;34mlat_rho\u001b[0m\n", + "│   └── 0.0\n", + "├── \u001b[01;34mlon_rho\u001b[0m\n", + "│   └── 0.0\n", + "├── \u001b[01;34msalt\u001b[0m\n", + "│   └── 0.0.0\n", + "├── \u001b[01;34ms_rho\u001b[0m\n", + "│   └── 0\n", + "├── \u001b[01;34mtemp\u001b[0m\n", + "│   └── 0.0.0\n", + "├── \u001b[01;34mtime\u001b[0m\n", + "│   └── 0\n", + "└── \u001b[01;34mtime_run\u001b[0m\n", + " └── 0\n", + "\n", + "7 directories, 7 files\n", + "144K\tdoppio/doppio_compressed.zarr/salt\n", + "156K\tdoppio/doppio_compressed.zarr/lat_rho\n", + "148K\tdoppio/doppio_compressed.zarr/temp\n", + "16K\tdoppio/doppio_compressed.zarr/time\n", + "16K\tdoppio/doppio_compressed.zarr/s_rho\n", + "16K\tdoppio/doppio_compressed.zarr/time_run\n", + "156K\tdoppio/doppio_compressed.zarr/lon_rho\n", + "676K\tdoppio/doppio_compressed.zarr\n" + ] + } + ], + "source": [ + "!tree doppio/*zarr\n", + "!du -h doppio/*zarr" + ] + }, + { + "cell_type": "markdown", + "id": "e345b204", + "metadata": {}, + "source": [ + "The first thing to observe is that the zarr format is a directory based storage. That structure should be familiar for HDF5 users. However, instead of being a filesystem inside a filesystem, zarr is layed out directly on the disk filesystem.\n", + "\n", + "Each variable and coordinate has its own directory and the data chunks are stored in subdirectories. For more information [check this awesome](https://www.youtube.com/watch?v=qyJXBlrdzBs&list=PLKbXDtRY2ZfU6OfZ8JQimBEY-u1WLCpwp) presentation from one of zarr authors.\n", + "\n", + "Note that the stored size is quite smaller too! We went from 820.9 kB to 676 kB. Zarr has many modern compression oprions as plugins, [including some bitinformation based methods](https://xbitinfo.readthedocs.io/en/latest/api/xbitinfo.save_compressed.ToCompressed_Zarr.html).\n", + "\n", + "The data attributes, groups, and metdata are stored in the `.zattrs`, `.zgroup`, and `.zmetadata`. They are plain text JSON files and easy to parse:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "44822485", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'metadata': {'.zattrs': {'CPP_options': 'DOPPIO, ADD_FSOBC, ADD_M2OBC, ANA_BSFLUX, ANA_BTFLUX, ASSUMED_SHAPE, ATM_PRESS, AVERAGES, !BOUNDARY_A BULK_FLUXES, !COLLECT_ALL..., CHARNOK, CRAIG_BANNER, CURVGRID, DEFLATE, DIURNAL_SRFLUX, DJ_GRADPS, DOUBLE_PRECISION, EMINUSP, FLOATS, FORWARD_WRITE, GLS_MIXING, HDF5, KANTHA_CLAYSON, LONGWAVE_OUT, MASKING, MIX_GEO_TS, MIX_S_UV, MPI, NONLINEAR, NONLIN_EOS, NO_LBC_ATT, N2S2_HORAVG, OUT_DOUBLE, POWER_LAW, PROFILE, K_GSCHEME, REDUCE_ALLREDUCE, !RST_SINGLE, SALINITY, SOLAR_SOURCE, SOLVE3D, SSH_TIDES, TS_DIF2, UV_ADV, UV_COR, UV_U3HADVECTION, UV_C4VADVECTION, UV_QDRAG, UV_TIDES, UV_VIS2, VAR_RHO_2D, WIND_MINUS_CURRENT',\n", + " 'Conventions': 'CF-1.4, SGRID-0.3',\n", + " 'DODS_EXTRA.Unlimited_Dimension': 'ocean_time',\n", + " 'NLM_LBC': '\\nEDGE: WEST SOUTH EAST NORTH \\nzeta: Cha Cha Cha Clo \\nubar: Fla Fla Fla Clo \\nvbar: Fla Fla Fla Clo \\nu: RadNud RadNud RadNud Clo \\nv: RadNud RadNud RadNud Clo \\ntemp: Rad Rad Rad Clo \\nsalt: Rad Rad Rad Clo \\ntke: Gra Gra Gra Clo',\n", + " 'NLM_TADV': '\\nADVECTION: HORIZONTAL VERTICAL \\ntemp: Akima4 Akima4 \\nsalt: Akima4 Akima4',\n", + " '_CoordSysBuilder': 'ucar.nc2.dataset.conv.CF1Convention',\n", + " 'ana_file': 'ROMS/Functionals/ana_btflux.h, ROMS/Functionals/ana_srflux.h',\n", + " 'avg_base': 'doppio_avg_6280',\n", + " 'cdm_data_type': 'GRID',\n", + " 'clm_file_01': '../Data/doppio_clm.nc',\n", + " 'code_dir': '/home/julia/ROMS/doppio/svn1040t',\n", + " 'compiler_command': '/opt/sw/apps/intel-18.0.1/openmpi/3.1.2/bin/mpif90',\n", + " 'compiler_flags': '-fp-model precise -heap-arrays -ip -O3 -traceback -check uninit',\n", + " 'compiler_system': 'ifort',\n", + " 'cpu': 'x86_64',\n", + " 'featureType': 'GRID',\n", + " 'file': 'doppio_avg_6280_0004.nc',\n", + " 'flt_file': 'doppio_flt_6280.nc',\n", + " 'format': 'netCDF-4/HDF5 file',\n", + " 'fpos_file': 'floats.in',\n", + " 'frc_file_01': '../Data/lwrad_down_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'frc_file_02': '../Data/Pair_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'frc_file_03': '../Data/Qair_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'frc_file_04': '../Data/rain_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'frc_file_05': '../Data/swrad_daily_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'frc_file_06': '../Data/Tair_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'frc_file_07': '../Data/Winds_ncepnam_3hourly_MAB_and_GoM.nc',\n", + " 'grd_file': '/home/om/roms/doppio/7km/grid_doppio_JJA_v13.nc',\n", + " 'header_dir': '/home/julia/ROMS/doppio/Compile/fwd',\n", + " 'header_file': 'doppio.h',\n", + " 'his_base': 'doppio_his_6280',\n", + " 'history': 'ROMS/TOMS, Version 3.9, Thursday - March 16, 2023 - 4:32:53 AM ;\\nFMRC Best Dataset',\n", + " 'ini_file': 'doppio_rst_6280.nc',\n", + " 'location': 'Proto fmrc:doppio_2017_da_avg',\n", + " 'nud_file': '/home/om/roms/doppio/7km/doppio_nudgcoef_7km_1500-2000_GS.nc',\n", + " 'os': 'Linux',\n", + " 'rst_file': 'tmp_doppio_rst_6280.nc',\n", + " 'script_file': 'nl_ocean_doppio.in',\n", + " 'summary': 'doppio',\n", + " 'svn_rev': '1040',\n", + " 'svn_url': 'https://www.myroms.org/svn/src/trunk',\n", + " 'tide_file': '/home/om/roms/doppio/7km/doppio_tide_7km.nc',\n", + " 'tiling': '004x004',\n", + " 'title': 'ROMS doppio Real-Time Operational PSAS Forecast System Version 1 FMRC Averages',\n", + " 'type': 'ROMS/TOMS nonlinear model averages file',\n", + " 'var_info': '../Data/varinfo1040t_daily.dat'},\n", + " '.zgroup': {'zarr_format': 2},\n", + " 'lat_rho/.zarray': {'chunks': [106, 242],\n", + " 'compressor': {'blocksize': 0,\n", + " 'clevel': 2,\n", + " 'cname': 'lz4',\n", + " 'id': 'blosc',\n", + " 'shuffle': -1},\n", + " 'dtype': '\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:   (eta_rho: 106, xi_rho: 242, time: 1)\n",
+       "Coordinates:\n",
+       "    lat_rho   (eta_rho, xi_rho) float64 dask.array<chunksize=(106, 242), meta=np.ndarray>\n",
+       "    lon_rho   (eta_rho, xi_rho) float64 dask.array<chunksize=(106, 242), meta=np.ndarray>\n",
+       "    s_rho     float64 ...\n",
+       "  * time      (time) datetime64[ns] 2022-06-06T12:00:00\n",
+       "    time_run  (time) datetime64[ns] dask.array<chunksize=(1,), meta=np.ndarray>\n",
+       "Dimensions without coordinates: eta_rho, xi_rho\n",
+       "Data variables:\n",
+       "    salt      (time, eta_rho, xi_rho) float64 dask.array<chunksize=(1, 106, 242), meta=np.ndarray>\n",
+       "    temp      (time, eta_rho, xi_rho) float64 dask.array<chunksize=(1, 106, 242), meta=np.ndarray>\n",
+       "Attributes: (12/46)\n",
+       "    CPP_options:                     DOPPIO, ADD_FSOBC, ADD_M2OBC, ANA_BSFLUX...\n",
+       "    Conventions:                     CF-1.4, SGRID-0.3\n",
+       "    DODS_EXTRA.Unlimited_Dimension:  ocean_time\n",
+       "    NLM_LBC:                         \\nEDGE:  WEST   SOUTH  EAST   NORTH  \\nz...\n",
+       "    NLM_TADV:                        \\nADVECTION:   HORIZONTAL   VERTICAL    ...\n",
+       "    _CoordSysBuilder:                ucar.nc2.dataset.conv.CF1Convention\n",
+       "    ...                              ...\n",
+       "    svn_url:                         https://www.myroms.org/svn/src/trunk\n",
+       "    tide_file:                       /home/om/roms/doppio/7km/doppio_tide_7km.nc\n",
+       "    tiling:                          004x004\n",
+       "    title:                           ROMS doppio Real-Time Operational PSAS F...\n",
+       "    type:                            ROMS/TOMS nonlinear model averages file\n",
+       "    var_info:                        ../Data/varinfo1040t_daily.dat
" + ], + "text/plain": [ + "\n", + "Dimensions: (eta_rho: 106, xi_rho: 242, time: 1)\n", + "Coordinates:\n", + " lat_rho (eta_rho, xi_rho) float64 dask.array\n", + " lon_rho (eta_rho, xi_rho) float64 dask.array\n", + " s_rho float64 ...\n", + " * time (time) datetime64[ns] 2022-06-06T12:00:00\n", + " time_run (time) datetime64[ns] dask.array\n", + "Dimensions without coordinates: eta_rho, xi_rho\n", + "Data variables:\n", + " salt (time, eta_rho, xi_rho) float64 dask.array\n", + " temp (time, eta_rho, xi_rho) float64 dask.array\n", + "Attributes: (12/46)\n", + " CPP_options: DOPPIO, ADD_FSOBC, ADD_M2OBC, ANA_BSFLUX...\n", + " Conventions: CF-1.4, SGRID-0.3\n", + " DODS_EXTRA.Unlimited_Dimension: ocean_time\n", + " NLM_LBC: \\nEDGE: WEST SOUTH EAST NORTH \\nz...\n", + " NLM_TADV: \\nADVECTION: HORIZONTAL VERTICAL ...\n", + " _CoordSysBuilder: ucar.nc2.dataset.conv.CF1Convention\n", + " ... ...\n", + " svn_url: https://www.myroms.org/svn/src/trunk\n", + " tide_file: /home/om/roms/doppio/7km/doppio_tide_7km.nc\n", + " tiling: 004x004\n", + " title: ROMS doppio Real-Time Operational PSAS F...\n", + " type: ROMS/TOMS nonlinear model averages file\n", + " var_info: ../Data/varinfo1040t_daily.dat" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subset = xr.open_zarr(fname)\n", + "\n", + "subset" + ] + }, + { + "cell_type": "markdown", + "id": "eeded07f", + "metadata": {}, + "source": [ + "And a quick plot to check the data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fb7122f3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "subset[\"temp\"].squeeze().plot(x=\"lon_rho\", y=\"lat_rho\");" + ] + }, + { + "cell_type": "markdown", + "id": "0254598f", + "metadata": {}, + "source": [ + "What is the current workflow and what are the altearnatives? Most ocean data are stored as modern netCDF files that are, under the hood HDF5 files with more strict metadata stuture. HDF5 has some limitations like,\n", + "\n", + "- no thread-based parallelism\n", + "- cannot do parallel writes with compression\n", + "- no support for could object stores\n", + "\n", + "However, for most workflows what really matters is the chunking, not the data format. Leaving the parallelism, compression, and cloud support to be built on top of it with `dask`, `numcodecs`, and `fsspec`, respectively. That raises the question: Should one convert all the existing data to `zarr`? Luckily no! We can adopt a more inexpensive workflow and kerchunk to create virtual cloud-optimized CF-compliant datasets that access files in any format using the Zarr library.\n", + "\n", + "We can write the data in whatever format we need (maybe you are NASA and require HDF5, maybe you have R users who like netcdf, or want to use a visualization tool that only reads geotiff), then rechunk the data to best support the expected use cases." + ] + } + ], + "metadata": { + "_draft": { + "nbviewer_url": "https://gist.github.com/9fc25f2a1a1a653535bc15428798dfab" + }, + "gist": { + "data": { + "description": "netcdf-c-zarr.ipynb", + "public": true + }, + "id": "9fc25f2a1a1a653535bc15428798dfab" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pytest.ini b/pytest.ini index 9f40dab6..b2fbea15 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,7 +5,7 @@ addopts = --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2016-12-22-boston_light_swim.ipynb" --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2017-03-21-ERDDAP_IOOS_Sensor_Map.ipynb" --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2017-07-25-HFRadar_currents.ipynb" - --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2017-09-09-hurricane_irma" + --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2017-09-09-hurricane_irma.ipynb" --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2017-12-15-finding_HFRadar_currents.ipynb" --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2019-02-26-hurricane_gis_part01.ipynb" --ignore="jupyterbook/content/code_gallery/data_access_notebooks/2019-03-08-grids-temperature"