From 546302c297b28ff56b4a3753218fc7d328b478fc Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:55:45 +0200 Subject: [PATCH 1/3] Add 02-quality.ipynb --- 03-examples/02-quality.ipynb | 270 +++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 03-examples/02-quality.ipynb diff --git a/03-examples/02-quality.ipynb b/03-examples/02-quality.ipynb new file mode 100644 index 0000000..381c2b4 --- /dev/null +++ b/03-examples/02-quality.ipynb @@ -0,0 +1,270 @@ +{ + "metadata": { + "kernelspec": { + "display_name": "Python (Pyodide)", + "language": "python", + "name": "python" + }, + "language_info": { + "codemirror_mode": { + "name": "python", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat_minor": 4, + "nbformat": 4, + "cells": [ + { + "cell_type": "markdown", + "source": "# Example: Which compression method works best for different quality requirements?", + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": "> [!NOTE]\n> The online laboratory has only been tested in recent Firefox and Chrome browsers. Some features may not (yet) be supported in Safari browsers.\n\n> [!CAUTION]\n> Any changes you make to this notebook will be lost once the page is closed or refreshed. Please download any files you would like to keep.", + "metadata": {} + }, + { + "cell_type": "markdown", + "source": "## Setting up the online laboratory\n\nFirst, we import several utility packages to setup this notebook for data compression in climate science and meteorology.\n\nPlease refer back to the [01-intro.ipynb](../01-intro.ipynb) notebook for a deeper explanation.", + "metadata": {} + }, + { + "cell_type": "code", + "source": "from pathlib import Path\n\nimport dask\nimport fcbench\nimport numpy as np\nimport pandas as pd\nimport xarray as xr", + "metadata": { + "autorun": true, + "tags": [], + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "text": "[pyodide]: Loading Jinja2, MarkupSafe, Pillow, appdirs, asciitree, attrs, cffi, cfgrib, cftime, click, cloudpickle, cycler, dask, eccodes, field-compression-benchmark, findlibs, flexcache, flexparser, fonttools, h5py, importlib_metadata, joblib, kiwisolver, locket, matplotlib, matplotlib-pyodide, msgpack, netcdf4, numcodecs, numpy, openblas, pandas, partd, pint, pkgconfig, pycparser, pyparsing, python-dateutil, pytz, pyyaml, scikit-learn, scipy, threadpoolctl, toolz, tqdm, typing-extensions, tzdata, xarray, xarray-datatree, xeofs, xhistogram, zarr, zipp\n[pyodide]: Loaded Jinja2, MarkupSafe, Pillow, appdirs, asciitree, attrs, cffi, cfgrib, cftime, click, cloudpickle, cycler, dask, eccodes, field-compression-benchmark, findlibs, flexcache, flexparser, fonttools, h5py, importlib_metadata, joblib, kiwisolver, locket, matplotlib, matplotlib-pyodide, msgpack, netcdf4, numcodecs, numpy, openblas, pandas, partd, pint, pkgconfig, pycparser, pyparsing, python-dateutil, pytz, pyyaml, scikit-learn, scipy, threadpoolctl, toolz, tqdm, typing-extensions, tzdata, xarray, xarray-datatree, xeofs, xhistogram, zarr, zipp\n[pyodide]: Memory usage has grown to 171.0MiB (from 49.9MiB) for this notebook\n", + "output_type": "stream" + } + ], + "execution_count": 1 + }, + { + "cell_type": "markdown", + "source": "We also import a utility module `utils.py` from the outer parent directory.", + "metadata": {} + }, + { + "cell_type": "code", + "source": "import sys\nsys.path.insert(0, \"..\")", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 2 + }, + { + "cell_type": "code", + "source": "import utils", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "text": "[pyodide]: Loading fsspec, ipyfilite, ipywidgets, jupyterlab_widgets, kerchunk, mpmath, sympy, ujson, widgetsnbextension\n[pyodide]: Loaded fsspec, ipyfilite, ipywidgets, jupyterlab_widgets, kerchunk, mpmath, sympy, ujson, widgetsnbextension\n[pyodide]: Memory usage has grown to 205.2MiB (from 171.0MiB) for this notebook\n", + "output_type": "stream" + } + ], + "execution_count": 3 + }, + { + "cell_type": "code", + "source": "fcbench.codecs.preload()\n\ndask.config.set(array__chunk_size=\"4MiB\");", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "text": "Loaded the fcbench.codecs.Asinh codec ...\nLoaded the fcbench.codecs.BitRound codec ...\nLoaded the fcbench.codecs.FixedOffsetScale codec ...\nLoaded the fcbench.codecs.Identity codec ...\nLoaded the fcbench.codecs.LinearQuantize codec ...\nLoaded the fcbench.codecs.Log codec ...\nLoaded the fcbench.codecs.RandomProjection codec ...\nLoaded the fcbench.codecs.Reinterpret codec ...\nLoaded the fcbench.codecs.Round codec ...\nLoaded the fcbench.codecs.SwizzleReshape codec ...\nLoaded the fcbench.codecs.Sz3 codec ...\nLoaded the fcbench.codecs.UniformNoise codec ...\nLoaded the fcbench.codecs.Zfp codec ...\nLoaded the fcbench.codecs.Zlib codec ...\nLoaded the fcbench.codecs.Zstd codec ...\n[pyodide]: Memory usage has grown to 246.3MiB (from 205.2MiB) for this notebook\n", + "output_type": "stream" + } + ], + "execution_count": 4 + }, + { + "cell_type": "code", + "source": "import shutil\nfrom urllib.parse import unquote as urlunquote\nfrom urllib.parse import urlparse\nfrom urllib.request import urlopen\n\ndownload_url = \"https://a3s.fi/compression.lab.climet.eu/tigge_pl_t_q_dx=2_2024_08_02.nc\"\ndownload_path = Path(\"data\") / \"02-t-q.nc\"\n\nwith urlopen(download_url) as response:\n with download_path.open(\"wb\") as file:\n shutil.copyfileobj(response, file)", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 5 + }, + { + "cell_type": "code", + "source": "import numpy as np\nimport xarray as xr\n\nds = utils.open_dataset(download_path)\nds", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "execution_count": 6, + "output_type": "execute_result", + "data": { + "text/plain": " Size: 263kB\nDimensions: (longitude: 180, latitude: 91, time: 1)\nCoordinates:\n * longitude (longitude) float32 720B 0.0 2.0 4.0 6.0 ... 354.0 356.0 358.0\n * latitude (latitude) float32 364B 90.0 88.0 86.0 84.0 ... -86.0 -88.0 -90.0\n * time (time) datetime64[ns] 8B 2024-08-12\nData variables:\n t (time, latitude, longitude) float64 131kB dask.array\n q (time, latitude, longitude) float64 131kB dask.array\nAttributes:\n Conventions: CF-1.6\n history: 2024-09-27 11:51:26 GMT by grib_to_netcdf-2.35.3: /opt/ecmw...\n path: data/02-t-q.nc", + "text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.Dataset> Size: 263kB\nDimensions:    (longitude: 180, latitude: 91, time: 1)\nCoordinates:\n  * longitude  (longitude) float32 720B 0.0 2.0 4.0 6.0 ... 354.0 356.0 358.0\n  * latitude   (latitude) float32 364B 90.0 88.0 86.0 84.0 ... -86.0 -88.0 -90.0\n  * time       (time) datetime64[ns] 8B 2024-08-12\nData variables:\n    t          (time, latitude, longitude) float64 131kB dask.array<chunksize=(1, 91, 180), meta=np.ndarray>\n    q          (time, latitude, longitude) float64 131kB dask.array<chunksize=(1, 91, 180), meta=np.ndarray>\nAttributes:\n    Conventions:  CF-1.6\n    history:      2024-09-27 11:51:26 GMT by grib_to_netcdf-2.35.3: /opt/ecmw...\n    path:         data/02-t-q.nc
" + }, + "metadata": {} + } + ], + "execution_count": 6 + }, + { + "cell_type": "code", + "source": "da = ds[\"q\"]", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 7 + }, + { + "cell_type": "code", + "source": "def goodness(da, da_decompressed) -> float:\n # compute the PS2NR for now, choose a better goodness estimator later\n da_min, da_max = float(da.min()), float(da.max())\n\n da_error = da_decompressed - da\n da_mse = float((da_error*da_error).mean())\n\n return np.log10(da_max - da_min) * 20.0 - np.log10(da_mse) * 10.0", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 8 + }, + { + "cell_type": "code", + "source": "def compression_ratio(stats):\n return round(stats[0].decoded_bytes / stats[-1].encoded_bytes, 2)", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 9 + }, + { + "cell_type": "code", + "source": "from matplotlib import pyplot as plt", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 10 + }, + { + "cell_type": "code", + "source": "import tqdm", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 11 + }, + { + "cell_type": "code", + "source": "bitround_ratio_goodness = []\nfor keepbits in tqdm.tqdm(range(0, 24)):\n compressor = [fcbench.codecs.BitRound(keepbits=keepbits), fcbench.codecs.Zlib(level=9)]\n da_decompressed, stats = fcbench.compressor.compute_dataarray_compress_decompress(\n da, compressor\n )\n bitround_ratio_goodness.append((compression_ratio(stats), goodness(da, da_decompressed)))\n del compressor\nbitround_ratio_goodness = np.array(bitround_ratio_goodness)", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "text": ":2: TqdmMonitorWarning: tqdm:disabling monitor support (monitor_interval = 0) due to:\ncan't start new thread\n for keepbits in tqdm.tqdm(range(0, 24)):\n100%|██████████| 24/24 [00:01<00:00, 13.31it/s]\n", + "output_type": "stream" + } + ], + "execution_count": 12 + }, + { + "cell_type": "code", + "source": "linquant_ratio_goodness = []\nfor bits in tqdm.tqdm(range(1, 24)):\n compressor = [fcbench.codecs.LinearQuantize(bits=bits, dtype=str(da.dtype)), fcbench.codecs.Zlib(level=9)]\n da_decompressed, stats = fcbench.compressor.compute_dataarray_compress_decompress(\n da, compressor\n )\n linquant_ratio_goodness.append((compression_ratio(stats), goodness(da, da_decompressed)))\n del compressor\nlinquant_ratio_goodness = np.array(linquant_ratio_goodness)", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "text": "100%|██████████| 23/23 [00:01<00:00, 21.46it/s]\n", + "output_type": "stream" + } + ], + "execution_count": 13 + }, + { + "cell_type": "code", + "source": "zfp_ratio_goodness = []\nfor rel in tqdm.tqdm([10**(-x/4) for x in range(20)]):\n compressor = [fcbench.codecs.Asinh(linear_width=1.0), fcbench.codecs.Zfp(mode=\"fixed-accuracy\", tolerance=rel)]\n da_decompressed, stats = fcbench.compressor.compute_dataarray_compress_decompress(\n da, compressor\n )\n zfp_ratio_goodness.append((compression_ratio(stats), goodness(da, da_decompressed)))\n del compressor\nzfp_ratio_goodness = np.array(zfp_ratio_goodness)", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "text": "100%|██████████| 20/20 [00:00<00:00, 26.32it/s]\n", + "output_type": "stream" + } + ], + "execution_count": 14 + }, + { + "cell_type": "code", + "source": "sz3_ratio_goodness = []\nfor rel in tqdm.tqdm([10**(-x/4) for x in range(20)]):\n compressor = [fcbench.codecs.Sz3(eb_mode=\"rel\", eb_rel=rel)]\n da_decompressed, stats = fcbench.compressor.compute_dataarray_compress_decompress(\n da, compressor\n )\n sz3_ratio_goodness.append((compression_ratio(stats), goodness(da, da_decompressed)))\n del compressor\nsz3_ratio_goodness = np.array(sz3_ratio_goodness)", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "text": "100%|██████████| 20/20 [00:00<00:00, 29.59it/s]\n", + "output_type": "stream" + } + ], + "execution_count": 15 + }, + { + "cell_type": "code", + "source": "fig, ax = plt.subplots()\n\nax.set_title(\"Which compressor performs better for different quality requirements?\")\nax.set_xlabel(\"peak signal-to-noise ratio\\nhigher means better quality\")\nax.set_ylabel(\"compression ratio\")\nax.set_yscale(\"log\")\n\nax.plot(bitround_ratio_goodness[:,1], bitround_ratio_goodness[:,0])\nax.scatter(bitround_ratio_goodness[:,1], bitround_ratio_goodness[:,0], label=\"Bit Rounding\")\n\nax.plot(linquant_ratio_goodness[:,1], linquant_ratio_goodness[:,0])\nax.scatter(linquant_ratio_goodness[:,1], linquant_ratio_goodness[:,0], label=\"Linear Quantize\")\n\nax.plot(zfp_ratio_goodness[:,1], zfp_ratio_goodness[:,0])\nax.scatter(zfp_ratio_goodness[:,1], zfp_ratio_goodness[:,0], label=\"ZFP\")\n\nax.plot(sz3_ratio_goodness[:,1], sz3_ratio_goodness[:,0])\nax.scatter(sz3_ratio_goodness[:,1], sz3_ratio_goodness[:,0], label=\"SZ3\")\n\nax.legend()\n\nplt.show()", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "
", + "image/png": "" + }, + "metadata": {} + } + ], + "execution_count": 16 + }, + { + "cell_type": "code", + "source": "", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": null + } + ] +} \ No newline at end of file From 5527d792c886f78454d2036b65acf9cc9d8bfca3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:56:52 +0200 Subject: [PATCH 2/3] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d6d8bd4..8c9ad84 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ The following is an overview of all notebooks: - [`04-ecmwfapi.ipynb`](02-data-sources/04-ecmwfapi.ipynb): download small datasets from the ECMWF Archive using the `ecmwfapi` - [`03-examples/`](03-examples/README.md): Longer walkthrough examples that apply and evaluate data compression on different variables - [`01-compressors.ipynb`](03-examples/01-compressors.ipynb): comparison of different compressors on a small temperature and specific humidity dataset + - [`02-quality.ipynb`](03-examples/02-quality.ipynb): quantitative evaluation of different compressors and their settings across different variables ## Getting Help and Contributing From 33dbe26c26233adff3e7de57a14d929454f49a59 Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:57:15 +0200 Subject: [PATCH 3/3] Update README.md --- 03-examples/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/03-examples/README.md b/03-examples/README.md index 122009c..5b0fd3d 100644 --- a/03-examples/README.md +++ b/03-examples/README.md @@ -6,6 +6,7 @@ The **Online Laboratory for Data Compression** allows you to explore various dat ## Overview of the provided notebooks - [`01-compressors.ipynb`](01-compressors.ipynb): Comparison of different compressors on a small temperature and specific humidity dataset +- [`02-quality.ipynb`](02-quality.ipynb): Quantitative evaluation of different compressors and their settings across different variables ## License