pangeo-forge · keewis · Nov 24, 2024 · Nov 24, 2024 · Nov 24, 2024 · Nov 24, 2024
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -1,4 +1,3 @@
-
 version: 2
 updates:
   - package-ecosystem: pip

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -2,15 +2,15 @@ name: Tests
 
 on:
   push:
-    branches: [ "main" ]
+    branches: ["main"]
     paths-ignore:
-    - 'docs/**'
+      - "docs/**"
   pull_request:
-    branches: [ "main" ]
+    branches: ["main"]
     paths-ignore:
-    - 'docs/**'
+      - "docs/**"
   schedule:
-    - cron: '0 4 * * *'  # run once a day at 4 AM
+    - cron: "0 4 * * *" # run once a day at 4 AM
 
 env:
   PYTEST_ADDOPTS: "--color=yes"
@@ -21,7 +21,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11","3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
       - uses: actions/checkout@v4
         with:

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -1,4 +1,3 @@
-
 name: Release Python Package
 
 on:
@@ -13,7 +12,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.x'
+          python-version: "3.x"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml
@@ -2,10 +2,10 @@ name: Integration tests
 
 on:
   push:
-    branches: [ "main" ]
+    branches: ["main"]
   pull_request:
-    branches: [ "main" ]
-    types: [ opened, reopened, synchronize, labeled ]
+    branches: ["main"]
+    types: [opened, reopened, synchronize, labeled]
 
 env:
   PYTEST_ADDOPTS: "--color=yes"
@@ -29,12 +29,11 @@ jobs:
         # https://github.com/pangeo-forge/pangeo-forge-recipes/issues/540#issuecomment-1685096271
         # Once https://github.com/pangeo-forge/pangeo-forge-runner/pull/90 goes in, we can add back
         # integration testing for 3.10 and 3.11 (for runner versions that follow that PR).
-        python-version: ["3.9"]  # , "3.10", "3.11"]
-        runner-version: [
-          "pangeo-forge-runner==0.9.1",
-          "pangeo-forge-runner==0.9.2",
-          "pangeo-forge-runner==0.9.3",
-        ]
+        python-version: ["3.9"] # , "3.10", "3.11"]
+        runner-version:
+          - "pangeo-forge-runner==0.9.1"
+          - "pangeo-forge-runner==0.9.2"
+          - "pangeo-forge-runner==0.9.3"
     steps:
       - uses: actions/checkout@v4
       - name: 🔁 Setup Python
@@ -45,7 +44,6 @@ jobs:
           cache: pip
           cache-dependency-path: pyproject.toml
 
-
       - name: Install pangeo-forge recipes and runner
         shell: bash -l {0}
         run: |
@@ -57,7 +55,7 @@ jobs:
         run: |
           python -m pip install ecmwflibs eccodes cfgrib
 
-      - name: 'Setup minio'
+      - name: "Setup minio"
         run: |
           wget --quiet https://dl.min.io/server/minio/release/linux-amd64/minio
           chmod +x minio

diff --git a/.gitignore b/.gitignore
@@ -136,3 +136,6 @@ dask-worker-space
 
 # vscode
 .vscode/
+
+# linter caches
+.prettier_cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,47 +1,52 @@
+ci:
+  autofix_prs: true
+  autoupdate_schedule: "monthly"
+
 repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-docstring-first
 
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.10.0
     hooks:
-    - id: trailing-whitespace
-    - id: end-of-file-fixer
-    - id: check-docstring-first
-    - id: check-json
-    - id: check-yaml
-    - id: pretty-format-json
-      args: ["--autofix", "--indent=2", "--no-sort-keys"]
-      exclude: "docs/"
+      - id: black
 
--   repo: https://github.com/psf/black
-    rev: 24.4.2
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.8.0
     hooks:
-    - id: black
-      args: ["--line-length", "100"]
+      - id: ruff
+        args: ["--fix", "--show-fixes"]
 
--   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.0
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.10.0"
     hooks:
-    - id: flake8
-      exclude: pangeo_forge_recipes/recipes
+      - id: mypy
+        exclude: tests,pangeo_forge_recipes/recipes
 
--   repo: https://github.com/asottile/seed-isort-config
-    rev: v2.2.0
+  - repo: https://github.com/rstcheck/rstcheck
+    rev: v6.2.4
     hooks:
-    - id: seed-isort-config
+      - id: rstcheck
 
--   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.10.0'
+  - repo: https://github.com/rbubley/mirrors-prettier
+    rev: v3.3.3
     hooks:
-    -   id: mypy
-        exclude: tests,pangeo_forge_recipes/recipes
+      - id: prettier
+        args: ["--cache-location=.prettier_cache/cache"]
 
--   repo: https://github.com/pycqa/isort
-    rev: 5.13.2
+  - repo: https://github.com/ComPWA/taplo-pre-commit
+    rev: v0.9.3
     hooks:
-      - id: isort
-        args: ["--profile", "black"]
+      - id: taplo-format
+        args: ["--option", "array_auto_collapse=false"]
+      - id: taplo-lint
+        args: ["--no-schema"]
 
--   repo: https://github.com/rstcheck/rstcheck
-    rev: v6.2.0
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.23
     hooks:
-    -   id: rstcheck
+      - id: validate-pyproject
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![NSF Award 2026932](https://img.shields.io/badge/NSF-2026932-blue)](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2026932&HistoricalAwards=false)
 
-__pangeo-forge__ is an open-source tool designed to aid the extraction, transformation, and loading of datasets. The goal of pangeo-forge is to make it easy to extract datasets from traditional data repositories and deposit them into cloud object storage in analysis-ready, cloud-optimized format.
+**pangeo-forge** is an open-source tool designed to aid the extraction, transformation, and loading of datasets. The goal of pangeo-forge is to make it easy to extract datasets from traditional data repositories and deposit them into cloud object storage in analysis-ready, cloud-optimized format.
 
 pangeo-forge is inspired by [conda-forge](https://conda-forge.org/), a community-led collection of recipes for building [Conda](https://docs.conda.io/en/latest/) packages. We hope that pangeo-forge can play the same role for datasets.
 

diff --git a/docs/advanced/globus.md b/docs/advanced/globus.md
@@ -12,6 +12,7 @@ following workaround.
 To ingest files from Globus with Pangeo Forge, you must create a
 _fully public Guest Collection and access the files via HTTPS_.
 The specific steps are as follows:
+
 - Verify that the Globus endpoint you want to transfer from uses
   **Globus Connect Server V5**. This method _will not work_ with earlier Globus
   versions (e.g. V4).

diff --git a/docs/api_reference.md b/docs/api_reference.md
@@ -2,7 +2,6 @@
 
 ## File Patterns
 
-
 ```{eval-rst}
 .. autoclass:: pangeo_forge_recipes.patterns.FilePattern
     :members:
@@ -16,7 +15,6 @@
     :members:
 ```
 
-
 ```{eval-rst}
 .. autoclass:: pangeo_forge_recipes.patterns.MergeDim
     :members:
@@ -49,7 +47,6 @@
     :members:
 ```
 
-
 ## Processing Functions
 
 The [Beam PTransform Style Guide](https://beam.apache.org/contribute/ptransform-style-guide/) recommends:
@@ -81,7 +78,6 @@ The [Beam PTransform Style Guide](https://beam.apache.org/contribute/ptransform-
 
 ## Combiners
 
-
 ```{eval-rst}
 .. automodule:: pangeo_forge_recipes.combiners
     :members:

diff --git a/docs/composition/examples/gpcp-from-gcs.md b/docs/composition/examples/gpcp-from-gcs.md
@@ -5,4 +5,5 @@ This recipe is representative of the [](../styles.md#open-with-xarray-write-to-z
 ```
 
 ```{literalinclude} ../../../examples/feedstock/gpcp_from_gcs.py
+
 ```
diff --git a/docs/composition/examples/gpcp-rechunk.md b/docs/composition/examples/gpcp-rechunk.md
@@ -1,5 +1,5 @@
 # GPCP Rechunk
 
-
 ```{literalinclude} ../../../examples/feedstock/gpcp_rechunk.py
+
 ```
diff --git a/docs/composition/examples/noaa-oisst.md b/docs/composition/examples/noaa-oisst.md
@@ -5,4 +5,5 @@ This recipe is representative of the [](../styles.md#open-with-xarray-write-to-z
 ```
 
 ```{literalinclude} ../../../examples/feedstock/noaa_oisst.py
+
 ```
diff --git a/docs/composition/file_patterns.md b/docs/composition/file_patterns.md
@@ -18,8 +18,8 @@ inputs (or "ingredients") upon which the recipe will act. File patterns describe
 
 - Where individual source files are located; and
 - How they should be organized logically as part of an aggregate dataset.
-(In this respect, file patterns are conceptually similar to
-[NcML](https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_aggregation.html) documents.)
+  (In this respect, file patterns are conceptually similar to
+  [NcML](https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_aggregation.html) documents.)
 
 ```{note}
 API Reference is available here: {class}`pangeo_forge_recipes.patterns.FilePattern`
@@ -65,6 +65,7 @@ http://data-provider.org/data/humidity/humidity_10.txt
 ```
 
 This is a relatively common way to organize data files:
+
 - There are two different "variables" (temperature and humidity), stored in separate files.
 - There is a sequence of 10 files for each variable. We will assume that this
   represents the "time" axis of the data.
@@ -124,6 +125,7 @@ These are the same as the names used in our [](#combine-dimensions).
 
 We now need to define the "combine dimensions" of the file pattern.
 Combine dimensions are one of two types:
+
 - {class}`pangeo_forge_recipes.patterns.ConcatDim`: The files should be combined by
   concatenating _the same variables_ sequentially along an axis.
   This is conceptually similar to Xarray's [concat](http://xarray.pydata.org/en/stable/combining.html#concatenate)
@@ -134,14 +136,14 @@ Combine dimensions are one of two types:
   operation.
 
 File patterns permit us to combine multiple combine dims into a single pattern.
-For the present example, we have one ``MergeDim``:
+For the present example, we have one `MergeDim`:
 
 ```{code-cell} ipython3
 from pangeo_forge_recipes.patterns import MergeDim
 variable_merge_dim = MergeDim("variable", ["temperature", "humidity"])
 ```
 
-...and one ``ConcatDim``:
+...and one `ConcatDim`:
 
 ```{code-cell} ipython3
 from pangeo_forge_recipes.patterns import ConcatDim
@@ -177,7 +179,7 @@ pattern
 
 To see the full code in one place, please refer back to [](#sneak-peek-the-full-code).
 
-### Create a  `FilePattern` from a list of files
+### Create a `FilePattern` from a list of files
 
 Alternatively, we can also use the convenience function {meth}`pangeo_forge_recipes.patterns.pattern_from_file_sequence` to create a file pattern from a list of files.
 
@@ -193,14 +195,13 @@ file_list = [
 pattern = pattern_from_file_sequence(file_list, concat_dim="time")
 ```
 
-
 ## Inspect a `FilePattern`
 
 We can inspect file patterns manually to understand how they work. This is not necessary
 to create a recipe; however digging into a `FilePattern`'s internals may be helpful in
 debugging a complex recipe. Internally, the file pattern maps the keys of the
 [](#combine-dimensions) to logical indices. We can see all of these keys by iterating over
-the patterns using the ``items()`` method:
+the patterns using the `items()` method:
 
 ```{code-cell} ipython3
 for index, fname in pattern.items():
@@ -227,7 +228,7 @@ As covered in {doc}`index`, a recipe is composed of a sequence of Apache Beam tr
 The data Apache Beam transforms operate on are
 [`PCollections`](https://beam.apache.org/documentation/programming-guide/#pcollections).
 Therefore, we bring the contents of a `FilePattern` into a recipe, we pass the index:url
-pairs generated by the file pattern's ``items()`` method into Beam's `Create` constructor
+pairs generated by the file pattern's `items()` method into Beam's `Create` constructor
 as follows:
 
 ```{code-cell} ipython3

diff --git a/docs/composition/styles.md b/docs/composition/styles.md
@@ -21,20 +21,17 @@ Below we give a very basic overview of how this recipe is used.
 First you must define a {doc}`file pattern <file_patterns>`.
 Once you have a {class}`FilePattern <pangeo_forge_recipes.patterns.FilePattern>` object,
 the recipe pipeline will contain at a minimum the following transforms applied to the file pattern collection:
-* {class}`pangeo_forge_recipes.transforms.OpenURLWithFSSpec`: retrieves each pattern file using the specified URLs.
-* {class}`pangeo_forge_recipes.transforms.OpenWithXarray`: load each pattern file into an [`xarray.Dataset`](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html).
-* {class}`pangeo_forge_recipes.transforms.StoreToZarr`: generate a Zarr store by combining the datasets.
-* {class}`pangeo_forge_recipes.transforms.ConsolidateDimensionCoordinates`: consolidate the Dimension Coordinates for dataset read performance.
-* {class}`pangeo_forge_recipes.transforms.ConsolidateMetadata`: calls Zarr's convinience function to consolidate metadata.
-
-### Open existing Zarr Store
-* {class}`pangeo_forge_recipes.transforms.OpenWithXarray` supports opening existing Zarr stores. This might be useful for rechunking a Zarr store into an alternative chunking scheme.
-An example of this recipe can be found in - {doc}`examples/gpcp-rechunk`
-
-
 
+- {class}`pangeo_forge_recipes.transforms.OpenURLWithFSSpec`: retrieves each pattern file using the specified URLs.
+- {class}`pangeo_forge_recipes.transforms.OpenWithXarray`: load each pattern file into an [`xarray.Dataset`](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html).
+- {class}`pangeo_forge_recipes.transforms.StoreToZarr`: generate a Zarr store by combining the datasets.
+- {class}`pangeo_forge_recipes.transforms.ConsolidateDimensionCoordinates`: consolidate the Dimension Coordinates for dataset read performance.
+- {class}`pangeo_forge_recipes.transforms.ConsolidateMetadata`: calls Zarr's convinience function to consolidate metadata.
 
+### Open existing Zarr Store
 
+- {class}`pangeo_forge_recipes.transforms.OpenWithXarray` supports opening existing Zarr stores. This might be useful for rechunking a Zarr store into an alternative chunking scheme.
+  An example of this recipe can be found in - {doc}`examples/gpcp-rechunk`
 
 ```{tip}
 If using the {class}`pangeo_forge_recipes.transforms.ConsolidateDimensionCoordinates` transform, make sure to chain on the {class}`pangeo_forge_recipes.transforms.ConsolidateMetadata` transform to your recipe.
@@ -56,7 +53,6 @@ selecting this option, it is therefore up to you, the user, to ensure that the i
 append.
 ```
 
-
 ## Open with Kerchunk, write to virtual Zarr
 
 The standard Zarr recipe creates a copy of the original dataset in the Zarr format, this