NASA-IMPACT · gadomski · Oct 10, 2023 · Oct 9, 2023
@@ -21,3 +21,17 @@ jobs:
         run: pip install -r requirements.txt
       - name: Validate collections
         run: python scripts/validate_collections.py
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+          cache: "pip"
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+      - name: Run pre-commit hooks
+        run: pre-commit run --all-files
@@ -0,0 +1,5 @@
+{
+    "config": {
+        "MD013": false // disable line length checks
+    }
+}
@@ -0,0 +1,22 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-yaml
+      - id: file-contents-sorter
+        files: ^requirements.in$
+      - id: trailing-whitespace
+        exclude: ^ingestion-data/.*$
+  - repo: https://github.com/DavidAnson/markdownlint-cli2
+    rev: v0.10.0
+    hooks:
+      - id: markdownlint-cli2
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.1
+    hooks:
+      - id: black
+      - id: black-jupyter
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.292
+    hooks:
+      - id: ruff
@@ -1,26 +1,26 @@
 # veda-data
 
+[![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/nasa-impact/veda-data/ci.yaml?style=for-the-badge&label=CI)](https://github.com/NASA-IMPACT/veda-data/actions/workflows/ci.yaml)
+
 This repository houses data used to define a VEDA dataset to load into the [VEDA catalog](https://nasa-impact.github.io/veda-docs/services/apis.html). Inclusion in the VEDA catalog is a prerequisite for displaying the dataset in the [VEDA Dashboard](https://www.earthdata.nasa.gov/dashboard/).
 
 The data provided here gets processed in the ingestion system [veda-data-airflow](https://github.com/NASA-IMPACT/veda-data-airflow), to which this repository is directly linked (as a Git submodule).
 
-
 ## Dataset Submission Process
 
 The VEDA user docs explain the full [dataset submission process](https://nasa-impact.github.io/veda-docs/contributing/dataset-ingestion/).
 
 Ultimately, submission to the VEDA catalog requires that you [open an issue with the "new dataset" template](https://github.com/NASA-IMPACT/veda-data/issues/new?assignees=&labels=dataset&projects=&template=new-dataset.yaml&title=New+Dataset%3A+%3Cdataset+title%3E). This template will require, at minimum:
 
 1. a description of the dataset
-2. the location of the data (in S3, CMR, etc.), and 
-3. a point of contact for the VEDA team to collaborate with. 
+2. the location of the data (in S3, CMR, etc.), and
+3. a point of contact for the VEDA team to collaborate with.
 
 One or more notebooks showing how the data should be processed would be appreciated.
 
-
 ## Ingestion Data Structure
 
-When submitting STAC records to ingest, a pull request can be made with the data structured as described below. 
+When submitting STAC records to ingest, a pull request can be made with the data structured as described below.
 
 ### `collections/`
 
@@ -92,13 +92,13 @@ Should follow the following format:
     "bucket": "<s3-bucket>",
     "filename_regex": "<filename-regex>",
     "datetime_range": "<month/day/year>",
-    
+
     ## for cmr discovery
     "version": "<collection-version>",
     "temporal": ["<start-date>", "<end-date>"],
     "bounding_box": ["<bounding-box-as-comma-separated-LBRT>"],
     "include": "<filename-pattern>",
-    
+
     ### misc
     "cogify": "<true/false>",
     "upload": "<true/false>",
@@ -123,13 +123,22 @@ python scripts/validate_collections.py
 
 ## Development
 
-If you need to add new dependencies, first install the requirements:
+We use [pre-commit](https://pre-commit.com/) hooks to keep our notebooks and Python scripts consistently formatted.
+To contribute, first install the requirements, then install the **pre-commit** hooks:
 
 ```shell
-pip install -r requirements.txt
+pip install -r requirements.txt  # recommend a virtual environment
+pre-commit install
+```
+
+The hooks will run automatically on any changed files when you commit.
+To run the hooks on the entire repository (which is what happens in CI):
+
+```shell
+pre-commit run --all-files
 ```
 
-Add your dependency to `requirements.in` *without a version specifier* (unless you really need one).
+If you need to add a Python dependency, add your dependency to `requirements.in`:
 Then run:
 
 ```shell

@@ -1,2 +1,5 @@
+black[jupyter]
 pip-tools
+pre-commit
 pystac[validation]
+ruff
@@ -4,18 +4,74 @@
 #
 #    pip-compile
 #
+appnope==0.1.3
+    # via ipython
+asttokens==2.4.0
+    # via stack-data
 attrs==23.1.0
     # via jsonschema
+backcall==0.2.0
+    # via ipython
+black[jupyter]==23.9.1
+    # via -r requirements.in
 build==1.0.3
     # via pip-tools
+cfgv==3.4.0
+    # via pre-commit
 click==8.1.7
-    # via pip-tools
+    # via
+    #   black
+    #   pip-tools
+decorator==5.1.1
+    # via ipython
+distlib==0.3.7
+    # via virtualenv
+executing==2.0.0
+    # via stack-data
+filelock==3.12.4
+    # via virtualenv
+identify==2.5.30
+    # via pre-commit
+ipython==8.16.1
+    # via black
+jedi==0.19.1
+    # via ipython
 jsonschema==4.17.3
     # via pystac
+matplotlib-inline==0.1.6
+    # via ipython
+mypy-extensions==1.0.0
+    # via black
+nodeenv==1.8.0
+    # via pre-commit
 packaging==23.1
-    # via build
+    # via
+    #   black
+    #   build
+parso==0.8.3
+    # via jedi
+pathspec==0.11.2
+    # via black
+pexpect==4.8.0
+    # via ipython
+pickleshare==0.7.5
+    # via ipython
 pip-tools==7.3.0
     # via -r requirements.in
+platformdirs==3.11.0
+    # via
+    #   black
+    #   virtualenv
+pre-commit==3.4.0
+    # via -r requirements.in
+prompt-toolkit==3.0.39
+    # via ipython
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.2
+    # via stack-data
+pygments==2.16.1
+    # via ipython
 pyproject-hooks==1.0.0
     # via build
 pyrsistent==0.19.3
@@ -24,8 +80,26 @@ pystac[validation]==1.8.3
     # via -r requirements.in
 python-dateutil==2.8.2
     # via pystac
+pyyaml==6.0.1
+    # via pre-commit
+ruff==0.0.292
+    # via -r requirements.in
 six==1.16.0
-    # via python-dateutil
+    # via
+    #   asttokens
+    #   python-dateutil
+stack-data==0.6.3
+    # via ipython
+tokenize-rt==5.2.0
+    # via black
+traitlets==5.11.2
+    # via
+    #   ipython
+    #   matplotlib-inline
+virtualenv==20.24.5
+    # via pre-commit
+wcwidth==0.2.8
+    # via prompt-toolkit
 wheel==0.41.2
     # via pip-tools
 

@@ -34,8 +34,8 @@
     "from kerchunk.hdf import SingleHdf5ToZarr\n",
     "from typing import Dict\n",
     "\n",
-    "# Specify the CMIP model and variable to use. \n",
-    "# Here we are using near-surface air temperature from the GISS-E2-1-G GCM \n",
+    "# Specify the CMIP model and variable to use.\n",
+    "# Here we are using near-surface air temperature from the GISS-E2-1-G GCM\n",
     "model = \"GISS-E2-1-G\"\n",
     "variable = \"tas\"\n",
     "# If this code were re-used for a protected bucket, anon should be False.\n",
@@ -126,18 +126,20 @@
    "source": [
     "so = dict(mode=\"rb\", anon=anon, default_fill_cache=False, default_cache_type=\"first\")\n",
     "\n",
+    "\n",
     "# Use Kerchunk's `SingleHdf5ToZarr` method to create a `Kerchunk` index from a NetCDF file.\n",
     "def generate_json_reference(u):\n",
     "    with fs_read.open(u, **so) as infile:\n",
-    "        fname = u.split(\"/\")[-1].strip(\".nc\")        \n",
+    "        fname = u.split(\"/\")[-1].strip(\".nc\")\n",
     "        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)\n",
     "        return fname, ujson.dumps(h5chunks.translate()).encode()\n",
-    "    \n",
+    "\n",
+    "\n",
     "def write_json(fname, reference_json, temp_dir):\n",
     "    outf = os.path.join(temp_dir, f\"{fname}.json\")\n",
     "    with open(outf, \"wb\") as f:\n",
     "        f.write(reference_json)\n",
-    "    return outf    "
+    "    return outf"
    ]
   },
   {
@@ -297,7 +299,10 @@
     "bag = db.from_sequence(all_files, partition_size=1)\n",
     "result = db.map(generate_json_reference, bag)\n",
     "all_references = result.compute()\n",
-    "output_files = [write_json(fname, reference_json, temp_dir) for fname, reference_json in all_references]"
+    "output_files = [\n",
+    "    write_json(fname, reference_json, temp_dir)\n",
+    "    for fname, reference_json in all_references\n",
+    "]"
    ]
   },
   {
@@ -331,11 +336,11 @@
     "%%time\n",
     "mzz = MultiZarrToZarr(\n",
     "    output_files,\n",
-    "    remote_protocol='s3',\n",
-    "    remote_options={'anon': anon},\n",
-    "    concat_dims=['time'],\n",
+    "    remote_protocol=\"s3\",\n",
+    "    remote_options={\"anon\": anon},\n",
+    "    concat_dims=[\"time\"],\n",
     "    coo_map={\"time\": \"cf:time\"},\n",
-    "    inline_threshold=0\n",
+    "    inline_threshold=0,\n",
     ")\n",
     "multi_kerchunk = mzz.translate()"
    ]
@@ -878,9 +883,13 @@
     }
    ],
    "source": [
-    "s3 = boto3.client('s3')\n",
-    "upload_bucket_name = 'veda-data-store-staging'\n",
-    "response = s3.upload_file(output_location, upload_bucket_name, f'cmip6-{model}-{variable}-kerchunk/{output_fname}')\n",
+    "s3 = boto3.client(\"s3\")\n",
+    "upload_bucket_name = \"veda-data-store-staging\"\n",
+    "response = s3.upload_file(\n",
+    "    output_location,\n",
+    "    upload_bucket_name,\n",
+    "    f\"cmip6-{model}-{variable}-kerchunk/{output_fname}\",\n",
+    ")\n",
     "# None is good.\n",
     "print(f\"Response uploading {output_fname} to {upload_bucket_name} was {response}.\")"
    ]