Merge pull request #5 from jbusecke/add-pre-commit

Add pre-commit config
jbusecke · Oct 6, 2022 · 7bd23dd · 7bd23dd
2 parents dc6ca0a + 904a256
commit 7bd23dd
Show file tree

Hide file tree

Showing 11 changed files with 107 additions and 51 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,16 @@
+[flake8]
+exclude = __init__.py,.eggs,doc
+ignore =
+    # whitespace before ':' - doesn't work well with black
+    E203
+    E402
+    # line too long - let black worry about that
+    E501
+    # do not assign a lambda expression, use a def
+    E731
+    # line break before binary operator
+    W503
+    E265
+    F811
+    # Allows type hinting as Gridded[DataArray, "(X:center)"], where we did `from typing import Annotated as Gridded`
+    F722
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,21 @@
+default_language_version:
+    python: python3.9
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: 22.8.0
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
diff --git a/README.md b/README.md
@@ -3,9 +3,9 @@ Using queries to the ESGF API to generate urls and keyword arguments for receipe
 
 
 ## Parsing a list of instance ids using wildcards
-Pangeo forge recipes require the user to provide exact instance_id's for the datasets they want to be processed. Discovering these with the [web search](https://esgf-node.llnl.gov/search/cmip6/) can become cumbersome, especially when dealing with a large number of members/models etc. 
+Pangeo forge recipes require the user to provide exact instance_id's for the datasets they want to be processed. Discovering these with the [web search](https://esgf-node.llnl.gov/search/cmip6/) can become cumbersome, especially when dealing with a large number of members/models etc.
 
-`pangeo-forge-esgf` provides some functions to query the ESGF API based on instance_id values with wildcards. 
+`pangeo-forge-esgf` provides some functions to query the ESGF API based on instance_id values with wildcards.
 
 For example if you want to find all the zonal (`uo`) and meridonal (`vo`) velocities available for the `lgm` experiment of PMIP, you can do:
 
@@ -35,4 +35,4 @@ and you will get:
  'CMIP6.PMIP.MPI-M.MPI-ESM1-2-LR.lgm.r1i1p1f1.Omon.vo.gn.v20190710']
 ```
 
-Eventually I hope I can leverage this functionality to handle user requests in PRs that add wildcard instance_ids, but for now this might be helpful to manually construct lists of instance_ids to submit to a pangeo-forge feedstock.
+Eventually I hope I can leverage this functionality to handle user requests in PRs that add wildcard instance_ids, but for now this might be helpful to manually construct lists of instance_ids to submit to a pangeo-forge feedstock.
diff --git a/environment.yaml b/environment.yaml
@@ -1,6 +1,6 @@
 name: pangeo-forge-esgf
 channels:
  - conda-forge
-dependencies: 
+dependencies:
  - python
- - aiohttp
+ - aiohttp
diff --git a/pangeo_forge_esgf/dynamic_kwargs.py b/pangeo_forge_esgf/dynamic_kwargs.py
@@ -1,5 +1,7 @@
-from typing import Dict, Union, List, Tuple
+from typing import Dict, List, Tuple
+
 import aiohttp
+
 from .utils import facets_from_iid
 
 # For certain table_ids it is preferrable to have time chunks that are a multiple of e.g. 1 year for monthly data.
@@ -127,7 +129,7 @@ async def response_data_processing(
     print(f"Inferred timesteps per file: {timesteps}")
     element_sizes = [size / n_t for size, n_t in zip(sizes, timesteps)]
 
-    ### Determine kwargs
+    # Determine kwargs
     # MAX_SUBSET_SIZE=1e9 # This is an option if the revised subsetting still runs into errors.
     MAX_SUBSET_SIZE = 500e6
     DESIRED_CHUNKSIZE = 200e6
@@ -146,7 +148,7 @@ async def response_data_processing(
     if max(sizes) <= MAX_SUBSET_SIZE:
         subset_input = 0
     else:
-        ## Determine subset_input parameters given the following constraints
+        # Determine subset_input parameters given the following constraints
         # - Needs to keep the subset size below MAX_SUBSET_SIZE
         # - (Not currently implemented) Resulting subsets should be evenly dividable by target_chunks (except for the last file, that can be odd). This might ultimately not be required once we figure out the locking issues. I cannot fulfill this right now with the dataset structure where often the first and last files have different number of timesteps than the 'middle' ones.
 
@@ -180,4 +182,4 @@ async def is_netcdf3(session: aiohttp.ClientSession, url: str) -> bool:
         if not status_code == 206:
             raise RuntimeError(f"Range request failed with {status_code} for {url}")
         head = await resp.read()
-        return "CDF" in str(head)
+        return "CDF" in str(head)
diff --git a/pangeo_forge_esgf/parsing.py b/pangeo_forge_esgf/parsing.py
@@ -1,7 +1,8 @@
 import requests
-import json
+
 from .utils import facets_from_iid
 
+
 def request_from_facets(url, **facets):
     params = {
         "type": "Dataset",
@@ -25,21 +26,29 @@ def instance_ids_from_request(json_dict):
 def parse_instance_ids(iid: str) -> list[str]:
     """Parse an instance id with wildcards"""
     facets = facets_from_iid(iid)
-    #convert string to list if square brackets are found
-    for k,v in facets.items():
-        if '[' in v:
-            v = v.replace('[', '').replace(']', '').replace('\'', '').replace(' ','').split(',')
+    # convert string to list if square brackets are found
+    for k, v in facets.items():
+        if "[" in v:
+            v = (
+                v.replace("[", "")
+                .replace("]", "")
+                .replace("'", "")
+                .replace(" ", "")
+                .split(",")
+            )
         facets[k] = v
     facets_filtered = {k: v for k, v in facets.items() if v != "*"}
-
-    #TODO: I should make the node url a keyword argument. For now this works well enough
-    url="https://esgf-node.llnl.gov/esg-search/search"
+
+    # TODO: I should make the node url a keyword argument.
+    # For now this works well enough
+    url = "https://esgf-node.llnl.gov/esg-search/search"
     # url = "https://esgf-data.dkrz.de/esg-search/search"
-    # TODO: how do I iterate over this more efficiently? Maybe we do not want to allow more than x files parsed?
+    # TODO: how do I iterate over this more efficiently?
+    # Maybe we do not want to allow more than x files parsed?
     resp = request_from_facets(url, **facets_filtered)
     if resp.status_code != 200:
         print(f"Request [{resp.url}] failed with {resp.status_code}")
         return resp
     else:
         json_dict = resp.json()
-        return instance_ids_from_request(json_dict)
+        return instance_ids_from_request(json_dict)
diff --git a/pangeo_forge_esgf/recipe_inputs.py b/pangeo_forge_esgf/recipe_inputs.py
@@ -1,12 +1,12 @@
-from typing import Dict, Union, List, Tuple
-import aiohttp
 import asyncio
-import time
+from typing import Dict, List, Union
+
+import aiohttp
+
 from .dynamic_kwargs import response_data_processing
 from .utils import facets_from_iid
 
-
-## global variables
+# global variables
 search_node_list = [
     "https://esgf-node.llnl.gov/esg-search/search",
     "https://esgf-data.dkrz.de/esg-search/search",
@@ -18,7 +18,7 @@
 # For now just use llnl
 search_node = search_node_list[0]
 
-## Data nodes in preferred order (from naomis code here: https://github.com/naomi-henderson/cmip6collect2/blob/main/myconfig.py)
+# Data nodes in preferred order (from naomis code here: https://github.com/naomi-henderson/cmip6collect2/blob/main/myconfig.py)
 data_nodes = [
     "esgf-data1.llnl.gov",
     "esgf-data2.llnl.gov",
@@ -80,9 +80,7 @@ async def generate_recipe_inputs_from_iids(
 
         tasks = []
         for iid in iid_list:
-            tasks.append(
-                asyncio.ensure_future(iid_request(session, iid, search_node))
-            )
+            tasks.append(asyncio.ensure_future(iid_request(session, iid, search_node)))
 
         raw_input = await asyncio.gather(*tasks)
         recipe_inputs = {
@@ -111,9 +109,7 @@ async def iid_request(
     filtered_response_data = await sort_and_filter_response(response_data, session)
 
     print(f"Determining dynamics kwargs for {iid}...")
-    urls, kwargs = await response_data_processing(
-        session, filtered_response_data, iid
-    )
+    urls, kwargs = await response_data_processing(session, filtered_response_data, iid)
 
     return urls, kwargs
 
@@ -128,7 +124,7 @@ async def _esgf_api_request(
         "retracted": "false",
         "format": "application/solr+json",
         "fields": "url,size,table_id,title,instance_id,replica,data_node",
-        "latest": "true", 
+        "latest": "true",
         "distrib": "true",
         "limit": 500,  # This determines the number of urls/files that are returned. I dont expect this to be ever more than 500?
     }
@@ -158,11 +154,11 @@ async def _esgf_api_request(
 
 
 async def check_url(url, session):
-    try: 
+    try:
         async with session.head(url, timeout=5) as resp:
             return resp.status
     except asyncio.exceptions.TimeoutError:
-        return 503 #TODO: Is this best practice?
+        return 503  # TODO: Is this best practice?
 
 
 async def sort_and_filter_response(
@@ -225,24 +221,26 @@ async def pick_data_node(
 ) -> Dict[str, Dict[str, str]]:
     """Filters out non-responsive data nodes, and then selects the preferred data node from available ones"""
     test_response_list = response_groups.get(list(response_groups.keys())[0])
-    
-    ## Determine preferred data node
+
+    # Determine preferred data node
     for data_node in data_nodes:
-        print(f'DEBUG: Testing data node: {data_node}')
-        matching_data_nodes = [r for r in test_response_list if r['data_node']==data_node]
-        if len(matching_data_nodes)==1:
-            matching_data_node = matching_data_nodes[0] # TODO: this is kinda clunky
-            status = await check_url(matching_data_node['url'], session)
+        print(f"DEBUG: Testing data node: {data_node}")
+        matching_data_nodes = [
+            r for r in test_response_list if r["data_node"] == data_node
+        ]
+        if len(matching_data_nodes) == 1:
+            matching_data_node = matching_data_nodes[0]  # TODO: this is kinda clunky
+            status = await check_url(matching_data_node["url"], session)
             if status in [200, 308]:
                 picked_data_node = data_node
                 print(f"DEBUG: Picking preferred data_node: {picked_data_node}")
                 break
             else:
                 print(f"Got status {status} for {matching_data_node['instance_id']}")
         elif len(matching_data_nodes) == 0:
-            print(f'DEBUG: Data node: {data_node} not available')
+            print(f"DEBUG: Data node: {data_node} not available")
         else:
-            raise # this should never happen
+            raise  # this should never happen
 
     # loop through all groups and filter for the picked data node
     modified_response_groups = {}

diff --git a/pangeo_forge_esgf/utils.py b/pangeo_forge_esgf/utils.py
@@ -1,9 +1,10 @@
 from typing import Dict
 
+
 def facets_from_iid(iid: str) -> Dict[str, str]:
     """Translates iid string to facet dict according to CMIP6 naming scheme"""
     iid_name_template = "mip_era.activity_id.institution_id.source_id.experiment_id.variant_label.table_id.variable_id.grid_label.version"
     facets = {}
     for name, value in zip(iid_name_template.split("."), iid.split(".")):
         facets[name] = value
-    return facets
+    return facets
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,3 +6,11 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools_scm]
 write_to = "pangeo_forge_esgf/_version.py"
 write_to_template = "__version__ = '{version}'"
+
+[tools.isort]
+profile = "black"
+skip_gitignore = true
+force_to_top = true
+default_section = "THIRDPARTY"
+known_first_party = "pangeo-forge-esgf"
+skip= ["doc/conf.py"]
diff --git a/setup.py b/setup.py
@@ -1,3 +1,3 @@
 from setuptools import setup
 
-setup()
+setup()
diff --git a/test_script.py b/test_script.py
@@ -1,13 +1,14 @@
 import asyncio
+
 from pangeo_forge_esgf import generate_recipe_inputs_from_iids
 
 iids = [
-    #PMIP runs requested by @CommonClimate
-    'CMIP6.PMIP.MIROC.MIROC-ES2L.past1000.r1i1p1f2.Amon.tas.gn.v20200318',
-    'CMIP6.PMIP.MRI.MRI-ESM2-0.past1000.r1i1p1f1.Amon.tas.gn.v20200120',
-    'CMIP6.PMIP.MPI-M.MPI-ESM1-2-LR.past2k.r1i1p1f1.Amon.tas.gn.v20210714',
+    # PMIP runs requested by @CommonClimate
+    "CMIP6.PMIP.MIROC.MIROC-ES2L.past1000.r1i1p1f2.Amon.tas.gn.v20200318",
+    "CMIP6.PMIP.MRI.MRI-ESM2-0.past1000.r1i1p1f1.Amon.tas.gn.v20200120",
+    "CMIP6.PMIP.MPI-M.MPI-ESM1-2-LR.past2k.r1i1p1f1.Amon.tas.gn.v20210714",
 ]
 
 recipe_inputs = asyncio.run(generate_recipe_inputs_from_iids(iids))
-print('DONE')
-print(recipe_inputs)
+print("DONE")
+print(recipe_inputs)