Release 1.2.7

NASA-IMPACT · Jul 18, 2024 · db65c94 · db65c94
2 parents d8c6b5c + 122f4aa
commit db65c94
Show file tree

Hide file tree

Showing 10 changed files with 486 additions and 134 deletions.
diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py
@@ -1,6 +1,7 @@
 import json
 
 from xmltodict import parse
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from .custom_checker import CustomChecker
 from .schema_validator import SchemaValidator
@@ -154,43 +155,81 @@ def _check_dependencies_validity(self, dependencies, field_dict):
                 return False
         return True
 
+    def _process_field(
+        self,
+        func,
+        check,
+        rule_id,
+        metadata_content,
+        field_dict,
+        result_dict,
+        rule_mapping,
+    ):
+        """
+        Process a single field according to the given rule and update result_dict
+        """
+        external_data = rule_mapping.get("data", [])
+        relation = rule_mapping.get("relation")
+        dependencies = self.scheduler.get_all_dependencies(
+            rule_mapping, check, field_dict
+        )
+        main_field = field_dict["fields"][0]
+        external_data = field_dict.get("data", external_data)
+        result_dict.setdefault(main_field, {})
+
+        if not self._check_dependencies_validity(dependencies, field_dict):
+            return
+
+        result = self.custom_checker.run(
+            func, metadata_content, field_dict, external_data, relation
+        )
+
+        self.tracker.update_data(rule_id, main_field, result["valid"])
+
+        # Avoid adding null valid results for rules that are not applied
+        if result["valid"] is None:
+            return
+
+        result_dict[main_field][rule_id] = result
+
+        message = self.build_message(result, rule_id)
+        if message:
+            result["message"] = message
+            result["remediation"] = self.message(rule_id, "remediation")
+
     def _run_func(self, func, check, rule_id, metadata_content, result_dict):
         """
         Run the check function for `rule_id` and update `result_dict`
         """
         rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get(
             rule_id
         )
-        external_data = rule_mapping.get("data", [])
-        relation = rule_mapping.get("relation")
         list_of_fields_to_apply = rule_mapping.get("fields_to_apply").get(
             self.metadata_format, {}
         )
-
-        for field_dict in list_of_fields_to_apply:
-            dependencies = self.scheduler.get_all_dependencies(
-                rule_mapping, check, field_dict
-            )
-            main_field = field_dict["fields"][0]
-            external_data = field_dict.get("data", external_data)
-            result_dict.setdefault(main_field, {})
-            if not self._check_dependencies_validity(dependencies, field_dict):
-                continue
-            result = self.custom_checker.run(
-                func, metadata_content, field_dict, external_data, relation
-            )
-
-            self.tracker.update_data(rule_id, main_field, result["valid"])
-
-            # this is to avoid "valid" = null in the result, for rules that are not applied
-            if result["valid"] is None:
-                continue
-            result_dict[main_field][rule_id] = result
-
-            message = self.build_message(result, rule_id)
-            if message:
-                result["message"] = message
-                result["remediation"] = self.message(rule_id, "remediation")
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = []
+            for field_dict in list_of_fields_to_apply:
+                future = executor.submit(
+                    self._process_field,
+                    func,
+                    check,
+                    rule_id,
+                    metadata_content,
+                    field_dict,
+                    result_dict,
+                    rule_mapping,
+                )
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in as_completed(futures):
+                # Retrieve the result or raise an exception if an error occurred
+                try:
+                    future.result()
+                except Exception as e:
+                    # Handle the exception from the thread
+                    raise e
 
     def perform_custom_checks(self, metadata_content):
         """

diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py
@@ -77,3 +77,13 @@
 }
 
 CMR_URL = "https://cmr.earthdata.nasa.gov"
+
+DATE_FORMATS = [
+    "%Y-%m-%dT%H:%M:%S.%f",  # Year to microsecond
+    "%Y-%m-%dT%H:%M:%S",  # Year to second
+    "%Y-%m-%dT%H:%M",  # Year to minute
+    "%Y-%m-%dT%H",  # Year to hour
+    "%Y-%m-%d",  # Year to day
+    "%Y-%m",  # Year to month
+    "%Y",  # Year
+]
diff --git a/pyQuARC/code/custom_checker.py b/pyQuARC/code/custom_checker.py
@@ -1,4 +1,5 @@
 from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 class CustomChecker:
@@ -103,6 +104,33 @@ def _get_path_value(content_to_validate, path_string):
         )
         return container
 
+    @staticmethod
+    def _process_argument(arg, func, relation, external_data, external_relation):
+        """
+        Process the argument by calling the provided function with the given arguments.
+
+        Args:
+            arg: The argument to be processed.
+            func: The function to be called.
+            relation: The relation argument.
+            external_data: The external data argument.
+            external_relation: The external relation argument.
+
+        Returns:
+            A dict containing the updated invalid_values list and the updated validity flag.
+        """
+
+        function_args = [*arg]
+        function_args.extend(
+            [
+                extra_arg
+                for extra_arg in [relation, *external_data, external_relation]
+                if extra_arg
+            ]
+        )
+        func_return = func(*function_args)
+        return func_return
+
     def run(
         self, func, content_to_validate, field_dict, external_data, external_relation
     ):
@@ -137,24 +165,35 @@ def run(
 
         invalid_values = []
         validity = None
-        for arg in args:
-            function_args = [*arg]
-            function_args.extend(
-                [
-                    extra_arg
-                    for extra_arg in [relation, *external_data, external_relation]
-                    if extra_arg
-                ]
-            )
-            func_return = func(*function_args)
-            valid = func_return["valid"]  # can be True, False or None
-            if valid is not None:
-                if valid:
-                    validity = validity or (validity is None)
-                else:
-                    if "value" in func_return:
-                        invalid_values.append(func_return["value"])
-                    validity = False
+
+        # Process arguments using multithreading
+        with ThreadPoolExecutor() as executor:
+            future_results = []
+            for arg in args:
+                future = executor.submit(
+                    self._process_argument,
+                    arg,
+                    func,
+                    relation,
+                    external_data,
+                    external_relation,
+                )
+                future_results.append(future)
+
+            # Retrieve results from futures
+            for future in as_completed(future_results):
+                try:
+                    func_return = future.result()
+                    valid = func_return["valid"]  # can be True, False or None
+                    if valid is not None:
+                        if valid:
+                            validity = validity or (validity is None)
+                        else:
+                            if "value" in func_return:
+                                invalid_values.append(func_return["value"])
+                            validity = False
+                except Exception as e:
+                    raise e
         result["valid"] = validity
         result["value"] = invalid_values
         return result
diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py
@@ -92,13 +92,13 @@ def one_item_presence_check(*field_values):
         value = None
 
         for field_value in field_values:
-            if field_value:
+            if field_value is not None:
                 value = field_value
                 validity = True
                 break
 
         return {"valid": validity, "value": value}
-        
+
     @staticmethod
     def dif_standard_product_check(*field_values):
         """
@@ -130,7 +130,7 @@ def license_url_description_check(description_field, url_field, license_text):
             description_field (string): string describing the URL
         """
         validity = True
-        value  = description_field
+        value = description_field
 
         if not license_text and not url_field:
             validity = False

diff --git a/pyQuARC/code/datetime_validator.py b/pyQuARC/code/datetime_validator.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 
 from .base_validator import BaseValidator
-from .utils import cmr_request, if_arg, set_cmr_prms
+from .utils import cmr_request, if_arg, set_cmr_prms, get_date_time
 
 
 class DatetimeValidator(BaseValidator):
@@ -117,13 +117,13 @@ def compare(first, second, relation):
 
     @staticmethod
     def validate_datetime_against_granules(
-        datetime, collection_shortname, version, sort_key, time_key
+        datetime_string, collection_shortname, version, sort_key, time_key
     ):
         """
         Validates the collection datetime against the datetime of the last granule in the collection
 
         Args:
-            datetime (str): datetime string
+            datetime_string (str): datetime string
             collection_shortname (str): ShortName of the parent collection
             sort_key (str): choice of start_date and end_date
             time_key (str): choice of time_end and time_start
@@ -143,13 +143,17 @@ def validate_datetime_against_granules(
 
         validity = True
         last_granule_datetime = None
+        date_time = None
 
+        # Compare the precision of the two datetime strings
         if len(granules["feed"]["entry"]) > 0:
             last_granule = granules["feed"]["entry"][0]
             last_granule_datetime = last_granule.get(time_key)
-            validity = datetime == last_granule_datetime
+            date_time = get_date_time(datetime_string)
+            last_granule_datetime = get_date_time(last_granule_datetime)
+            validity = date_time == last_granule_datetime
 
-        return {"valid": validity, "value": (datetime, last_granule_datetime)}
+        return {"valid": validity, "value": (date_time, last_granule_datetime)}
 
     @staticmethod
     @if_arg

diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py
@@ -1,3 +1,4 @@
+import os
 import requests
 
 from urlextract import URLExtract
@@ -54,7 +55,7 @@ def status_code_from_request(url):
         validity = True
 
         # extract URLs from text
-        extractor = URLExtract()
+        extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
         urls = extractor.find_urls(text_with_urls)
         urls.extend(UrlValidator._extract_http_texts(text_with_urls))
 
@@ -115,4 +116,4 @@ def doi_link_update(value, bad_urls):
         if value in bad_urls:
             validity = False
 
-        return {"valid": validity, "Value": value}
+        return {"valid": validity, "value": value}
diff --git a/pyQuARC/code/utils.py b/pyQuARC/code/utils.py
@@ -1,10 +1,11 @@
 import os
 import requests
 import urllib
+from datetime import datetime
 
 from functools import wraps
 
-from .constants import CMR_URL
+from .constants import CMR_URL, DATE_FORMATS
 
 
 def if_arg(func):
@@ -64,3 +65,20 @@ def cmr_request(cmr_prms):
 
 def collection_in_cmr(cmr_prms):
     return cmr_request(cmr_prms).get("hits", 0) > 0
+
+
+def get_date_time(dt_str):
+    """
+    Convert a date and time string to a datetime object using predefined formats.
+    This function attempts to parse a date and time string (`dt_str`) into a `datetime` object.
+    It iterates over a list of possible date and time formats (`DATE_FORMATS`). The first successful
+    parse using one of these formats will result in returning the corresponding `datetime` object.
+    If none of the formats match, the function returns `None`.
+    """
+    for fmt in DATE_FORMATS:
+        try:
+            date_time = datetime.strptime(dt_str, fmt)
+            return date_time
+        except ValueError:
+            continue
+    return None