Merge pull request #251 from NASA-IMPACT/dev

Updates: Adding Checks and Resolving Inconsistencies
NASA-IMPACT · Aug 2, 2023 · 88b7ac5 · 88b7ac5
2 parents baf4b47 + 8b6136c
commit 88b7ac5
Show file tree

Hide file tree

Showing 45 changed files with 2,332 additions and 1,262 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+  # Enable version updates for pip
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ build/*
 dist/*
 pyQuARC.egg-info/*
 env/*
+.venv/*
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # CHANGELOG
 
+## v1.2.3
+- Updated schema files
+- Added Free And Open Data check
+- Added Horizontal Resolution Presence check
+- Added Data Format Presence check
+- Added Standard Product check
+- Added License URL Description check
+- Added Granule Campaign Name Presence check
+- Revised GCMD long name presence checks
+- Revised validate_beginning_datetime_against_granules check
+- Removed redundant checks
+- Fix auth issue when downloading metadata files
+
 ## v1.2.2
 
 - Bugfixes:

diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ The CMR is designed around its own metadata standard called the [Unified Metadat
 
 pyQuARC supports DIF10 (collection only), ECHO10 (collection and granule), UMM-C, and UMM-G standards. At this time, there are no plans to add ISO 19115 or UMM-S/T specific checks. **Additionally, the output messages pyQuARC currently displays should be taken with a grain of salt. There is still testing and clean-up work to be done.**
 
-**For inquiries, please email: jeanne.leroux@nsstc.uah.edu**
+**For inquiries, please email: jenny.wood@uah.edu**
 
 ## pyQuARC as a Service (QuARC)
 
@@ -53,7 +53,7 @@ The `checks.json` file includes a comprehensive list of rules. Each rule is spec
 
 The `rule_mapping.json` file specifies which metadata element(s) each rule applies to. The `rule_mapping.json` also references the `messages.json` file which includes messages that can be displayed when a check passes or fails.
 
-Furthermore, the `rule_mapping.json` file specifies the level of severity associated with a failure. If a check fails, it will be assigned a severity category of “<span style="color:red">error</span>,” “<span style="color:orange">warning</span>,” or <span style="color:blue">info</span>.” These categories correspond to priority categorizations in [ARC’s priority matrix](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) and communicate the importance of the failed check, with “error” being the most critical category, “warning” indicating a failure of medium priority, and “info” indicating a minor issue or inconsistency. Default severity values are assigned based on ARC’s metadata quality assessment framework, but can be customized to meet individual needs.
+Furthermore, the `rule_mapping.json` file specifies the level of severity associated with a failure. If a check fails, it will be assigned a severity category of “<span style="color:red">error</span>”, “<span style="color:orange">warning</span>”, or "<span style="color:blue">info</span>.” These categories correspond to priority categorizations in [ARC’s priority matrix](https://wiki.earthdata.nasa.gov/display/CMR/ARC+Priority+Matrix) and communicate the importance of the failed check, with “error” being the most critical category, “warning” indicating a failure of medium priority, and “info” indicating a minor issue or inconsistency. Default severity values are assigned based on ARC’s metadata quality assessment framework, but can be customized to meet individual needs.
 
 ## Customization
 pyQuARC is designed to be customizable. Output messages can be modified using the `messages_override.json` file - any messages added to `messages_override.json` will display over the default messages in the `message.json` file. Similarly, there is a `rule_mapping_override.json` file which can be used to override the default settings for which rules/checks are applied to which metadata elements.  
@@ -317,7 +317,7 @@ Then, if the check function receives input `value1=0` and `value2=1`, the output
 The values 0 and 1 do not amount to a true value
 ```
 
-### Use as a package
+### Using as a package
 *Note:* This program requires `Python 3.8` installed in your system.
 
 **Clone the repo:** [https://github.com/NASA-IMPACT/pyQuARC/](https://github.com/NASA-IMPACT/pyQuARC/)

diff --git a/pyQuARC/__init__.py b/pyQuARC/__init__.py
@@ -17,7 +17,7 @@
 with open(f"{ABS_PATH}/version.txt") as version_file:
     __version__ = version_file.read().strip()
 
+
 def version():
-    """Returns the current version of pyQuARC.
-    """
+    """Returns the current version of pyQuARC."""
     return __version__
diff --git a/pyQuARC/code/base_validator.py b/pyQuARC/code/base_validator.py
@@ -40,7 +40,7 @@ def contains(list_of_values, value):
 
     @staticmethod
     def compare(first, second, relation):
-        if relation.startswith('not_'):
+        if relation.startswith("not_"):
             return not (BaseValidator.compare(first, second, relation[4:]))
         func = getattr(BaseValidator, relation)
         return func(first, second)
diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py
@@ -26,7 +26,7 @@ def __init__(
         metadata_format=ECHO10_C,
         messages_override=None,
         checks_override=None,
-        rules_override=None
+        rules_override=None,
     ):
         """
         Args:
@@ -53,13 +53,13 @@ def __init__(
             self.rules_override,
             self.checks,
             self.checks_override,
-            metadata_format=metadata_format
+            metadata_format=metadata_format,
+        )
+        self.schema_validator = SchemaValidator(
+            self.messages_override or self.messages, metadata_format
         )
-        self.schema_validator = SchemaValidator(self.messages_override or self.messages, metadata_format)
         self.tracker = Tracker(
-            self.rule_mapping,
-            self.rules_override,
-            metadata_format=metadata_format
+            self.rule_mapping, self.rules_override, metadata_format=metadata_format
         )
 
     @staticmethod
@@ -76,15 +76,9 @@ def load_schemas(self):
         self.checks = Checker._json_load_schema("checks")
         self.rule_mapping = Checker._json_load_schema("rule_mapping")
         self.messages = Checker._json_load_schema("check_messages")
-        self.messages_override = Checker._json_load_schema(
-            self.msgs_override_file
-        )
-        self.rules_override = Checker._json_load_schema(
-            self.rules_override_file
-        )
-        self.checks_override = Checker._json_load_schema(
-            self.checks_override_file
-        )
+        self.messages_override = Checker._json_load_schema(self.msgs_override_file)
+        self.rules_override = Checker._json_load_schema(self.rules_override_file)
+        self.checks_override = Checker._json_load_schema(self.checks_override_file)
 
     @staticmethod
     def map_to_function(data_type, function):
@@ -112,19 +106,19 @@ def message(self, rule_id, msg_type):
         msg_type can be any one of 'failure', 'remediation'
         """
         messages = self.messages_override.get(rule_id) or self.messages.get(rule_id)
-        return messages[msg_type] if messages else ''
+        return messages[msg_type] if messages else ""
 
     def build_message(self, result, rule_id):
         """
         Formats the message for `rule_id` based on the result
         """
         failure_message = self.message(rule_id, "failure")
-        rule_mapping = self.rules_override.get(
+        rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get(
             rule_id
-        ) or self.rule_mapping.get(rule_id)
+        )
         severity = rule_mapping.get("severity", "error")
         messages = []
-        if not(result["valid"]) and result.get("value"):
+        if not (result["valid"]) and result.get("value"):
             for value in result["value"]:
                 formatted_message = failure_message
                 value = value if isinstance(value, tuple) else (value,)
@@ -143,7 +137,9 @@ def _check_dependency_validity(self, dependency, field_dict):
         """
         Checks if the dependent check called `dependency` is valid
         """
-        dependency_fields = field_dict["fields"] if len(dependency) == 1 else [dependency[1]]
+        dependency_fields = (
+            field_dict["fields"] if len(dependency) == 1 else [dependency[1]]
+        )
         for field in dependency_fields:
             if not self.tracker.read_data(dependency[0], field).get("valid"):
                 return False
@@ -162,27 +158,26 @@ def _run_func(self, func, check, rule_id, metadata_content, result_dict):
         """
         Run the check function for `rule_id` and update `result_dict`
         """
-        rule_mapping = self.rules_override.get(
+        rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get(
             rule_id
-        ) or self.rule_mapping.get(rule_id)
+        )
         external_data = rule_mapping.get("data", [])
         relation = rule_mapping.get("relation")
-        list_of_fields_to_apply = \
-            rule_mapping.get("fields_to_apply").get(self.metadata_format, {})
-
+        list_of_fields_to_apply = rule_mapping.get("fields_to_apply").get(
+            self.metadata_format, {}
+        )
+
         for field_dict in list_of_fields_to_apply:
-            dependencies = self.scheduler.get_all_dependencies(rule_mapping, check, field_dict)
+            dependencies = self.scheduler.get_all_dependencies(
+                rule_mapping, check, field_dict
+            )
             main_field = field_dict["fields"][0]
             external_data = field_dict.get("data", external_data)
             result_dict.setdefault(main_field, {})
             if not self._check_dependencies_validity(dependencies, field_dict):
                 continue
             result = self.custom_checker.run(
-                func,
-                metadata_content,
-                field_dict,
-                external_data,
-                relation
+                func, metadata_content, field_dict, external_data, relation
             )
 
             self.tracker.update_data(rule_id, main_field, result["valid"])
@@ -211,14 +206,16 @@ def perform_custom_checks(self, metadata_content):
                 ) or self.rule_mapping.get(rule_id)
                 check_id = rule_mapping.get("check_id", rule_id)
                 check = self.checks_override.get(check_id) or self.checks.get(check_id)
-                func = Checker.map_to_function(check["data_type"], check["check_function"])
+                func = Checker.map_to_function(
+                    check["data_type"], check["check_function"]
+                )
                 if func:
                     self._run_func(func, check, rule_id, metadata_content, result_dict)
             except Exception as e:
                 pyquarc_errors.append(
                     {
                         "message": f"Running check for the rule: '{rule_id}' failed.",
-                        "details": str(e)
+                        "details": str(e),
                     }
                 )
         return result_dict, pyquarc_errors
@@ -233,6 +230,7 @@ def run(self, metadata_content):
         Returns:
             (dict): The results of the jsonschema check and all custom checks
         """
+
         def _xml_postprocessor(_, key, value):
             """
             Sometimes the XML values contain attributes.
@@ -259,11 +257,7 @@ def _xml_postprocessor(_, key, value):
             parser = parse
             kwargs = {"postprocessor": _xml_postprocessor}
         json_metadata = parser(metadata_content, **kwargs)
-        result_schema = self.perform_schema_check(
-            metadata_content
-        )
+        result_schema = self.perform_schema_check(metadata_content)
         result_custom, pyquarc_errors = self.perform_custom_checks(json_metadata)
-        result = {
-            **result_schema, **result_custom
-        }
+        result = {**result_schema, **result_custom}
         return result, pyquarc_errors
diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py
@@ -14,7 +14,7 @@
 
 ROOT_DIR = (
     # go up one directory
-    os.path.abspath(os.path.join(__file__, '../..'))
+    os.path.abspath(os.path.join(__file__, "../.."))
 )
 
 SCHEMAS_BASE_PATH = f"{ROOT_DIR}/schemas"
@@ -46,17 +46,17 @@
         "rules_override",
         f"{UMM_C}-json-schema",
         "umm-cmn-json-schema",
-        f"{UMM_G}-json-schema"
+        f"{UMM_G}-json-schema",
     ],
     "csv": GCMD_KEYWORDS,
-    "xsd": [ f"{DIF}_schema", f"{ECHO10_C}_schema", f"{ECHO10_G}_schema" ],
-    "xml": [ "catalog" ]
+    "xsd": [f"{DIF}_schema", f"{ECHO10_C}_schema", f"{ECHO10_G}_schema"],
+    "xml": ["catalog"],
 }
 
 SCHEMA_PATHS = {
-    schema:  f"{SCHEMAS_BASE_PATH}/{schema}.{filetype}"
-        for filetype, schemas in SCHEMAS.items()
-            for schema in schemas
+    schema: f"{SCHEMAS_BASE_PATH}/{schema}.{filetype}"
+    for filetype, schemas in SCHEMAS.items()
+    for schema in schemas
 }
 
 VERSION_FILE = f"{SCHEMAS_BASE_PATH}/version.txt"
@@ -67,7 +67,7 @@
     "error": Fore.RED,
     "warning": Fore.YELLOW,
     "reset": Style.RESET_ALL,
-    "bright": Style.BRIGHT
+    "bright": Style.BRIGHT,
 }
 
 GCMD_BASIC_URL = "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/"

diff --git a/pyQuARC/code/custom_checker.py b/pyQuARC/code/custom_checker.py
@@ -10,7 +10,9 @@ def __init__(self):
         pass
 
     @staticmethod
-    def _get_path_value_recursively(subset_of_metadata_content, path_list, container, query_params=None):
+    def _get_path_value_recursively(
+        subset_of_metadata_content, path_list, container, query_params=None
+    ):
         """
         Gets the path values recursively while handling list or dictionary in `subset_of_metadata_content`
         Adds the values to `container`
@@ -37,7 +39,11 @@ def _get_path_value_recursively(subset_of_metadata_content, path_list, container
             container.append(subset_of_metadata_content)
             return
         new_path = path_list[1:]
-        if isinstance(root_content, str) or isinstance(root_content, int) or isinstance(root_content, float):
+        if (
+            isinstance(root_content, str)
+            or isinstance(root_content, int)
+            or isinstance(root_content, float)
+        ):
             container.append(root_content)
             return
         elif isinstance(root_content, list):
@@ -46,7 +52,13 @@ def _get_path_value_recursively(subset_of_metadata_content, path_list, container
                 return
             if len(new_path) == 1 and query_params:
                 try:
-                    root_content = next((x for x in root_content if x[query_params[0]] == query_params[1]))
+                    root_content = next(
+                        (
+                            x
+                            for x in root_content
+                            if x[query_params[0]] == query_params[1]
+                        )
+                    )
                     root_content = root_content[new_path[0]]
                     container.append(root_content)
                 except:
@@ -55,13 +67,15 @@ def _get_path_value_recursively(subset_of_metadata_content, path_list, container
             for each in root_content:
                 try:
                     CustomChecker._get_path_value_recursively(
-                        each, new_path, container, query_params)
+                        each, new_path, container, query_params
+                    )
                 except KeyError:
                     container.append(None)
                     continue
         elif isinstance(root_content, dict):
             CustomChecker._get_path_value_recursively(
-                root_content, new_path, container, query_params)
+                root_content, new_path, container, query_params
+            )
 
     @staticmethod
     def _get_path_value(content_to_validate, path_string):
@@ -80,15 +94,18 @@ def _get_path_value(content_to_validate, path_string):
         query_params = None
 
         parsed = urlparse(path_string)
-        path = parsed.path.split('/')
+        path = parsed.path.split("/")
         if key_value := parsed.query:
-            query_params = key_value.split('=')
+            query_params = key_value.split("=")
 
         CustomChecker._get_path_value_recursively(
-            content_to_validate, path, container, query_params)
+            content_to_validate, path, container, query_params
+        )
         return container
 
-    def run(self, func, content_to_validate, field_dict, external_data, external_relation):
+    def run(
+        self, func, content_to_validate, field_dict, external_data, external_relation
+    ):
         """
         Runs the custom check based on `func` to the `content_to_validate`'s `field_dict` path
 
@@ -112,22 +129,25 @@ def run(self, func, content_to_validate, field_dict, external_data, external_rel
         fields = field_dict["fields"]
         field_values = []
         relation = field_dict.get("relation")
-        result = {
-            "valid": None
-        }
+        result = {"valid": None}
         for _field in fields:
-            value = CustomChecker._get_path_value(
-                content_to_validate, _field)
+            value = CustomChecker._get_path_value(content_to_validate, _field)
             field_values.append(value)
         args = zip(*field_values)
 
         invalid_values = []
         validity = None
         for arg in args:
             function_args = [*arg]
-            function_args.extend([extra_arg for extra_arg in [relation, *external_data, external_relation] if extra_arg])
+            function_args.extend(
+                [
+                    extra_arg
+                    for extra_arg in [relation, *external_data, external_relation]
+                    if extra_arg
+                ]
+            )
             func_return = func(*function_args)
-            valid = func_return["valid"] # can be True, False or None
+            valid = func_return["valid"]  # can be True, False or None
             if valid is not None:
                 if valid:
                     validity = validity or (validity is None)