Skip to content

Commit

Permalink
Release 1.2.7
Browse files Browse the repository at this point in the history
  • Loading branch information
slesaad authored Jul 18, 2024
2 parents d8c6b5c + 122f4aa commit db65c94
Show file tree
Hide file tree
Showing 10 changed files with 486 additions and 134 deletions.
93 changes: 66 additions & 27 deletions pyQuARC/code/checker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json

from xmltodict import parse
from concurrent.futures import ThreadPoolExecutor, as_completed

from .custom_checker import CustomChecker
from .schema_validator import SchemaValidator
Expand Down Expand Up @@ -154,43 +155,81 @@ def _check_dependencies_validity(self, dependencies, field_dict):
return False
return True

def _process_field(
self,
func,
check,
rule_id,
metadata_content,
field_dict,
result_dict,
rule_mapping,
):
"""
Process a single field according to the given rule and update result_dict
"""
external_data = rule_mapping.get("data", [])
relation = rule_mapping.get("relation")
dependencies = self.scheduler.get_all_dependencies(
rule_mapping, check, field_dict
)
main_field = field_dict["fields"][0]
external_data = field_dict.get("data", external_data)
result_dict.setdefault(main_field, {})

if not self._check_dependencies_validity(dependencies, field_dict):
return

result = self.custom_checker.run(
func, metadata_content, field_dict, external_data, relation
)

self.tracker.update_data(rule_id, main_field, result["valid"])

# Avoid adding null valid results for rules that are not applied
if result["valid"] is None:
return

result_dict[main_field][rule_id] = result

message = self.build_message(result, rule_id)
if message:
result["message"] = message
result["remediation"] = self.message(rule_id, "remediation")

def _run_func(self, func, check, rule_id, metadata_content, result_dict):
"""
Run the check function for `rule_id` and update `result_dict`
"""
rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get(
rule_id
)
external_data = rule_mapping.get("data", [])
relation = rule_mapping.get("relation")
list_of_fields_to_apply = rule_mapping.get("fields_to_apply").get(
self.metadata_format, {}
)

for field_dict in list_of_fields_to_apply:
dependencies = self.scheduler.get_all_dependencies(
rule_mapping, check, field_dict
)
main_field = field_dict["fields"][0]
external_data = field_dict.get("data", external_data)
result_dict.setdefault(main_field, {})
if not self._check_dependencies_validity(dependencies, field_dict):
continue
result = self.custom_checker.run(
func, metadata_content, field_dict, external_data, relation
)

self.tracker.update_data(rule_id, main_field, result["valid"])

# this is to avoid "valid" = null in the result, for rules that are not applied
if result["valid"] is None:
continue
result_dict[main_field][rule_id] = result

message = self.build_message(result, rule_id)
if message:
result["message"] = message
result["remediation"] = self.message(rule_id, "remediation")
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for field_dict in list_of_fields_to_apply:
future = executor.submit(
self._process_field,
func,
check,
rule_id,
metadata_content,
field_dict,
result_dict,
rule_mapping,
)
futures.append(future)

# Wait for all futures to complete
for future in as_completed(futures):
# Retrieve the result or raise an exception if an error occurred
try:
future.result()
except Exception as e:
# Handle the exception from the thread
raise e

def perform_custom_checks(self, metadata_content):
"""
Expand Down
10 changes: 10 additions & 0 deletions pyQuARC/code/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,13 @@
}

CMR_URL = "https://cmr.earthdata.nasa.gov"

DATE_FORMATS = [
"%Y-%m-%dT%H:%M:%S.%f", # Year to microsecond
"%Y-%m-%dT%H:%M:%S", # Year to second
"%Y-%m-%dT%H:%M", # Year to minute
"%Y-%m-%dT%H", # Year to hour
"%Y-%m-%d", # Year to day
"%Y-%m", # Year to month
"%Y", # Year
]
75 changes: 57 additions & 18 deletions pyQuARC/code/custom_checker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed


class CustomChecker:
Expand Down Expand Up @@ -103,6 +104,33 @@ def _get_path_value(content_to_validate, path_string):
)
return container

@staticmethod
def _process_argument(arg, func, relation, external_data, external_relation):
"""
Process the argument by calling the provided function with the given arguments.
Args:
arg: The argument to be processed.
func: The function to be called.
relation: The relation argument.
external_data: The external data argument.
external_relation: The external relation argument.
Returns:
A dict containing the updated invalid_values list and the updated validity flag.
"""

function_args = [*arg]
function_args.extend(
[
extra_arg
for extra_arg in [relation, *external_data, external_relation]
if extra_arg
]
)
func_return = func(*function_args)
return func_return

def run(
self, func, content_to_validate, field_dict, external_data, external_relation
):
Expand Down Expand Up @@ -137,24 +165,35 @@ def run(

invalid_values = []
validity = None
for arg in args:
function_args = [*arg]
function_args.extend(
[
extra_arg
for extra_arg in [relation, *external_data, external_relation]
if extra_arg
]
)
func_return = func(*function_args)
valid = func_return["valid"] # can be True, False or None
if valid is not None:
if valid:
validity = validity or (validity is None)
else:
if "value" in func_return:
invalid_values.append(func_return["value"])
validity = False

# Process arguments using multithreading
with ThreadPoolExecutor() as executor:
future_results = []
for arg in args:
future = executor.submit(
self._process_argument,
arg,
func,
relation,
external_data,
external_relation,
)
future_results.append(future)

# Retrieve results from futures
for future in as_completed(future_results):
try:
func_return = future.result()
valid = func_return["valid"] # can be True, False or None
if valid is not None:
if valid:
validity = validity or (validity is None)
else:
if "value" in func_return:
invalid_values.append(func_return["value"])
validity = False
except Exception as e:
raise e
result["valid"] = validity
result["value"] = invalid_values
return result
6 changes: 3 additions & 3 deletions pyQuARC/code/custom_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ def one_item_presence_check(*field_values):
value = None

for field_value in field_values:
if field_value:
if field_value is not None:
value = field_value
validity = True
break

return {"valid": validity, "value": value}

@staticmethod
def dif_standard_product_check(*field_values):
"""
Expand Down Expand Up @@ -130,7 +130,7 @@ def license_url_description_check(description_field, url_field, license_text):
description_field (string): string describing the URL
"""
validity = True
value = description_field
value = description_field

if not license_text and not url_field:
validity = False
Expand Down
14 changes: 9 additions & 5 deletions pyQuARC/code/datetime_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime

from .base_validator import BaseValidator
from .utils import cmr_request, if_arg, set_cmr_prms
from .utils import cmr_request, if_arg, set_cmr_prms, get_date_time


class DatetimeValidator(BaseValidator):
Expand Down Expand Up @@ -117,13 +117,13 @@ def compare(first, second, relation):

@staticmethod
def validate_datetime_against_granules(
datetime, collection_shortname, version, sort_key, time_key
datetime_string, collection_shortname, version, sort_key, time_key
):
"""
Validates the collection datetime against the datetime of the last granule in the collection
Args:
datetime (str): datetime string
datetime_string (str): datetime string
collection_shortname (str): ShortName of the parent collection
sort_key (str): choice of start_date and end_date
time_key (str): choice of time_end and time_start
Expand All @@ -143,13 +143,17 @@ def validate_datetime_against_granules(

validity = True
last_granule_datetime = None
date_time = None

# Compare the precision of the two datetime strings
if len(granules["feed"]["entry"]) > 0:
last_granule = granules["feed"]["entry"][0]
last_granule_datetime = last_granule.get(time_key)
validity = datetime == last_granule_datetime
date_time = get_date_time(datetime_string)
last_granule_datetime = get_date_time(last_granule_datetime)
validity = date_time == last_granule_datetime

return {"valid": validity, "value": (datetime, last_granule_datetime)}
return {"valid": validity, "value": (date_time, last_granule_datetime)}

@staticmethod
@if_arg
Expand Down
5 changes: 3 additions & 2 deletions pyQuARC/code/url_validator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import requests

from urlextract import URLExtract
Expand Down Expand Up @@ -54,7 +55,7 @@ def status_code_from_request(url):
validity = True

# extract URLs from text
extractor = URLExtract()
extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
urls = extractor.find_urls(text_with_urls)
urls.extend(UrlValidator._extract_http_texts(text_with_urls))

Expand Down Expand Up @@ -115,4 +116,4 @@ def doi_link_update(value, bad_urls):
if value in bad_urls:
validity = False

return {"valid": validity, "Value": value}
return {"valid": validity, "value": value}
20 changes: 19 additions & 1 deletion pyQuARC/code/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import requests
import urllib
from datetime import datetime

from functools import wraps

from .constants import CMR_URL
from .constants import CMR_URL, DATE_FORMATS


def if_arg(func):
Expand Down Expand Up @@ -64,3 +65,20 @@ def cmr_request(cmr_prms):

def collection_in_cmr(cmr_prms):
return cmr_request(cmr_prms).get("hits", 0) > 0


def get_date_time(dt_str):
"""
Convert a date and time string to a datetime object using predefined formats.
This function attempts to parse a date and time string (`dt_str`) into a `datetime` object.
It iterates over a list of possible date and time formats (`DATE_FORMATS`). The first successful
parse using one of these formats will result in returning the corresponding `datetime` object.
If none of the formats match, the function returns `None`.
"""
for fmt in DATE_FORMATS:
try:
date_time = datetime.strptime(dt_str, fmt)
return date_time
except ValueError:
continue
return None
Loading

0 comments on commit db65c94

Please sign in to comment.