From 63471aa3cb6ea05ddebfce10cf69f7b06ed9b5cd Mon Sep 17 00:00:00 2001 From: Nargis Sultani Date: Thu, 14 Sep 2023 14:57:57 -0400 Subject: [PATCH 1/4] Task 42: structure validator CLI output --- src/validator/create_schemas.py | 78 ++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py index e582747b..725608f2 100644 --- a/src/validator/create_schemas.py +++ b/src/validator/create_schemas.py @@ -1,6 +1,8 @@ """Creates two DataFrameSchema objects by rendering the schema template with validations listed in phase 1 and phase 2.""" +import pandas as pd +from checks import SBLCheck from pandera import DataFrameSchema from pandera.errors import SchemaErrors from phase_validations import get_phase_1_and_2_validations_for_lei @@ -21,26 +23,68 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None): def print_schema_errors(errors: SchemaErrors, phase: str): - for error in errors.schema_errors: - # Name of the column in the dataframe being checked - schema_error = error["error"] + findings = [] + print("Validation failed for phase: " + phase + ":") + for schema_error in errors.schema_errors: + error = schema_error["error"] + check: SBLCheck = error.check + column_name = error.schema.name check_id = "n/a" - # built in checks such as unique=True are different than custom - # checks unfortunately so the name needs to be accessed differently - try: - check_name = schema_error.check.name - check_id = schema_error.check.id + fields: list[str] = [column_name] + + if hasattr(check, "name"): + check_name: str = check.name + + if check.groupby: + fields += check.groupby # type: ignore + # This will either be a boolean series or a single bool - check_output = schema_error.check_output - except AttributeError: - check_name = schema_error.check - # this is just a string that we'd need to parse manually - check_output = schema_error.args[0] - - print(f"{phase} Validation `{check_name}` with id: `{check_id}` failed for column `{{column_name}}`") - print(check_output) - print("") + check_output = error.check_output + else: + # This means this check's column has unique set to True. + # we shouldn't be using Unique flag as it doesn't return series of + # validation result . it returns just a printout result string/txt + raise AttributeError(f"{str(check)}") + if hasattr(check, "id"): + check_id = schema_error.check.id + + # Remove duplicates, but keep as `list` for JSON-friendliness + fields = list(set(fields)) + + if check_output is not None: + # `check_output` must be sorted so its index lines up with `df`'s index + check_output.sort_index(inplace=True) + + # Filter records using Pandas's boolean indexing, where all False values + # get filtered out. The `~` does the inverse since it's actually the + # False values we want to keep. + # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing + failed_check_fields_df = df[~check_output][fields].fillna("") + + # Create list of dicts representing the failed validations and the + # associated field data for each invalid record. + records = [] + for idx, row in failed_check_fields_df.iterrows(): + record = {"number": idx + 1, "field_values": {}} + for field in fields: + record["field_values"][field] = row[field] + records.append(record) + + validation_findings = { + "validation": { + "id": check_id, + "name": check_name, + "description": check.description, + "fields": fields, + "severity": "warning" if check.warning else "error", + }, + "records": records, + } + + findings.append(validation_findings) + + return findings def get_phase_1_schema_for_lei(lei: str = None): From 761470ba9acbeda8fbb68793e255d21866a9e29a Mon Sep 17 00:00:00 2001 From: Nargis Sultani Date: Thu, 14 Sep 2023 15:31:45 -0400 Subject: [PATCH 2/4] Fixed issues --- src/validator/create_schemas.py | 156 +++++++++++++++++++------------- src/validator/main.py | 23 +---- 2 files changed, 93 insertions(+), 86 deletions(-) diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py index 725608f2..65884f43 100644 --- a/src/validator/create_schemas.py +++ b/src/validator/create_schemas.py @@ -22,74 +22,100 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None): return DataFrameSchema(template) -def print_schema_errors(errors: SchemaErrors, phase: str): - findings = [] - print("Validation failed for phase: " + phase + ":") - for schema_error in errors.schema_errors: - error = schema_error["error"] - check: SBLCheck = error.check - column_name = error.schema.name - check_id = "n/a" - - fields: list[str] = [column_name] - - if hasattr(check, "name"): - check_name: str = check.name - - if check.groupby: - fields += check.groupby # type: ignore - - # This will either be a boolean series or a single bool - check_output = error.check_output - else: - # This means this check's column has unique set to True. - # we shouldn't be using Unique flag as it doesn't return series of - # validation result . it returns just a printout result string/txt - raise AttributeError(f"{str(check)}") - if hasattr(check, "id"): - check_id = schema_error.check.id - - # Remove duplicates, but keep as `list` for JSON-friendliness - fields = list(set(fields)) - - if check_output is not None: - # `check_output` must be sorted so its index lines up with `df`'s index - check_output.sort_index(inplace=True) - - # Filter records using Pandas's boolean indexing, where all False values - # get filtered out. The `~` does the inverse since it's actually the - # False values we want to keep. - # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing - failed_check_fields_df = df[~check_output][fields].fillna("") - - # Create list of dicts representing the failed validations and the - # associated field data for each invalid record. - records = [] - for idx, row in failed_check_fields_df.iterrows(): - record = {"number": idx + 1, "field_values": {}} - for field in fields: - record["field_values"][field] = row[field] - records.append(record) - - validation_findings = { - "validation": { - "id": check_id, - "name": check_name, - "description": check.description, - "fields": fields, - "severity": "warning" if check.warning else "error", - }, - "records": records, - } - - findings.append(validation_findings) - - return findings - - def get_phase_1_schema_for_lei(lei: str = None): return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei) def get_phase_2_schema_for_lei(lei: str = None): return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei) + + +def validate(schema: DataFrameSchema, df: pd.DataFrame): + """ + validate received dataframe with schema and return list of + schema errors + + Args: + schema (DataFrameSchema): schema to be used for validation + df (pd.DataFrame): data parsed into dataframe + + Returns: + list of schema error + """ + findings = [] + try: + schema(df, lazy=True) + except SchemaErrors as errors: + for schema_error in errors.schema_errors: + error = schema_error["error"] + check: SBLCheck = error.check + column_name = error.schema.name + check_id = "n/a" + + fields: list[str] = [column_name] + + if hasattr(check, "name"): + check_name: str = check.name + + if check.groupby: + fields += check.groupby # type: ignore + + # This will either be a boolean series or a single bool + check_output = error.check_output + else: + # This means this check's column has unique set to True. + # we shouldn't be using Unique flag as it doesn't return series of + # validation result . it returns just a printout result string/txt + raise AttributeError(f"{str(check)}") + + if hasattr(check, "id"): + check_id: str = check.id + + # Remove duplicates, but keep as `list` for JSON-friendliness + fields = list(set(fields)) + + if check_output is not None: + # `check_output` must be sorted so its index lines up with `df`'s index + check_output.sort_index(inplace=True) + + # Filter records using Pandas's boolean indexing, where all False values + # get filtered out. The `~` does the inverse since it's actually the + # False values we want to keep. + # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing + failed_check_fields_df = df[~check_output][fields].fillna("") + + # Create list of dicts representing the failed validations and the + # associated field data for each invalid record. + records = [] + for idx, row in failed_check_fields_df.iterrows(): + record = {"number": idx + 1, "field_values": {}} + for field in fields: + record["field_values"][field] = row[field] + records.append(record) + + validation_findings = { + "validation": { + "id": check_id, + "name": check_name, + "description": check.description, + "fields": fields, + "severity": "warning" if check.warning else "error", + }, + "records": records, + } + + findings.append(validation_findings) + + return findings + + +def validate_phases(phase1: DataFrameSchema, phase2: DataFrameSchema, df: pd.DataFrame) -> list: + phase1_findings = validate(phase1, df) + if phase1_findings: + return phase1_findings + else: + phase2_findings = validate(phase2, df) + if phase2_findings: + return phase2_findings + else: + return [{"response": "No validations errors or warnings"}] diff --git a/src/validator/main.py b/src/validator/main.py index 433275be..07d9add1 100644 --- a/src/validator/main.py +++ b/src/validator/main.py @@ -8,12 +8,7 @@ import sys import pandas as pd -from create_schemas import ( - get_phase_1_schema_for_lei, - get_phase_2_schema_for_lei, - print_schema_errors, -) -from pandera.errors import SchemaErrors +from create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate_phases def csv_to_df(path: str) -> pd.DataFrame: @@ -32,21 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None: print(df) print("") - phase_1_failure_cases = None - - phase_1_sblar_schema = get_phase_1_schema_for_lei(lei) - try: - phase_1_sblar_schema(df, lazy=True) - except SchemaErrors as errors: - phase_1_failure_cases = errors.failure_cases - print_schema_errors(errors, "Phase 1") - - if phase_1_failure_cases is None: - phase_2_sblar_schema = get_phase_2_schema_for_lei(lei) - try: - phase_2_sblar_schema(df, lazy=True) - except SchemaErrors as errors: - print_schema_errors(errors, "Phase 2") + print(validate_phases(get_phase_1_schema_for_lei(lei), get_phase_2_schema_for_lei(lei), df)) if __name__ == "__main__": From 573a7465f13d61af4f0c3fd9b80fe80bac4a92b9 Mon Sep 17 00:00:00 2001 From: Nargis Sultani Date: Thu, 14 Sep 2023 15:42:23 -0400 Subject: [PATCH 3/4] made changes --- src/validator/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/validator/main.py b/src/validator/main.py index 07d9add1..680173bd 100644 --- a/src/validator/main.py +++ b/src/validator/main.py @@ -8,7 +8,7 @@ import sys import pandas as pd -from create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate_phases +from create_schemas import validate_phases_by_lei def csv_to_df(path: str) -> pd.DataFrame: @@ -27,7 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None: print(df) print("") - print(validate_phases(get_phase_1_schema_for_lei(lei), get_phase_2_schema_for_lei(lei), df)) + print(validate_phases_by_lei(df, lei)) if __name__ == "__main__": From e0bb624eda760b55c062eecc4b806522e1f9a8a5 Mon Sep 17 00:00:00 2001 From: Nargis Sultani Date: Fri, 15 Sep 2023 10:50:51 -0400 Subject: [PATCH 4/4] added changes --- src/validator/create_schemas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py index 65884f43..8e21391b 100644 --- a/src/validator/create_schemas.py +++ b/src/validator/create_schemas.py @@ -109,12 +109,12 @@ def validate(schema: DataFrameSchema, df: pd.DataFrame): return findings -def validate_phases(phase1: DataFrameSchema, phase2: DataFrameSchema, df: pd.DataFrame) -> list: - phase1_findings = validate(phase1, df) +def validate_phases_by_lei(df: pd.DataFrame, lei: str) -> list: + phase1_findings = validate(get_phase_1_schema_for_lei(lei), df) if phase1_findings: return phase1_findings else: - phase2_findings = validate(phase2, df) + phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df) if phase2_findings: return phase2_findings else: