Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task 42: structure validator CLI output #47

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 93 additions & 23 deletions src/validator/create_schemas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Creates two DataFrameSchema objects by rendering the schema template
with validations listed in phase 1 and phase 2."""

import pandas as pd
from checks import SBLCheck
from pandera import DataFrameSchema
from pandera.errors import SchemaErrors
from phase_validations import get_phase_1_and_2_validations_for_lei
Expand All @@ -20,32 +22,100 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
return DataFrameSchema(template)


def print_schema_errors(errors: SchemaErrors, phase: str):
for error in errors.schema_errors:
# Name of the column in the dataframe being checked
schema_error = error["error"]
check_id = "n/a"

# built in checks such as unique=True are different than custom
# checks unfortunately so the name needs to be accessed differently
try:
check_name = schema_error.check.name
check_id = schema_error.check.id
# This will either be a boolean series or a single bool
check_output = schema_error.check_output
except AttributeError:
check_name = schema_error.check
# this is just a string that we'd need to parse manually
check_output = schema_error.args[0]

print(f"{phase} Validation `{check_name}` with id: `{check_id}` failed for column `{{column_name}}`")
print(check_output)
print("")


def get_phase_1_schema_for_lei(lei: str = None):
return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)


def get_phase_2_schema_for_lei(lei: str = None):
return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)


def validate(schema: DataFrameSchema, df: pd.DataFrame):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: functions in #32 does not handle LEI so you'll need update them to handle LEI check

"""
validate received dataframe with schema and return list of
schema errors

Args:
schema (DataFrameSchema): schema to be used for validation
df (pd.DataFrame): data parsed into dataframe

Returns:
list of schema error
"""
findings = []
try:
schema(df, lazy=True)
except SchemaErrors as errors:
for schema_error in errors.schema_errors:
error = schema_error["error"]
check: SBLCheck = error.check
column_name = error.schema.name
check_id = "n/a"

fields: list[str] = [column_name]

if hasattr(check, "name"):
check_name: str = check.name

if check.groupby:
fields += check.groupby # type: ignore

# This will either be a boolean series or a single bool
check_output = error.check_output
else:
# This means this check's column has unique set to True.
# we shouldn't be using Unique flag as it doesn't return series of
# validation result . it returns just a printout result string/txt
raise AttributeError(f"{str(check)}")

if hasattr(check, "id"):
check_id: str = check.id

# Remove duplicates, but keep as `list` for JSON-friendliness
fields = list(set(fields))

if check_output is not None:
# `check_output` must be sorted so its index lines up with `df`'s index
check_output.sort_index(inplace=True)

# Filter records using Pandas's boolean indexing, where all False values
# get filtered out. The `~` does the inverse since it's actually the
# False values we want to keep.
# http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
failed_check_fields_df = df[~check_output][fields].fillna("")

# Create list of dicts representing the failed validations and the
# associated field data for each invalid record.
records = []
for idx, row in failed_check_fields_df.iterrows():
record = {"number": idx + 1, "field_values": {}}
for field in fields:
record["field_values"][field] = row[field]
records.append(record)

validation_findings = {
"validation": {
"id": check_id,
"name": check_name,
"description": check.description,
"fields": fields,
"severity": "warning" if check.warning else "error",
},
"records": records,
}

findings.append(validation_findings)

return findings


def validate_phases_by_lei(df: pd.DataFrame, lei: str) -> list:
phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
if phase1_findings:
return phase1_findings
else:
phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df)
if phase2_findings:
return phase2_findings
else:
return [{"response": "No validations errors or warnings"}]
23 changes: 2 additions & 21 deletions src/validator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
import sys

import pandas as pd
from create_schemas import (
get_phase_1_schema_for_lei,
get_phase_2_schema_for_lei,
print_schema_errors,
)
from pandera.errors import SchemaErrors
from create_schemas import validate_phases_by_lei


def csv_to_df(path: str) -> pd.DataFrame:
Expand All @@ -32,21 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
print(df)
print("")

phase_1_failure_cases = None

phase_1_sblar_schema = get_phase_1_schema_for_lei(lei)
try:
phase_1_sblar_schema(df, lazy=True)
except SchemaErrors as errors:
phase_1_failure_cases = errors.failure_cases
print_schema_errors(errors, "Phase 1")

if phase_1_failure_cases is None:
phase_2_sblar_schema = get_phase_2_schema_for_lei(lei)
try:
phase_2_sblar_schema(df, lazy=True)
except SchemaErrors as errors:
print_schema_errors(errors, "Phase 2")
print(validate_phases_by_lei(df, lei))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this validate_phases_by_lei does not exist in create_schemas.
If you run /usr/local/bin/python /workspaces/regtech-data-validator/src/validator/main.py SBL_Validations_SampleData_GoodFile_03312 023.csv , you'll see this error ImportError: cannot import name 'validate_phases_by_lei' from 'create_schemas' (/workspaces/regtech-data-validator/src/validator/create_schemas.py)



if __name__ == "__main__":
Expand Down
Loading