cfpb · nargis-sultani · Sep 14, 2023 · Sep 14, 2023 · Sep 14, 2023 · Sep 15, 2023
diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py
@@ -1,6 +1,8 @@
 """Creates two DataFrameSchema objects by rendering the schema template
 with validations listed in phase 1 and phase 2."""
 
+import pandas as pd
+from checks import SBLCheck
 from pandera import DataFrameSchema
 from pandera.errors import SchemaErrors
 from phase_validations import get_phase_1_and_2_validations_for_lei
@@ -20,32 +22,100 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
     return DataFrameSchema(template)
 
 
-def print_schema_errors(errors: SchemaErrors, phase: str):
-    for error in errors.schema_errors:
-        # Name of the column in the dataframe being checked
-        schema_error = error["error"]
-        check_id = "n/a"
-
-        # built in checks such as unique=True are different than custom
-        # checks unfortunately so the name needs to be accessed differently
-        try:
-            check_name = schema_error.check.name
-            check_id = schema_error.check.id
-            # This will either be a boolean series or a single bool
-            check_output = schema_error.check_output
-        except AttributeError:
-            check_name = schema_error.check
-            # this is just a string that we'd need to parse manually
-            check_output = schema_error.args[0]
-
-        print(f"{phase} Validation `{check_name}` with id: `{check_id}` failed for column `{{column_name}}`")
-        print(check_output)
-        print("")
-
-
 def get_phase_1_schema_for_lei(lei: str = None):
     return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)
 
 
 def get_phase_2_schema_for_lei(lei: str = None):
     return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)
+
+
+def validate(schema: DataFrameSchema, df: pd.DataFrame):
+    """
+    validate received dataframe with schema and return list of
+    schema errors
+
+    Args:
+        schema (DataFrameSchema): schema to be used for validation
+        df (pd.DataFrame): data parsed into dataframe
+
+    Returns:
+        list of schema error
+    """
+    findings = []
+    try:
+        schema(df, lazy=True)
+    except SchemaErrors as errors:
+        for schema_error in errors.schema_errors:
+            error = schema_error["error"]
+            check: SBLCheck = error.check
+            column_name = error.schema.name
+            check_id = "n/a"
+
+            fields: list[str] = [column_name]
+
+            if hasattr(check, "name"):
+                check_name: str = check.name
+
+                if check.groupby:
+                    fields += check.groupby  # type: ignore
+
+                # This will either be a boolean series or a single bool
+                check_output = error.check_output
+            else:
+                # This means this check's column has unique set to True.
+                # we shouldn't be using Unique flag as it doesn't return series of
+                # validation result .  it returns just a printout result string/txt
+                raise AttributeError(f"{str(check)}")
+
+            if hasattr(check, "id"):
+                check_id: str = check.id
+
+            # Remove duplicates, but keep as `list` for JSON-friendliness
+            fields = list(set(fields))
+
+            if check_output is not None:
+                # `check_output` must be sorted so its index lines up with `df`'s index
+                check_output.sort_index(inplace=True)
+
+                # Filter records using Pandas's boolean indexing, where all False values
+                # get filtered out. The `~` does the inverse since it's actually the
+                # False values we want to keep.
+                # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
+                failed_check_fields_df = df[~check_output][fields].fillna("")
+
+                # Create list of dicts representing the failed validations and the
+                # associated field data for each invalid record.
+                records = []
+                for idx, row in failed_check_fields_df.iterrows():
+                    record = {"number": idx + 1, "field_values": {}}
+                    for field in fields:
+                        record["field_values"][field] = row[field]
+                    records.append(record)
+
+                validation_findings = {
+                    "validation": {
+                        "id": check_id,
+                        "name": check_name,
+                        "description": check.description,
+                        "fields": fields,
+                        "severity": "warning" if check.warning else "error",
+                    },
+                    "records": records,
+                }
+
+                findings.append(validation_findings)
+
+    return findings
+
+
+def validate_phases_by_lei(df: pd.DataFrame, lei: str) -> list:
+    phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
+    if phase1_findings:
+        return phase1_findings
+    else:
+        phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df)
+        if phase2_findings:
+            return phase2_findings
+        else:
+            return [{"response": "No validations errors or warnings"}]
diff --git a/src/validator/main.py b/src/validator/main.py
@@ -8,12 +8,7 @@
 import sys
 
 import pandas as pd
-from create_schemas import (
-    get_phase_1_schema_for_lei,
-    get_phase_2_schema_for_lei,
-    print_schema_errors,
-)
-from pandera.errors import SchemaErrors
+from create_schemas import validate_phases_by_lei
 
 
 def csv_to_df(path: str) -> pd.DataFrame:
@@ -32,21 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
     print(df)
     print("")
 
-    phase_1_failure_cases = None
-
-    phase_1_sblar_schema = get_phase_1_schema_for_lei(lei)
-    try:
-        phase_1_sblar_schema(df, lazy=True)
-    except SchemaErrors as errors:
-        phase_1_failure_cases = errors.failure_cases
-        print_schema_errors(errors, "Phase 1")
-
-    if phase_1_failure_cases is None:
-        phase_2_sblar_schema = get_phase_2_schema_for_lei(lei)
-        try:
-            phase_2_sblar_schema(df, lazy=True)
-        except SchemaErrors as errors:
-            print_schema_errors(errors, "Phase 2")
+    print(validate_phases_by_lei(df, lei))
 
 
 if __name__ == "__main__":