From 63471aa3cb6ea05ddebfce10cf69f7b06ed9b5cd Mon Sep 17 00:00:00 2001
From: Nargis Sultani <nargis.sultani@cfpb.gov>
Date: Thu, 14 Sep 2023 14:57:57 -0400
Subject: [PATCH 1/4] Task 42: structure validator CLI output

---
 src/validator/create_schemas.py | 78 ++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 17 deletions(-)

diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py
index e582747b..725608f2 100644
--- a/src/validator/create_schemas.py
+++ b/src/validator/create_schemas.py
@@ -1,6 +1,8 @@
 """Creates two DataFrameSchema objects by rendering the schema template
 with validations listed in phase 1 and phase 2."""
 
+import pandas as pd
+from checks import SBLCheck
 from pandera import DataFrameSchema
 from pandera.errors import SchemaErrors
 from phase_validations import get_phase_1_and_2_validations_for_lei
@@ -21,26 +23,68 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
 
 
 def print_schema_errors(errors: SchemaErrors, phase: str):
-    for error in errors.schema_errors:
-        # Name of the column in the dataframe being checked
-        schema_error = error["error"]
+    findings = []
+    print("Validation failed for phase: " + phase + ":")
+    for schema_error in errors.schema_errors:
+        error = schema_error["error"]
+        check: SBLCheck = error.check
+        column_name = error.schema.name
         check_id = "n/a"
 
-        # built in checks such as unique=True are different than custom
-        # checks unfortunately so the name needs to be accessed differently
-        try:
-            check_name = schema_error.check.name
-            check_id = schema_error.check.id
+        fields: list[str] = [column_name]
+
+        if hasattr(check, "name"):
+            check_name: str = check.name
+
+            if check.groupby:
+                fields += check.groupby  # type: ignore
+
             # This will either be a boolean series or a single bool
-            check_output = schema_error.check_output
-        except AttributeError:
-            check_name = schema_error.check
-            # this is just a string that we'd need to parse manually
-            check_output = schema_error.args[0]
-
-        print(f"{phase} Validation `{check_name}` with id: `{check_id}` failed for column `{{column_name}}`")
-        print(check_output)
-        print("")
+            check_output = error.check_output
+        else:
+            # This means this check's column has unique set to True.
+            # we shouldn't be using Unique flag as it doesn't return series of
+            # validation result .  it returns just a printout result string/txt
+            raise AttributeError(f"{str(check)}")
+        if hasattr(check, "id"):
+            check_id = schema_error.check.id
+
+        # Remove duplicates, but keep as `list` for JSON-friendliness
+        fields = list(set(fields))
+
+        if check_output is not None:
+            # `check_output` must be sorted so its index lines up with `df`'s index
+            check_output.sort_index(inplace=True)
+
+            # Filter records using Pandas's boolean indexing, where all False values
+            # get filtered out. The `~` does the inverse since it's actually the
+            # False values we want to keep.
+            # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
+            failed_check_fields_df = df[~check_output][fields].fillna("")
+
+            # Create list of dicts representing the failed validations and the
+            # associated field data for each invalid record.
+            records = []
+            for idx, row in failed_check_fields_df.iterrows():
+                record = {"number": idx + 1, "field_values": {}}
+                for field in fields:
+                    record["field_values"][field] = row[field]
+                records.append(record)
+
+            validation_findings = {
+                "validation": {
+                    "id": check_id,
+                    "name": check_name,
+                    "description": check.description,
+                    "fields": fields,
+                    "severity": "warning" if check.warning else "error",
+                },
+                "records": records,
+            }
+
+            findings.append(validation_findings)
+
+        return findings
 
 
 def get_phase_1_schema_for_lei(lei: str = None):

From 761470ba9acbeda8fbb68793e255d21866a9e29a Mon Sep 17 00:00:00 2001
From: Nargis Sultani <nargis.sultani@cfpb.gov>
Date: Thu, 14 Sep 2023 15:31:45 -0400
Subject: [PATCH 2/4] Fixed issues

---
 src/validator/create_schemas.py | 156 +++++++++++++++++++-------------
 src/validator/main.py           |  23 +----
 2 files changed, 93 insertions(+), 86 deletions(-)

diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py
index 725608f2..65884f43 100644
--- a/src/validator/create_schemas.py
+++ b/src/validator/create_schemas.py
@@ -22,74 +22,100 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
     return DataFrameSchema(template)
 
 
-def print_schema_errors(errors: SchemaErrors, phase: str):
-    findings = []
-    print("Validation failed for phase: " + phase + ":")
-    for schema_error in errors.schema_errors:
-        error = schema_error["error"]
-        check: SBLCheck = error.check
-        column_name = error.schema.name
-        check_id = "n/a"
-
-        fields: list[str] = [column_name]
-
-        if hasattr(check, "name"):
-            check_name: str = check.name
-
-            if check.groupby:
-                fields += check.groupby  # type: ignore
-
-            # This will either be a boolean series or a single bool
-            check_output = error.check_output
-        else:
-            # This means this check's column has unique set to True.
-            # we shouldn't be using Unique flag as it doesn't return series of
-            # validation result .  it returns just a printout result string/txt
-            raise AttributeError(f"{str(check)}")
-        if hasattr(check, "id"):
-            check_id = schema_error.check.id
-
-        # Remove duplicates, but keep as `list` for JSON-friendliness
-        fields = list(set(fields))
-
-        if check_output is not None:
-            # `check_output` must be sorted so its index lines up with `df`'s index
-            check_output.sort_index(inplace=True)
-
-            # Filter records using Pandas's boolean indexing, where all False values
-            # get filtered out. The `~` does the inverse since it's actually the
-            # False values we want to keep.
-            # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
-            failed_check_fields_df = df[~check_output][fields].fillna("")
-
-            # Create list of dicts representing the failed validations and the
-            # associated field data for each invalid record.
-            records = []
-            for idx, row in failed_check_fields_df.iterrows():
-                record = {"number": idx + 1, "field_values": {}}
-                for field in fields:
-                    record["field_values"][field] = row[field]
-                records.append(record)
-
-            validation_findings = {
-                "validation": {
-                    "id": check_id,
-                    "name": check_name,
-                    "description": check.description,
-                    "fields": fields,
-                    "severity": "warning" if check.warning else "error",
-                },
-                "records": records,
-            }
-
-            findings.append(validation_findings)
-
-        return findings
-
-
 def get_phase_1_schema_for_lei(lei: str = None):
     return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)
 
 
 def get_phase_2_schema_for_lei(lei: str = None):
     return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)
+
+
+def validate(schema: DataFrameSchema, df: pd.DataFrame):
+    """
+    validate received dataframe with schema and return list of
+    schema errors
+
+    Args:
+        schema (DataFrameSchema): schema to be used for validation
+        df (pd.DataFrame): data parsed into dataframe
+
+    Returns:
+        list of schema error
+    """
+    findings = []
+    try:
+        schema(df, lazy=True)
+    except SchemaErrors as errors:
+        for schema_error in errors.schema_errors:
+            error = schema_error["error"]
+            check: SBLCheck = error.check
+            column_name = error.schema.name
+            check_id = "n/a"
+
+            fields: list[str] = [column_name]
+
+            if hasattr(check, "name"):
+                check_name: str = check.name
+
+                if check.groupby:
+                    fields += check.groupby  # type: ignore
+
+                # This will either be a boolean series or a single bool
+                check_output = error.check_output
+            else:
+                # This means this check's column has unique set to True.
+                # we shouldn't be using Unique flag as it doesn't return series of
+                # validation result .  it returns just a printout result string/txt
+                raise AttributeError(f"{str(check)}")
+
+            if hasattr(check, "id"):
+                check_id: str = check.id
+
+            # Remove duplicates, but keep as `list` for JSON-friendliness
+            fields = list(set(fields))
+
+            if check_output is not None:
+                # `check_output` must be sorted so its index lines up with `df`'s index
+                check_output.sort_index(inplace=True)
+
+                # Filter records using Pandas's boolean indexing, where all False values
+                # get filtered out. The `~` does the inverse since it's actually the
+                # False values we want to keep.
+                # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
+                failed_check_fields_df = df[~check_output][fields].fillna("")
+
+                # Create list of dicts representing the failed validations and the
+                # associated field data for each invalid record.
+                records = []
+                for idx, row in failed_check_fields_df.iterrows():
+                    record = {"number": idx + 1, "field_values": {}}
+                    for field in fields:
+                        record["field_values"][field] = row[field]
+                    records.append(record)
+
+                validation_findings = {
+                    "validation": {
+                        "id": check_id,
+                        "name": check_name,
+                        "description": check.description,
+                        "fields": fields,
+                        "severity": "warning" if check.warning else "error",
+                    },
+                    "records": records,
+                }
+
+                findings.append(validation_findings)
+
+    return findings
+
+
+def validate_phases(phase1: DataFrameSchema, phase2: DataFrameSchema, df: pd.DataFrame) -> list:
+    phase1_findings = validate(phase1, df)
+    if phase1_findings:
+        return phase1_findings
+    else:
+        phase2_findings = validate(phase2, df)
+        if phase2_findings:
+            return phase2_findings
+        else:
+            return [{"response": "No validations errors or warnings"}]
diff --git a/src/validator/main.py b/src/validator/main.py
index 433275be..07d9add1 100644
--- a/src/validator/main.py
+++ b/src/validator/main.py
@@ -8,12 +8,7 @@
 import sys
 
 import pandas as pd
-from create_schemas import (
-    get_phase_1_schema_for_lei,
-    get_phase_2_schema_for_lei,
-    print_schema_errors,
-)
-from pandera.errors import SchemaErrors
+from create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate_phases
 
 
 def csv_to_df(path: str) -> pd.DataFrame:
@@ -32,21 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
     print(df)
     print("")
 
-    phase_1_failure_cases = None
-
-    phase_1_sblar_schema = get_phase_1_schema_for_lei(lei)
-    try:
-        phase_1_sblar_schema(df, lazy=True)
-    except SchemaErrors as errors:
-        phase_1_failure_cases = errors.failure_cases
-        print_schema_errors(errors, "Phase 1")
-
-    if phase_1_failure_cases is None:
-        phase_2_sblar_schema = get_phase_2_schema_for_lei(lei)
-        try:
-            phase_2_sblar_schema(df, lazy=True)
-        except SchemaErrors as errors:
-            print_schema_errors(errors, "Phase 2")
+    print(validate_phases(get_phase_1_schema_for_lei(lei), get_phase_2_schema_for_lei(lei), df))
 
 
 if __name__ == "__main__":

From 573a7465f13d61af4f0c3fd9b80fe80bac4a92b9 Mon Sep 17 00:00:00 2001
From: Nargis Sultani <nargis.sultani@cfpb.gov>
Date: Thu, 14 Sep 2023 15:42:23 -0400
Subject: [PATCH 3/4] made changes

---
 src/validator/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/validator/main.py b/src/validator/main.py
index 07d9add1..680173bd 100644
--- a/src/validator/main.py
+++ b/src/validator/main.py
@@ -8,7 +8,7 @@
 import sys
 
 import pandas as pd
-from create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate_phases
+from create_schemas import validate_phases_by_lei
 
 
 def csv_to_df(path: str) -> pd.DataFrame:
@@ -27,7 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
     print(df)
     print("")
 
-    print(validate_phases(get_phase_1_schema_for_lei(lei), get_phase_2_schema_for_lei(lei), df))
+    print(validate_phases_by_lei(df, lei))
 
 
 if __name__ == "__main__":

From e0bb624eda760b55c062eecc4b806522e1f9a8a5 Mon Sep 17 00:00:00 2001
From: Nargis Sultani <nargis.sultani@cfpb.gov>
Date: Fri, 15 Sep 2023 10:50:51 -0400
Subject: [PATCH 4/4] added changes

---
 src/validator/create_schemas.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py
index 65884f43..8e21391b 100644
--- a/src/validator/create_schemas.py
+++ b/src/validator/create_schemas.py
@@ -109,12 +109,12 @@ def validate(schema: DataFrameSchema, df: pd.DataFrame):
     return findings
 
 
-def validate_phases(phase1: DataFrameSchema, phase2: DataFrameSchema, df: pd.DataFrame) -> list:
-    phase1_findings = validate(phase1, df)
+def validate_phases_by_lei(df: pd.DataFrame, lei: str) -> list:
+    phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
     if phase1_findings:
         return phase1_findings
     else:
-        phase2_findings = validate(phase2, df)
+        phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df)
         if phase2_findings:
             return phase2_findings
         else: