validity description linter (datacontract#78)

* Linter for field constraint validity. Allow only string constraints on string (and related) fields, allow only numeric constraints on numeric (and related) fields. * Added linter to lint for descriptions. Descriptions should be present on Models, model fields, definitions and examples. * Extended changelog with linting. * Extended field constraints linter. Field constraints are now checked whether they make sense. * Added linter configuration argument to lint(), fixed tests. Linters can now be enabled or disabled with an `enabled_linters` argument that can be set to `all`, `none` or a set of linter IDs to only enable a subset of all linters.
marcggl · Mar 7, 2024 · 787ee43 · 787ee43
1 parent a85164a
commit 787ee43
Show file tree

Hide file tree

Showing 18 changed files with 312 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - Fixed a bug where the export to YAML always escaped the unicode characters.
-- added export format **protobuf**: `datacontract export --format protobuf`
+- Added export format **protobuf**: `datacontract export --format protobuf`
+- Added extensive linting on data contracts. `datacontract lint` will now check for a variety of possible errors in the data contract, such as missing descriptions, incorrect references to models or fields, nonsensical constraints, and more.
 
 ## [0.9.6-2] - 2024-03-04
 
@@ -17,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.9.6] - 2024-03-04
 
-This is a hugh step forward, we now support testing Kafka messages. 
+This is a huge step forward, we now support testing Kafka messages.
 We start with JSON messages and avro, and Protobuf will follow.
 
 ### Added

diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import tempfile
+import typing
 
 import yaml
 
@@ -22,8 +23,11 @@
 from datacontract.integration.publish_datamesh_manager import \
     publish_datamesh_manager
 from datacontract.lint import resolve
-from datacontract.lint.linters.example_model_linter import ExampleModelLinter
+
 from datacontract.model.breaking_change import BreakingChanges, BreakingChange
+from datacontract.lint.linters.description_linter import DescriptionLinter
+from datacontract.lint.linters.example_model_linter import ExampleModelLinter
+from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter
 from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter
 from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter
 from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter
@@ -58,20 +62,26 @@ def __init__(
         self._publish_url = publish_url
         self._spark = spark
         self._inline_definitions = inline_definitions
-        self.enabled_linters = {
+        self.all_linters = {
             ExampleModelLinter(),
             QualityUsesSchemaLinter(),
             FieldPatternLinter(),
             FieldReferenceLinter(),
             NoticePeriodLinter(),
             PrimaryFieldUniqueRequired(),
+            ValidFieldConstraintsLinter(),
+            DescriptionLinter()
         }
 
     @classmethod
     def init(cls, template: str = "https://datacontract.com/datacontract.init.yaml") -> DataContractSpecification:
         return resolve.resolve_data_contract(data_contract_location=template)
 
-    def lint(self) -> Run:
+    def lint(self, enabled_linters: typing.Union[str, set[str]]="all") -> Run:
+        """Lint the data contract by deserializing the contract and checking the schema, as well as calling the configured linters.
+
+          enabled_linters can be either "all" or "none", or a set of linter IDs.
+          """
         run = Run.create_run()
         try:
             run.log_info("Linting data contract")
@@ -83,7 +93,16 @@ def lint(self) -> Run:
                 name="Data contract is syntactically valid",
                 engine="datacontract"
             ))
-            for linter in self.enabled_linters:
+            if enabled_linters == "none":
+                linters_to_check = set()
+            elif enabled_linters == "all":
+                linters_to_check = self.all_linters
+            elif isinstance(enabled_linters, set):
+                linters_to_check = {linter for linter in self.all_linters
+                    if linter.id in enabled_linters}
+            else:
+                raise RuntimeError(f"Unknown argument enabled_linters={enabled_linters} for lint()")
+            for linter in linters_to_check:
                 try:
                     run.checks.extend(linter.lint(data_contract))
                 except Exception as e:

diff --git a/datacontract/lint/lint.py b/datacontract/lint/lint.py
@@ -106,13 +106,21 @@ class Linter(abc.ABC):
     @property
     @abc.abstractmethod
     def name(self) -> str:
+        """Human-readable name of the linter."""
+        pass
+
+    @property
+    @abc.abstractmethod
+    def id(self) -> str:
+        """A linter ID for configuration (i.e. enabling and disabling)."""
         pass
 
     @abc.abstractmethod
     def lint_implementation(self, contract: DataContractSpecification) -> LinterResult:
         pass
 
     def lint(self, contract: DataContractSpecification) -> list[Check]:
+        """Call with a data contract to get a list of check results from the linter."""
         result = self.lint_implementation(contract)
         checks = []
         if not result.error_results():

diff --git a/datacontract/lint/linters/description_linter.py b/datacontract/lint/linters/description_linter.py
@@ -0,0 +1,40 @@
+from ..lint import Linter, LinterResult
+from datacontract.model.data_contract_specification import\
+    DataContractSpecification, Model
+
+
+class DescriptionLinter(Linter):
+    """Check for a description on models, model fields, definitions and examples."""
+
+    @property
+    def name(self) -> str:
+        return "Objects have descriptions"
+
+    @property
+    def id(self) -> str:
+        return "description"
+
+    def lint_implementation(
+        self,
+        contract: DataContractSpecification
+    ) -> LinterResult:
+        result = LinterResult()
+        for (model_name, model) in contract.models.items():
+            if not model.description:
+                result = result.with_error(
+                    f"Model '{model_name}' has empty description."
+                )
+            for (field_name, field) in model.fields.items():
+                if not field.description:
+                    result = result.with_error(
+                        f"Field '{field_name}' in model '{model_name}'"
+                        f" has empty description.")
+        for (definition_name, definition) in contract.definitions.items():
+            if not definition.description:
+                result = result.with_error(
+                    f"Definition '{definition_name}' has empty description.")
+        for (index, example) in enumerate(contract.examples):
+            if not example.description:
+                result = result.with_error(
+                    f"Example {index + 1} has empty description.")
+        return result
diff --git a/datacontract/lint/linters/example_model_linter.py b/datacontract/lint/linters/example_model_linter.py
@@ -13,6 +13,10 @@ class ExampleModelLinter(Linter):
     def name(self) -> str:
         return "Example(s) match model"
 
+    @property
+    def id(self) -> str:
+        return "example-model"
+
     @staticmethod
     def get_example_headers(example: Example) -> list[str]:
         if isinstance(example.data, str):

diff --git a/datacontract/lint/linters/field_pattern_linter.py b/datacontract/lint/linters/field_pattern_linter.py
@@ -12,6 +12,10 @@ class FieldPatternLinter(Linter):
     def name(self):
         return "Field pattern is correct regex"
 
+    @property
+    def id(self) -> str:
+        return "field-pattern"
+
     def lint_implementation(
         self,
         contract: DataContractSpecification

diff --git a/datacontract/lint/linters/field_reference_linter.py b/datacontract/lint/linters/field_reference_linter.py
@@ -10,6 +10,10 @@ class FieldReferenceLinter(Linter):
     def name(self):
         return "Field references existing field"
 
+    @property
+    def id(self) -> str:
+        return "field-reference"
+
     def lint_implementation(
         self,
         contract: DataContractSpecification

diff --git a/datacontract/lint/linters/notice_period_linter.py b/datacontract/lint/linters/notice_period_linter.py
@@ -8,6 +8,10 @@ class NoticePeriodLinter(Linter):
     def name(self) -> str:
         return "noticePeriod in ISO8601 format"
 
+    @property
+    def id(self) -> str:
+        return "notice-period"
+
     # Regex matching the "simple" ISO8601 duration format
     simple = re.compile(
         r"""P             # Introduces period

diff --git a/datacontract/lint/linters/primary_field_linter.py b/datacontract/lint/linters/primary_field_linter.py
@@ -7,6 +7,10 @@ class PrimaryFieldUniqueRequired(Linter):
     def name(self) -> str:
         return "Model primary fields unique and required"
 
+    @property
+    def id(self) -> str:
+        return "notice-period"
+
     def lint_implementation(
             self,
             contract: DataContractSpecification

diff --git a/datacontract/lint/linters/quality_schema_linter.py b/datacontract/lint/linters/quality_schema_linter.py
@@ -11,6 +11,10 @@ class QualityUsesSchemaLinter(Linter):
     def name(self) -> str:
         return "Quality check(s) use model"
 
+    @property
+    def id(self) -> str:
+        return "quality-schema"
+
     def lint_sodacl(self, check, models: dict[str, Model]) ->\
             LinterResult:
         result = LinterResult()

diff --git a/datacontract/lint/linters/valid_constraints_linter.py b/datacontract/lint/linters/valid_constraints_linter.py
@@ -0,0 +1,94 @@
+from ..lint import Linter, LinterResult
+from datacontract.model.data_contract_specification import\
+    DataContractSpecification, Field
+
+
+class ValidFieldConstraintsLinter(Linter):
+    """Check validity of field constraints.
+
+      More precisely, check that only numeric constraints are specified on
+      fields of numeric type and string constraints on fields of string type.
+      Additionally, the linter checks that defined constraints make sense.
+      Minimum values should not be greater than maximum values, exclusive and
+      non-exclusive minimum and maximum should not be combined and string
+      pattern and format should not be combined.
+
+    """
+
+    valid_types_for_constraint = {
+        "pattern": set(["string", "text", "varchar"]),
+        "format": set(["string", "text", "varchar"]),
+        "minLength": set(["string", "text", "varchar"]),
+        "maxLength": set(["string", "text", "varchar"]),
+        "minimum": set(["int", "integer", "number", "decimal", "numeric",
+                        "long", "bigint", "float", "double"]),
+        "exclusiveMinimum": set(["int", "integer", "number", "decimal", "numeric",
+                                 "long", "bigint", "float", "double"]),
+        "maximum": set(["int", "integer", "number", "decimal", "numeric",
+                        "long", "bigint", "float", "double"]),
+        "exclusiveMaximum": set(["int", "integer", "number", "decimal", "numeric",
+                                 "long", "bigint", "float", "double"]),
+    }
+
+    def check_minimum_maximum(self, field: Field, field_name: str, model_name: str) -> LinterResult:
+        (min, max, xmin, xmax) = (field.minimum, field.maximum, field.exclusiveMinimum, field.exclusiveMaximum)
+        match ("minimum" in field.model_fields_set, "maximum" in field.model_fields_set,
+               "exclusiveMinimum" in field.model_fields_set, "exclusiveMaximum" in field.model_fields_set):
+            case (True, True, _, _) if min > max:
+                return LinterResult.erroneous(
+                    f"Minimum {min} is greater than maximum {max} on "
+                    f"field '{field_name}' in model '{model_name}'.")
+            case (_, _, True, True) if xmin >= xmax:
+                  return LinterResult.erroneous(
+                    f"Exclusive minimum {xmin} is greater than exclusive"
+                    f" maximum {xmax} on field '{field_name}' in model '{model_name}'.")
+            case (True, True, True, True):
+                return LinterResult.erroneous(
+                    f"Both exclusive and non-exclusive minimum and maximum are "
+                    f"defined on field '{field_name}' in model '{model_name}'.")
+            case (True, _, True, _):
+                return LinterResult.erroneous(
+                    f"Both exclusive and non-exclusive minimum are "
+                    f"defined on field '{field_name}' in model '{model_name}'.")
+            case (_, True, _, True):
+                return LinterResult.erroneous(
+                    f"Both exclusive and non-exclusive maximum are "
+                    f"defined on field '{field_name}' in model '{model_name}'.")
+        return LinterResult()
+
+    def check_string_constraints(self, field: Field, field_name: str, model_name: str) -> LinterResult:
+        result = LinterResult()
+        if field.minLength and field.maxLength and field.minLength > field.maxLength:
+            result = result.with_error(
+                f"Minimum length is greater that maximum length on"
+                f" field '{field_name}' in model '{model_name}'.")
+        if field.pattern and field.format:
+            result = result.with_error(
+                f"Both a pattern and a format are defined for field"
+                f" '{field_name}' in model '{model_name}'.")
+        return result
+
+    @property
+    def name(self):
+        return "Fields use valid constraints"
+
+    @property
+    def id(self):
+        return "field-constraints"
+
+    def lint_implementation(
+            self,
+            contract: DataContractSpecification
+    ) -> LinterResult:
+        result = LinterResult()
+        for (model_name, model) in contract.models.items():
+            for (field_name, field) in model.fields.items():
+                for (_property, allowed_types) in self.valid_types_for_constraint.items():
+                    if _property in field.model_fields_set and field.type not in allowed_types:
+                        result = result.with_error(
+                            f"Forbidden constraint '{_property}' defined on field "
+                            f"'{field_name}' in model '{model_name}'. Field type "
+                            f"is '{field.type}'.")
+                result = result.combine(self.check_minimum_maximum(field, field_name, model_name))
+                result = result.combine(self.check_string_constraints(field, field_name, model_name))
+        return result
diff --git a/tests/test_description_linter.py b/tests/test_description_linter.py
diff --git a/tests/test_documentation_linter.py b/tests/test_documentation_linter.py
@@ -0,0 +1,57 @@
+from datacontract.lint.linters.description_linter import DescriptionLinter
+import datacontract.model.data_contract_specification as spec
+from datacontract.model.run import Check
+
+def construct_error_check(msg: str) -> Check:
+    return Check(
+        type="lint",
+        name="Linter 'Objects have descriptions'",
+        result="warning",
+        engine="datacontract",
+        reason=msg,
+    )
+
+
+success_check = Check(
+    type="lint",
+    name="Linter 'Objects have descriptions'",
+    result="passed",
+    engine="datacontract"
+)
+
+linter = DescriptionLinter()
+
+
+def test_correct_contract():
+    specification = spec.DataContractSpecification(
+        models={
+            "test_model": spec.Model(
+                description="Test model description",
+                fields={
+                    "test_field": spec.Field(
+                        description="Test field description")})},
+        examples=[
+            spec.Example(description="Example description")
+            ],
+        definitions={
+            "test_definition": spec.Definition(
+                description="Test description definition")})
+    assert linter.lint(specification) == [success_check]
+
+def test_missing_contract():
+    specification = spec.DataContractSpecification(
+        models={
+            "test_model": spec.Model(
+                fields={
+                    "test_field": spec.Field()})},
+        examples=[
+            spec.Example()
+            ],
+        definitions={
+            "test_definition": spec.Definition()})
+    assert linter.lint(specification) == [
+        construct_error_check("Model 'test_model' has empty description."),
+        construct_error_check("Field 'test_field' in model 'test_model' has empty description."),
+        construct_error_check("Definition 'test_definition' has empty description."),
+        construct_error_check("Example 1 has empty description."),
+    ]