Skip to content

Commit

Permalink
validity description linter (datacontract#78)
Browse files Browse the repository at this point in the history
* Linter for field constraint validity.

Allow only string constraints on string (and related) fields, allow only numeric constraints on
numeric (and related) fields.

* Added linter to lint for descriptions.

Descriptions should be present on Models, model fields, definitions and examples.

* Extended changelog with linting.

* Extended field constraints linter.

Field constraints are now checked whether they make sense.

* Added linter configuration argument to lint(), fixed tests.

Linters can now be enabled or disabled with an `enabled_linters` argument that can be set to `all`,
`none` or a set of linter IDs to only enable a subset of all linters.
  • Loading branch information
jeeger authored Mar 7, 2024
1 parent a85164a commit 787ee43
Show file tree
Hide file tree
Showing 18 changed files with 312 additions and 10 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

- Fixed a bug where the export to YAML always escaped the unicode characters.
- added export format **protobuf**: `datacontract export --format protobuf`
- Added export format **protobuf**: `datacontract export --format protobuf`
- Added extensive linting on data contracts. `datacontract lint` will now check for a variety of possible errors in the data contract, such as missing descriptions, incorrect references to models or fields, nonsensical constraints, and more.

## [0.9.6-2] - 2024-03-04

Expand All @@ -17,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [0.9.6] - 2024-03-04

This is a hugh step forward, we now support testing Kafka messages.
This is a huge step forward, we now support testing Kafka messages.
We start with JSON messages and avro, and Protobuf will follow.

### Added
Expand Down
27 changes: 23 additions & 4 deletions datacontract/data_contract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import tempfile
import typing

import yaml

Expand All @@ -22,8 +23,11 @@
from datacontract.integration.publish_datamesh_manager import \
publish_datamesh_manager
from datacontract.lint import resolve
from datacontract.lint.linters.example_model_linter import ExampleModelLinter

from datacontract.model.breaking_change import BreakingChanges, BreakingChange
from datacontract.lint.linters.description_linter import DescriptionLinter
from datacontract.lint.linters.example_model_linter import ExampleModelLinter
from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter
from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter
from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter
from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter
Expand Down Expand Up @@ -58,20 +62,26 @@ def __init__(
self._publish_url = publish_url
self._spark = spark
self._inline_definitions = inline_definitions
self.enabled_linters = {
self.all_linters = {
ExampleModelLinter(),
QualityUsesSchemaLinter(),
FieldPatternLinter(),
FieldReferenceLinter(),
NoticePeriodLinter(),
PrimaryFieldUniqueRequired(),
ValidFieldConstraintsLinter(),
DescriptionLinter()
}

@classmethod
def init(cls, template: str = "https://datacontract.com/datacontract.init.yaml") -> DataContractSpecification:
return resolve.resolve_data_contract(data_contract_location=template)

def lint(self) -> Run:
def lint(self, enabled_linters: typing.Union[str, set[str]]="all") -> Run:
"""Lint the data contract by deserializing the contract and checking the schema, as well as calling the configured linters.
enabled_linters can be either "all" or "none", or a set of linter IDs.
"""
run = Run.create_run()
try:
run.log_info("Linting data contract")
Expand All @@ -83,7 +93,16 @@ def lint(self) -> Run:
name="Data contract is syntactically valid",
engine="datacontract"
))
for linter in self.enabled_linters:
if enabled_linters == "none":
linters_to_check = set()
elif enabled_linters == "all":
linters_to_check = self.all_linters
elif isinstance(enabled_linters, set):
linters_to_check = {linter for linter in self.all_linters
if linter.id in enabled_linters}
else:
raise RuntimeError(f"Unknown argument enabled_linters={enabled_linters} for lint()")
for linter in linters_to_check:
try:
run.checks.extend(linter.lint(data_contract))
except Exception as e:
Expand Down
8 changes: 8 additions & 0 deletions datacontract/lint/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,21 @@ class Linter(abc.ABC):
@property
@abc.abstractmethod
def name(self) -> str:
"""Human-readable name of the linter."""
pass

@property
@abc.abstractmethod
def id(self) -> str:
"""A linter ID for configuration (i.e. enabling and disabling)."""
pass

@abc.abstractmethod
def lint_implementation(self, contract: DataContractSpecification) -> LinterResult:
pass

def lint(self, contract: DataContractSpecification) -> list[Check]:
"""Call with a data contract to get a list of check results from the linter."""
result = self.lint_implementation(contract)
checks = []
if not result.error_results():
Expand Down
40 changes: 40 additions & 0 deletions datacontract/lint/linters/description_linter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from ..lint import Linter, LinterResult
from datacontract.model.data_contract_specification import\
DataContractSpecification, Model


class DescriptionLinter(Linter):
"""Check for a description on models, model fields, definitions and examples."""

@property
def name(self) -> str:
return "Objects have descriptions"

@property
def id(self) -> str:
return "description"

def lint_implementation(
self,
contract: DataContractSpecification
) -> LinterResult:
result = LinterResult()
for (model_name, model) in contract.models.items():
if not model.description:
result = result.with_error(
f"Model '{model_name}' has empty description."
)
for (field_name, field) in model.fields.items():
if not field.description:
result = result.with_error(
f"Field '{field_name}' in model '{model_name}'"
f" has empty description.")
for (definition_name, definition) in contract.definitions.items():
if not definition.description:
result = result.with_error(
f"Definition '{definition_name}' has empty description.")
for (index, example) in enumerate(contract.examples):
if not example.description:
result = result.with_error(
f"Example {index + 1} has empty description.")
return result
4 changes: 4 additions & 0 deletions datacontract/lint/linters/example_model_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ class ExampleModelLinter(Linter):
def name(self) -> str:
return "Example(s) match model"

@property
def id(self) -> str:
return "example-model"

@staticmethod
def get_example_headers(example: Example) -> list[str]:
if isinstance(example.data, str):
Expand Down
4 changes: 4 additions & 0 deletions datacontract/lint/linters/field_pattern_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ class FieldPatternLinter(Linter):
def name(self):
return "Field pattern is correct regex"

@property
def id(self) -> str:
return "field-pattern"

def lint_implementation(
self,
contract: DataContractSpecification
Expand Down
4 changes: 4 additions & 0 deletions datacontract/lint/linters/field_reference_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ class FieldReferenceLinter(Linter):
def name(self):
return "Field references existing field"

@property
def id(self) -> str:
return "field-reference"

def lint_implementation(
self,
contract: DataContractSpecification
Expand Down
4 changes: 4 additions & 0 deletions datacontract/lint/linters/notice_period_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ class NoticePeriodLinter(Linter):
def name(self) -> str:
return "noticePeriod in ISO8601 format"

@property
def id(self) -> str:
return "notice-period"

# Regex matching the "simple" ISO8601 duration format
simple = re.compile(
r"""P # Introduces period
Expand Down
4 changes: 4 additions & 0 deletions datacontract/lint/linters/primary_field_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ class PrimaryFieldUniqueRequired(Linter):
def name(self) -> str:
return "Model primary fields unique and required"

@property
def id(self) -> str:
return "notice-period"

def lint_implementation(
self,
contract: DataContractSpecification
Expand Down
4 changes: 4 additions & 0 deletions datacontract/lint/linters/quality_schema_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ class QualityUsesSchemaLinter(Linter):
def name(self) -> str:
return "Quality check(s) use model"

@property
def id(self) -> str:
return "quality-schema"

def lint_sodacl(self, check, models: dict[str, Model]) ->\
LinterResult:
result = LinterResult()
Expand Down
94 changes: 94 additions & 0 deletions datacontract/lint/linters/valid_constraints_linter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from ..lint import Linter, LinterResult
from datacontract.model.data_contract_specification import\
DataContractSpecification, Field


class ValidFieldConstraintsLinter(Linter):
"""Check validity of field constraints.
More precisely, check that only numeric constraints are specified on
fields of numeric type and string constraints on fields of string type.
Additionally, the linter checks that defined constraints make sense.
Minimum values should not be greater than maximum values, exclusive and
non-exclusive minimum and maximum should not be combined and string
pattern and format should not be combined.
"""

valid_types_for_constraint = {
"pattern": set(["string", "text", "varchar"]),
"format": set(["string", "text", "varchar"]),
"minLength": set(["string", "text", "varchar"]),
"maxLength": set(["string", "text", "varchar"]),
"minimum": set(["int", "integer", "number", "decimal", "numeric",
"long", "bigint", "float", "double"]),
"exclusiveMinimum": set(["int", "integer", "number", "decimal", "numeric",
"long", "bigint", "float", "double"]),
"maximum": set(["int", "integer", "number", "decimal", "numeric",
"long", "bigint", "float", "double"]),
"exclusiveMaximum": set(["int", "integer", "number", "decimal", "numeric",
"long", "bigint", "float", "double"]),
}

def check_minimum_maximum(self, field: Field, field_name: str, model_name: str) -> LinterResult:
(min, max, xmin, xmax) = (field.minimum, field.maximum, field.exclusiveMinimum, field.exclusiveMaximum)
match ("minimum" in field.model_fields_set, "maximum" in field.model_fields_set,
"exclusiveMinimum" in field.model_fields_set, "exclusiveMaximum" in field.model_fields_set):
case (True, True, _, _) if min > max:
return LinterResult.erroneous(
f"Minimum {min} is greater than maximum {max} on "
f"field '{field_name}' in model '{model_name}'.")
case (_, _, True, True) if xmin >= xmax:
return LinterResult.erroneous(
f"Exclusive minimum {xmin} is greater than exclusive"
f" maximum {xmax} on field '{field_name}' in model '{model_name}'.")
case (True, True, True, True):
return LinterResult.erroneous(
f"Both exclusive and non-exclusive minimum and maximum are "
f"defined on field '{field_name}' in model '{model_name}'.")
case (True, _, True, _):
return LinterResult.erroneous(
f"Both exclusive and non-exclusive minimum are "
f"defined on field '{field_name}' in model '{model_name}'.")
case (_, True, _, True):
return LinterResult.erroneous(
f"Both exclusive and non-exclusive maximum are "
f"defined on field '{field_name}' in model '{model_name}'.")
return LinterResult()

def check_string_constraints(self, field: Field, field_name: str, model_name: str) -> LinterResult:
result = LinterResult()
if field.minLength and field.maxLength and field.minLength > field.maxLength:
result = result.with_error(
f"Minimum length is greater that maximum length on"
f" field '{field_name}' in model '{model_name}'.")
if field.pattern and field.format:
result = result.with_error(
f"Both a pattern and a format are defined for field"
f" '{field_name}' in model '{model_name}'.")
return result

@property
def name(self):
return "Fields use valid constraints"

@property
def id(self):
return "field-constraints"

def lint_implementation(
self,
contract: DataContractSpecification
) -> LinterResult:
result = LinterResult()
for (model_name, model) in contract.models.items():
for (field_name, field) in model.fields.items():
for (_property, allowed_types) in self.valid_types_for_constraint.items():
if _property in field.model_fields_set and field.type not in allowed_types:
result = result.with_error(
f"Forbidden constraint '{_property}' defined on field "
f"'{field_name}' in model '{model_name}'. Field type "
f"is '{field.type}'.")
result = result.combine(self.check_minimum_maximum(field, field_name, model_name))
result = result.combine(self.check_string_constraints(field, field_name, model_name))
return result
Empty file.
57 changes: 57 additions & 0 deletions tests/test_documentation_linter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from datacontract.lint.linters.description_linter import DescriptionLinter
import datacontract.model.data_contract_specification as spec
from datacontract.model.run import Check

def construct_error_check(msg: str) -> Check:
return Check(
type="lint",
name="Linter 'Objects have descriptions'",
result="warning",
engine="datacontract",
reason=msg,
)


success_check = Check(
type="lint",
name="Linter 'Objects have descriptions'",
result="passed",
engine="datacontract"
)

linter = DescriptionLinter()


def test_correct_contract():
specification = spec.DataContractSpecification(
models={
"test_model": spec.Model(
description="Test model description",
fields={
"test_field": spec.Field(
description="Test field description")})},
examples=[
spec.Example(description="Example description")
],
definitions={
"test_definition": spec.Definition(
description="Test description definition")})
assert linter.lint(specification) == [success_check]

def test_missing_contract():
specification = spec.DataContractSpecification(
models={
"test_model": spec.Model(
fields={
"test_field": spec.Field()})},
examples=[
spec.Example()
],
definitions={
"test_definition": spec.Definition()})
assert linter.lint(specification) == [
construct_error_check("Model 'test_model' has empty description."),
construct_error_check("Field 'test_field' in model 'test_model' has empty description."),
construct_error_check("Definition 'test_definition' has empty description."),
construct_error_check("Example 1 has empty description."),
]
Loading

0 comments on commit 787ee43

Please sign in to comment.