Skip to content

Commit

Permalink
Add format checks to soda
Browse files Browse the repository at this point in the history
  • Loading branch information
jochenchrist committed Apr 17, 2024
1 parent bf69a7e commit cee03e5
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 6 deletions.
1 change: 1 addition & 0 deletions datacontract/engines/soda/check_soda_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
# Don't check types for csv format, as they are hard to detect
server_type = server.type
check_types = server.format != "json" and server.format != "csv" and server.format != "avro"

sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
# print("sodacl_yaml_str:\n" + sodacl_yaml_str)
scan.add_sodacl_yaml_str(sodacl_yaml_str)
Expand Down
21 changes: 17 additions & 4 deletions datacontract/engines/soda/connections/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

import duckdb
from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type


def get_duckdb_connection(data_contract, server):
Expand All @@ -12,7 +13,7 @@ def get_duckdb_connection(data_contract, server):
if server.type == "s3":
path = server.location
setup_s3_connection(con, server)
for model_name in data_contract.models:
for model_name, model in data_contract.models.items():
model_path = path
if "{model}" in model_path:
model_path = model_path.format(model=model_name)
Expand All @@ -32,12 +33,24 @@ def get_duckdb_connection(data_contract, server):
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
""")
elif server.format == "csv":
con.sql(f"""
CREATE VIEW "{model_name}" AS SELECT * FROM read_csv_auto('{model_path}', hive_partitioning=1);
""")
columns = to_csv_types(model)
if columns is None:
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);""")
else:
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});""")
return con


def to_csv_types(model) -> dict:
if model is None:
return None
columns = {}
# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
for field_name, field in model.fields.items():
columns[field_name] = convert_to_duckdb_csv_type(field)
return columns


def setup_s3_connection(con, server):
s3_region = os.getenv("DATACONTRACT_S3_REGION")
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
Expand Down
38 changes: 38 additions & 0 deletions datacontract/export/csv_type_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

# https://duckdb.org/docs/data/csv/overview.html
# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
def convert_to_duckdb_csv_type(field) -> None | str:
type = field.type
if type is None:
return "VARCHAR"
if type.lower() in ["string", "varchar", "text"]:
return "VARCHAR"
if type.lower() in ["timestamp", "timestamp_tz"]:
return "TIMESTAMP"
if type.lower() in ["timestamp_ntz"]:
return "TIMESTAMP"
if type.lower() in ["date"]:
return "DATE"
if type.lower() in ["time"]:
return "TIME"
if type.lower() in ["number", "decimal", "numeric"]:
# precision and scale not supported by data contract
return "VARCHAR"
if type.lower() in ["float", "double"]:
return "DOUBLE"
if type.lower() in ["integer", "int", "long", "bigint"]:
return "BIGINT"
if type.lower() in ["boolean"]:
return "BOOLEAN"
if type.lower() in ["object", "record", "struct"]:
# not supported in CSV
return "VARCHAR"
if type.lower() in ["bytes"]:
# not supported in CSV
return "VARCHAR"
if type.lower() in ["array"]:
return "VARCHAR"
if type.lower() in ["null"]:
return "SQLNULL"
return "VARCHAR"

99 changes: 98 additions & 1 deletion datacontract/export/sodacl_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
DataContractSpecification


def to_sodacl_yaml(data_contract_spec: DataContractSpecification, server_type: str = None, check_types: bool = True) -> str:
def to_sodacl_yaml(data_contract_spec: DataContractSpecification, server_type: str = None,
check_types: bool = True) -> str:
try:
sodacl = {}
for model_key, model_value in data_contract_spec.models.items():
Expand Down Expand Up @@ -33,6 +34,26 @@ def to_checks(model_key, model_value, server_type: str, check_types: bool):
checks.append(check_field_required(field_name, quote_field_name))
if field.unique:
checks.append(check_field_unique(field_name, quote_field_name))
if field.minLength is not None:
checks.append(check_field_min_length(field_name, field.minLength))
if field.maxLength is not None:
checks.append(check_field_max_length(field_name, field.maxLength))
if field.minimum is not None:
checks.append(check_field_minimum(field_name, field.minimum))
if field.maximum is not None:
checks.append(check_field_maximum(field_name, field.maximum))
if field.exclusiveMinimum is not None:
checks.append(check_field_minimum(field_name, field.exclusiveMinimum))
checks.append(check_field_not_equal(field_name, field.exclusiveMinimum))
if field.exclusiveMaximum is not None:
checks.append(check_field_maximum(field_name, field.exclusiveMaximum))
checks.append(check_field_not_equal(field_name, field.exclusiveMaximum))
if field.pattern is not None:
checks.append(check_field_regex(field_name, field.pattern))
if field.enum is not None and len(field.enum) > 0:
checks.append(check_field_enum(field_name, field.enum))
# TODO references: str = None
# TODO format

return f"checks for {model_key}", checks

Expand Down Expand Up @@ -74,6 +95,82 @@ def check_field_unique(field_name, quote_field_name: bool = False):
}


def check_field_min_length(field_name, min_length, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} has a min length of {min}",
"valid min length": min_length
}
}

def check_field_max_length(field_name, max_length, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} has a max length of {max_length}",
"valid max length": max_length
}
}


def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} has a minimum of {min}",
"valid min": minimum
}
}


def check_field_maximum(field_name, maximum, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} has a maximum of {maximum}",
"valid max": maximum
}
}

def check_field_not_equal(field_name, value, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} is not equal to {value}",
"invalid values": [value]
}
}


def check_field_enum(field_name, enum, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} only contains enum values {enum}",
"valid values": enum
}
}


def check_field_regex(field_name, pattern, quote_field_name: bool = False):
if quote_field_name:
field_name = f"\"{field_name}\""
return {
f"invalid_count({field_name}) = 0": {
"name": f"Check that field {field_name} matches regex pattern {pattern}",
"valid regex": pattern
}
}



def add_quality_checks(sodacl, data_contract_spec):
if data_contract_spec.quality is None:
return
Expand Down
2 changes: 1 addition & 1 deletion datacontract/templates/datacontract.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<title>Data Contract</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<script src="https://cdn.tailwindcss.com"></script>
<style>/*! tailwindcss v3.4.3 | MIT License | https://tailwindcss.com*/*,:after,:before{box-sizing:border-box;border:0 solid #e5e7eb}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji;font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:initial}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:initial;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:initial}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}*,::backdrop,:after,:before{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:#3b82f680;--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }.relative{position:relative}.-mx-4{margin-left:-1rem;margin-right:-1rem}.-my-2{margin-top:-.5rem;margin-bottom:-.5rem}.mx-auto{margin-left:auto;margin-right:auto}.-ml-0{margin-left:0}.-ml-0.5{margin-left:-.125rem}.mr-1{margin-right:.25rem}.mr-1.5{margin-right:.375rem}.mr-6{margin-right:1.5rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-5{margin-top:1.25rem}.mt-6{margin-top:1.5rem}.mt-8{margin-top:2rem}.mt-auto{margin-top:auto}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.flow-root{display:flow-root}.grid{display:grid}.hidden{display:none}.h-16{height:4rem}.h-5{height:1.25rem}.h-full{height:100%}.min-h-full{min-height:100%}.w-1/12{width:8.333333%}.w-16{width:4rem}.w-3/12{width:25%}.w-5{width:1.25rem}.w-7/12{width:58.333333%}.min-w-0{min-width:0}.min-w-full{min-width:100%}.max-w-7xl{max-width:80rem}.flex-1{flex:1 1 0%}.flex-shrink-0{flex-shrink:0}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.flex-col{flex-direction:column}.items-center{align-items:center}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-3{gap:.75rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-x-6{-moz-column-gap:1.5rem;column-gap:1.5rem}.gap-y-6{row-gap:1.5rem}.space-x-6>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(1.5rem*var(--tw-space-x-reverse));margin-left:calc(1.5rem*(1 - var(--tw-space-x-reverse)))}.space-y-6>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-top:calc(1.5rem*(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1.5rem*var(--tw-space-y-reverse))}.divide-y>:not([hidden])~:not([hidden]){--tw-divide-y-reverse:0;border-top-width:calc(1px*(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(1px*var(--tw-divide-y-reverse))}.divide-gray-100>:not([hidden])~:not([hidden]){--tw-divide-opacity:1;border-color:rgb(243 244 246/var(--tw-divide-opacity))}.divide-gray-200>:not([hidden])~:not([hidden]){--tw-divide-opacity:1;border-color:rgb(229 231 235/var(--tw-divide-opacity))}.divide-gray-300>:not([hidden])~:not([hidden]){--tw-divide-opacity:1;border-color:rgb(209 213 219/var(--tw-divide-opacity))}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.rounded-md{border-radius:.375rem}.bg-gray-100{--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity))}.bg-gray-50{--tw-bg-opacity:1;background-color:rgb(249 250 251/var(--tw-bg-opacity))}.bg-white{--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity))}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.px-6{padding-left:1.5rem;padding-right:1.5rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-5{padding-top:1.25rem;padding-bottom:1.25rem}.pb-7{padding-bottom:1.75rem}.pl-4{padding-left:1rem}.pr-2{padding-right:.5rem}.pr-3{padding-right:.75rem}.pt-5{padding-top:1.25rem}.text-left{text-align:left}.text-center{text-align:center}.align-middle{vertical-align:middle}.text-2xl{font-size:1.5rem;line-height:2rem}.text-base{font-size:1rem;line-height:1.5rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-semibold{font-weight:600}.leading-5{line-height:1.25rem}.leading-6{line-height:1.5rem}.leading-7{line-height:1.75rem}.text-gray-400{--tw-text-opacity:1;color:rgb(156 163 175/var(--tw-text-opacity))}.text-gray-500{--tw-text-opacity:1;color:rgb(107 114 128/var(--tw-text-opacity))}.text-gray-600{--tw-text-opacity:1;color:rgb(75 85 99/var(--tw-text-opacity))}.text-gray-900{--tw-text-opacity:1;color:rgb(17 24 39/var(--tw-text-opacity))}.text-sky-500{--tw-text-opacity:1;color:rgb(14 165 233/var(--tw-text-opacity))}.shadow{--tw-shadow:0 1px 3px 0 #0000001a,0 1px 2px -1px #0000001a;--tw-shadow-colored:0 1px 3px 0 var(--tw-shadow-color),0 1px 2px -1px var(--tw-shadow-color)}.shadow,.shadow-sm{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-sm{--tw-shadow:0 1px 2px 0 #0000000d;--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color)}.ring-1{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.ring-inset{--tw-ring-inset:inset}.ring-black{--tw-ring-opacity:1;--tw-ring-color:rgb(0 0 0/var(--tw-ring-opacity))}.ring-gray-300{--tw-ring-opacity:1;--tw-ring-color:rgb(209 213 219/var(--tw-ring-opacity))}.ring-gray-500/10{--tw-ring-color:#6b72801a}.ring-gray-900/5{--tw-ring-color:#1118270d}.ring-opacity-5{--tw-ring-opacity:0.05}.hover:bg-gray-50:hover{--tw-bg-opacity:1;background-color:rgb(249 250 251/var(--tw-bg-opacity))}.hover:text-gray-500:hover{--tw-text-opacity:1;color:rgb(107 114 128/var(--tw-text-opacity))}.hover:text-gray-700:hover{--tw-text-opacity:1;color:rgb(55 65 81/var(--tw-text-opacity))}@media (min-width:640px){.sm:col-span-1{grid-column:span 1/span 1}.sm:col-span-2{grid-column:span 2/span 2}.sm:-mx-6{margin-left:-1.5rem;margin-right:-1.5rem}.sm:ml-6{margin-left:1.5rem}.sm:mt-0{margin-top:0}.sm:flex{display:flex}.sm:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.sm:flex-row{flex-direction:row}.sm:flex-col{flex-direction:column}.sm:flex-wrap{flex-wrap:wrap}.sm:items-center{align-items:center}.sm:space-x-6>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(1.5rem*var(--tw-space-x-reverse));margin-left:calc(1.5rem*(1 - var(--tw-space-x-reverse)))}.sm:truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.sm:rounded-lg{border-radius:.5rem}.sm:px-0{padding-left:0;padding-right:0}.sm:px-6{padding-right:1.5rem}.sm:pl-6,.sm:px-6{padding-left:1.5rem}.sm:text-3xl{font-size:1.875rem;line-height:2.25rem}.sm:tracking-tight{letter-spacing:-.025em}}@media (min-width:768px){.md:order-1{order:1}.md:order-2{order:2}.md:mt-0{margin-top:0}.md:flex{display:flex}.md:items-center{align-items:center}.md:justify-between{justify-content:space-between}}@media (min-width:1024px){.lg:-mx-8{margin-left:-2rem;margin-right:-2rem}.lg:ml-4{margin-left:1rem}.lg:mt-0{margin-top:0}.lg:flex{display:flex}.lg:items-center{align-items:center}.lg:justify-between{justify-content:space-between}.lg:px-8{padding-left:2rem;padding-right:2rem}}</style>
</head>
<body class="h-full">

Expand Down
42 changes: 42 additions & 0 deletions tests/fixtures/examples/datacontract_formats_valid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
dataContractSpecification: 0.9.2
id: "123"
info:
title: "Test"
version: 1.0.0
owner: my-domain-team
models:
sample_model:
description: Sample Model
type: table
fields:
id:
type: text
required: true
primary: true
unique: true
title: ID
description: A unique identifier
minLength: 4
maxLength: 5
pattern: ^\d+
enum:
- "1234"
- "22345"
model_date:
type: text
required: true
unique: false
title: Model Date
description: Model date
field_c:
type: integer
exclusiveMaximum: 22346

examples:
- type: csv
description: Sample Model Example
model: sample_model
data: |-
id,model_date,field_c
"1234","2023-09-09",2
"22345","2023-09-09",22345
17 changes: 17 additions & 0 deletions tests/test_test_examples_formats_valid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import logging

from typer.testing import CliRunner

from datacontract.data_contract import DataContract

runner = CliRunner()

logging.basicConfig(level=logging.DEBUG, force=True)


def test_formats():
data_contract = DataContract(data_contract_file="fixtures/examples/datacontract_formats_valid.yaml", examples=True)
run = data_contract.test()
print(run)
print(run.result)
assert run.result == "passed"

0 comments on commit cee03e5

Please sign in to comment.