From 2908e2f999abd9ba49128a3d58b5f953c659ebf1 Mon Sep 17 00:00:00 2001 From: Joachim Praetorius Date: Wed, 22 May 2024 15:38:51 +0200 Subject: [PATCH] Add Support to Export to DBML (#215) * Add Support to Export to DBML - Create basic project info from what the datacontract has - Map all the Models into tables, _not_ taking into regard nested fields (as there is no way to express them anyway) - Also create references, when they are given for a field, so the connections between tables become visible * Adapt CHANGELOG and README * Add generated info to make clear this is a generated file * Add support to convert to a specific servers data types - support selecting a server using --server - then the data types will be converted to the selected servers specific types --- CHANGELOG.md | 3 +- README.md | 60 +++++--- datacontract/cli.py | 1 + datacontract/data_contract.py | 4 + datacontract/export/dbml_converter.py | 111 ++++++++++++++ tests/fixtures/dbml/datacontract.yaml | 213 ++++++++++++++++++++++++++ tests/test_export_dbml.py | 146 ++++++++++++++++++ 7 files changed, 512 insertions(+), 26 deletions(-) create mode 100644 datacontract/export/dbml_converter.py create mode 100644 tests/fixtures/dbml/datacontract.yaml create mode 100644 tests/test_export_dbml.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f9d1f889..ed4cbbba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- `datacontract export --format dbml`: Export to [Database Markup Language (DBML)](https://dbml.dbdiagram.io/home/) (#135) ## [0.10.4] - 2024-05-17 ### Added -- `datacibtract catalog` Search +- `datacontract catalog` Search - `datacontract publish`: Publish the data contract to the Data Mesh Manager - `datacontract import --format bigquery`: Import from BigQuery format (#110) - `datacontract export --format bigquery`: Export to BigQuery format (#111) diff --git a/README.md b/README.md index 5a13dcfe..5ebfc33f 100644 --- a/README.md +++ b/README.md @@ -557,31 +557,33 @@ models: Convert data contract to a specific format. Prints to stdout or to the specified output file. -╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location (url or path) of the data contract yaml. [default: datacontract.yaml] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --format [jsonschema|pydantic-model|sodacl|dbt|dbt-sources|dbt-staging The export format. [default: None] [required] │ -│ -sql|odcs|rdf|avro|protobuf|great-expectations|terraform|avro │ -│ -idl|sql|sql-query|html|bigquery|go] │ -│ --output PATH Specify the file path where the exported data will be saved. │ -│ If no path is provided, the output will be printed to stdout. │ -│ [default: None] │ -│ --server TEXT The server name to export. [default: None] │ -│ --model TEXT Use the key of the model in the data contract yaml file to │ -│ refer to a model, e.g., `orders`, or `all` for all models │ -│ (default). │ -│ [default: all] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ RDF Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ SQL Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --sql-server-type TEXT [sql] The server type to determine the sql dialect. By default, it uses 'auto' to automatically detect the sql │ -│ dialect via the specified servers in the data contract. │ -│ [default: auto] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Arguments ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location (url or path) of the data contract yaml. [default: datacontract.yaml] │ +╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --format [jsonschema|pydantic-model|sodacl|dbt|dbt-sources|db The export format. [default: None] [required] │ +│ t-staging-sql|odcs|rdf|avro|protobuf|great-expectati │ +│ ons|terraform|avro-idl|sql|sql-query|html|go|bigquer │ +│ y|dbml] │ +│ --output PATH Specify the file path where the exported data will be │ +│ saved. If no path is provided, the output will be │ +│ printed to stdout. │ +│ [default: None] │ +│ --server TEXT The server name to export. [default: None] │ +│ --model TEXT Use the key of the model in the data contract yaml │ +│ file to refer to a model, e.g., `orders`, or `all` │ +│ for all models (default). │ +│ [default: all] │ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ RDF Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ SQL Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --sql-server-type TEXT [sql] The server type to determine the sql dialect. By default, it uses 'auto' to automatically │ +│ detect the sql dialect via the specified servers in the data contract. │ +│ [default: auto] │ +╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -611,6 +613,7 @@ Available export options: | `bigquery` | Export to BigQuery Schemas | ✅ | | `go` | Export to Go types | ✅ | | `pydantic-model` | Export to pydantic models | ✅ | +| `DBML` | Export to a DBML Diagram description | ✅ | | Missing something? | Please create an issue on GitHub | TBD | #### Great Expectations @@ -651,6 +654,13 @@ Having the data contract inside an RDF Graph gives us access the following use c - Apply graph algorithms on multiple data contracts (Find similar data contracts, find "gatekeeper" data products, find the true domain owner of a field attribute) +#### DBML + +The export function converts the logical data types of the datacontract into the specific ones of a concrete Database +if a server is selected via the `--server` option (based on the `type` of that server). If no server is selected, the +logical data types are exported. + + ### import ``` diff --git a/datacontract/cli.py b/datacontract/cli.py index 3d9f235d..e7690747 100644 --- a/datacontract/cli.py +++ b/datacontract/cli.py @@ -162,6 +162,7 @@ class ExportFormat(str, Enum): html = "html" go = "go" bigquery = "bigquery" + dbml = "dbml" @app.command() diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py index b3f095c1..bc133317 100644 --- a/datacontract/data_contract.py +++ b/datacontract/data_contract.py @@ -15,6 +15,7 @@ from datacontract.export.avro_converter import to_avro_schema_json from datacontract.export.avro_idl_converter import to_avro_idl from datacontract.export.bigquery_converter import to_bigquery_json +from datacontract.export.dbml_converter import to_dbml_diagram from datacontract.export.dbt_converter import to_dbt_models_yaml, \ to_dbt_sources_yaml, to_dbt_staging_sql from datacontract.export.great_expectations_converter import \ @@ -334,6 +335,9 @@ def export(self, export_format, model: str = "all", rdf_base: str = None, sql_se if found_server.type != 'bigquery': raise RuntimeError(f"Export to {export_format} requires selecting a bigquery server from the data contract.") return to_bigquery_json(model_name, model_value, found_server) + if export_format == "dbml": + found_server = data_contract.servers.get(self._server) + return to_dbml_diagram(data_contract, found_server) else: print(f"Export format {export_format} not supported.") return "" diff --git a/datacontract/export/dbml_converter.py b/datacontract/export/dbml_converter.py new file mode 100644 index 00000000..9f711846 --- /dev/null +++ b/datacontract/export/dbml_converter.py @@ -0,0 +1,111 @@ +from datetime import datetime +from importlib.metadata import version +import pytz +from datacontract.export.sql_type_converter import convert_to_sql_type +import datacontract.model.data_contract_specification as spec +from typing import Tuple + + +def to_dbml_diagram(contract: spec.DataContractSpecification, server: spec.Server) -> str: + + result = '' + result += add_generated_info(contract, server) + "\n" + result += generate_project_info(contract) + "\n" + + for model_name, model in contract.models.items(): + table_description = generate_table(model_name, model, server) + result += f"\n{table_description}\n" + + return result + +def add_generated_info(contract: spec.DataContractSpecification, server: spec.Server) -> str: + tz = pytz.timezone("UTC") + now = datetime.now(tz) + formatted_date = now.strftime("%b %d %Y") + datacontract_cli_version = get_version() + dialect = 'Logical Datacontract' if server is None else server.type + + generated_info = """ +Generated at {0} by datacontract-cli version {1} +for datacontract {2} ({3}) version {4} +Using {5} Types for the field types + """.format(formatted_date, datacontract_cli_version, contract.info.title, contract.id, contract.info.version, dialect) + + comment = """/* +{0} +*/ + """.format(generated_info) + + note = """Note project_info {{ +''' +{0} +''' +}} + """.format(generated_info) + + return """{0} +{1} + """.format(comment, note) + +def get_version() -> str: + try: + return version("datacontract_cli") + except Exception: + return "" + +def generate_project_info(contract: spec.DataContractSpecification) -> str: + return """Project "{0}" {{ + Note: "{1}" +}}\n + """.format(contract.info.title, ' '.join(contract.info.description.splitlines())) + +def generate_table(model_name: str, model: spec.Model, server: spec.Server) -> str: + result = """Table "{0}" {{ +Note: "{1}" + """.format(model_name, ' '.join(model.description.splitlines())) + + references = [] + + # Add all the fields + for field_name, field in model.fields.items(): + ref, field_string = generate_field(field_name, field, model_name, server) + if ref is not None: + references.append(ref) + result += "{0}\n".format(field_string) + + result += "}\n" + + # and if any: add the references + if len(references) > 0: + for ref in references: + result += "Ref: {0}\n".format(ref) + + result += "\n" + + return result + +def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]: + + field_attrs = [] + if field.primary: + field_attrs.append('pk') + + if field.unique: + field_attrs.append('unique') + + if field.required: + field_attrs.append('not null') + else: + field_attrs.append('null') + + if field.description: + field_attrs.append('Note: "{0}"'.format(' '.join(field.description.splitlines()))) + + field_type = field.type if server is None else convert_to_sql_type(field, server.type) + + field_str = '"{0}" "{1}" [{2}]'.format(field_name, field_type, ','.join(field_attrs)) + ref_str = None + if (field.references) is not None: + # we always assume many to one, as datacontract doesn't really give us more info + ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references) + return (ref_str, field_str) diff --git a/tests/fixtures/dbml/datacontract.yaml b/tests/fixtures/dbml/datacontract.yaml new file mode 100644 index 00000000..1eb2453e --- /dev/null +++ b/tests/fixtures/dbml/datacontract.yaml @@ -0,0 +1,213 @@ +dataContractSpecification: 0.9.3 +id: urn:datacontract:checkout:orders-latest +info: + title: Orders Latest + version: 1.0.0 + description: | + Successful customer orders in the webshop. + All orders since 2020-01-01. + Orders with their line items are in their current state (no history included). + owner: Checkout Team + contact: + name: John Doe (Data Product Owner) + url: https://teams.microsoft.com/l/channel/example/checkout +servers: + production: + type: s3 + location: s3://datacontract-example-orders-latest/data/{model}/*.json + format: json + delimiter: new_line +terms: + usage: | + Data can be used for reports, analytics and machine learning use cases. + Order may be linked and joined by other tables + limitations: | + Not suitable for real-time use cases. + Data may not be used to identify individual customers. + Max data processing per day: 10 TiB + billing: 5000 USD per month + noticePeriod: P3M +models: + orders: + description: One record per order. Includes cancelled and deleted orders. + type: table + fields: + order_id: + $ref: '#/definitions/order_id' + required: true + unique: true + primary: true + order_timestamp: + description: The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful. + type: timestamp + required: true + example: "2024-09-09T08:30:00Z" + order_total: + description: Total amount the smallest monetary unit (e.g., cents). + type: record + required: true + fields: + amount: + description: The amount to pay + required: true + type: record + fields: + sum: + description: the sum to pay + required: true + type: number + currency: + description: the currency the amount is in + required: true + type: string + example: EUR + due_date: + type: date + description: the date by which the total must be paid + required: true + classification: important + discount: + type: double + description: a discount as fraction of one (for percentages), if the customer gets discount + example: "0.2" + customer_id: + description: Unique identifier for the customer. + type: text + minLength: 10 + maxLength: 20 + customer_email_address: + description: The email address, as entered by the customer. The email address was not verified. + type: text + format: email + required: true + pii: true + classification: sensitive + processed_timestamp: + description: The timestamp when the record was processed by the data platform. + type: timestamp + required: true + line_items: + description: A single article that is part of an order. + type: table + fields: + lines_item_id: + type: text + description: Primary key of the lines_item_id table + required: true + unique: true + primary: true + order_id: + $ref: '#/definitions/order_id' + references: orders.order_id + sku: + description: The purchased article number + $ref: '#/definitions/sku' +definitions: + order_id: + domain: checkout + name: order_id + title: Order ID + type: text + format: uuid + description: An internal ID that identifies an order in the online shop. + example: 243c25e5-a081-43a9-aeab-6d5d5b6cb5e2 + pii: true + classification: restricted + sku: + domain: inventory + name: sku + title: Stock Keeping Unit + type: text + pattern: ^[A-Za-z0-9]{8,14}$ + example: "96385074" + description: | + A Stock Keeping Unit (SKU) is an internal unique identifier for an article. + It is typically associated with an article's barcode, such as the EAN/GTIN. + flags: + domain: checkout + name: flags + title: Order Flags + type: text + enum: + - PRIORITY + - NORMAL + - SLOW + example: "PRIORITY" + classification: restricted + tags: + - classification + - delivery speed + pattern: PRIORITY|NORMAL|SLOW + +examples: + - type: csv # csv, json, yaml, custom + model: orders + description: An example list of order records. + data: | # expressed as string or inline yaml or via "$ref: data.csv" + order_id,order_timestamp,order_total,customer_id,customer_email_address,processed_timestamp + "1001","2030-09-09T08:30:00Z",2500,"1000000001","[email protected]","2030-09-09T08:31:00Z" + "1002","2030-09-08T15:45:00Z",1800,"1000000002","[email protected]","2030-09-09T08:31:00Z" + "1003","2030-09-07T12:15:00Z",3200,"1000000003","[email protected]","2030-09-09T08:31:00Z" + "1004","2030-09-06T19:20:00Z",1500,"1000000004","[email protected]","2030-09-09T08:31:00Z" + "1005","2030-09-05T10:10:00Z",4200,"1000000004","[email protected]","2030-09-09T08:31:00Z" + "1006","2030-09-04T14:55:00Z",2800,"1000000005","[email protected]","2030-09-09T08:31:00Z" + "1007","2030-09-03T21:05:00Z",1900,"1000000006","[email protected]","2030-09-09T08:31:00Z" + "1008","2030-09-02T17:40:00Z",3600,"1000000007","[email protected]","2030-09-09T08:31:00Z" + "1009","2030-09-01T09:25:00Z",3100,"1000000008","[email protected]","2030-09-09T08:31:00Z" + "1010","2030-08-31T22:50:00Z",2700,"1000000009","[email protected]","2030-09-09T08:31:00Z" + - type: csv + model: line_items + description: An example list of line items. + data: | + lines_item_id,order_id,sku + "LI-1","1001","5901234123457" + "LI-2","1001","4001234567890" + "LI-3","1002","5901234123457" + "LI-4","1002","2001234567893" + "LI-5","1003","4001234567890" + "LI-6","1003","5001234567892" + "LI-7","1004","5901234123457" + "LI-8","1005","2001234567893" + "LI-9","1005","5001234567892" + "LI-10","1005","6001234567891" +servicelevels: + availability: + description: The server is available during support hours + percentage: 99.9% + retention: + description: Data is retained for one year + period: P1Y + unlimited: false + latency: + description: Data is available within 25 hours after the order was placed + threshold: 25h + sourceTimestampField: orders.order_timestamp + processedTimestampField: orders.processed_timestamp + freshness: + description: The age of the youngest row in a table. + threshold: 25h + timestampField: orders.order_timestamp + frequency: + description: Data is delivered once a day + type: batch # or streaming + interval: daily # for batch, either or cron + cron: 0 0 * * * # for batch, either or interval + support: + description: The data is available during typical business hours at headquarters + time: 9am to 5pm in EST on business days + responseTime: 1h + backup: + description: Data is backed up once a week, every Sunday at 0:00 UTC. + interval: weekly + cron: 0 0 * * 0 + recoveryTime: 24 hours + recoveryPoint: 1 week +quality: + type: SodaCL # data quality check format: SodaCL, montecarlo, custom + specification: # expressed as string or inline yaml or via "$ref: checks.yaml" + checks for orders: + - row_count >= 5 + - duplicate_count(order_id) = 0 + checks for line_items: + - values in (order_id) must exist in orders (order_id) + - row_count >= 5 diff --git a/tests/test_export_dbml.py b/tests/test_export_dbml.py new file mode 100644 index 00000000..407daa67 --- /dev/null +++ b/tests/test_export_dbml.py @@ -0,0 +1,146 @@ +from datetime import datetime +from importlib.metadata import version +import logging + +import pytz +from typer.testing import CliRunner + +from datacontract.cli import app +from datacontract.data_contract import DataContract + +logging.basicConfig(level=logging.DEBUG, force=True) + + +def test_cli(): + runner = CliRunner() + result = runner.invoke(app, ["export", "./fixtures/dbml/datacontract.yaml", "--format", "dbml"]) + assert result.exit_code == 0 + +def test_cli_with_server(): + runner = CliRunner() + result = runner.invoke(app, ["export", "./fixtures/dbml/datacontract.yaml", "--format", "dbml", "--server", "production"]) + assert result.exit_code == 0 + +def test_dbml_export(): + data_contract = DataContract(data_contract_file="fixtures/dbml/datacontract.yaml") + assert data_contract.lint(enabled_linters="none").has_passed() + + result = data_contract.export("dbml") + + tz = pytz.timezone("UTC") + now = datetime.now(tz) + formatted_date = now.strftime("%b %d %Y") + try: + datacontract_cli_version = version("datacontract_cli") + except Exception: + datacontract_cli_version = "" + + expected = """ +/* + +Generated at {0} by datacontract-cli version {1} +for datacontract Orders Latest (urn:datacontract:checkout:orders-latest) version 1.0.0 +Using Logical Datacontract Types for the field types + +*/ + +Note project_info {{ +''' + +Generated at {0} by datacontract-cli version {1} +for datacontract Orders Latest (urn:datacontract:checkout:orders-latest) version 1.0.0 +Using Logical Datacontract Types for the field types + +''' +}} + + +Project "Orders Latest" {{ + Note: "Successful customer orders in the webshop. All orders since 2020-01-01. Orders with their line items are in their current state (no history included)." +}} + + + +Table "orders" {{ +Note: "One record per order. Includes cancelled and deleted orders." + "order_id" "text" [pk,unique,not null,Note: "An internal ID that identifies an order in the online shop."] +"order_timestamp" "timestamp" [not null,Note: "The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful."] +"order_total" "record" [not null,Note: "Total amount the smallest monetary unit (e.g., cents)."] +"customer_id" "text" [null,Note: "Unique identifier for the customer."] +"customer_email_address" "text" [not null,Note: "The email address, as entered by the customer. The email address was not verified."] +"processed_timestamp" "timestamp" [not null,Note: "The timestamp when the record was processed by the data platform."] +}} + + +Table "line_items" {{ +Note: "A single article that is part of an order." + "lines_item_id" "text" [pk,unique,not null,Note: "Primary key of the lines_item_id table"] +"order_id" "text" [null,Note: "An internal ID that identifies an order in the online shop."] +"sku" "text" [null,Note: "The purchased article number"] +}} +Ref: line_items.order_id > orders.order_id + """.format(formatted_date, datacontract_cli_version) + + assert result.strip() == expected.strip() + +def test_dbml_export_with_server(): + data_contract = DataContract(data_contract_file="fixtures/dbml/datacontract.yaml", server='production') + assert data_contract.lint(enabled_linters="none").has_passed() + + result = data_contract.export("dbml") + + tz = pytz.timezone("UTC") + now = datetime.now(tz) + formatted_date = now.strftime("%b %d %Y") + try: + datacontract_cli_version = version("datacontract_cli") + except Exception: + datacontract_cli_version = "" + + expected = """ +/* + +Generated at {0} by datacontract-cli version {1} +for datacontract Orders Latest (urn:datacontract:checkout:orders-latest) version 1.0.0 +Using s3 Types for the field types + +*/ + +Note project_info {{ +''' + +Generated at {0} by datacontract-cli version {1} +for datacontract Orders Latest (urn:datacontract:checkout:orders-latest) version 1.0.0 +Using s3 Types for the field types + +''' +}} + + +Project "Orders Latest" {{ + Note: "Successful customer orders in the webshop. All orders since 2020-01-01. Orders with their line items are in their current state (no history included)." +}} + + + +Table "orders" {{ +Note: "One record per order. Includes cancelled and deleted orders." + "order_id" "VARCHAR" [pk,unique,not null,Note: "An internal ID that identifies an order in the online shop."] +"order_timestamp" "TIMESTAMP WITH TIME ZONE" [not null,Note: "The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful."] +"order_total" "STRUCT" [not null,Note: "Total amount the smallest monetary unit (e.g., cents)."] +"customer_id" "VARCHAR" [null,Note: "Unique identifier for the customer."] +"customer_email_address" "VARCHAR" [not null,Note: "The email address, as entered by the customer. The email address was not verified."] +"processed_timestamp" "TIMESTAMP WITH TIME ZONE" [not null,Note: "The timestamp when the record was processed by the data platform."] +}} + + +Table "line_items" {{ +Note: "A single article that is part of an order." + "lines_item_id" "VARCHAR" [pk,unique,not null,Note: "Primary key of the lines_item_id table"] +"order_id" "VARCHAR" [null,Note: "An internal ID that identifies an order in the online shop."] +"sku" "VARCHAR" [null,Note: "The purchased article number"] +}} +Ref: line_items.order_id > orders.order_id + """.format(formatted_date, datacontract_cli_version) + + assert result.strip() == expected.strip()