From fb343fefe0effc2da0b03633f884b003b5d49680 Mon Sep 17 00:00:00 2001 From: Joachim Praetorius Date: Tue, 14 May 2024 21:48:49 +0200 Subject: [PATCH] Support Bigquery JSON as export format (#198) * Support Bigquery JSON as export format - have a new exporter for BigQuery JSON Files - hook it up to the CLI - simplify the checking for model selection during export * Use the correct type for the 'schema' field contents * Mention Bigquery Export in the Changelog --- CHANGELOG.md | 1 + README.md | 75 ++--- datacontract/cli.py | 1 + datacontract/data_contract.py | 155 +++------- datacontract/export/bigquery_converter.py | 109 +++++++ .../bigquery/export/bq_table_schema.json | 273 ++++++++++++++++++ .../bigquery/export/datacontract.yaml | 183 ++++++++++++ .../bigquery/import/datacontract.yaml | 2 +- tests/test_export_bigquery.py | 39 +++ 9 files changed, 689 insertions(+), 149 deletions(-) create mode 100644 datacontract/export/bigquery_converter.py create mode 100644 tests/fixtures/bigquery/export/bq_table_schema.json create mode 100644 tests/fixtures/bigquery/export/datacontract.yaml create mode 100644 tests/test_export_bigquery.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b2757d03..a399e9a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - datacontract catalog: Search form - `datacontract import --format bigquery`: Import from BigQuery format +- `datacontract export --format bigquery`: Export to BigQuery format - `datacontract publish`: Publish the data contract to the Data Mesh Manager ## [0.10.3] - 2024-05-05 diff --git a/README.md b/README.md index 5d02b1e6..bc4c5649 100644 --- a/README.md +++ b/README.md @@ -556,26 +556,31 @@ models: Convert data contract to a specific format. Prints to stdout or to the specified output file. -╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ location [LOCATION] The location (url or path) of the data contract yaml. [default: datacontract.yaml] │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ * --format [html|jsonschema|pydantic-model|sodacl|dbt|dbt-sources|dbt-staging-sql|odcs|rd The export format. [default: None] [required] │ -│ f|avro|protobuf|great-expectations|terraform|avro-idl|sql|sql-query] │ -│ --server TEXT The server name to export. [default: None] │ -│ --model TEXT Use the key of the model in the data contract yaml file to refer to a │ -│ model, e.g., `orders`, or `all` for all models (default). │ -│ [default: all] │ -│ --help Show this message and exit. │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ RDF Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │ -╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ SQL Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --sql-server-type TEXT [sql] The server type to determine the sql dialect. By default, it uses 'auto' to automatically detect the sql dialect via the specified │ -│ servers in the data contract. │ -│ [default: auto] │ -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ location [LOCATION] The location (url or path) of the data contract yaml. [default: datacontract.yaml] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * --format [jsonschema|pydantic-model|sodacl|dbt|dbt-sources|dbt-staging The export format. [default: None] [required] │ +│ -sql|odcs|rdf|avro|protobuf|great-expectations|terraform|avro │ +│ -idl|sql|sql-query|html|bigquery] │ +│ --output PATH Specify the file path where the exported data will be saved. │ +│ If no path is provided, the output will be printed to stdout. │ +│ [default: None] │ +│ --server TEXT The server name to export. [default: None] │ +│ --model TEXT Use the key of the model in the data contract yaml file to │ +│ refer to a model, e.g., `orders`, or `all` for all models │ +│ (default). │ +│ [default: all] │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ RDF Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --rdf-base TEXT [rdf] The base URI used to generate the RDF graph. [default: None] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ SQL Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --sql-server-type TEXT [sql] The server type to determine the sql dialect. By default, it uses 'auto' to automatically detect the sql │ +│ dialect via the specified servers in the data contract. │ +│ [default: auto] │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -588,21 +593,21 @@ Available export options: | Type | Description | Status | |----------------------|---------------------------------------------------------|--------| -| `html` | Export to HTML | ✅ | -| `jsonschema` | Export to JSON Schema | ✅ | -| `odcs` | Export to Open Data Contract Standard (ODCS) | ✅ | -| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ | -| `dbt` | Export to dbt models in YAML format | ✅ | -| `dbt-sources` | Export to dbt sources in YAML format | ✅ | -| `dbt-staging-sql` | Export to dbt staging SQL models | ✅ | -| `rdf` | Export data contract to RDF representation in N3 format | ✅ | -| `avro` | Export to AVRO models | ✅ | -| `protobuf` | Export to Protobuf | ✅ | -| `terraform` | Export to terraform resources | ✅ | -| `sql` | Export to SQL DDL | ✅ | -| `sql-query` | Export to SQL Query | ✅ | -| `great-expectations` | Export to Great Expectations Suites in JSON Format | ✅ | -| `bigquery` | Export to BigQuery Schemas | TBD | +| `html` | Export to HTML | ✅ | +| `jsonschema` | Export to JSON Schema | ✅ | +| `odcs` | Export to Open Data Contract Standard (ODCS) | ✅ | +| `sodacl` | Export to SodaCL quality checks in YAML format | ✅ | +| `dbt` | Export to dbt models in YAML format | ✅ | +| `dbt-sources` | Export to dbt sources in YAML format | ✅ | +| `dbt-staging-sql` | Export to dbt staging SQL models | ✅ | +| `rdf` | Export data contract to RDF representation in N3 format | ✅ | +| `avro` | Export to AVRO models | ✅ | +| `protobuf` | Export to Protobuf | ✅ | +| `terraform` | Export to terraform resources | ✅ | +| `sql` | Export to SQL DDL | ✅ | +| `sql-query` | Export to SQL Query | ✅ | +| `great-expectations` | Export to Great Expectations Suites in JSON Format | ✅ | +| `bigquery` | Export to BigQuery Schemas | ✅ | | `pydantic` | Export to pydantic models | TBD | | Missing something? | Please create an issue on GitHub | TBD | diff --git a/datacontract/cli.py b/datacontract/cli.py index a83ed538..78a12dd6 100644 --- a/datacontract/cli.py +++ b/datacontract/cli.py @@ -158,6 +158,7 @@ class ExportFormat(str, Enum): sql = "sql" sql_query = "sql-query" html = "html" + bigquery = "bigquery" @app.command() diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py index 61ef92ba..0664332c 100644 --- a/datacontract/data_contract.py +++ b/datacontract/data_contract.py @@ -16,6 +16,7 @@ from datacontract.engines.soda.check_soda_execute import check_soda_execute from datacontract.export.avro_converter import to_avro_schema_json from datacontract.export.avro_idl_converter import to_avro_idl +from datacontract.export.bigquery_converter import to_bigquery_json from datacontract.export.dbt_converter import to_dbt_models_yaml, \ to_dbt_sources_yaml, to_dbt_staging_sql from datacontract.export.great_expectations_converter import \ @@ -290,28 +291,8 @@ def export(self, export_format, model: str = "all", rdf_base: str = None, sql_se inline_quality=True, ) if export_format == "jsonschema": - if data_contract.models is None: - raise RuntimeError(f"Export to {export_format} requires models in the data contract.") - - model_names = list(data_contract.models.keys()) - - if model == "all": - if len(data_contract.models.items()) != 1: - raise RuntimeError( - f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}" - ) - - model_name, model_value = next(iter(data_contract.models.items())) - return to_jsonschema_json(model_name, model_value) - else: - model_name = model - model_value = data_contract.models.get(model_name) - if model_value is None: - raise RuntimeError( - f"Model {model_name} not found in the data contract. Available models: {model_names}" - ) - - return to_jsonschema_json(model_name, model_value) + model_name, model_value = self._check_models_for_export(data_contract, model, export_format) + return to_jsonschema_json(model_name, model_value) if export_format == "sodacl": return to_sodacl_yaml(data_contract) if export_format == "dbt": @@ -319,28 +300,8 @@ def export(self, export_format, model: str = "all", rdf_base: str = None, sql_se if export_format == "dbt-sources": return to_dbt_sources_yaml(data_contract, self._server) if export_format == "dbt-staging-sql": - if data_contract.models is None: - raise RuntimeError(f"Export to {export_format} requires models in the data contract.") - - model_names = list(data_contract.models.keys()) - - if model == "all": - if len(data_contract.models.items()) != 1: - raise RuntimeError( - f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}" - ) - - model_name, model_value = next(iter(data_contract.models.items())) - return to_dbt_staging_sql(data_contract, model_name, model_value) - else: - model_name = model - model_value = data_contract.models.get(model_name) - if model_value is None: - raise RuntimeError( - f"Model {model_name} not found in the data contract. Available models: {model_names}" - ) - - return to_dbt_staging_sql(data_contract, model_name, model_value) + model_name, model_value = self._check_models_for_export(data_contract, model, export_format) + return to_dbt_staging_sql(data_contract, model_name, model_value) if export_format == "odcs": return to_odcs_yaml(data_contract) if export_format == "rdf": @@ -348,28 +309,8 @@ def export(self, export_format, model: str = "all", rdf_base: str = None, sql_se if export_format == "protobuf": return to_protobuf(data_contract) if export_format == "avro": - if data_contract.models is None: - raise RuntimeError(f"Export to {export_format} requires models in the data contract.") - - model_names = list(data_contract.models.keys()) - - if model == "all": - if len(data_contract.models.items()) != 1: - raise RuntimeError( - f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}" - ) - - model_name, model_value = next(iter(data_contract.models.items())) - return to_avro_schema_json(model_name, model_value) - else: - model_name = model - model_value = data_contract.models.get(model_name) - if model_value is None: - raise RuntimeError( - f"Model {model_name} not found in the data contract. Available models: {model_names}" - ) - - return to_avro_schema_json(model_name, model_value) + model_name, model_value = self._check_models_for_export(data_contract, model, export_format) + return to_avro_schema_json(model_name, model_value) if export_format == "avro-idl": return to_avro_idl(data_contract) if export_format == "terraform": @@ -378,59 +319,24 @@ def export(self, export_format, model: str = "all", rdf_base: str = None, sql_se server_type = self._determine_sql_server_type(data_contract, sql_server_type) return to_sql_ddl(data_contract, server_type=server_type) if export_format == "sql-query": - if data_contract.models is None: - raise RuntimeError(f"Export to {export_format} requires models in the data contract.") - + model_name, model_value = self._check_models_for_export(data_contract, model, export_format) server_type = self._determine_sql_server_type(data_contract, sql_server_type) - - model_names = list(data_contract.models.keys()) - - if model == "all": - if len(data_contract.models.items()) != 1: - raise RuntimeError( - f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}" - ) - - model_name, model_value = next(iter(data_contract.models.items())) - return to_sql_query(data_contract, model_name, model_value, server_type) - else: - model_name = model - model_value = data_contract.models.get(model_name) - if model_value is None: - raise RuntimeError( - f"Model {model_name} not found in the data contract. Available models: {model_names}" - ) - - return to_sql_query(data_contract, model_name, model_value, server_type) - + return to_sql_query(data_contract, model_name, model_value, server_type) if export_format == "great-expectations": - if data_contract.models is None: - raise RuntimeError(f"Export to {export_format} requires models in the data contract.") - - model_names = list(data_contract.models.keys()) - - if model == "all": - if len(data_contract.models.items()) != 1: - raise RuntimeError( - f"Export to {export_format} is model specific. Specify the model via --model " - f"$MODEL_NAME. Available models: {model_names}" - ) - - model_name, model_value = next(iter(data_contract.models.items())) - return to_great_expectations(data_contract, model_name) - else: - model_name = model - model_value = data_contract.models.get(model_name) - if model_value is None: - raise RuntimeError( - f"Model {model_name} not found in the data contract. " f"Available models: {model_names}" - ) - - return to_great_expectations(data_contract, model_name) + model_name, model_value = self._check_models_for_export(data_contract, model, export_format) + return to_great_expectations(data_contract, model_name) if export_format == "pydantic-model": return to_pydantic_model_str(data_contract) if export_format == "html": return to_html(data_contract) + if export_format == "bigquery": + model_name, model_value = self._check_models_for_export(data_contract, model, export_format) + found_server = data_contract.servers.get(self._server) + if found_server is None: + raise RuntimeError(f"Export to {export_format} requires selecting a bigquery server from the data contract.") + if found_server.type != 'bigquery': + raise RuntimeError(f"Export to {export_format} requires selecting a bigquery server from the data contract.") + return to_bigquery_json(model_name, model_value, found_server) else: print(f"Export format {export_format} not supported.") return "" @@ -484,6 +390,29 @@ def _get_examples_server(self, data_contract, run, tmp_dir): ) run.log_info(f"Using {server} for testing the examples") return server + + def _check_models_for_export(self, data_contract: DataContractSpecification, model: str, export_format: str) -> typing.Tuple[str, str]: + if data_contract.models is None: + raise RuntimeError(f"Export to {export_format} requires models in the data contract.") + + model_names = list(data_contract.models.keys()) + + if model == "all": + if len(data_contract.models.items()) != 1: + raise RuntimeError( + f"Export to {export_format} is model specific. Specify the model via --model $MODEL_NAME. Available models: {model_names}" + ) + + model_name, model_value = next(iter(data_contract.models.items())) + else: + model_name = model + model_value = data_contract.models.get(model_name) + if model_value is None: + raise RuntimeError( + f"Model {model_name} not found in the data contract. Available models: {model_names}" + ) + + return model_name, model_value def import_from_source(self, format: str, source: typing.Optional[str] = None, bigquery_tables: typing.Optional[typing.List[str]] = None, bigquery_project: typing.Optional[str] = None, bigquery_dataset: typing.Optional[str] = None) -> DataContractSpecification: data_contract_specification = DataContract.init() diff --git a/datacontract/export/bigquery_converter.py b/datacontract/export/bigquery_converter.py new file mode 100644 index 00000000..bb28a6d4 --- /dev/null +++ b/datacontract/export/bigquery_converter.py @@ -0,0 +1,109 @@ +import json +import logging +from typing import Dict, List + +from datacontract.model.data_contract_specification import Model, Field, Server +from datacontract.model.exceptions import DataContractException + +logging.basicConfig(level=logging.INFO, force=True) + +def to_bigquery_json(model_name: str, model_value: Model, server: Server) -> str: + bigquery_table = to_bigquery_schema(model_name, model_value, server) + return json.dumps(bigquery_table, indent=2) + +def to_bigquery_schema(model_name: str, model_value: Model, server: Server) -> dict: + return { + "kind": "bigquery#table", + "tableReference": { + "datasetId": server.dataset, + "projectId": server.project, + "tableId": model_name + }, + "description": model_value.description, + "schema": { + "fields": to_fields_array(model_value.fields) + } + } + +def to_fields_array(fields: Dict[str, Field]) -> List[Dict[str, Field]]: + bq_fields = [] + for field_name, field in fields.items(): + bq_fields.append(to_field(field_name, field)) + + return bq_fields + + +def to_field(field_name: str, field: Field) -> dict: + + bq_type = map_type_to_bigquery(field.type, field_name) + bq_field = { + "name": field_name, + "type": bq_type, + "mode": "REQUIRED" if field.required else "NULLABLE", + "description": field.description + } + + # handle arrays + if field.type == 'array': + bq_field["mode"] = 'REPEATED' + if field.items.type == 'object': + # in case the array type is a complex object, we want to copy all its fields + bq_field["fields"] = to_fields_array(field.items.fields) + else: + # otherwise we make up a structure that gets us a single field of the specified type + bq_field["fields"] = to_fields_array({ f"{field_name}_1": Field(type=field.items.type, required=False, description="")}) + # all of these can carry other fields + elif bq_type.lower() in ["record", "struct"]: + bq_field["fields"] = to_fields_array(field.fields) + + # strings can have a maxlength + if bq_type.lower() == "string": + bq_field["maxLength"] = field.maxLength + + # number types have precision and scale + if bq_type.lower() in ["numeric", "bignumeric"]: + bq_field["precision"] = field.precision + bq_field["scale"] = field.scale + + return bq_field + +def map_type_to_bigquery(type_str: str, field_name: str) -> str: + logger = logging.getLogger(__name__) + if type_str.lower() in ["string", "varchar", "text"]: + return "STRING" + elif type_str == "bytes": + return "BYTES" + elif type_str.lower() in ["int", "integer"]: + return "INTEGER" + elif type_str.lower() in ["long", "bigint"]: + return "INT64" + elif type_str == "float": + return "FLOAT" + elif type_str == "boolean": + return "BOOLEAN" + elif type_str.lower() in ["timestamp", "timestamp_tz"]: + return "TIMESTAMP" + elif type_str == "date": + return "DATE" + elif type_str == "timestamp_ntz": + return "TIME" + elif type_str.lower() in ["number", "decimal", "numeric"]: + return "NUMERIC" + elif type_str == "double": + return "BIGNUMERIC" + elif type_str.lower() in ["object", "record", "array"]: + return "RECORD" + elif type_str == "struct": + return "STRUCT" + elif type_str == "null": + logger.info(f"Can't properly map {field_name} to bigquery Schema, as 'null' is not supported as a type. Mapping it to STRING.") + return "STRING" + else: + raise DataContractException( + type="schema", + result="failed", + name="Map datacontract type to bigquery data type", + reason=f"Unsupported type {type_str} in data contract definition.", + engine="datacontract", + ) + \ No newline at end of file diff --git a/tests/fixtures/bigquery/export/bq_table_schema.json b/tests/fixtures/bigquery/export/bq_table_schema.json new file mode 100644 index 00000000..2f507c07 --- /dev/null +++ b/tests/fixtures/bigquery/export/bq_table_schema.json @@ -0,0 +1,273 @@ +{ + "kind": "bigquery#table", + "tableReference": { + "datasetId": "datacontract_cli_test_dataset", + "projectId": "datameshexample-product", + "tableId": "BQ_Example" + }, + "description": "This is a test contract containing all Datacontracts data types to check conversion to Bigquery", + "schema": { + "fields": [ + { + "name": "string_field", + "type": "STRING", + "mode": "NULLABLE", + "description": "a simple nullable string field", + "maxLength": null + }, + { + "name": "required_string_field", + "type": "STRING", + "mode": "REQUIRED", + "description": "a simple non-nullable string field", + "maxLength": null + }, + { + "name": "maxlength_string_field", + "type": "STRING", + "mode": "NULLABLE", + "description": "a string field with a maximum length", + "maxLength": 42 + }, + { + "name": "maxlength_required_string_field", + "type": "STRING", + "mode": "REQUIRED", + "description": "a required string field with a maximum length", + "maxLength": 42 + }, + { + "name": "varchar_field", + "type": "STRING", + "mode": "NULLABLE", + "description": "This is declared as varchar but should map to STRING", + "maxLength": null + }, + { + "name": "text_field", + "type": "STRING", + "mode": "NULLABLE", + "description": "This is declared as text but should map to STRING and allow a maximum length", + "maxLength": 42 + }, + { + "name": "bytes_field", + "type": "BYTES", + "mode": "REQUIRED", + "description": "a required bytes field" + }, + { + "name": "int_field", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "a simple int field" + }, + { + "name": "integer_field", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "a simple integer field" + }, + { + "name": "long_field", + "type": "INT64", + "mode": "NULLABLE", + "description": "a simple long field" + }, + { + "name": "bigint_field", + "type": "INT64", + "mode": "NULLABLE", + "description": "a simple bigint field" + }, + { + "name": "float_field", + "type": "FLOAT", + "mode": "NULLABLE", + "description": "a simple float field" + }, + { + "name": "boolean_field", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "a simple boolean field" + }, + { + "name": "timestamp_field", + "type": "TIMESTAMP", + "mode": "NULLABLE", + "description": "a simple timestamp field" + }, + { + "name": "timestamp_tz_field", + "type": "TIMESTAMP", + "mode": "NULLABLE", + "description": "a simple timestamp_tz field" + }, + { + "name": "timestamp_ntz_field", + "type": "TIME", + "mode": "NULLABLE", + "description": "a simple timestamp_ntz field" + }, + { + "name": "date_field", + "type": "DATE", + "mode": "NULLABLE", + "description": "a simple date field" + }, + { + "name": "number_field", + "type": "NUMERIC", + "mode": "NULLABLE", + "description": "a simple number field with precision 5 and scale 3", + "precision": 5, + "scale": 3 + }, + { + "name": "decimal_field", + "type": "NUMERIC", + "mode": "NULLABLE", + "description": "a simple decimal field with precision 5 and scale 3", + "precision": 5, + "scale": 3 + }, + { + "name": "numeric_field", + "type": "NUMERIC", + "mode": "NULLABLE", + "description": "a simple numeric field with precision 5 and scale 3", + "precision": 5, + "scale": 3 + }, + { + "name": "double_field", + "type": "BIGNUMERIC", + "mode": "NULLABLE", + "description": "a simple double field with precision 5 and scale 3", + "precision": 5, + "scale": 3 + }, + { + "name": "null_field", + "type": "STRING", + "mode": "NULLABLE", + "description": "a null field that should get mapped to STRING", + "maxLength": null + }, + { + "name": "object_field", + "type": "RECORD", + "mode": "NULLABLE", + "description": "an object field with two subfields", + "fields": [ + { + "name": "subfield_1", + "type": "STRING", + "mode": "REQUIRED", + "description": "a required string field", + "maxLength": null + }, + { + "name": "subfield_2", + "type": "STRING", + "mode": "NULLABLE", + "description": "a non required string field", + "maxLength": null + } + ] + }, + { + "name": "record_field", + "type": "RECORD", + "mode": "NULLABLE", + "description": "an record field with two subfields", + "fields": [ + { + "name": "subfield_1", + "type": "BOOLEAN", + "mode": "REQUIRED", + "description": "a required boolean field" + }, + { + "name": "subfield_2", + "type": "DATE", + "mode": "NULLABLE", + "description": "a non required date field" + } + ] + }, + { + "name": "struct_field", + "type": "STRUCT", + "mode": "NULLABLE", + "description": "an struct field with two subfields", + "fields": [ + { + "name": "subfield_1", + "type": "BYTES", + "mode": "REQUIRED", + "description": "a required bytes field" + }, + { + "name": "subfield_2", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "a non required int field" + } + ] + }, + { + "name": "string_array_field", + "type": "RECORD", + "mode": "REPEATED", + "description": "a string array", + "fields": [ + { + "name": "string_array_field_1", + "type": "STRING", + "mode": "NULLABLE", + "description": "", + "maxLength": null + } + ] + }, + { + "name": "int_array_field", + "type": "RECORD", + "mode": "REPEATED", + "description": "an int array", + "fields": [ + { + "name": "int_array_field_1", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "" + } + ] + }, + { + "name": "complex_array_field", + "type": "RECORD", + "mode": "REPEATED", + "description": "an array of objects that has multiple fields that should carry through", + "fields": [ + { + "name": "Field1", + "type": "BOOLEAN", + "mode": "NULLABLE", + "description": "a boolean field" + }, + { + "name": "Field2", + "type": "BIGNUMERIC", + "mode": "REQUIRED", + "description": "a double field", + "precision": null, + "scale": null + } + ] + } + ] + } +} \ No newline at end of file diff --git a/tests/fixtures/bigquery/export/datacontract.yaml b/tests/fixtures/bigquery/export/datacontract.yaml new file mode 100644 index 00000000..a03a6359 --- /dev/null +++ b/tests/fixtures/bigquery/export/datacontract.yaml @@ -0,0 +1,183 @@ +dataContractSpecification: 0.9.3 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +servers: + bigquery: + type: bigquery + project: datameshexample-product + dataset: datacontract_cli_test_dataset + dataProductId: my-dataproduct + outputPortId: bigquery +models: + BQ_Example: + description: This is a test contract containing all Datacontracts data types to check conversion to Bigquery + type: table + fields: + string_field: + type: string + required: false + description: a simple nullable string field + required_string_field: + type: string + required: true + description: a simple non-nullable string field + maxlength_string_field: + type: string + required: false + description: a string field with a maximum length + maxLength: 42 + maxlength_required_string_field: + type: string + required: true + description: a required string field with a maximum length + maxLength: 42 + varchar_field: + type: varchar + required: false + description: This is declared as varchar but should map to STRING + text_field: + type: text + required: false + description: This is declared as text but should map to STRING and allow a maximum length + maxLength: 42 + bytes_field: + type: bytes + required: true + description: a required bytes field + maxLength: 42 + int_field: + type: int + required: false + description: a simple int field + integer_field: + type: integer + required: false + description: a simple integer field + long_field: + type: long + required: false + description: a simple long field + bigint_field: + type: bigint + required: false + description: a simple bigint field + float_field: + type: float + required: false + description: a simple float field + boolean_field: + type: boolean + required: false + description: a simple boolean field + timestamp_field: + type: timestamp + required: false + description: a simple timestamp field + timestamp_tz_field: + type: timestamp_tz + required: false + description: a simple timestamp_tz field + timestamp_ntz_field: + type: timestamp_ntz + required: false + description: a simple timestamp_ntz field + date_field: + type: date + required: false + description: a simple date field + number_field: + type: number + required: false + description: a simple number field with precision 5 and scale 3 + precision: 5 + scale: 3 + decimal_field: + type: decimal + required: false + description: a simple decimal field with precision 5 and scale 3 + precision: 5 + scale: 3 + numeric_field: + type: numeric + required: false + description: a simple numeric field with precision 5 and scale 3 + precision: 5 + scale: 3 + double_field: + type: double + required: false + description: a simple double field with precision 5 and scale 3 + precision: 5 + scale: 3 + null_field: + type: "null" + required: false + description: a null field that should get mapped to STRING + object_field: + type: object + required: false + description: an object field with two subfields + fields: + subfield_1: + type: string + required: true + description: a required string field + subfield_2: + type: string + required: false + description: a non required string field + record_field: + type: record + required: false + description: an record field with two subfields + fields: + subfield_1: + type: boolean + required: true + description: a required boolean field + subfield_2: + type: date + required: false + description: a non required date field + struct_field: + type: struct + required: false + description: an struct field with two subfields + fields: + subfield_1: + type: bytes + required: true + description: a required bytes field + subfield_2: + type: int + required: false + description: a non required int field + string_array_field: + type: array + required: false + description: a string array + items: + type: string + int_array_field: + type: array + required: false + description: an int array + items: + type: int + complex_array_field: + type: array + required: false + description: an array of objects that has multiple fields that should carry through + items: + type: object + fields: + Field1: + type: boolean + required: false + description: a boolean field + Field2: + type: double + required: true + description: a double field \ No newline at end of file diff --git a/tests/fixtures/bigquery/import/datacontract.yaml b/tests/fixtures/bigquery/import/datacontract.yaml index a3b638ae..83c738da 100644 --- a/tests/fixtures/bigquery/import/datacontract.yaml +++ b/tests/fixtures/bigquery/import/datacontract.yaml @@ -96,4 +96,4 @@ models: JSON_Field: type: object required: false - description: a json field \ No newline at end of file + description: a json field diff --git a/tests/test_export_bigquery.py b/tests/test_export_bigquery.py new file mode 100644 index 00000000..726dd784 --- /dev/null +++ b/tests/test_export_bigquery.py @@ -0,0 +1,39 @@ +import logging + +import json +from typer.testing import CliRunner + +from datacontract.cli import app +from datacontract.data_contract import DataContract + +logging.basicConfig(level=logging.DEBUG, force=True) + + +def test_cli(): + runner = CliRunner() + result = runner.invoke( + app, + [ + "export", + "--format", + "bigquery", + "--server", + "bigquery", + "fixtures/bigquery/export/datacontract.yaml", + ], + ) + assert result.exit_code == 0 + + +def test_exports_bigquery_schema(): + data_contract_file: str = "fixtures/bigquery/export/datacontract.yaml" + with open(data_contract_file) as file: + file_content = file.read() + data_contract = DataContract(data_contract_str=file_content, server="bigquery") + assert data_contract.lint(enabled_linters="none").has_passed() + result = data_contract.export("bigquery") + + print("Result:\n", result) + with open("fixtures/bigquery/export/bq_table_schema.json") as file: + expected = file.read() + assert json.loads(result) == json.loads(expected)