From 702ba503f1fa2b11dd9acd5feabbb35277a9b546 Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Fri, 24 May 2024 06:47:22 -0400 Subject: [PATCH] [196] Add support for MSSQL (#204) * [196] Add support for MSSQL * [196] Correct soda config serializer * [196] Move requirements for MSSql into dev reqs * [196] Correct usage of regex in datacontract to be pattern instead * [196] Correct pyproject toml * [196] Formalize support for SQLServer - Add required packages - Add required connection details - Semi complete tests * feat(export/jsonschema): supports array type (#200) * Support logical Types in Avro Export (#199) * Support logical Types in Avro Export - Map Datacontracts date-type fields to avro logical types - date: `int/date` - timestamp, timestamp_tz: `long/timestamp-millis` - timestamp_ntz: `long/local-timestamp-millis` * Update CHANGELOG * Add ability to export to go types (#195) * Add ability to export to go types * add test * rename to types * updated naming * update docs * Update boto3 requirement from <1.34.99,>=1.34.41 to >=1.34.41,<1.34.104 (#189) Updates the requirements on [boto3](https://github.com/boto/boto3) to permit the latest version. - [Release notes](https://github.com/boto/boto3/releases) - [Changelog](https://github.com/boto/boto3/blob/develop/CHANGELOG.rst) - [Commits](https://github.com/boto/boto3/compare/1.34.41...1.34.103) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Update botocore requirement (#190) Updates the requirements on [botocore](https://github.com/boto/botocore) to permit the latest version. - [Changelog](https://github.com/boto/botocore/blob/develop/CHANGELOG.rst) - [Commits](https://github.com/boto/botocore/compare/1.34.41...1.34.103) --- updated-dependencies: - dependency-name: botocore dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: jochenchrist * Update snowflake-connector-python[pandas] requirement (#172) Updates the requirements on [snowflake-connector-python[pandas]](https://github.com/snowflakedb/snowflake-connector-python) to permit the latest version. - [Release notes](https://github.com/snowflakedb/snowflake-connector-python/releases) - [Commits](https://github.com/snowflakedb/snowflake-connector-python/compare/v3.6.0...v3.10.0) --- updated-dependencies: - dependency-name: snowflake-connector-python[pandas] dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * 91 JSON Schema (#201) * add import for jsonschemas and extend export * fix tests * remove unused import * Update changelog * First support for server-specific types in a config map. Resolves #150 * Issues/193 fix all todos in html export (#203) * Make Examples in Fields work - we have to have them declared in the model to make them show up at all :) * Add Definitions in HTML Export - add examples to the model, so we can render them - create new tables akin to what we do for the models * Add examples * Handle nested fields in HTML Export We just go one level deep but add an additional set of rows for fields contained in a models field. * Update CHANGELOG * Update Tests for breaking and changelog Now that we include the `example` property in the field there's more things being pointed out, so adjust the tests accordingly * Handle Model Fields and their nesting through partials - added jinja partials as dependency - extracted the model and the nesting handling out to its own partial * Update definitions - move them to their opwn partial - move enum into the content column - try to highlight the different optional aspects a tad * Move some more blocks into partials * Add partials to manifest * Removew the nested headline --------- Co-authored-by: jochen [196] Formalize support for SQLServer - Add required packages - Add required connection details - Semi complete tests [196] Add SQLServer type serializer * [196] Add msodbcsql18 to docker file * [196] Apply ruff formatting * @simonharrer PR suggestion to make naming more consistent * [196] Add changes to changelog * [196] Update readme with new SQLServer information * [196] Add CI/CD step to install msodbcsql driver * [196] Skip test if outside of CI/CD environment * [196] Add msqsql package back --------- Signed-off-by: dependabot[bot] Co-authored-by: Robert DeRienzo Co-authored-by: jochen Co-authored-by: JAEJIN LEE Co-authored-by: Joachim Praetorius Co-authored-by: Mark Olliver Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: jochenchrist Co-authored-by: Mark Olliver Co-authored-by: Simon Harrer --- .github/workflows/ci.yaml | 4 + CHANGELOG.md | 11 +- Dockerfile | 2 +- README.md | 118 ++++++++++++------ datacontract/data_contract.py | 4 +- .../engines/soda/check_soda_execute.py | 5 + .../engines/soda/connections/sqlserver.py | 41 ++++++ datacontract/export/sql_type_converter.py | 68 +++++++++- .../model/data_contract_specification.py | 1 + pyproject.toml | 3 + tests/fixtures/bigquery/datacontract.yaml | 2 +- .../postgres-export/datacontract.yaml | 2 +- tests/fixtures/postgres/datacontract.yaml | 2 +- tests/fixtures/s3-csv/datacontract.yaml | 2 +- tests/fixtures/sqlserver/data/data.sql | 19 +++ tests/fixtures/sqlserver/datacontract.yaml | 29 +++++ tests/test_test_sqlserver.py | 67 ++++++++++ 17 files changed, 322 insertions(+), 58 deletions(-) create mode 100644 datacontract/engines/soda/connections/sqlserver.py create mode 100644 tests/fixtures/sqlserver/data/data.sql create mode 100644 tests/fixtures/sqlserver/datacontract.yaml create mode 100644 tests/test_test_sqlserver.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b5bfaedc..b4ead14c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,6 +28,10 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{matrix.python-version}} + - name: Install msodbcsql18 + run: | + sudo apt-get update + sudo apt-get install -y msodbcsql18 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d35bc03..6e59ba09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added + - Added support for `sqlserver` (#196) - `datacontract export --format dbml`: Export to [Database Markup Language (DBML)](https://dbml.dbdiagram.io/home/) (#135) @@ -43,7 +44,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for `delta` tables on S3 (#24) - Added new command `datacontract catalog` that generates a data contract catalog with an `index.html` file. - Added field format information to HTML export - + ### Fixed - RDF Export: Fix error if owner is not a URI/URN @@ -70,13 +71,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added export format **html** (#15) - Added descriptions as comments to `datacontract export --format sql` for Databricks dialects -- Added import of arrays in Avro import +- Added import of arrays in Avro import ## [0.9.8] - 2024-04-01 ### Added - -- Added export format **great-expectations**: `datacontract export --format great-expectations` + +- Added export format **great-expectations**: `datacontract export --format great-expectations` - Added gRPC support to OpenTelemetry integration for publishing test results - Added AVRO import support for namespace (#121) - Added handling for optional fields in avro import (#112) @@ -158,7 +159,7 @@ We start with JSON messages and avro, and Protobuf will follow. ## [0.9.0] - 2024-01-26 - BREAKING This is a breaking change (we are still on a 0.x.x version). -The project migrated from Golang to Python. +The project migrated from Golang to Python. The Golang version can be found at [cli-go](https://github.com/datacontract/cli-go) ### Added diff --git a/Dockerfile b/Dockerfile index 34af75d6..27a56e5f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN python -c "import duckdb; duckdb.connect().sql(\"INSTALL httpfs\");" FROM ubuntu:22.04 AS runner-image -RUN apt-get update && apt-get install --no-install-recommends -y python3.11 python3.11-venv && \ +RUN apt-get update && apt-get install --no-install-recommends -y python3.11 python3.11-venv msodbcsql18 && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY --from=builder-image /opt/venv /opt/venv diff --git a/README.md b/README.md index 9c4c4960..ab226523 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ It uses data contract YAML files to lint the data contract, connect to data sour ## Getting started -Let's look at this data contract: +Let's look at this data contract: [https://datacontract.com/examples/orders-latest/datacontract.yaml](https://datacontract.com/examples/orders-latest/datacontract.yaml) We have a _servers_ section with endpoint details to the S3 bucket, _models_ for the structure of the data, _servicelevels_ and _quality_ attributes that describe the expected freshness and number of rows. @@ -191,11 +191,11 @@ Commands ### init -``` - Usage: datacontract init [OPTIONS] [LOCATION] - - Download a datacontract.yaml template and write it to file. - +``` + Usage: datacontract init [OPTIONS] [LOCATION] + + Download a datacontract.yaml template and write it to file. + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml to create. │ │ [default: datacontract.yaml] │ @@ -213,10 +213,10 @@ Commands ### lint ``` - Usage: datacontract lint [OPTIONS] [LOCATION] - - Validate that the datacontract.yaml is correctly formatted. - + Usage: datacontract lint [OPTIONS] [LOCATION] + + Validate that the datacontract.yaml is correctly formatted. + ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml. [default: datacontract.yaml] │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ @@ -230,10 +230,10 @@ Commands ### test ``` - Usage: datacontract test [OPTIONS] [LOCATION] - - Run schema and quality tests on configured servers. - + Usage: datacontract test [OPTIONS] [LOCATION] + + Run schema and quality tests on configured servers. + ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ location [LOCATION] The location (url or path) of the data contract yaml. [default: datacontract.yaml] │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ @@ -266,11 +266,11 @@ Data Contract CLI connects to a data source and runs schema and quality tests to $ datacontract test --server production datacontract.yaml ``` -To connect to the databases the `server` block in the datacontract.yaml is used to set up the connection. +To connect to the databases the `server` block in the datacontract.yaml is used to set up the connection. In addition, credentials, such as username and passwords, may be defined with environment variables. The application uses different engines, based on the server `type`. -Internally, it connects with DuckDB, Spark, or a native connection and executes the most tests with _soda-core_ and _fastjsonschema_. +Internally, it connects with DuckDB, Spark, or a native connection and executes the most tests with _soda-core_ and _fastjsonschema_. Credentials are provided with environment variables. @@ -456,7 +456,7 @@ dbutils.library.restartPython() from datacontract.data_contract import DataContract data_contract = DataContract( - data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml", + data_contract_file="/Volumes/acme_catalog_prod/orders_latest/datacontract/datacontract.yaml", spark=spark) run = data_contract.test() run.result @@ -481,7 +481,7 @@ servers: models: my_table_1: # corresponds to a table type: table - fields: + fields: my_column_1: # corresponds to a column type: varchar ``` @@ -539,7 +539,7 @@ servers: models: my_table_1: # corresponds to a table type: table - fields: + fields: my_column_1: # corresponds to a column type: varchar ``` @@ -553,9 +553,47 @@ models: + +### Postgres + +Data Contract CLI can test data in Postgres or Postgres-compliant databases (e.g., RisingWave). + +#### Example + +datacontract.yaml +```yaml +servers: + postgres: + type: sqlserver + host: localhost + port: 5432 + database: tempdb + schema: dbo + driver: ODBC Driver 18 for SQL Server +models: + my_table_1: # corresponds to a table + type: table + fields: + my_column_1: # corresponds to a column + type: varchar +``` + +#### Environment Variables + +| Environment Variable | Example | Description | +|----------------------------------|--------------------|-------------| +| `DATACONTRACT_SQLSERVER_USERNAME` | `root` | Username | +| `DATACONTRACT_SQLSERVER_PASSWORD` | `toor` | Password | +| `DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION` | `True` | Use windows authentication, instead of login | +| `DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE` | `True` | Trust self-signed certificate | +| `DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION` | `True` | Use SSL | + + + ### export ``` + Usage: datacontract export [OPTIONS] [LOCATION] Convert data contract to a specific format. Prints to stdout or to the specified output file. @@ -599,9 +637,9 @@ Available export options: | Type | Description | Status | |----------------------|---------------------------------------------------------|--------| -| `html` | Export to HTML | ✅ | -| `jsonschema` | Export to JSON Schema | ✅ | -| `odcs` | Export to Open Data Contract Standard (ODCS) | ✅ | +| `html` | Export to HTML | ✅ | +| `jsonschema` | Export to JSON Schema | ✅ | +| `odcs` | Export to Open Data Contract Standard (ODCS) | ✅ | | `sodacl` | Export to SodaCL quality checks in YAML format | ✅ | | `dbt` | Export to dbt models in YAML format | ✅ | | `dbt-sources` | Export to dbt sources in YAML format | ✅ | @@ -621,11 +659,11 @@ Available export options: #### Great Expectations -The export function transforms a specified data contract into a comprehensive Great Expectations JSON suite. +The export function transforms a specified data contract into a comprehensive Great Expectations JSON suite. If the contract includes multiple models, you need to specify the names of the model you wish to export. ```shell -datacontract export datacontract.yaml --format great-expectations --model orders +datacontract export datacontract.yaml --format great-expectations --model orders ``` The export creates a list of expectations by utilizing: @@ -635,7 +673,7 @@ The export creates a list of expectations by utilizing: #### RDF -The export function converts a given data contract into a RDF representation. You have the option to +The export function converts a given data contract into a RDF representation. You have the option to add a base_url which will be used as the default prefix to resolve relative IRIs inside the document. ```shell @@ -688,7 +726,7 @@ In this case there's no need to specify `source` but instead `bt-project-id`, `b For providing authentication to the Client, please see [the google documentation](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) or the one [about authorizing client libraries](https://cloud.google.com/bigquery/docs/authentication#client-libs). -Example: +Example: ```bash # Example import from SQL DDL datacontract import --format sql --source my_ddl.sql @@ -722,10 +760,10 @@ Available import options: ### breaking ``` - Usage: datacontract breaking [OPTIONS] LOCATION_OLD LOCATION_NEW - - Identifies breaking changes between data contracts. Prints to stdout. - + Usage: datacontract breaking [OPTIONS] LOCATION_OLD LOCATION_NEW + + Identifies breaking changes between data contracts. Prints to stdout. + ╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * location_old TEXT The location (url or path) of the old data contract yaml. [default: None] [required] │ │ * location_new TEXT The location (url or path) of the new data contract yaml. [default: None] [required] │ @@ -738,10 +776,10 @@ Available import options: ### changelog ``` - Usage: datacontract changelog [OPTIONS] LOCATION_OLD LOCATION_NEW - - Generate a changelog between data contracts. Prints to stdout. - + Usage: datacontract changelog [OPTIONS] LOCATION_OLD LOCATION_NEW + + Generate a changelog between data contracts. Prints to stdout. + ╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * location_old TEXT The location (url or path) of the old data contract yaml. [default: None] [required] │ │ * location_new TEXT The location (url or path) of the new data contract yaml. [default: None] [required] │ @@ -754,10 +792,10 @@ Available import options: ### diff ``` - Usage: datacontract diff [OPTIONS] LOCATION_OLD LOCATION_NEW - - PLACEHOLDER. Currently works as 'changelog' does. - + Usage: datacontract diff [OPTIONS] LOCATION_OLD LOCATION_NEW + + PLACEHOLDER. Currently works as 'changelog' does. + ╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * location_old TEXT The location (url or path) of the old data contract yaml. [default: None] [required] │ │ * location_new TEXT The location (url or path) of the new data contract yaml. [default: None] [required] │ @@ -889,14 +927,14 @@ Create a data contract based on the requirements from use cases. ```bash $ datacontract init ``` - + 2. Add examples to the `datacontract.yaml`. Do not start with the data model, although you are probably tempted to do that. Examples are the fastest way to get feedback from everybody and not loose someone in the discussion. 3. Create the model based on the examples. Test the model against the examples to double-check whether the model matches the examples. ```bash $ datacontract test --examples ``` - + 4. Add quality checks and additional type constraints one by one to the contract and make sure the examples and the actual data still adheres to the contract. Check against examples for a very fast feedback loop. ```bash $ datacontract test --examples diff --git a/datacontract/data_contract.py b/datacontract/data_contract.py index bc133317..b6fea588 100644 --- a/datacontract/data_contract.py +++ b/datacontract/data_contract.py @@ -391,7 +391,7 @@ def _get_examples_server(self, data_contract, run, tmp_dir): ) run.log_info(f"Using {server} for testing the examples") return server - + def _check_models_for_export(self, data_contract: DataContractSpecification, model: str, export_format: str) -> typing.Tuple[str, str]: if data_contract.models is None: raise RuntimeError(f"Export to {export_format} requires models in the data contract.") @@ -412,7 +412,7 @@ def _check_models_for_export(self, data_contract: DataContractSpecification, mod raise RuntimeError( f"Model {model_name} not found in the data contract. Available models: {model_names}" ) - + return model_name, model_value def import_from_source(self, format: str, source: typing.Optional[str] = None, bigquery_tables: typing.Optional[typing.List[str]] = None, bigquery_project: typing.Optional[str] = None, bigquery_dataset: typing.Optional[str] = None) -> DataContractSpecification: diff --git a/datacontract/engines/soda/check_soda_execute.py b/datacontract/engines/soda/check_soda_execute.py index 0e52d9e5..1d2e4a8a 100644 --- a/datacontract/engines/soda/check_soda_execute.py +++ b/datacontract/engines/soda/check_soda_execute.py @@ -9,6 +9,7 @@ from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration +from datacontract.engines.soda.connections.sqlserver import to_sqlserver_soda_configuration from datacontract.export.sodacl_converter import to_sodacl_yaml from datacontract.model.data_contract_specification import DataContractSpecification, Server from datacontract.model.run import Run, Check, Log @@ -69,6 +70,10 @@ def check_soda_execute( read_kafka_topic(spark, data_contract, server, tmp_dir) scan.add_spark_session(spark, data_source_name=server.type) scan.set_data_source_name(server.type) + elif server.type == "sqlserver": + soda_configuration_str = to_sqlserver_soda_configuration(server) + scan.add_configuration_yaml_str(soda_configuration_str) + scan.set_data_source_name(server.type) else: run.checks.append( diff --git a/datacontract/engines/soda/connections/sqlserver.py b/datacontract/engines/soda/connections/sqlserver.py new file mode 100644 index 00000000..f4511999 --- /dev/null +++ b/datacontract/engines/soda/connections/sqlserver.py @@ -0,0 +1,41 @@ +import os + +import yaml +from datacontract.model.data_contract_specification import Server + +def to_sqlserver_soda_configuration(server: Server) -> str: + """Serialize server config to soda configuration. + + + ### Example: + type: sqlserver + host: host + port: '1433' + username: simple + password: simple_pass + database: database + schema: dbo + trusted_connection: false + encrypt: false + trust_server_certificate: false + driver: ODBC Driver 18 for SQL Server + """ + # with service account key, using an external json file + soda_configuration = { + f"data_source {server.type}": { + "type": "sqlserver", + "host": server.host, + "port": str(server.port), + "username": os.getenv("DATACONTRACT_SQLSERVER_USERNAME", ''), + "password": os.getenv("DATACONTRACT_SQLSERVER_PASSWORD", ''), + "database": server.database, + "schema": server.schema_, + "trusted_connection": os.getenv("DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION", False), + "trust_server_certificate": os.getenv("DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE", False), + "encrypt": os.getenv("DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION", True), + "driver": server.driver + } + } + + soda_configuration_str = yaml.dump(soda_configuration) + return soda_configuration_str diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py index af1a347d..dedb5d79 100644 --- a/datacontract/export/sql_type_converter.py +++ b/datacontract/export/sql_type_converter.py @@ -4,12 +4,14 @@ def convert_to_sql_type(field: Field, server_type: str) -> str: if server_type == "snowflake": return convert_to_snowflake(field) - if server_type == "postgres": + elif server_type == "postgres": return convert_type_to_postgres(field) - if server_type == "databricks": + elif server_type == "databricks": return convert_to_databricks(field) - if server_type == "local" or server_type == "s3": + elif server_type == "local" or server_type == "s3": return convert_to_duckdb(field) + elif server_type == "sqlserver": + return convert_type_to_sqlserver(field) return field.type @@ -103,10 +105,9 @@ def convert_type_to_postgres(field: Field) -> None | str: # databricks data types: # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html -def convert_to_databricks(field) -> None | str: +def convert_to_databricks(field: Field) -> None | str: if field.config and field.config["databricksType"] is not None: return field.config["databricksType"] - type = field.type if type is None: return None @@ -142,7 +143,7 @@ def convert_to_databricks(field) -> None | str: return None -def convert_to_duckdb(field) -> None | str: +def convert_to_duckdb(field: Field) -> None | str: type = field.type if type is None: return None @@ -176,3 +177,58 @@ def convert_to_duckdb(field) -> None | str: if type.lower() in ["array"]: return "ARRAY" return None + + +def convert_type_to_sqlserver(field: Field) -> None | str: + """Convert from supported datacontract types to equivalent sqlserver types""" + field_type = field.type + if not field_type: + return None + + # If provided sql-server config type, prefer it over default mapping + if sqlserver_type := get_type_config(field, 'sqlserverType'): + return sqlserver_type + + field_type = field_type.lower() + if field_type in ["string", "varchar", "text"]: + if field.format == 'uuid': + return "uniqueidentifier" + return "varchar" + if field_type in ["timestamp", "timestamp_tz"]: + return "datetimeoffset" + if field_type in ["timestamp_ntz"]: + if field.format == "datetime": + return "datetime" + return "datetime2" + if field_type in ["date"]: + return "date" + if field_type in ["time"]: + return "time" + if field_type in ["number", "decimal", "numeric"]: + # precision and scale not supported by data contract + if field_type == "number": + return "numeric" + return field_type + if field_type in ["float"]: + return "float" + if field_type in ["double"]: + return "double precision" + if field_type in ["integer", "int", "bigint"]: + return field_type + if field_type in ["long"]: + return "bigint" + if field_type in ["boolean"]: + return "bit" + if field_type in ["object", "record", "struct"]: + return "jsonb" + if field_type in ["bytes"]: + return "binary" + if field_type in ["array"]: + raise NotImplementedError('SQLServer does not support array types.') + return None + +def get_type_config(field: Field, config_attr: str) -> dict[str, str] | None: + """Retrieve type configuration if provided in datacontract.""" + if not field.config: + return None + return field.config.get(config_attr, None) \ No newline at end of file diff --git a/datacontract/model/data_contract_specification.py b/datacontract/model/data_contract_specification.py index 915c3aab..67799879 100644 --- a/datacontract/model/data_contract_specification.py +++ b/datacontract/model/data_contract_specification.py @@ -31,6 +31,7 @@ class Server(pyd.BaseModel): token: str = None # Use ENV variable dataProductId: str = None outputPortId: str = None + driver: str = None class Terms(pyd.BaseModel): diff --git a/pyproject.toml b/pyproject.toml index 7fb34f94..2fbcae52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "simple-ddl-parser==1.3.0", "soda-core-bigquery>=3.3.1,<3.4.0", "soda-core-duckdb>=3.3.1,<3.4.0", + "soda-core-sqlserver>=3.3.1,<3.4.0", "soda-core-postgres>=3.3.1,<3.4.0", "soda-core-snowflake>=3.3.1,<3.4.0", "soda-core-spark[databricks]>=3.3.1,<3.4.0", @@ -51,12 +52,14 @@ dev = [ "pytest", "pytest-xdist", "moto", + "pymssql==2.3.0", # testcontainers 4.x have issues with Kafka on arm # https://github.com/testcontainers/testcontainers-python/issues/450 "testcontainers<4.0.0", "testcontainers-minio", "testcontainers-postgres", "testcontainers-kafka", + "testcontainers[mssql]", # https://github.com/testcontainers/testcontainers-python/issues/554 # "testcontainers~=4.4.0", # "testcontainers[minio]", diff --git a/tests/fixtures/bigquery/datacontract.yaml b/tests/fixtures/bigquery/datacontract.yaml index 1be151e1..8ec19b4e 100644 --- a/tests/fixtures/bigquery/datacontract.yaml +++ b/tests/fixtures/bigquery/datacontract.yaml @@ -19,7 +19,7 @@ models: type: varchar required: true unique: true - regex: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" + pattern: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" field_two: type: int minimum: 10 diff --git a/tests/fixtures/postgres-export/datacontract.yaml b/tests/fixtures/postgres-export/datacontract.yaml index e24ad9fa..57c71429 100644 --- a/tests/fixtures/postgres-export/datacontract.yaml +++ b/tests/fixtures/postgres-export/datacontract.yaml @@ -19,7 +19,7 @@ models: type: varchar required: true unique: true - regex: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" + pattern: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" field_two: type: integer minimum: 10 diff --git a/tests/fixtures/postgres/datacontract.yaml b/tests/fixtures/postgres/datacontract.yaml index 23ad434c..a4474ae3 100644 --- a/tests/fixtures/postgres/datacontract.yaml +++ b/tests/fixtures/postgres/datacontract.yaml @@ -19,7 +19,7 @@ models: type: varchar required: true unique: true - regex: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" + pattern: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" field_two: type: integer minimum: 10 diff --git a/tests/fixtures/s3-csv/datacontract.yaml b/tests/fixtures/s3-csv/datacontract.yaml index 1d9bae7c..dd33e0c1 100644 --- a/tests/fixtures/s3-csv/datacontract.yaml +++ b/tests/fixtures/s3-csv/datacontract.yaml @@ -20,7 +20,7 @@ models: type: varchar required: true unique: true - regex: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" + pattern: "[A-Za-z]{2}-\\d{3}-[A-Za-z]{2}$" field_two: type: bigint minimum: 10 diff --git a/tests/fixtures/sqlserver/data/data.sql b/tests/fixtures/sqlserver/data/data.sql new file mode 100644 index 00000000..b691d638 --- /dev/null +++ b/tests/fixtures/sqlserver/data/data.sql @@ -0,0 +1,19 @@ +-- Create the table +CREATE TABLE dbo.my_table ( + field_one VARCHAR(10) PRIMARY KEY, + field_two INT NOT NULL, + field_three DATETIME2 +); + +-- Insert the data +INSERT INTO dbo.my_table (field_one, field_two, field_three) VALUES + ('CX-263-DU', 50, '2023-06-16 13:12:56'), + ('IK-894-MN', 47, '2023-10-08 22:40:57'), + ('ER-399-JY', 22, '2023-05-16 01:08:22 '), + ('MT-939-FH', 63, '2023-03-15 05:15:21 '), + ('LV-849-MI', 33, '2023-09-08 20:08:43 '), + ('VS-079-OH', 85, '2023-04-15 00:50:32 '), + ('DN-297-XY', 79, '2023-11-08 12:55:42 '), + ('ZE-172-FP', 14, '2023-12-03 18:38:38 '), + ('ID-840-EG', 89, '2023-10-02 17:17:58 '), + ('FK-230-KZ', 64, '2023-11-27 15:21:48 '); diff --git a/tests/fixtures/sqlserver/datacontract.yaml b/tests/fixtures/sqlserver/datacontract.yaml new file mode 100644 index 00000000..f3086f3b --- /dev/null +++ b/tests/fixtures/sqlserver/datacontract.yaml @@ -0,0 +1,29 @@ +dataContractSpecification: 0.9.2 +id: sqlserver +info: + title: sqlserver + version: 0.0.1 + owner: my-domain-team +servers: + my-dataproduct/sqlserver: + type: sqlserver + host: localhost + port: __PORT__ + database: tempdb + schema: dbo + driver: ODBC Driver 18 for SQL Server +models: + my_table: + type: table + fields: + field_one: + type: varchar + required: true + unique: true + field_two: + type: int + minimum: 10 + field_three: + type: timestamp + config: + sqlserverType: DATETIME2 diff --git a/tests/test_test_sqlserver.py b/tests/test_test_sqlserver.py new file mode 100644 index 00000000..f1b2a429 --- /dev/null +++ b/tests/test_test_sqlserver.py @@ -0,0 +1,67 @@ +import logging +import os + +import pymssql +import pytest +from testcontainers.mssql import SqlServerContainer + +from datacontract.data_contract import DataContract + +logging.basicConfig(level=logging.DEBUG, force=True) + +datacontract = "fixtures/sqlserver/datacontract.yaml" +sql_file_path = "fixtures/sqlserver/data/data.sql" + +sql_server = SqlServerContainer() +SQL_SERVER_PORT: int = 1433 + +@pytest.fixture(scope="module", autouse=True) +def mssql_container(request): + sql_server.start() + + def remove_container(): + sql_server.stop() + + request.addfinalizer(remove_container) + +@pytest.mark.skipif(not os.getenv('CI'), reason="Skipping test outside CI/CD environment") +def test_test_sqlserver(mssql_container, monkeypatch): + monkeypatch.setenv("DATACONTRACT_SQLSERVER_USERNAME", sql_server.SQLSERVER_USER) + monkeypatch.setenv("DATACONTRACT_SQLSERVER_PASSWORD", sql_server.SQLSERVER_PASSWORD) + monkeypatch.setenv("DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE", "True") + + _init_sql() + + data_contract_str = _setup_datacontract() + data_contract = DataContract(data_contract_str=data_contract_str) + + run = data_contract.test() + + print(run) + assert run.result == "passed" + assert all(check.result == "passed" for check in run.checks) + + +def _setup_datacontract(): + with open(datacontract) as data_contract_file: + data_contract_str = data_contract_file.read() + port = sql_server.get_exposed_port(SQL_SERVER_PORT) + data_contract_str = data_contract_str.replace("__PORT__", port) + return data_contract_str + + +def _init_sql(): + connection = pymssql.connect( + database=sql_server.SQLSERVER_DBNAME, + user=sql_server.SQLSERVER_USER, + password=sql_server.SQLSERVER_PASSWORD, + host=sql_server.get_container_host_ip(), + port=sql_server.get_exposed_port(SQL_SERVER_PORT), + ) + cursor = connection.cursor() + with open(sql_file_path, "r") as sql_file: + sql_commands = sql_file.read() + cursor.execute(sql_commands) + connection.commit() + cursor.close() + connection.close()