diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py index 877f99fa..b7326298 100644 --- a/datacontract/export/sql_type_converter.py +++ b/datacontract/export/sql_type_converter.py @@ -8,6 +8,8 @@ def convert_to_sql_type(field: Field, server_type: str) -> str: return convert_type_to_postgres(field) if server_type == "databricks": return convert_to_databricks(field) + if server_type == "local" or server_type == "s3": + return convert_to_duckdb(field) return field.type @@ -129,3 +131,39 @@ def convert_to_databricks(field) -> None | str: if type.lower() in ["array"]: return "ARRAY" return None + + +def convert_to_duckdb(field) -> None | str: + type = field.type + if type is None: + return None + if type.lower() in ["string", "varchar", "text"]: + return "VARCHAR" # aliases: VARCHAR, CHAR, BPCHAR, STRING, TEXT, VARCHAR(n) STRING(n), TEXT(n) + if type.lower() in ["timestamp", "timestamp_tz"]: + return "TIMESTAMP WITH TIME ZONE" # aliases: TIMESTAMPTZ + if type.lower() in ["timestamp_ntz"]: + return "DATETIME" # timestamp with microsecond precision (ignores time zone), aliases: TIMESTAMP + if type.lower() in ["date"]: + return "DATE" + if type.lower() in ["time"]: + return "TIME" # TIME WITHOUT TIME ZONE + if type.lower() in ["number", "decimal", "numeric"]: + # precision and scale not supported by data contract + return "DECIMAL" + if type.lower() in ["float"]: + return "FLOAT" + if type.lower() in ["double"]: + return "DOUBLE" + if type.lower() in ["integer", "int"]: + return "INT" + if type.lower() in ["long", "bigint"]: + return "BIGINT" + if type.lower() in ["boolean"]: + return "BOOLEAN" + if type.lower() in ["object", "record", "struct"]: + return "STRUCT" + if type.lower() in ["bytes"]: + return "BLOB" + if type.lower() in ["array"]: + return "ARRAY" + return None diff --git a/tests/fixtures/parquet/data/timestamp.parquet b/tests/fixtures/parquet/data/timestamp.parquet new file mode 100644 index 00000000..570bb6b9 Binary files /dev/null and b/tests/fixtures/parquet/data/timestamp.parquet differ diff --git a/tests/fixtures/parquet/datacontract_timestamp.yaml b/tests/fixtures/parquet/datacontract_timestamp.yaml new file mode 100644 index 00000000..9f053b67 --- /dev/null +++ b/tests/fixtures/parquet/datacontract_timestamp.yaml @@ -0,0 +1,33 @@ +dataContractSpecification: 0.9.3 +id: iceberg-ingestion +info: + title: ingestion to s3/iceberg + version: 0.0.1 + description: The ingestion of parquet files from s3 into iceberg table format +servers: + test: + type: local + path: "./fixtures/parquet/data/timestamp.parquet" + format: parquet +models: + complaintcost_c: + description: complaintcost_c + type: table + fields: + id: + type: varchar + required: true + primary: true + description: ID + isdeleted: + type: boolean + description: ISDELETED + required: true + name: + type: varchar + description: NAME_C + required: true + createddate: + type: timestamp_tz + description: CREATEDDATE + required: true \ No newline at end of file diff --git a/tests/test_test_parquet.py b/tests/test_test_parquet.py index 09ef9afc..966a742c 100644 --- a/tests/test_test_parquet.py +++ b/tests/test_test_parquet.py @@ -38,4 +38,13 @@ def test_invalid(): assert run.result == "failed" assert len(run.checks) == 6 assert any(check.result == "failed" for check in run.checks) - assert any(check.reason == "Type Mismatch, Expected Type: date; Actual Type: varchar" for check in run.checks) + assert any(check.reason == "Type Mismatch, Expected Type: DATE; Actual Type: varchar" for check in run.checks) + + +def test_timestamp(): + data_contract = DataContract( + data_contract_file="fixtures/parquet/datacontract_timestamp.yaml", + ) + run = data_contract.test() + print(run.pretty()) + assert run.result == "passed"