Skip to content

Commit

Permalink
Convert DuckDB Types explicitly.
Browse files Browse the repository at this point in the history
  • Loading branch information
jochenchrist committed May 9, 2024
1 parent 95784cd commit ea3c05a
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 1 deletion.
38 changes: 38 additions & 0 deletions datacontract/export/sql_type_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ def convert_to_sql_type(field: Field, server_type: str) -> str:
return convert_type_to_postgres(field)
if server_type == "databricks":
return convert_to_databricks(field)
if server_type == "local" or server_type == "s3":
return convert_to_duckdb(field)
return field.type


Expand Down Expand Up @@ -129,3 +131,39 @@ def convert_to_databricks(field) -> None | str:
if type.lower() in ["array"]:
return "ARRAY"
return None


def convert_to_duckdb(field) -> None | str:
type = field.type
if type is None:
return None
if type.lower() in ["string", "varchar", "text"]:
return "VARCHAR" # aliases: VARCHAR, CHAR, BPCHAR, STRING, TEXT, VARCHAR(n) STRING(n), TEXT(n)
if type.lower() in ["timestamp", "timestamp_tz"]:
return "TIMESTAMP WITH TIME ZONE" # aliases: TIMESTAMPTZ
if type.lower() in ["timestamp_ntz"]:
return "DATETIME" # timestamp with microsecond precision (ignores time zone), aliases: TIMESTAMP
if type.lower() in ["date"]:
return "DATE"
if type.lower() in ["time"]:
return "TIME" # TIME WITHOUT TIME ZONE
if type.lower() in ["number", "decimal", "numeric"]:
# precision and scale not supported by data contract
return "DECIMAL"
if type.lower() in ["float"]:
return "FLOAT"
if type.lower() in ["double"]:
return "DOUBLE"
if type.lower() in ["integer", "int"]:
return "INT"
if type.lower() in ["long", "bigint"]:
return "BIGINT"
if type.lower() in ["boolean"]:
return "BOOLEAN"
if type.lower() in ["object", "record", "struct"]:
return "STRUCT"
if type.lower() in ["bytes"]:
return "BLOB"
if type.lower() in ["array"]:
return "ARRAY"
return None
Binary file added tests/fixtures/parquet/data/timestamp.parquet
Binary file not shown.
33 changes: 33 additions & 0 deletions tests/fixtures/parquet/datacontract_timestamp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
dataContractSpecification: 0.9.3
id: iceberg-ingestion
info:
title: ingestion to s3/iceberg
version: 0.0.1
description: The ingestion of parquet files from s3 into iceberg table format
servers:
test:
type: local
path: "./fixtures/parquet/data/timestamp.parquet"
format: parquet
models:
complaintcost_c:
description: complaintcost_c
type: table
fields:
id:
type: varchar
required: true
primary: true
description: ID
isdeleted:
type: boolean
description: ISDELETED
required: true
name:
type: varchar
description: NAME_C
required: true
createddate:
type: timestamp_tz
description: CREATEDDATE
required: true
11 changes: 10 additions & 1 deletion tests/test_test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,13 @@ def test_invalid():
assert run.result == "failed"
assert len(run.checks) == 6
assert any(check.result == "failed" for check in run.checks)
assert any(check.reason == "Type Mismatch, Expected Type: date; Actual Type: varchar" for check in run.checks)
assert any(check.reason == "Type Mismatch, Expected Type: DATE; Actual Type: varchar" for check in run.checks)


def test_timestamp():
data_contract = DataContract(
data_contract_file="fixtures/parquet/datacontract_timestamp.yaml",
)
run = data_contract.test()
print(run.pretty())
assert run.result == "passed"

0 comments on commit ea3c05a

Please sign in to comment.