From f35f839f584bb27da3f17c052d7cc74fd4d13196 Mon Sep 17 00:00:00 2001 From: jochen Date: Thu, 30 May 2024 07:49:40 +0200 Subject: [PATCH] AVRO export: Logical Types should be nested (#233) --- CHANGELOG.md | 3 ++ datacontract/export/avro_converter.py | 33 ++++++++----------- .../avro/export/datacontract_logicalType.avsc | 11 ++++--- .../avro/export/datacontract_logicalType.yaml | 1 + .../avro/export/orders_with_datefields.avsc | 18 ++++++---- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5347f45..1cd58aad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Test data contract against dataframes / temporary views (#175) +### Fixed +- AVRO export: Logical Types should be nested (#233) + ## [0.10.6] - 2024-05-29 ### Fixed diff --git a/datacontract/export/avro_converter.py b/datacontract/export/avro_converter.py index a051e14e..89920010 100644 --- a/datacontract/export/avro_converter.py +++ b/datacontract/export/avro_converter.py @@ -34,13 +34,8 @@ def to_avro_field(field, field_name): if field.description is not None: avro_field["doc"] = field.description avro_field["type"] = to_avro_type(field, field_name) - # add logical type definitions for any of the date type fields - if field.type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]: - avro_field["logicalType"] = to_avro_logical_type(field.type) if field.config: - if "avroLogicalType" in field.config: - avro_field["logicalType"] = field.config["avroLogicalType"] if "avroDefault" in field.config: avro_field["default"] = field.config["avroDefault"] @@ -48,6 +43,17 @@ def to_avro_field(field, field_name): def to_avro_type(field: Field, field_name: str) -> str | dict: + if field.config: + if "avroLogicalType" in field.config and "avroType" in field.config: + return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]} + if "avroLogicalType" in field.config: + if field.config["avroLogicalType"] in ["timestamp-millis", "timestamp-micros", "local-timestamp-millis", "local-timestamp-micros", "time-micros"]: + return {"type": "long", "logicalType": field.config["avroLogicalType"]} + if field.config["avroLogicalType"] in ["time-millis", "date"]: + return {"type": "int", "logicalType": field.config["avroLogicalType"]} + if "avroType" in field.config: + return field.config["avroLogicalType"] + if field.type is None: return "null" if field.type in ["string", "varchar", "text"]: @@ -64,11 +70,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict: elif field.type in ["boolean"]: return "boolean" elif field.type in ["timestamp", "timestamp_tz"]: - return "long" + return {"type": "long", "logicalType": "timestamp-millis"} elif field.type in ["timestamp_ntz"]: - return "long" + return {"type": "long", "logicalType": "local-timestamp-millis"} elif field.type in ["date"]: - return "int" + return {"type": "int", "logicalType": "date"} elif field.type in ["time"]: return "long" elif field.type in ["object", "record", "struct"]: @@ -82,14 +88,3 @@ def to_avro_type(field: Field, field_name: str) -> str | dict: return "null" else: return "bytes" - - -def to_avro_logical_type(type: str) -> str: - if type in ["timestamp", "timestamp_tz"]: - return "timestamp-millis" - elif type in ["timestamp_ntz"]: - return "local-timestamp-millis" - elif type in ["date"]: - return "date" - else: - return "" diff --git a/tests/fixtures/avro/export/datacontract_logicalType.avsc b/tests/fixtures/avro/export/datacontract_logicalType.avsc index 5d2b5ebc..501de359 100644 --- a/tests/fixtures/avro/export/datacontract_logicalType.avsc +++ b/tests/fixtures/avro/export/datacontract_logicalType.avsc @@ -7,10 +7,13 @@ {"name": "device_id", "type": "int"}, {"name": "test_value", "type": "double"}, {"name": "num_items", "type": "int"}, - {"name": "processed_timestamp", - "type": "long", - "doc": "The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29", - "logicalType": "local-timestamp-micros"}, + {"name": "processed_timestamp", + "type": { + "type": "long", + "logicalType": "local-timestamp-micros" + }, + "doc": "The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29" + }, {"name": "description", "type": "string"}, {"name": "is_processed", "type": "boolean", "default": false} diff --git a/tests/fixtures/avro/export/datacontract_logicalType.yaml b/tests/fixtures/avro/export/datacontract_logicalType.yaml index b0d30648..758359db 100644 --- a/tests/fixtures/avro/export/datacontract_logicalType.yaml +++ b/tests/fixtures/avro/export/datacontract_logicalType.yaml @@ -25,6 +25,7 @@ models: required: true description: 'The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29' config: + avroType: long avroLogicalType: local-timestamp-micros description: type: string diff --git a/tests/fixtures/avro/export/orders_with_datefields.avsc b/tests/fixtures/avro/export/orders_with_datefields.avsc index f1dea35b..d1447b60 100644 --- a/tests/fixtures/avro/export/orders_with_datefields.avsc +++ b/tests/fixtures/avro/export/orders_with_datefields.avsc @@ -7,18 +7,24 @@ { "name": "orderdate", "doc": "My Field", - "type": "int", - "logicalType": "date" + "type": { + "type": "int", + "logicalType": "date" + } }, { "name": "order_timestamp", - "type": "long", - "logicalType": "timestamp-millis" + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } }, { "name": "delivery_timestamp", - "type": "long", - "logicalType": "local-timestamp-millis" + "type": { + "type": "long", + "logicalType": "local-timestamp-millis" + } }, { "name": "orderid",