diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e59ba09..309aae46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added support for `sqlserver` (#196) - `datacontract export --format dbml`: Export to [Database Markup Language (DBML)](https://dbml.dbdiagram.io/home/) (#135) +- `datacontract export --format avro`: Now supports config map on field level for logicalTypes and default values [Custom Avro Properties](./README.md#custom-avro-properties) +- `datacontract import --format avro`: Now supports importing logicalType and default definition on avro files [Custom Avro Properties](./README.md#custom-avro-properties) ### Fixed diff --git a/README.md b/README.md index ab226523..6f6de19d 100644 --- a/README.md +++ b/README.md @@ -702,6 +702,47 @@ if a server is selected via the `--server` option (based on the `type` of that s logical data types are exported. +#### Avro + +The export function converts the data contract specification into an avro schema. It supports specifying custom avro properties for logicalTypes and default values. + +##### Custom Avro Properties + +We support a **config map on field level**. A config map may include any additional key-value pairs and support multiple server type bindings. + +To specify custom Avro properties in your data contract, you can define them within the `config` section of your field definition. Below is an example of how to structure your YAML configuration to include custom Avro properties, such as `avroLogicalType` and `avroDefault`. + +>NOTE: At this moment, we just support [logicalType](https://avro.apache.org/docs/1.11.0/spec.html#Logical+Types) and [default](https://avro.apache.org/docs/1.11.0/spec.htm) + +#### Example Configuration + +```yaml +models: + orders: + fields: + my_field_1: + description: Example for AVRO with Timestamp (microsecond precision) https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29 + type: long + example: 1672534861000000 # Equivalent to 2023-01-01 01:01:01 in microseconds + config: + avroLogicalType: local-timestamp-micros + avroDefault: 1672534861000000 +``` + +#### Explanation + +- **models**: The top-level key that contains different models (tables or objects) in your data contract. +- **orders**: A specific model name. Replace this with the name of your model. +- **fields**: The fields within the model. Each field can have various properties defined. +- **my_field_1**: The name of a specific field. Replace this with your field name. + - **description**: A textual description of the field. + - **type**: The data type of the field. In this example, it is `long`. + - **example**: An example value for the field. + - **config**: Section to specify custom Avro properties. + - **avroLogicalType**: Specifies the logical type of the field in Avro. In this example, it is `local-timestamp-micros`. + - **avroDefault**: Specifies the default value for the field in Avro. In this example, it is 1672534861000000 which corresponds to ` 2023-01-01 01:01:01 UTC`. + + ### import ``` diff --git a/datacontract/export/avro_converter.py b/datacontract/export/avro_converter.py index ed0072f5..6d960d2a 100644 --- a/datacontract/export/avro_converter.py +++ b/datacontract/export/avro_converter.py @@ -38,6 +38,12 @@ def to_avro_field(field, field_name): if field.type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]: avro_field["logicalType"] = to_avro_logical_type(field.type) + if field.config: + if "avroLogicalType" in field.config: + avro_field["logicalType"] = field.config["avroLogicalType"] + if "avroDefault" in field.config: + avro_field["default"] = field.config["avroDefault"] + return avro_field diff --git a/datacontract/imports/avro_importer.py b/datacontract/imports/avro_importer.py index 49bcf1a6..9a30b282 100644 --- a/datacontract/imports/avro_importer.py +++ b/datacontract/imports/avro_importer.py @@ -37,34 +37,47 @@ def import_avro(data_contract_specification: DataContractSpecification, source: return data_contract_specification +def handle_config_avro_custom_properties(field, imported_field): + if field.get_prop('logicalType') is not None: + if imported_field.config is None: + imported_field.config = {} + imported_field.config["avroLogicalType"] = field.get_prop('logicalType') + + if field.default is not None: + if imported_field.config is None: + imported_field.config = {} + imported_field.config["avroDefault"] = field.default def import_record_fields(record_fields): imported_fields = {} for field in record_fields: - imported_fields[field.name] = Field() - imported_fields[field.name].required = True - imported_fields[field.name].description = field.doc - for prop in field.other_props: - imported_fields[field.name].__setattr__(prop, field.other_props[prop]) - + imported_field = Field() + imported_field.required = True + imported_field.description = field.doc + + handle_config_avro_custom_properties(field, imported_field) + + # Determine field type and handle nested structures if field.type.type == "record": - imported_fields[field.name].type = "object" - imported_fields[field.name].description = field.type.doc - imported_fields[field.name].fields = import_record_fields(field.type.fields) + imported_field.type = "object" + imported_field.description = field.type.doc + imported_field.fields = import_record_fields(field.type.fields) elif field.type.type == "union": - imported_fields[field.name].required = False + imported_field.required = False type = import_type_of_optional_field(field) - imported_fields[field.name].type = type + imported_field.type = type if type == "record": - imported_fields[field.name].fields = import_record_fields(get_record_from_union_field(field).fields) + imported_field.fields = import_record_fields(get_record_from_union_field(field).fields) elif type == "array": - imported_fields[field.name].type = "array" - imported_fields[field.name].items = import_avro_array_items(get_array_from_union_field(field)) + imported_field.type = "array" + imported_field.items = import_avro_array_items(get_array_from_union_field(field)) elif field.type.type == "array": - imported_fields[field.name].type = "array" - imported_fields[field.name].items = import_avro_array_items(field.type) + imported_field.type = "array" + imported_field.items = import_avro_array_items(field.type) else: # primitive type - imported_fields[field.name].type = map_type_from_avro(field.type.type) + imported_field.type = map_type_from_avro(field.type.type) + + imported_fields[field.name] = imported_field return imported_fields diff --git a/tests/fixtures/avro/data/logicalTypes.avsc b/tests/fixtures/avro/data/logicalTypes.avsc new file mode 100644 index 00000000..5d2b5ebc --- /dev/null +++ b/tests/fixtures/avro/data/logicalTypes.avsc @@ -0,0 +1,18 @@ +{ + "type": "record", + "name": "Test", + "namespace": "mynamespace.com", + "fields": [ + {"name": "test_id", "type": "string", "doc": "id documentation test"}, + {"name": "device_id", "type": "int"}, + {"name": "test_value", "type": "double"}, + {"name": "num_items", "type": "int"}, + {"name": "processed_timestamp", + "type": "long", + "doc": "The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29", + "logicalType": "local-timestamp-micros"}, + {"name": "description", "type": "string"}, + {"name": "is_processed", "type": "boolean", + "default": false} + ] +} \ No newline at end of file diff --git a/tests/fixtures/avro/export/datacontract_logicalType.avsc b/tests/fixtures/avro/export/datacontract_logicalType.avsc new file mode 100644 index 00000000..5d2b5ebc --- /dev/null +++ b/tests/fixtures/avro/export/datacontract_logicalType.avsc @@ -0,0 +1,18 @@ +{ + "type": "record", + "name": "Test", + "namespace": "mynamespace.com", + "fields": [ + {"name": "test_id", "type": "string", "doc": "id documentation test"}, + {"name": "device_id", "type": "int"}, + {"name": "test_value", "type": "double"}, + {"name": "num_items", "type": "int"}, + {"name": "processed_timestamp", + "type": "long", + "doc": "The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29", + "logicalType": "local-timestamp-micros"}, + {"name": "description", "type": "string"}, + {"name": "is_processed", "type": "boolean", + "default": false} + ] +} \ No newline at end of file diff --git a/tests/fixtures/avro/export/datacontract_logicalType.yaml b/tests/fixtures/avro/export/datacontract_logicalType.yaml new file mode 100644 index 00000000..b0d30648 --- /dev/null +++ b/tests/fixtures/avro/export/datacontract_logicalType.yaml @@ -0,0 +1,36 @@ +dataContractSpecification: 0.9.3 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +models: + Test: + namespace: mynamespace.com + fields: + test_id: + type: string + required: true + description: id documentation test + device_id: + type: int + required: true + test_value: + type: double + required: true + num_items: + type: int + required: true + processed_timestamp: + type: long + required: true + description: 'The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29' + config: + avroLogicalType: local-timestamp-micros + description: + type: string + required: true + is_processed: + type: boolean + required: true + config: + avroDefault: false \ No newline at end of file diff --git a/tests/test_export_avro.py b/tests/test_export_avro.py index 76e5da8b..99044cad 100644 --- a/tests/test_export_avro.py +++ b/tests/test_export_avro.py @@ -25,3 +25,13 @@ def test_to_avro_schema(): result = to_avro_schema_json(model_name, model) assert json.loads(result) == json.loads(expected_avro_schema) + +def test_to_avro_schema_with_logicalTypes(): + data_contract = DataContractSpecification.from_file("fixtures/avro/export/datacontract_logicalType.yaml") + with open("fixtures/avro/export/datacontract_logicalType.avsc") as file: + expected_avro_schema = file.read() + + model_name, model = next(iter(data_contract.models.items())) + result = to_avro_schema_json(model_name, model) + + assert json.loads(result) == json.loads(expected_avro_schema) diff --git a/tests/test_import_avro.py b/tests/test_import_avro.py index 785034fb..1dea202b 100644 --- a/tests/test_import_avro.py +++ b/tests/test_import_avro.py @@ -226,3 +226,48 @@ def test_import_avro_nested_records_with_arrays(): print("Result:\n", result.to_yaml()) assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) assert DataContract(data_contract_str=expected).lint(enabled_linters="none").has_passed() + +def test_import_avro_logicalTypes(): + result = DataContract().import_from_source("avro", "fixtures/avro/data/logicalTypes.avsc") + + expected = """ +dataContractSpecification: 0.9.3 +id: my-data-contract-id +info: + title: My Data Contract + version: 0.0.1 +models: + Test: + namespace: mynamespace.com + fields: + test_id: + type: string + required: true + description: id documentation test + device_id: + type: int + required: true + test_value: + type: double + required: true + num_items: + type: int + required: true + processed_timestamp: + type: long + required: true + description: 'The date the event was processed: for more info https://avro.apache.org/docs/current/spec.html#Local+timestamp+%28microsecond+precision%29' + config: + avroLogicalType: local-timestamp-micros + description: + type: string + required: true + is_processed: + type: boolean + required: true + config: + avroDefault: false +""" + print("Result:\n", result.to_yaml()) + assert yaml.safe_load(result.to_yaml()) == yaml.safe_load(expected) + assert DataContract(data_contract_str=expected).lint(enabled_linters="none").has_passed() \ No newline at end of file