From cd933546660128e99514ec992e53681d42e76d1e Mon Sep 17 00:00:00 2001 From: samdaviestvg <157589906+samdaviestvg@users.noreply.github.com> Date: Sat, 15 Jun 2024 09:59:30 +0100 Subject: [PATCH] support decimal prec and scale when testing parquet files and in glue table import (#265) --- .gitignore | 2 ++ datacontract/export/sql_type_converter.py | 3 +-- datacontract/imports/glue_importer.py | 9 +++++++ tests/fixtures/glue/datacontract.yaml | 4 ++++ tests/fixtures/parquet/data/decimal.parquet | Bin 0 -> 691 bytes .../parquet/datacontract_decimal.yaml | 22 ++++++++++++++++++ tests/test_import_glue.py | 2 ++ tests/test_test_parquet.py | 9 +++++++ 8 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/parquet/data/decimal.parquet create mode 100644 tests/fixtures/parquet/datacontract_decimal.yaml diff --git a/.gitignore b/.gitignore index 91b6beef..44528a58 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ tmp /quality/ db.duckdb .soda/ +.vscode/ +.duckdb/ ### JetBrains template diff --git a/datacontract/export/sql_type_converter.py b/datacontract/export/sql_type_converter.py index 4e7ca3c0..2525c11f 100644 --- a/datacontract/export/sql_type_converter.py +++ b/datacontract/export/sql_type_converter.py @@ -161,8 +161,7 @@ def convert_to_duckdb(field: Field) -> None | str: if type.lower() in ["time"]: return "TIME" # TIME WITHOUT TIME ZONE if type.lower() in ["number", "decimal", "numeric"]: - # precision and scale not supported by data contract - return "DECIMAL" + return f"DECIMAL({field.precision},{field.scale})" if type.lower() in ["float"]: return "FLOAT" if type.lower() in ["double"]: diff --git a/datacontract/imports/glue_importer.py b/datacontract/imports/glue_importer.py index c17d6795..94464700 100644 --- a/datacontract/imports/glue_importer.py +++ b/datacontract/imports/glue_importer.py @@ -142,6 +142,13 @@ def import_glue(data_contract_specification: DataContractSpecification, source: fields[column["Name"]] = field + if "decimal" in column["Type"]: + # Extract precision and scale from the string + perc_scale = column["Type"][8:-1].split(',') + print(perc_scale) + field.precision = int(perc_scale[0]) + field.scale = int(perc_scale[1]) + data_contract_specification.models[table_name] = Model( type="table", fields=fields, @@ -180,5 +187,7 @@ def map_type_from_sql(sql_type: str): return "timestamp" elif sql_type.lower().startswith("date"): return "date" + elif sql_type.lower().startswith("decimal"): + return "decimal" else: return "variant" diff --git a/tests/fixtures/glue/datacontract.yaml b/tests/fixtures/glue/datacontract.yaml index 5696fd49..7778e5f7 100644 --- a/tests/fixtures/glue/datacontract.yaml +++ b/tests/fixtures/glue/datacontract.yaml @@ -19,6 +19,10 @@ models: type: integer field_three: type: timestamp + field_four: + type: decimal + precision: 6 + scale: 2 part_one: description: Partition Key required: True diff --git a/tests/fixtures/parquet/data/decimal.parquet b/tests/fixtures/parquet/data/decimal.parquet new file mode 100644 index 0000000000000000000000000000000000000000..50cc2a2b12ba6f892996340e745cec5eeb2ea6c6 GIT binary patch literal 691 zcmah{&2G~`5MKYZ7!)BPva4Om2g_2d1esOs#BPXs;fAWJsyJ{cAhfnOt%aSWwgal7 zNE|uv0!Scn?2#wn%z-E1EqDWFoj@yb;4&Qp13sw%KQ+t|`Jxfiv&fa!1z$d;^XiO?g#a(SwZ-4a zs?O&#x-;Y>9w0jS2T@-X9B$xZDH~qR-||v6A`^|EpS6|STrQ+LaqFg9rj6^n(_9u= z?M|Mr+?gmO4a;X|kk6JyA(8_6K$O1wXQn$@t=IlvZhNcO^h~CCDT;pVhlBqO zdAUaPa4Y*^SvgyuPQZhZ51oFp%!|yKj