From e730afdb6837f43e6abbf43ae27ccb55288103aa Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 12 Dec 2024 16:51:18 -0500 Subject: [PATCH] feat(ingest): improve query fingerprinting (#12104) --- .../sql_parsing/sql_parsing_aggregator.py | 3 +-- .../src/datahub/sql_parsing/sqlglot_utils.py | 10 +++++++-- .../aggregator_goldens/test_table_rename.json | 14 ++++++------ .../aggregator_goldens/test_table_swap.json | 22 +++++++++---------- .../test_table_swap_with_temp.json | 12 +++++----- .../unit/sql_parsing/test_sqlglot_utils.py | 21 ++++++++++++++++++ 6 files changed, 54 insertions(+), 28 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 44f0d7be7aad9a..79ea98d1c7f54e 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -1383,8 +1383,7 @@ def _query_urn(cls, query_id: QueryId) -> str: return QueryUrn(query_id).urn() @classmethod - def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str: - composed_of_queries = list(composed_of_queries) + def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str: combined = json.dumps(composed_of_queries) return f"composite_{generate_hash(combined)}" diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py index bd98557e08aacc..57a5cc3c9a6574 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py @@ -121,7 +121,7 @@ def _expression_to_string( # Remove /* */ comments. re.compile(r"/\*.*?\*/", re.DOTALL): "", # Remove -- comments. - re.compile(r"--.*$"): "", + re.compile(r"--.*$", re.MULTILINE): "", # Replace all runs of whitespace with a single space. re.compile(r"\s+"): " ", # Remove leading and trailing whitespace and trailing semicolons. @@ -131,10 +131,16 @@ def _expression_to_string( # Replace anything that looks like a string with a placeholder. re.compile(r"'[^']*'"): "?", # Replace sequences of IN/VALUES with a single placeholder. - re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)", + # The r" ?" makes it more robust to uneven spacing. + re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)", # Normalize parenthesis spacing. re.compile(r"\( "): "(", re.compile(r" \)"): ")", + # Fix up spaces before commas in column lists. + # e.g. "col1 , col2" -> "col1, col2" + # e.g. "col1,col2" -> "col1, col2" + re.compile(r"\b ,"): ",", + re.compile(r"\b,\b"): ", ", } _TABLE_NAME_NORMALIZATION_RULES = { # Replace UUID-like strings with a placeholder (both - and _ variants). diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index 750b2c4a92fd0b..2d32e1328fbb4f 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" + "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" } ], "fineGrainedLineages": [ @@ -147,7 +147,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" + "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" }, { "upstreamType": "FIELD_SET", @@ -159,7 +159,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" ], "confidenceScore": 1.0, - "query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" + "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" }, { "upstreamType": "FIELD_SET", @@ -171,7 +171,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" ], "confidenceScore": 1.0, - "query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" + "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4" } ] } @@ -179,7 +179,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", + "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -202,7 +202,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", + "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -229,7 +229,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", + "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json index 171a1bd3753e24..af0fca485777ff 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" + "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" } ], "fineGrainedLineages": [ @@ -147,7 +147,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" + "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" }, { "upstreamType": "FIELD_SET", @@ -159,7 +159,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)" ], "confidenceScore": 1.0, - "query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" + "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" }, { "upstreamType": "FIELD_SET", @@ -171,7 +171,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)" ], "confidenceScore": 1.0, - "query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" + "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405" } ] } @@ -179,7 +179,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", + "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -202,7 +202,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", + "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -229,7 +229,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", + "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -411,7 +411,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68" + "query": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904" }, { "auditStamp": { @@ -432,7 +432,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", + "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -455,7 +455,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", + "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -473,7 +473,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", + "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json index ba83917ca5c1a1..ceaaf8f6887c7c 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json @@ -133,7 +133,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce" + "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3" }, { "auditStamp": { @@ -146,7 +146,7 @@ }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)", "type": "TRANSFORMED", - "query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce" + "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3" } ], "fineGrainedLineages": [ @@ -161,7 +161,7 @@ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" ], "confidenceScore": 1.0, - "query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce" + "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3" } ] } @@ -169,7 +169,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce", + "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3", "changeType": "UPSERT", "aspectName": "queryProperties", "aspect": { @@ -192,7 +192,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce", + "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3", "changeType": "UPSERT", "aspectName": "querySubjects", "aspect": { @@ -219,7 +219,7 @@ }, { "entityType": "query", - "entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce", + "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py index 4e8ba8aa6b7770..dbe24ade6944f6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py @@ -73,6 +73,12 @@ class QueryGeneralizationTestMode(Enum): "SELECT * FROM foo", QueryGeneralizationTestMode.BOTH, ), + ( + "SELECT a\n -- comment--\n,b --another comment\n FROM books", + "redshift", + "SELECT a, b FROM books", + QueryGeneralizationTestMode.BOTH, + ), # Parameter normalization. ( "UPDATE \"books\" SET page_count = page_count + 1, author_count = author_count + 1 WHERE book_title = 'My New Book'", @@ -105,6 +111,21 @@ class QueryGeneralizationTestMode(Enum): "INSERT INTO MyTable (Column1, Column2, Column3) VALUES (?)", QueryGeneralizationTestMode.BOTH, ), + ( + # Uneven spacing within the IN clause. + "SELECT * FROM books WHERE zip_code IN (123,345, 423 )", + "redshift", + "SELECT * FROM books WHERE zip_code IN (?)", + QueryGeneralizationTestMode.BOTH, + ), + # Uneven spacing in the column list. + # This isn't perfect e.g. we still have issues with function calls inside selects. + ( + "SELECT a\n ,b FROM books", + "redshift", + "SELECT a, b FROM books", + QueryGeneralizationTestMode.BOTH, + ), ( textwrap.dedent( """\