From 84007f0398bafcd3f728918afd8d636db6227641 Mon Sep 17 00:00:00 2001 From: Sourav Gulati <108067669+souravg-db@users.noreply.github.com> Date: Wed, 27 Dec 2023 12:58:49 +0000 Subject: [PATCH] Issue 92 (#94) * added fix * added fix * added logger * Added fixes * fixed test cases * Added changes to regex pattern to include hyphen --- discoverx/explorer.py | 2 +- discoverx/msql.py | 4 ++-- discoverx/scanner.py | 5 ++--- tests/unit/msql_test.py | 1 + tests/unit/scanner_test.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 46f5a14..706ce5d 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -17,7 +17,7 @@ class DataExplorer: - FROM_COMPONENTS_EXPR = r"^(([0-9a-zA-Z_\*]+)\.([0-9a-zA-Z_\*]+)\.([0-9a-zA-Z_\*]+))$" + FROM_COMPONENTS_EXPR = r"^(([0-9a-zA-Z_\*-]+)\.([0-9a-zA-Z_\*-]+)\.([0-9a-zA-Z_\*-]+))$" def __init__(self, from_tables, spark: SparkSession, info_fetcher: InfoFetcher) -> None: self._from_tables = from_tables diff --git a/discoverx/msql.py b/discoverx/msql.py index 50cee25..57b9a82 100644 --- a/discoverx/msql.py +++ b/discoverx/msql.py @@ -22,8 +22,8 @@ class SQLRow: class Msql: """This class compiles M-SQL expressions into regular SQL""" - from_statement_expr = r"(FROM\s+)(([0-9a-zA-Z_\*]+).([0-9a-zA-Z_\*]+).([0-9a-zA-Z_\*]+))" - from_components_expr = r"^(([0-9a-zA-Z_\*]+).([0-9a-zA-Z_\*]+).([0-9a-zA-Z_\*]+))$" + from_statement_expr = r"(FROM\s+)(([0-9a-zA-Z_\*-]+).([0-9a-zA-Z_\*-]+).([0-9a-zA-Z_\*-]+))" + from_components_expr = r"^(([0-9a-zA-Z_\*-]+).([0-9a-zA-Z_\*-]+).([0-9a-zA-Z_\*-]+))$" command_expr = r"^\s*(\w+)\s" class_regex = r"\[([\w_-]+)\]" valid_commands = ["SELECT", "DELETE"] diff --git a/discoverx/scanner.py b/discoverx/scanner.py index f13c528..89f858d 100644 --- a/discoverx/scanner.py +++ b/discoverx/scanner.py @@ -226,7 +226,6 @@ def scan_table(self, table: TableInfo): # Build rule matching SQL sql = self._rule_matching_sql(table) - if self.what_if: logger.friendly(sql) else: @@ -300,7 +299,7 @@ def _rule_matching_sql(self, table_info: TableInfo): if not expressions: raise Exception(f"There are no rules to scan for.") - catalog_str = f"{table_info.catalog}." if table_info.catalog else "" + catalog_str = f"`{table_info.catalog}`." if table_info.catalog else "" matching_columns = [ f"INT(regexp_like(value, '{format_regex(r.definition)}')) AS `{r.name}`" for r in expressions ] @@ -328,7 +327,7 @@ def _rule_matching_sql(self, table_info: TableInfo): FROM ( SELECT stack({len(cols)}, {unpivot_columns}) AS (column_name, value) - FROM {catalog_str}{table_info.schema}.{table_info.table} + FROM {catalog_str}`{table_info.schema}`.`{table_info.table}` TABLESAMPLE ({self.sample_size} ROWS) ) ) diff --git a/tests/unit/msql_test.py b/tests/unit/msql_test.py index c0b64c1..fd53119 100644 --- a/tests/unit/msql_test.py +++ b/tests/unit/msql_test.py @@ -296,6 +296,7 @@ def test_execute_sql_should_fail_for_no_successful_queries(spark): def test_validate_from_components(): assert Msql.validate_from_components("c.d.t") == ("c", "d", "t") assert Msql.validate_from_components("*.*.*") == ("*", "*", "*") + assert Msql.validate_from_components("c-1.d.t-1") == ("c-1", "d", "t-1") with pytest.raises(ValueError): Msql.validate_from_components("c.d") diff --git a/tests/unit/scanner_test.py b/tests/unit/scanner_test.py index f469d45..7d3c940 100644 --- a/tests/unit/scanner_test.py +++ b/tests/unit/scanner_test.py @@ -77,7 +77,7 @@ def test_get_table_list(spark): FROM ( SELECT stack(1, 'name', `name`) AS (column_name, value) - FROM meta.db.tb + FROM `meta`.`db`.`tb` TABLESAMPLE (100 ROWS) ) ) @@ -103,7 +103,7 @@ def test_get_table_list(spark): FROM ( SELECT stack(1, 'name', `name`) AS (column_name, value) - FROM meta.db.tb + FROM `meta`.`db`.`tb` TABLESAMPLE (100 ROWS) ) )