From 206d243b9582e853f77b6c4856aec3ebb57d3e9d Mon Sep 17 00:00:00 2001 From: Erni Durdevic Date: Fri, 29 Sep 2023 13:00:19 +0200 Subject: [PATCH] refactored examples for with_sql --- README.md | 4 ++-- discoverx/dx.py | 2 +- discoverx/explorer.py | 4 ++-- discoverx/scanner.py | 2 +- docs/Arbitrary_multi-table_SQL.md | 14 +++++++------- docs/GDPR_RoA.md | 10 +++++++--- docs/GDPR_RoE.md | 6 +++--- docs/Vacuum.md | 4 ++-- examples/detect_small_files.py | 21 +++++++++------------ examples/pii_detection_presidio.py | 2 +- examples/vacuum_multiple_tables.py | 4 ++-- 11 files changed, 37 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index aaf73ac..68d4b3f 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,8 @@ As an illustration, consider the scenario where you need to retrieve a single ro ``` dx.from_tables("dev_*.*.*sample*")\ - .apply_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\ - .execute() + .with_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\ + .apply() ``` ## Available functionality diff --git a/discoverx/dx.py b/discoverx/dx.py index 5136621..e99d825 100644 --- a/discoverx/dx.py +++ b/discoverx/dx.py @@ -68,7 +68,7 @@ def intro(self):

Then you can apply the following operations

diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 235d69a..09f9bcf 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -203,7 +203,7 @@ def unpivot_string_columns(self, sample_size=None) -> "DataExplorerActions": if sample_size is not None: sql_query_template += f"TABLESAMPLE ({sample_size} ROWS)" - return self.apply_sql(sql_query_template) + return self.with_sql(sql_query_template) def scan( self, @@ -330,7 +330,7 @@ def explain(self) -> None: def display(self) -> None: """Executes the data exploration queries and displays a sample of results""" - return self.execute() + return self.display() def execute(self) -> None: """[DEPRECATED] Executes the data exploration queries and displays a sample of results""" diff --git a/discoverx/scanner.py b/discoverx/scanner.py index 65c0d22..a62ca71 100644 --- a/discoverx/scanner.py +++ b/discoverx/scanner.py @@ -140,7 +140,7 @@ def save(self, scan_table_name: str): and scan_delta_table.table_schema = scan_result_df.table_schema \ and scan_delta_table.table_name = scan_result_df.table_name \ and scan_delta_table.column_name = scan_result_df.column_name ", - ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute() + ).whenMatchedUpdateAll().whenNotMatchedInsertAll().display() def load(self, scan_table_name: str): try: diff --git a/docs/Arbitrary_multi-table_SQL.md b/docs/Arbitrary_multi-table_SQL.md index 92751dc..f1b78f1 100644 --- a/docs/Arbitrary_multi-table_SQL.md +++ b/docs/Arbitrary_multi-table_SQL.md @@ -10,8 +10,8 @@ For example, to vacuum all the tables in "default" catalog: ``` dx.from_tables("default.*.*")\ - .apply_sql("VACUUM {full_table_name}")\ - .execute() + .with_sql("VACUUM {full_table_name}")\ + .display() ``` That will apply the SQL template `VACUUM {full_table_name}` to all tables matched by the pattern `default.*.*`. @@ -26,7 +26,7 @@ You can use the `explain()` command to see the SQL that would be executed. ``` dx.from_tables("default.*.*")\ - .apply_sql("VACUUM {full_table_name}")\ + .with_sql("VACUUM {full_table_name}")\ .explain() ``` @@ -35,14 +35,14 @@ You can also filter tables that have a specific column name. ``` dx.from_tables("default.*.*")\ .having_columns("device_id")\ - .apply_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\ - .execute() + .with_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\ + .display() ``` ## Select entire rows as json ``` dx.from_tables("default.*.*")\ - .apply_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\ - .execute() + .with_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\ + .display() ``` \ No newline at end of file diff --git a/docs/GDPR_RoA.md b/docs/GDPR_RoA.md index 6c53af4..1b8645b 100644 --- a/docs/GDPR_RoA.md +++ b/docs/GDPR_RoA.md @@ -9,6 +9,10 @@ For example, if you want to get all data for user `1` from all tables that have ``` df = dx.from_tables("*.*.*")\ .having_columns("user_id")\ - .apply_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\ - .to_union_dataframe() -``` \ No newline at end of file + .with_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\ + .apply() +``` + +### Limitations + +The current approach only selects tables that contain the specified column, and does not recursively follow the relationships with other tables. \ No newline at end of file diff --git a/docs/GDPR_RoE.md b/docs/GDPR_RoE.md index c93f885..5c3b77c 100644 --- a/docs/GDPR_RoE.md +++ b/docs/GDPR_RoE.md @@ -9,9 +9,9 @@ For example, if you want to delete users `1`, `2`, and `3` from all tables that ``` dx.from_tables("*.*.*")\ .having_columns("user_id")\ - .apply_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\ - .execute() - # You can use .explain() instead of .execute() to preview the generated SQL + .with_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\ + .display() + # You can use .explain() instead of .display() to preview the generated SQL ``` ## Vaccum diff --git a/docs/Vacuum.md b/docs/Vacuum.md index 11f4a65..58f3215 100644 --- a/docs/Vacuum.md +++ b/docs/Vacuum.md @@ -8,8 +8,8 @@ With DiscoverX you can vacuum all the tables at once with the command: ``` dx.from_tables("*.*.*")\ - .apply_sql("VACUUM {full_table_name}")\ - .execute() + .with_sql("VACUUM {full_table_name}")\ + .display() ``` You can schedule [this example notebook](https://raw.githubusercontent.com/databrickslabs/discoverx/master/examples/vacuum_multiple_tables.py) in your Databricks workflows to run vacuum periodically. diff --git a/examples/detect_small_files.py b/examples/detect_small_files.py index 7bc7b35..26c614f 100644 --- a/examples/detect_small_files.py +++ b/examples/detect_small_files.py @@ -7,7 +7,7 @@ # MAGIC As a rule of thumb, if a table has more than `100` files and average file size smaller than `10 MB`, then we can consider it having too many small files. # MAGIC # MAGIC Some common causes of too many small files are: -# MAGIC * Overpartitioning: the cardinality of the partition columns is too high +# MAGIC * Overpartitioning: the cardinality of the partition columns is too high # MAGIC * Lack of scheduled maintenance operations like `OPTIMIZE` # MAGIC * Missing auto optimize on write # MAGIC @@ -38,16 +38,13 @@ from pyspark.sql.functions import col, lit -dx.from_tables(from_tables)\ - .apply_sql("DESCRIBE DETAIL {full_table_name}")\ - .to_union_dataframe()\ - .withColumn("average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024)\ - .withColumn("has_too_many_small_files", - (col("average_file_size_MB") < small_file_max_size_MB) & - (col("numFiles") > min_file_number))\ - .filter("has_too_many_small_files")\ - .display() +dx.from_tables(from_tables).with_sql("DESCRIBE DETAIL {full_table_name}").apply().withColumn( + "average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024 +).withColumn( + "has_too_many_small_files", + (col("average_file_size_MB") < small_file_max_size_MB) & (col("numFiles") > min_file_number), +).filter( + "has_too_many_small_files" +).display() # COMMAND ---------- - - diff --git a/examples/pii_detection_presidio.py b/examples/pii_detection_presidio.py index 63efa11..cddc28a 100644 --- a/examples/pii_detection_presidio.py +++ b/examples/pii_detection_presidio.py @@ -67,7 +67,7 @@ unpivoted_df = ( dx.from_tables(from_tables) .unpivot_string_columns(sample_size=sample_size) - .to_union_dataframe() + .apply() .localCheckpoint() # Checkpointing to reduce the query plan size ) diff --git a/examples/vacuum_multiple_tables.py b/examples/vacuum_multiple_tables.py index 6dc0a54..cc69430 100644 --- a/examples/vacuum_multiple_tables.py +++ b/examples/vacuum_multiple_tables.py @@ -28,7 +28,7 @@ # COMMAND ---------- -dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").explain() +dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").explain() # COMMAND ---------- @@ -37,4 +37,4 @@ # COMMAND ---------- -(dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").execute()) +(dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").display())