refactored examples for with_sql

databrickslabs · Sep 29, 2023 · 206d243 · 206d243
1 parent bbfe878
commit 206d243
Show file tree

Hide file tree

Showing 11 changed files with 37 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -48,8 +48,8 @@ As an illustration, consider the scenario where you need to retrieve a single ro
 
 ```
 dx.from_tables("dev_*.*.*sample*")\
-  .apply_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\
-  .execute()
+  .with_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\
+  .apply()
 ```
 
 ## Available functionality

diff --git a/discoverx/dx.py b/discoverx/dx.py
@@ -68,7 +68,7 @@ def intro(self):
         <p>
             Then you can apply the following operations
             <ul>
-                <li><code>.apply_sql(...)</code> - Runs a SQL template on each table</li>
+                <li><code>.with_sql(...)</code> - Runs a SQL template on each table</li>
                 <li><code>.scan(...)</code> - Scan your lakehouse for columns matching the given rules</li>
                 <li><code>.search(...)</code> - Search your lakehouse for columns matching the given search term</li>
             </ul>

diff --git a/discoverx/explorer.py b/discoverx/explorer.py
@@ -203,7 +203,7 @@ def unpivot_string_columns(self, sample_size=None) -> "DataExplorerActions":
         if sample_size is not None:
             sql_query_template += f"TABLESAMPLE ({sample_size} ROWS)"
 
-        return self.apply_sql(sql_query_template)
+        return self.with_sql(sql_query_template)
 
     def scan(
         self,
@@ -330,7 +330,7 @@ def explain(self) -> None:
 
     def display(self) -> None:
         """Executes the data exploration queries and displays a sample of results"""
-        return self.execute()
+        return self.display()
 
     def execute(self) -> None:
         """[DEPRECATED] Executes the data exploration queries and displays a sample of results"""

diff --git a/discoverx/scanner.py b/discoverx/scanner.py
@@ -140,7 +140,7 @@ def save(self, scan_table_name: str):
             and scan_delta_table.table_schema = scan_result_df.table_schema \
             and scan_delta_table.table_name = scan_result_df.table_name \
             and scan_delta_table.column_name = scan_result_df.column_name ",
-        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
+        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().display()
 
     def load(self, scan_table_name: str):
         try:

diff --git a/docs/Arbitrary_multi-table_SQL.md b/docs/Arbitrary_multi-table_SQL.md
@@ -10,8 +10,8 @@ For example, to vacuum all the tables in "default" catalog:
 
 ```
 dx.from_tables("default.*.*")\
-  .apply_sql("VACUUM {full_table_name}")\
-  .execute()
+  .with_sql("VACUUM {full_table_name}")\
+  .display()
 ```
 
 That will apply the SQL template `VACUUM {full_table_name}` to all tables matched by the pattern `default.*.*`.
@@ -26,7 +26,7 @@ You can use the `explain()` command to see the SQL that would be executed.
 
 ```
 dx.from_tables("default.*.*")\
-  .apply_sql("VACUUM {full_table_name}")\
+  .with_sql("VACUUM {full_table_name}")\
   .explain()
 ```
 
@@ -35,14 +35,14 @@ You can also filter tables that have a specific column name.
 ```
 dx.from_tables("default.*.*")\
   .having_columns("device_id")\
-  .apply_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\
-  .execute()
+  .with_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\
+  .display()
 ```
 
 ## Select entire rows as json
 
 ```
 dx.from_tables("default.*.*")\
-  .apply_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\
-  .execute()
+  .with_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\
+  .display()
 ```
diff --git a/docs/GDPR_RoA.md b/docs/GDPR_RoA.md
@@ -9,6 +9,10 @@ For example, if you want to get all data for user `1` from all tables that have
 ```
 df = dx.from_tables("*.*.*")\
   .having_columns("user_id")\
-  .apply_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\
-  .to_union_dataframe()
-```
+  .with_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\
+  .apply()
+```
+
+### Limitations
+
+The current approach only selects tables that contain the specified column, and does not recursively follow the relationships with other tables.
diff --git a/docs/GDPR_RoE.md b/docs/GDPR_RoE.md
@@ -9,9 +9,9 @@ For example, if you want to delete users `1`, `2`, and `3` from all tables that
 ```
 dx.from_tables("*.*.*")\
   .having_columns("user_id")\
-  .apply_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\
-  .execute() 
-  # You can use .explain() instead of .execute() to preview the generated SQL 
+  .with_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\
+  .display() 
+  # You can use .explain() instead of .display() to preview the generated SQL 
 ```
 
 ## Vaccum

diff --git a/docs/Vacuum.md b/docs/Vacuum.md
@@ -8,8 +8,8 @@ With DiscoverX you can vacuum all the tables at once with the command:
 
 ```
 dx.from_tables("*.*.*")\
-  .apply_sql("VACUUM {full_table_name}")\
-  .execute()
+  .with_sql("VACUUM {full_table_name}")\
+  .display()
 ```
 
 You can schedule [this example notebook](https://raw.githubusercontent.com/databrickslabs/discoverx/master/examples/vacuum_multiple_tables.py) in your Databricks workflows to run vacuum periodically.

diff --git a/examples/detect_small_files.py b/examples/detect_small_files.py
@@ -7,7 +7,7 @@
 # MAGIC As a rule of thumb, if a table has more than `100` files and average file size smaller than `10 MB`, then we can consider it having too many small files.
 # MAGIC
 # MAGIC Some common causes of too many small files are:
-# MAGIC * Overpartitioning: the cardinality of the partition columns is too high 
+# MAGIC * Overpartitioning: the cardinality of the partition columns is too high
 # MAGIC * Lack of scheduled maintenance operations like `OPTIMIZE`
 # MAGIC * Missing auto optimize on write
 # MAGIC
@@ -38,16 +38,13 @@
 
 from pyspark.sql.functions import col, lit
 
-dx.from_tables(from_tables)\
-  .apply_sql("DESCRIBE DETAIL {full_table_name}")\
-  .to_union_dataframe()\
-  .withColumn("average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024)\
-  .withColumn("has_too_many_small_files", 
-              (col("average_file_size_MB") < small_file_max_size_MB) & 
-              (col("numFiles") > min_file_number))\
-  .filter("has_too_many_small_files")\
-  .display()
+dx.from_tables(from_tables).with_sql("DESCRIBE DETAIL {full_table_name}").apply().withColumn(
+    "average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024
+).withColumn(
+    "has_too_many_small_files",
+    (col("average_file_size_MB") < small_file_max_size_MB) & (col("numFiles") > min_file_number),
+).filter(
+    "has_too_many_small_files"
+).display()
 
 # COMMAND ----------
-
-
diff --git a/examples/pii_detection_presidio.py b/examples/pii_detection_presidio.py
@@ -67,7 +67,7 @@
 unpivoted_df = (
     dx.from_tables(from_tables)
     .unpivot_string_columns(sample_size=sample_size)
-    .to_union_dataframe()
+    .apply()
     .localCheckpoint()  # Checkpointing to reduce the query plan size
 )
 

diff --git a/examples/vacuum_multiple_tables.py b/examples/vacuum_multiple_tables.py
@@ -28,7 +28,7 @@
 
 # COMMAND ----------
 
-dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").explain()
+dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").explain()
 
 # COMMAND ----------
 
@@ -37,4 +37,4 @@
 
 # COMMAND ----------
 
-(dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").execute())
+(dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").display())