Skip to content

Commit

Permalink
refactored examples for with_sql
Browse files Browse the repository at this point in the history
  • Loading branch information
edurdevic committed Sep 29, 2023
1 parent bbfe878 commit 206d243
Show file tree
Hide file tree
Showing 11 changed files with 37 additions and 36 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ As an illustration, consider the scenario where you need to retrieve a single ro

```
dx.from_tables("dev_*.*.*sample*")\
.apply_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\
.execute()
.with_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\
.apply()
```

## Available functionality
Expand Down
2 changes: 1 addition & 1 deletion discoverx/dx.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def intro(self):
<p>
Then you can apply the following operations
<ul>
<li><code>.apply_sql(...)</code> - Runs a SQL template on each table</li>
<li><code>.with_sql(...)</code> - Runs a SQL template on each table</li>
<li><code>.scan(...)</code> - Scan your lakehouse for columns matching the given rules</li>
<li><code>.search(...)</code> - Search your lakehouse for columns matching the given search term</li>
</ul>
Expand Down
4 changes: 2 additions & 2 deletions discoverx/explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def unpivot_string_columns(self, sample_size=None) -> "DataExplorerActions":
if sample_size is not None:
sql_query_template += f"TABLESAMPLE ({sample_size} ROWS)"

return self.apply_sql(sql_query_template)
return self.with_sql(sql_query_template)

def scan(
self,
Expand Down Expand Up @@ -330,7 +330,7 @@ def explain(self) -> None:

def display(self) -> None:
"""Executes the data exploration queries and displays a sample of results"""
return self.execute()
return self.display()

def execute(self) -> None:
"""[DEPRECATED] Executes the data exploration queries and displays a sample of results"""
Expand Down
2 changes: 1 addition & 1 deletion discoverx/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def save(self, scan_table_name: str):
and scan_delta_table.table_schema = scan_result_df.table_schema \
and scan_delta_table.table_name = scan_result_df.table_name \
and scan_delta_table.column_name = scan_result_df.column_name ",
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
).whenMatchedUpdateAll().whenNotMatchedInsertAll().display()

def load(self, scan_table_name: str):
try:
Expand Down
14 changes: 7 additions & 7 deletions docs/Arbitrary_multi-table_SQL.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ For example, to vacuum all the tables in "default" catalog:

```
dx.from_tables("default.*.*")\
.apply_sql("VACUUM {full_table_name}")\
.execute()
.with_sql("VACUUM {full_table_name}")\
.display()
```

That will apply the SQL template `VACUUM {full_table_name}` to all tables matched by the pattern `default.*.*`.
Expand All @@ -26,7 +26,7 @@ You can use the `explain()` command to see the SQL that would be executed.

```
dx.from_tables("default.*.*")\
.apply_sql("VACUUM {full_table_name}")\
.with_sql("VACUUM {full_table_name}")\
.explain()
```

Expand All @@ -35,14 +35,14 @@ You can also filter tables that have a specific column name.
```
dx.from_tables("default.*.*")\
.having_columns("device_id")\
.apply_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\
.execute()
.with_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\
.display()
```

## Select entire rows as json

```
dx.from_tables("default.*.*")\
.apply_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\
.execute()
.with_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\
.display()
```
10 changes: 7 additions & 3 deletions docs/GDPR_RoA.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ For example, if you want to get all data for user `1` from all tables that have
```
df = dx.from_tables("*.*.*")\
.having_columns("user_id")\
.apply_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\
.to_union_dataframe()
```
.with_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\
.apply()
```

### Limitations

The current approach only selects tables that contain the specified column, and does not recursively follow the relationships with other tables.
6 changes: 3 additions & 3 deletions docs/GDPR_RoE.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ For example, if you want to delete users `1`, `2`, and `3` from all tables that
```
dx.from_tables("*.*.*")\
.having_columns("user_id")\
.apply_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\
.execute()
# You can use .explain() instead of .execute() to preview the generated SQL
.with_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\
.display()
# You can use .explain() instead of .display() to preview the generated SQL
```

## Vaccum
Expand Down
4 changes: 2 additions & 2 deletions docs/Vacuum.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ With DiscoverX you can vacuum all the tables at once with the command:

```
dx.from_tables("*.*.*")\
.apply_sql("VACUUM {full_table_name}")\
.execute()
.with_sql("VACUUM {full_table_name}")\
.display()
```

You can schedule [this example notebook](https://raw.githubusercontent.com/databrickslabs/discoverx/master/examples/vacuum_multiple_tables.py) in your Databricks workflows to run vacuum periodically.
Expand Down
21 changes: 9 additions & 12 deletions examples/detect_small_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# MAGIC As a rule of thumb, if a table has more than `100` files and average file size smaller than `10 MB`, then we can consider it having too many small files.
# MAGIC
# MAGIC Some common causes of too many small files are:
# MAGIC * Overpartitioning: the cardinality of the partition columns is too high
# MAGIC * Overpartitioning: the cardinality of the partition columns is too high
# MAGIC * Lack of scheduled maintenance operations like `OPTIMIZE`
# MAGIC * Missing auto optimize on write
# MAGIC
Expand Down Expand Up @@ -38,16 +38,13 @@

from pyspark.sql.functions import col, lit

dx.from_tables(from_tables)\
.apply_sql("DESCRIBE DETAIL {full_table_name}")\
.to_union_dataframe()\
.withColumn("average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024)\
.withColumn("has_too_many_small_files",
(col("average_file_size_MB") < small_file_max_size_MB) &
(col("numFiles") > min_file_number))\
.filter("has_too_many_small_files")\
.display()
dx.from_tables(from_tables).with_sql("DESCRIBE DETAIL {full_table_name}").apply().withColumn(
"average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024
).withColumn(
"has_too_many_small_files",
(col("average_file_size_MB") < small_file_max_size_MB) & (col("numFiles") > min_file_number),
).filter(
"has_too_many_small_files"
).display()

# COMMAND ----------


2 changes: 1 addition & 1 deletion examples/pii_detection_presidio.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
unpivoted_df = (
dx.from_tables(from_tables)
.unpivot_string_columns(sample_size=sample_size)
.to_union_dataframe()
.apply()
.localCheckpoint() # Checkpointing to reduce the query plan size
)

Expand Down
4 changes: 2 additions & 2 deletions examples/vacuum_multiple_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

# COMMAND ----------

dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").explain()
dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").explain()

# COMMAND ----------

Expand All @@ -37,4 +37,4 @@

# COMMAND ----------

(dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").execute())
(dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").display())

0 comments on commit 206d243

Please sign in to comment.