From 5d7889e8e68640929ee2f4a760280db368ea0346 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio <104133639+lorenzorubi-db@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:05:58 +0100 Subject: [PATCH 01/25] delta housekeeping initial commit --- discoverx/delta_housekeeping.py | 179 ++++++++++++++++++++++++++++ discoverx/explorer.py | 48 +++++++- examples/exec_delta_housekeeping.py | 31 +++++ 3 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 discoverx/delta_housekeeping.py create mode 100644 examples/exec_delta_housekeeping.py diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py new file mode 100644 index 0000000..1ae3ce7 --- /dev/null +++ b/discoverx/delta_housekeeping.py @@ -0,0 +1,179 @@ +from typing import Iterable +from discoverx.table_info import TableInfo + +from pyspark.sql import DataFrame +from pyspark.sql.window import Window +import pyspark.sql.types as T +import pyspark.sql.functions as F + + + + +class DeltaHousekeeping: + empty_schema = T.StructType([ + T.StructField("catalog", T.StringType()), + T.StructField("database", T.StringType()), + T.StructField("tableName", T.StringType()), + ]) + + @staticmethod + def _process_describe_history(describe_detail_df, describe_history_df) -> DataFrame: + """ + processes the DESCRIBE HISTORY result of potentially several tables in different schemas/catalogs + Provides + - table stats (size and number of files) + - timestamp for last & second last OPTIMIZE + - stats of OPTIMIZE (including ZORDER) + - timestamp for last & second last VACUUM + + TODO reconsider if it is better outside of the class + """ + if not "operation" in describe_history_df.columns: + return describe_detail_df + + # window over operation + operation_order = ( + describe_history_df + .filter(F.col("operation").isin(["OPTIMIZE", "VACUUM END"])) + .withColumn("operation_order", F.row_number().over( + Window.partitionBy(["catalog", "database", "tableName", "operation"]).orderBy(F.col("timestamp").desc()) + )) + ) + # max & 2nd timestamp of OPTIMIZE into output + out = describe_detail_df.join( + operation_order + .filter((F.col("operation") == "OPTIMIZE") & (F.col("operation_order") == 1)) + .select("catalog", "database", "tableName", "timestamp") + .withColumnRenamed("timestamp", "max_optimize_timestamp"), + how="outer", on=["catalog", "database", "tableName"] + ) + out = out.join( + operation_order + .filter((F.col("operation") == "OPTIMIZE") & (F.col("operation_order") == 2)) + .select("catalog", "database", "tableName", "timestamp") + .withColumnRenamed("timestamp", "2nd_optimize_timestamp"), + how="outer", on=["catalog", "database", "tableName"] + ) + # max timestamp of VACUUM into output + out = out.join( + operation_order + .filter((F.col("operation") == "VACUUM END") & (F.col("operation_order") == 1)) + .select("catalog", "database", "tableName", "timestamp") + .withColumnRenamed("timestamp", "max_vacuum_timestamp"), + how="outer", on=["catalog", "database", "tableName"] + ) + out = out.join( + operation_order + .filter((F.col("operation") == "VACUUM END") & (F.col("operation_order") == 2)) + .select("catalog", "database", "tableName", "timestamp") + .withColumnRenamed("timestamp", "2nd_vacuum_timestamp"), + how="outer", on=["catalog", "database", "tableName"] + ) + # summary of table metrics + table_metrics_1 = ( + operation_order.filter((F.col("operation") == "OPTIMIZE") & (F.col("operation_order") == 1)) + .select([ + F.col("catalog"), + F.col("database"), + F.col("tableName"), + F.col("min_file_size"), + F.col("p50_file_size"), + F.col("max_file_size"), + F.col("z_order_by"), + ]) + ) + + # write to output + out = out.join( + table_metrics_1, + how="outer", on=["catalog", "database", "tableName"] + ) + + return out + + def scan( + self, + table_info_list: Iterable[TableInfo], + housekeeping_table_name: str = "lorenzorubi.default.housekeeping_summary_v2", # TODO remove + do_save_as_table: bool = True, + ): + dd_list = [] + statements = [] + errors = [] + + if not isinstance(table_info_list, Iterable): + table_info_list = [table_info_list] + + for table_info in table_info_list: + try: + dd = spark.sql(f""" + DESCRIBE DETAIL {table_info.catalog}.{table_info.schema}.{table_info.table}; + """) + + dd = ( + dd + .withColumn("split", F.split(F.col('name'), '\.')) + .withColumn("catalog", F.col("split").getItem(0)) + .withColumn("database", F.col("split").getItem(1)) + .withColumn("tableName", F.col("split").getItem(2)) + .select([ + F.col("catalog"), + F.col("database"), + F.col("tableName"), + F.col("numFiles").alias("number_of_files"), + F.col("sizeInBytes").alias("bytes"), + ]) + ) + dd_list.append(dd) + statements.append(f""" + SELECT + '{table_info.catalog}' AS catalog, + '{table_info.schema}' AS database, + '{table_info.table}' AS tableName, + operation, + timestamp, + operationMetrics.minFileSize AS min_file_size, + operationMetrics.p50FileSize AS p50_file_size, + operationMetrics.maxFileSize AS max_file_size, + operationParameters.zOrderBy AS z_order_by + FROM (DESCRIBE HISTORY {table_info.catalog}.{table_info.schema}.{table_info.table}) + WHERE operation in ('OPTIMIZE', 'VACUUM END') + """) + except Exception as e: + errors.append(spark.createDataFrame( + [(table_info.catalog, table_info.schema, table_info.table, str(e))], + ["catalog", "database", "tableName", "error"] + )) + + statement = " UNION ".join(statements) + + dh = spark.createDataFrame([], self.empty_schema) + if statements: + dh = self.process_describe_history( + reduce( + lambda left, right: left.union(right), + dd_list + ), + spark.sql(statement), + None + ) + + errors_df = spark.createDataFrame([], self.empty_schema) + if errors: + errors_df = reduce( + lambda left, right: left.union(right), + errors + ) + + out = dh.unionByName(errors_df, allowMissingColumns=True) + if do_save_as_table: + ( + out + .write + .format("delta") + .mode("append") + .option("mergeSchema", "true") + .saveAsTable(housekeeping_table_name) + ) + return out + diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 46f5a14..1b57c5d 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -1,7 +1,8 @@ import concurrent.futures import copy import re -from typing import Optional, List +import more_itertools +from typing import Optional, List, Callable, Iterable from discoverx import logging from discoverx.common import helper from discoverx.discovery import Discovery @@ -11,6 +12,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import lit from discoverx.table_info import InfoFetcher, TableInfo +from discoverx.delta_housekeeping import DeltaHousekeeping logger = logging.Logging() @@ -147,7 +149,7 @@ def scan( discover.scan(rules=rules, sample_size=sample_size, what_if=what_if) return discover - def map(self, f) -> list[any]: + def map(self, f: Callable) -> list[any]: """Runs a function for each table in the data explorer Args: @@ -178,6 +180,48 @@ def map(self, f) -> list[any]: return res + def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[any]: + """Runs a function for each table in the data explorer + + Args: + f (function): The function to run. The function should accept either a list of TableInfo objects as input and return a list of any object as output. + + Returns: + list[any]: A list of the results of running the function for each table + """ + res = [] + table_list = self._info_fetcher.get_tables_info( + self._catalogs, + self._schemas, + self._tables, + self._having_columns, + self._with_tags, + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_concurrency) as executor: + # Submit tasks to the thread pool + futures = [ + executor.submit(f, table_chunk, **kwargs) for table_chunk in more_itertools.chunked(table_list, tables_per_chunk) + ] + + # Process completed tasks + for future in concurrent.futures.as_completed(futures): + result = future.result() + if result is not None: + res.extend(result) + + logger.debug("Finished lakehouse map_chunked task") + + return res + + def delta_housekeeping(self) -> DataFrame: + """ + + """ + dh = DeltaHousekeeping() + self.map( + dh.scan + ) + class DataExplorerActions: def __init__( diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py new file mode 100644 index 0000000..e8bfeee --- /dev/null +++ b/examples/exec_delta_housekeeping.py @@ -0,0 +1,31 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Run arbitrary operations across multiple tables +# MAGIC + +# COMMAND ---------- + +# MAGIC %reload_ext autoreload +# MAGIC %autoreload 2 + +# COMMAND ---------- + + +from discoverx import DX + +dx = DX() + +# COMMAND ---------- + + +# COMMAND ---------- + +result = ( + dx.from_tables("lorenzorubi.*.*") + .with_concurrency(1) # You can increase the concurrency with this parameter + .delta_housekeeping() +) +print(len(result)) + +# COMMAND ---------- + From 90bab2790fc6f380b86a7b88e5aae5977a14ee97 Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Mon, 18 Dec 2023 11:34:57 +0000 Subject: [PATCH 02/25] debugging initial version --- discoverx/delta_housekeeping.py | 39 ++++++++++++++++++----------- discoverx/explorer.py | 4 +-- examples/exec_delta_housekeeping.py | 4 +-- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 1ae3ce7..8393405 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -1,20 +1,22 @@ from typing import Iterable +from functools import reduce from discoverx.table_info import TableInfo -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.window import Window import pyspark.sql.types as T import pyspark.sql.functions as F - - class DeltaHousekeeping: - empty_schema = T.StructType([ - T.StructField("catalog", T.StringType()), - T.StructField("database", T.StringType()), - T.StructField("tableName", T.StringType()), - ]) + + def __init__(self, spark: SparkSession) -> None: + self._spark = spark + self.empty_schema = T.StructType([ + T.StructField("catalog", T.StringType()), + T.StructField("database", T.StringType()), + T.StructField("tableName", T.StringType()), + ]) @staticmethod def _process_describe_history(describe_detail_df, describe_history_df) -> DataFrame: @@ -97,6 +99,11 @@ def scan( housekeeping_table_name: str = "lorenzorubi.default.housekeeping_summary_v2", # TODO remove do_save_as_table: bool = True, ): + """ + Scans a table_info / table_info_list to fetch Delta stats + - DESCRIBE DETAIL + - DESCRIBE HISTORY + """ dd_list = [] statements = [] errors = [] @@ -106,10 +113,12 @@ def scan( for table_info in table_info_list: try: - dd = spark.sql(f""" + # runs a describe detail per table, figures out if exception + dd = self._spark.sql(f""" DESCRIBE DETAIL {table_info.catalog}.{table_info.schema}.{table_info.table}; """) + # prepares a DESCRIBE HISTORY statement per table (will be run outside of the loop) dd = ( dd .withColumn("split", F.split(F.col('name'), '\.')) @@ -140,25 +149,25 @@ def scan( WHERE operation in ('OPTIMIZE', 'VACUUM END') """) except Exception as e: - errors.append(spark.createDataFrame( + errors.append(self._spark.createDataFrame( [(table_info.catalog, table_info.schema, table_info.table, str(e))], ["catalog", "database", "tableName", "error"] )) + # statement to UNION all DESCRIBE HISTORY together statement = " UNION ".join(statements) - dh = spark.createDataFrame([], self.empty_schema) + dh = self._spark.createDataFrame([], self.empty_schema) if statements: - dh = self.process_describe_history( + dh = self._process_describe_history( reduce( lambda left, right: left.union(right), dd_list ), - spark.sql(statement), - None + self._spark.sql(statement), ) - errors_df = spark.createDataFrame([], self.empty_schema) + errors_df = self._spark.createDataFrame([], self.empty_schema) if errors: errors_df = reduce( lambda left, right: left.union(right), diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 1b57c5d..d7e7b29 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -217,8 +217,8 @@ def delta_housekeeping(self) -> DataFrame: """ """ - dh = DeltaHousekeeping() - self.map( + dh = DeltaHousekeeping(self._spark) + return self.map( dh.scan ) diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index e8bfeee..3b90a47 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -15,9 +15,6 @@ dx = DX() -# COMMAND ---------- - - # COMMAND ---------- result = ( @@ -29,3 +26,4 @@ # COMMAND ---------- + From 94629e03213d1988bed2717cc2bb8ea05bf1664d Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio <104133639+lorenzorubi-db@users.noreply.github.com> Date: Mon, 18 Dec 2023 13:18:19 +0100 Subject: [PATCH 03/25] convert output to pandas --- discoverx/delta_housekeeping.py | 23 ++++++++++++++--------- discoverx/explorer.py | 8 +++++--- examples/exec_delta_housekeeping.py | 2 +- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 8393405..eca5d5d 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -2,6 +2,8 @@ from functools import reduce from discoverx.table_info import TableInfo +import pandas as pd + from pyspark.sql import DataFrame, SparkSession from pyspark.sql.window import Window import pyspark.sql.types as T @@ -19,7 +21,9 @@ def __init__(self, spark: SparkSession) -> None: ]) @staticmethod - def _process_describe_history(describe_detail_df, describe_history_df) -> DataFrame: + def _process_describe_history( + describe_detail_df: DataFrame, describe_history_df: DataFrame + ) -> DataFrame: """ processes the DESCRIBE HISTORY result of potentially several tables in different schemas/catalogs Provides @@ -94,11 +98,11 @@ def _process_describe_history(describe_detail_df, describe_history_df) -> DataFr return out def scan( - self, - table_info_list: Iterable[TableInfo], - housekeeping_table_name: str = "lorenzorubi.default.housekeeping_summary_v2", # TODO remove - do_save_as_table: bool = True, - ): + self, + table_info_list: Iterable[TableInfo], + housekeeping_table_name: str = "lorenzorubi.default.housekeeping_summary_v2", # TODO remove + do_save_as_table: bool = True, + ) -> pd.DataFrame: """ Scans a table_info / table_info_list to fetch Delta stats - DESCRIBE DETAIL @@ -117,8 +121,6 @@ def scan( dd = self._spark.sql(f""" DESCRIBE DETAIL {table_info.catalog}.{table_info.schema}.{table_info.table}; """) - - # prepares a DESCRIBE HISTORY statement per table (will be run outside of the loop) dd = ( dd .withColumn("split", F.split(F.col('name'), '\.')) @@ -134,6 +136,8 @@ def scan( ]) ) dd_list.append(dd) + + # prepares a DESCRIBE HISTORY statement per table (will be run outside of the loop) statements.append(f""" SELECT '{table_info.catalog}' AS catalog, @@ -175,6 +179,7 @@ def scan( ) out = dh.unionByName(errors_df, allowMissingColumns=True) + if do_save_as_table: ( out @@ -184,5 +189,5 @@ def scan( .option("mergeSchema", "true") .saveAsTable(housekeeping_table_name) ) - return out + return out.toPandas() diff --git a/discoverx/explorer.py b/discoverx/explorer.py index d7e7b29..6df804a 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -2,6 +2,7 @@ import copy import re import more_itertools +import pandas as pd from typing import Optional, List, Callable, Iterable from discoverx import logging from discoverx.common import helper @@ -213,14 +214,15 @@ def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[any] return res - def delta_housekeeping(self) -> DataFrame: + def delta_housekeeping(self) -> pd.DataFrame: """ - + Gathers stats and recommendations on Delta Housekeeping """ dh = DeltaHousekeeping(self._spark) - return self.map( + dfs_pd: Iterable[pd.DataFrame] = self.map( dh.scan ) + return reduce(lambda x, y: x.union(y), dfs_pd) # TODO create DeltaHousekeepingActions and implement `apply` class DataExplorerActions: diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 3b90a47..5a08a9a 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -22,8 +22,8 @@ .with_concurrency(1) # You can increase the concurrency with this parameter .delta_housekeeping() ) -print(len(result)) # COMMAND ---------- +display(result) From 543f852ae7682c1e4f2167635f8649198e8854d5 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio <104133639+lorenzorubi-db@users.noreply.github.com> Date: Mon, 18 Dec 2023 13:28:45 +0100 Subject: [PATCH 04/25] debugging -convert output to pandas --- discoverx/explorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 6df804a..83e740d 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -222,7 +222,7 @@ def delta_housekeeping(self) -> pd.DataFrame: dfs_pd: Iterable[pd.DataFrame] = self.map( dh.scan ) - return reduce(lambda x, y: x.union(y), dfs_pd) # TODO create DeltaHousekeepingActions and implement `apply` + return pd.concat(dfs_pd) # TODO create DeltaHousekeepingActions and implement `apply` class DataExplorerActions: From 567b303104a26c8a46ca7941b51d026361b1bca9 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio <104133639+lorenzorubi-db@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:21:25 +0100 Subject: [PATCH 05/25] DeltaHousekeepingActions object and tests --- discoverx/delta_housekeeping.py | 37 +++++++++++++++++++ discoverx/explorer.py | 8 ++-- setup.py | 1 + .../delta_housekeeping/dhk_pandas_result.csv | 20 ++++++++++ .../expected_need_optimize.csv | 4 ++ tests/unit/delta_housekeeping_test.py | 25 +++++++++++++ 6 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 tests/unit/data/delta_housekeeping/dhk_pandas_result.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_need_optimize.csv create mode 100644 tests/unit/delta_housekeeping_test.py diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index eca5d5d..6a6463b 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -191,3 +191,40 @@ def scan( ) return out.toPandas() + + +class DeltaHousekeepingActions: + def __init__( + self, + # delta_housekeeping: DeltaHousekeeping, + mapped_pd_dfs: Iterable[pd.DataFrame], + # spark: SparkSession = None, + min_table_size_optimize: int = 128*1024*1024, + stats: pd.DataFrame = None, # for testability only + ) -> None: + # self._delta_housekeeping = delta_housekeeping + if stats is None: + self._mapped_pd_dfs = mapped_pd_dfs + stats = pd.concat(self._mapped_pd_dfs) + self._stats: pd.DataFrame = stats + # if spark is None: + # spark = SparkSession.builder.getOrCreate() + # self._spark = spark + self.min_table_size_optimize = min_table_size_optimize + self.tables_not_optimized_legend = "Tables that are never OPTIMIZED and would benefit from it" + + def stats(self) -> pd.DataFrame: + return self._stats + + def _need_optimize(self) -> pd.DataFrame: + stats = self._stats + return ( + stats.loc[stats.max_optimize_timestamp.isnull() & (stats.bytes > self.min_table_size_optimize)] + ) + + def apply(self): + return [ + {self.tables_not_optimized_legend: self._need_optimize()} + ] + + diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 83e740d..94412f4 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -13,7 +13,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import lit from discoverx.table_info import InfoFetcher, TableInfo -from discoverx.delta_housekeeping import DeltaHousekeeping +from discoverx.delta_housekeeping import DeltaHousekeeping, DeltaHousekeepingActions logger = logging.Logging() @@ -219,10 +219,8 @@ def delta_housekeeping(self) -> pd.DataFrame: Gathers stats and recommendations on Delta Housekeeping """ dh = DeltaHousekeeping(self._spark) - dfs_pd: Iterable[pd.DataFrame] = self.map( - dh.scan - ) - return pd.concat(dfs_pd) # TODO create DeltaHousekeepingActions and implement `apply` + dfs_pd: Iterable[pd.DataFrame] = self.map(dh.scan) + return DeltaHousekeepingActions(dfs_pd) class DataExplorerActions: diff --git a/setup.py b/setup.py index 9233b4d..0e4d73f 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ "delta-spark>=2.2.0", "pandas<2.0.0", # From 2.0.0 onwards, pandas does not support iteritems() anymore, spark.createDataFrame will fail "numpy<1.24", # From 1.24 onwards, module 'numpy' has no attribute 'bool'. + "more_itertools", ] TEST_REQUIREMENTS = [ diff --git a/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv b/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv new file mode 100644 index 0000000..6a60520 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv @@ -0,0 +1,20 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error +lorenzorubi,default,housekeeping_summary_v3,1,3787,null,null,null,null,null,null,null,null,null +lorenzorubi,maxmind_geo,gold_ipv6,1,4907069,null,null,null,null,null,null,null,null,null +lorenzorubi,default,click_sales,6,326068799,null,null,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,null,null,null,null,null +lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,null,null,192917,192917,192917,[],null +lorenzorubi,default,housekeeping_summary_v2,3,12326,2023-12-18T11:25:35Z,null,null,null,5273,5273,5273,[],null +lorenzorubi,maxmind_geo,raw_locations,1,6933,null,null,null,null,null,null,null,null,null +lorenzorubi,tpch,customer,1,61897021,null,null,null,null,null,null,null,null,null +lorenzorubi,tpch,nation,1,3007,null,null,null,null,null,null,null,null,null +lorenzorubi,maxmind_geo,raw_ipv6,1,1783720,null,null,null,null,null,null,null,null,null +lorenzorubi,maxmind_geo,gold_ipv4,1,7220024,null,null,null,null,null,null,null,null,null +lorenzorubi,dais_dlt_2023,enriched_orders,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`enriched_orders` does not support DESCRIBE DETAIL. ; line 2 pos 20 +lorenzorubi,default,click_sales_history,1,7710,null,null,null,null,null,null,null,null,null +lorenzorubi,tpch,orders,2406,317120666,null,null,null,null,null,null,null,null,null +lorenzorubi,default,complete_data,6,326060019,null,null,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,null,null,null,null,null +lorenzorubi,maxmind_geo,raw_ipv4,1,3115269,null,null,null,null,null,null,null,null,null +lorenzorubi,gcp_cost_analysis,sku_prices,1,835,null,null,null,null,null,null,null,null,null +lorenzorubi,dais_dlt_2023,daily_totalorders_by_nation,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_totalorders_by_nation` does not support DESCRIBE DETAIL. ; line 2 pos 20 +lorenzorubi,gcp_cost_analysis,project_ids,2,1774,null,null,null,null,null,null,null,null,null +lorenzorubi,dais_dlt_2023,daily_2nd_high_orderprice,null,null,null,null,null,null,null,null,null,null,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_2nd_high_orderprice` does not support DESCRIBE DETAIL. ; line 2 pos 20 diff --git a/tests/unit/data/delta_housekeeping/expected_need_optimize.csv b/tests/unit/data/delta_housekeeping/expected_need_optimize.csv new file mode 100644 index 0000000..237a44c --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_need_optimize.csv @@ -0,0 +1,4 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error +lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,, +lorenzorubi,tpch,orders,2406.0,317120666.0,,,,,,,,, +lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,, diff --git a/tests/unit/delta_housekeeping_test.py b/tests/unit/delta_housekeeping_test.py new file mode 100644 index 0000000..1b760f1 --- /dev/null +++ b/tests/unit/delta_housekeeping_test.py @@ -0,0 +1,25 @@ +import pytest +import pandas as pd +from discoverx.delta_housekeeping import DeltaHousekeepingActions +from pathlib import Path + + +def test_need_optimize(request): + module_path = Path(request.module.__file__) + test_file_path = module_path.parent / "data/delta_housekeeping/dhk_pandas_result.csv" + stats = pd.read_csv(str(test_file_path.resolve())) + dha = DeltaHousekeepingActions( + None, + stats=stats, + ) + res = dha.apply() + assert len(res) == 1 + need_optimize = [item for item in res if list(res[0].keys())[0] == dha.tables_not_optimized_legend] + assert len(need_optimize) == 1 + need_optimize_df = list(need_optimize[0].values())[0] + need_optimize_df.to_csv(module_path.parent / "data/delta_housekeeping/expected_need_optimize.csv", index=False) + expected = pd.read_csv(module_path.parent / "data/delta_housekeeping/expected_need_optimize.csv") + pd.testing.assert_frame_equal( + need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected.loc[:, ["catalog", "database", "tableName"]], + ) From bded30562c3fd3716f261c0f12e811d6b6886fec Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio <104133639+lorenzorubi-db@users.noreply.github.com> Date: Thu, 21 Dec 2023 17:58:44 +0100 Subject: [PATCH 06/25] added more insights to housekeeping and refactored tests --- discoverx/delta_housekeeping.py | 150 +++++++++++++++++- examples/exec_delta_housekeeping.py | 23 ++- tests/unit/delta_housekeeping_actions_test.py | 49 ++++++ tests/unit/delta_housekeeping_test.py | 25 --- 4 files changed, 213 insertions(+), 34 deletions(-) create mode 100644 tests/unit/delta_housekeeping_actions_test.py delete mode 100644 tests/unit/delta_housekeeping_test.py diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 6a6463b..c3cac7e 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -1,9 +1,10 @@ from typing import Iterable from functools import reduce -from discoverx.table_info import TableInfo - +from datetime import datetime import pandas as pd +from discoverx.table_info import TableInfo + from pyspark.sql import DataFrame, SparkSession from pyspark.sql.window import Window import pyspark.sql.types as T @@ -199,7 +200,13 @@ def __init__( # delta_housekeeping: DeltaHousekeeping, mapped_pd_dfs: Iterable[pd.DataFrame], # spark: SparkSession = None, - min_table_size_optimize: int = 128*1024*1024, + min_table_size_optimize: int = 128*1024*1024, # i.e. 128 MB + min_days_not_optimized: int = 7, + min_days_not_vacuumed: int = 31, + max_optimize_freq: int = 2, + max_vacuum_freq: int = 2, + small_file_threshold: int = 32*1024*1024, # i.e. 32 MB + min_number_of_files_for_zorder: int = 8, stats: pd.DataFrame = None, # for testability only ) -> None: # self._delta_housekeeping = delta_housekeeping @@ -211,20 +218,149 @@ def __init__( # spark = SparkSession.builder.getOrCreate() # self._spark = spark self.min_table_size_optimize = min_table_size_optimize + self.min_days_not_optimized = min_days_not_optimized + self.min_days_not_vacuumed = min_days_not_vacuumed + self.max_optimize_freq = max_optimize_freq + self.max_vacuum_freq = max_vacuum_freq + self.small_file_threshold = small_file_threshold + self.min_number_of_files_for_zorder = min_number_of_files_for_zorder self.tables_not_optimized_legend = "Tables that are never OPTIMIZED and would benefit from it" + self.tables_not_vacuumed_legend = "Tables that are never VACUUM'ed" + self.tables_not_optimized_last_days = "Tables that are not OPTIMIZED often enough" + self.tables_not_vacuumed_last_days = "Tables that are not VACUUM'ed often enough" + self.tables_optimized_too_freq = "Tables that are OPTIMIZED too often" + self.tables_vacuumed_too_freq = "Tables that are VACUUM'ed too often" + self.tables_do_not_need_optimize = "Tables that are too small to be OPTIMIZED" + self.tables_to_analyze = "Tables that need more analysis (small_files)" + self.tables_zorder_not_effective = "Tables for which ZORDER is not being effective" def stats(self) -> pd.DataFrame: return self._stats def _need_optimize(self) -> pd.DataFrame: + stats = self._stats.copy() + stats = stats.loc[stats.max_optimize_timestamp.isnull() & stats.bytes.notnull()] + return ( + stats.loc[(stats.bytes.astype(int) > self.min_table_size_optimize)] + ) + + def _optimize_not_needed(self) -> pd.DataFrame: + stats = self._stats.copy() + stats = stats.loc[stats.max_optimize_timestamp.isnull() & stats.bytes.notnull()] + return ( + stats.loc[stats.max_optimize_timestamp.notnull() & (stats.bytes.astype(int) > self.min_table_size_optimize)] + ) + + def _not_optimized_last_days(self) -> pd.DataFrame: + stats = self._stats.copy() + stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp']) + stats['optimize_lag'] = ( + datetime.utcnow().replace(tzinfo=stats.dtypes["max_optimize_timestamp"].tz) - stats['max_optimize_timestamp'] + ).dt.days + return ( + stats[stats['optimize_lag'] < self.min_days_not_optimized] + ) + + def _optimized_too_frequently(self) -> pd.DataFrame: + stats = self._stats.copy() + stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp']) + stats['2nd_optimize_timestamp'] = pd.to_datetime(stats['2nd_optimize_timestamp']) + stats['optimize_lag'] = (stats['max_optimize_timestamp'] - stats['2nd_optimize_timestamp']).dt.days + return ( + stats[stats['optimize_lag'] < self.max_optimize_freq] + ) + + def _never_vacuumed(self) -> pd.DataFrame: stats = self._stats return ( - stats.loc[stats.max_optimize_timestamp.isnull() & (stats.bytes > self.min_table_size_optimize)] + stats.loc[stats.max_vacuum_timestamp.isnull()] + ) + + def _not_vacuumed_last_days(self) -> pd.DataFrame: + stats = self._stats.copy() + stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp']) + stats['vacuum_lag'] = ( + datetime.utcnow().replace(tzinfo=stats.dtypes["max_vacuum_timestamp"].tz) - stats['max_vacuum_timestamp'] + ).dt.days + return ( + stats[stats['vacuum_lag'] < self.min_days_not_vacuumed] + ) + + def _vacuumed_too_frequently(self) -> pd.DataFrame: + stats = self._stats.copy() + stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp']) + stats['2nd_vacuum_timestamp'] = pd.to_datetime(stats['2nd_vacuum_timestamp']) + stats['vacuum_lag'] = (stats['max_vacuum_timestamp'] - stats['2nd_vacuum_timestamp']).dt.days + return ( + stats[stats['vacuum_lag'] < self.max_vacuum_freq] + ) + + def _analyze_these_tables(self) -> pd.DataFrame: + stats = self._stats.copy() + stats = stats.loc[stats['max_optimize_timestamp'].notnull() & + stats['p50_file_size'].notnull() & + (stats['number_of_files'] > 1)] + stats = stats.loc[(stats['p50_file_size'] < self.small_file_threshold)] + return ( + stats.sort_values(by=['database', 'tableName', 'number_of_files'], ascending=[True, True, False]) + ) + + def _zorder_not_effective(self) -> pd.DataFrame: + stats = self._stats.copy() + stats = stats.loc[stats['max_optimize_timestamp'].notnull() & + stats['p50_file_size'].notnull()] + + # clean up z_order_by column and split into array + stats['z_order_by_clean'] = stats['z_order_by'].apply( + lambda x: None if x == "[]" else x.replace('[', '').replace(']', '').replace('"', '')) + stats['z_order_by_array'] = stats['z_order_by_clean'].str.split(',') + + # filter rows with zorder columns and number_of_files is less than threshold + stats = stats[stats['z_order_by_array'].str.len() > 0] + stats = stats[stats['number_of_files'].astype(int) < self.min_number_of_files_for_zorder] + return ( + stats ) def apply(self): - return [ - {self.tables_not_optimized_legend: self._need_optimize()} - ] + out = [] + for df, legend in zip([ + self._need_optimize(), + self._never_vacuumed(), + self._not_optimized_last_days(), + self._not_vacuumed_last_days(), + self._optimized_too_frequently(), + self._vacuumed_too_frequently(), + self._optimize_not_needed(), + self._analyze_these_tables(), + self._zorder_not_effective(), + ], [ + self.tables_not_optimized_legend, + self.tables_not_vacuumed_legend, + self.tables_not_optimized_last_days, + self.tables_not_vacuumed_last_days, + self.tables_optimized_too_freq, + self.tables_vacuumed_too_freq, + self.tables_do_not_need_optimize, + self.tables_to_analyze, + self.tables_zorder_not_effective, + ]): + if not df.empty: + out.append({legend: df}) + return out + + def to_html(self): + # TODO better formatting! + from bs4 import BeautifulSoup + res = self.apply() + soup = BeautifulSoup(features='xml') + body = soup.new_tag('body') + soup.insert(0, body) + for r in res: + for k,v in r.items(): + title_s = soup.new_tag('title') + title_s.string = k + body.insert(0, v.to_html()) + body.insert(0, title_s) diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 5a08a9a..d221432 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -5,6 +5,7 @@ # COMMAND ---------- +# TODO remove # MAGIC %reload_ext autoreload # MAGIC %autoreload 2 @@ -19,11 +20,29 @@ result = ( dx.from_tables("lorenzorubi.*.*") - .with_concurrency(1) # You can increase the concurrency with this parameter .delta_housekeeping() + .stats() ) +display(result) + +# COMMAND ---------- + +result = ( + dx.from_tables("lorenzorubi.*.*") + .delta_housekeeping() + .apply() +) + +# COMMAND ---------- + +result = ( + dx.from_tables("lorenzorubi.*.*") + .delta_housekeeping() + .html() +) + +displayHTML(result) # COMMAND ---------- -display(result) diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py new file mode 100644 index 0000000..7ccbd9a --- /dev/null +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -0,0 +1,49 @@ +import pytest +import pandas as pd +from discoverx.delta_housekeeping import DeltaHousekeepingActions +from pathlib import Path + + +def _resolve_file_path(request, relative_path): + module_path = Path(request.module.__file__) + test_file_path = module_path.parent / relative_path + return pd.read_csv(str(test_file_path.resolve())) + + +@pytest.fixture() +def housekeeping_stats(request): + return _resolve_file_path(request, "data/delta_housekeeping/dhk_pandas_result.csv") + + +@pytest.fixture() +def expected_need_optimize(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_need_optimize.csv") + + +def test_apply_output(housekeeping_stats, expected_need_optimize): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.apply() + assert len(res) == 7 + need_optimize = [item for item in res if (list(item.keys())[0] == dha.tables_not_optimized_legend)] + assert len(need_optimize) == 1 + need_optimize_df = list(need_optimize[0].values())[0] + pd.testing.assert_frame_equal( + need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_need_optimize.loc[:, ["catalog", "database", "tableName"]], + ) + # TODO complete all the tests + + +def test_empty_apply_output(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + min_table_size_optimize=1024*1024*1024*1024 + ) + res = dha.apply() + assert len(res) == 6 + need_optimize = [item for item in res if list(item.keys())[0] == dha.tables_not_optimized_legend] + assert len(need_optimize) == 0 diff --git a/tests/unit/delta_housekeeping_test.py b/tests/unit/delta_housekeeping_test.py deleted file mode 100644 index 1b760f1..0000000 --- a/tests/unit/delta_housekeeping_test.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -import pandas as pd -from discoverx.delta_housekeeping import DeltaHousekeepingActions -from pathlib import Path - - -def test_need_optimize(request): - module_path = Path(request.module.__file__) - test_file_path = module_path.parent / "data/delta_housekeeping/dhk_pandas_result.csv" - stats = pd.read_csv(str(test_file_path.resolve())) - dha = DeltaHousekeepingActions( - None, - stats=stats, - ) - res = dha.apply() - assert len(res) == 1 - need_optimize = [item for item in res if list(res[0].keys())[0] == dha.tables_not_optimized_legend] - assert len(need_optimize) == 1 - need_optimize_df = list(need_optimize[0].values())[0] - need_optimize_df.to_csv(module_path.parent / "data/delta_housekeeping/expected_need_optimize.csv", index=False) - expected = pd.read_csv(module_path.parent / "data/delta_housekeeping/expected_need_optimize.csv") - pd.testing.assert_frame_equal( - need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], - expected.loc[:, ["catalog", "database", "tableName"]], - ) From cf4ef0784079aee2c09edb565b01569c1d8d0a9e Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Thu, 21 Dec 2023 18:42:18 +0000 Subject: [PATCH 07/25] regression and cleanup --- discoverx/delta_housekeeping.py | 8 +++--- examples/exec_delta_housekeeping.py | 41 +++++++++++++++-------------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index c3cac7e..4928e80 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -255,7 +255,7 @@ def _not_optimized_last_days(self) -> pd.DataFrame: stats = self._stats.copy() stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp']) stats['optimize_lag'] = ( - datetime.utcnow().replace(tzinfo=stats.dtypes["max_optimize_timestamp"].tz) - stats['max_optimize_timestamp'] + datetime.utcnow() - stats['max_optimize_timestamp'] # TODO careful ).dt.days return ( stats[stats['optimize_lag'] < self.min_days_not_optimized] @@ -280,7 +280,7 @@ def _not_vacuumed_last_days(self) -> pd.DataFrame: stats = self._stats.copy() stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp']) stats['vacuum_lag'] = ( - datetime.utcnow().replace(tzinfo=stats.dtypes["max_vacuum_timestamp"].tz) - stats['max_vacuum_timestamp'] + datetime.utcnow() - stats['max_vacuum_timestamp'] # TODO careful ).dt.days return ( stats[stats['vacuum_lag'] < self.min_days_not_vacuumed] @@ -300,7 +300,7 @@ def _analyze_these_tables(self) -> pd.DataFrame: stats = stats.loc[stats['max_optimize_timestamp'].notnull() & stats['p50_file_size'].notnull() & (stats['number_of_files'] > 1)] - stats = stats.loc[(stats['p50_file_size'] < self.small_file_threshold)] + stats = stats.loc[(stats['p50_file_size'].astype(int) < self.small_file_threshold)] return ( stats.sort_values(by=['database', 'tableName', 'number_of_files'], ascending=[True, True, False]) ) @@ -364,3 +364,5 @@ def to_html(self): title_s.string = k body.insert(0, v.to_html()) body.insert(0, title_s) + + return soup diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index d221432..23b4318 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -1,47 +1,48 @@ # Databricks notebook source # MAGIC %md -# MAGIC # Run arbitrary operations across multiple tables +# MAGIC # Run Delta Housekeeping across multiple tables # MAGIC # COMMAND ---------- # TODO remove -# MAGIC %reload_ext autoreload -# MAGIC %autoreload 2 +%reload_ext autoreload +%autoreload 2 # COMMAND ---------- - from discoverx import DX dx = DX() # COMMAND ---------- -result = ( - dx.from_tables("lorenzorubi.*.*") - .delta_housekeeping() - .stats() +# DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object you can apply operations to +output = ( + dx.from_tables("lorenzorubi.*.*") + .delta_housekeeping() ) -display(result) # COMMAND ---------- -result = ( - dx.from_tables("lorenzorubi.*.*") - .delta_housekeeping() - .apply() -) +# DBTITLE 1,Generate a pandas dataframe with stats per table +display(output.stats()) # COMMAND ---------- -result = ( - dx.from_tables("lorenzorubi.*.*") - .delta_housekeeping() - .html() -) +# DBTITLE 1,apply() operation generates a list of dictionaries (if you need to postprocess the output) +result = output.apply() + +# COMMAND ---------- + +for r in result: + print(list(r.keys())[0]) + display(list(r.values())[0]) + +# COMMAND ---------- -displayHTML(result) +# DBTITLE 1,to_html() outputs the DeltaHousekeeping recommendations +displayHTML(output.to_html()) # COMMAND ---------- From bc303cd47c54f1f034502c830e794e328bde794f Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Wed, 3 Jan 2024 14:17:16 +0100 Subject: [PATCH 08/25] move implementation of map_chunked to a separated branch + improved unit tests --- discoverx/delta_housekeeping.py | 14 ++++---- discoverx/explorer.py | 34 ------------------- setup.py | 1 - tests/unit/delta_housekeeping_actions_test.py | 4 +-- tests/unit/explorer_test.py | 12 +++++++ 5 files changed, 21 insertions(+), 44 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 4928e80..889066f 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -1,6 +1,6 @@ from typing import Iterable from functools import reduce -from datetime import datetime +from datetime import datetime, timezone import pandas as pd from discoverx.table_info import TableInfo @@ -102,7 +102,7 @@ def scan( self, table_info_list: Iterable[TableInfo], housekeeping_table_name: str = "lorenzorubi.default.housekeeping_summary_v2", # TODO remove - do_save_as_table: bool = True, + do_save_as_table: bool = False, ) -> pd.DataFrame: """ Scans a table_info / table_info_list to fetch Delta stats @@ -155,7 +155,7 @@ def scan( """) except Exception as e: errors.append(self._spark.createDataFrame( - [(table_info.catalog, table_info.schema, table_info.table, str(e))], + [(table_info.catalog or "", table_info.schema, table_info.table, str(e))], ["catalog", "database", "tableName", "error"] )) @@ -253,9 +253,9 @@ def _optimize_not_needed(self) -> pd.DataFrame: def _not_optimized_last_days(self) -> pd.DataFrame: stats = self._stats.copy() - stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp']) + stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp'], utc=True) stats['optimize_lag'] = ( - datetime.utcnow() - stats['max_optimize_timestamp'] # TODO careful + datetime.now(timezone.utc) - stats['max_optimize_timestamp'] ).dt.days return ( stats[stats['optimize_lag'] < self.min_days_not_optimized] @@ -278,9 +278,9 @@ def _never_vacuumed(self) -> pd.DataFrame: def _not_vacuumed_last_days(self) -> pd.DataFrame: stats = self._stats.copy() - stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp']) + stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp'], utc=True) stats['vacuum_lag'] = ( - datetime.utcnow() - stats['max_vacuum_timestamp'] # TODO careful + datetime.now(timezone.utc) - stats['max_vacuum_timestamp'] ).dt.days return ( stats[stats['vacuum_lag'] < self.min_days_not_vacuumed] diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 94412f4..378d86f 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -1,7 +1,6 @@ import concurrent.futures import copy import re -import more_itertools import pandas as pd from typing import Optional, List, Callable, Iterable from discoverx import logging @@ -181,39 +180,6 @@ def map(self, f: Callable) -> list[any]: return res - def map_chunked(self, f: Callable, tables_per_chunk: int, **kwargs) -> list[any]: - """Runs a function for each table in the data explorer - - Args: - f (function): The function to run. The function should accept either a list of TableInfo objects as input and return a list of any object as output. - - Returns: - list[any]: A list of the results of running the function for each table - """ - res = [] - table_list = self._info_fetcher.get_tables_info( - self._catalogs, - self._schemas, - self._tables, - self._having_columns, - self._with_tags, - ) - with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_concurrency) as executor: - # Submit tasks to the thread pool - futures = [ - executor.submit(f, table_chunk, **kwargs) for table_chunk in more_itertools.chunked(table_list, tables_per_chunk) - ] - - # Process completed tasks - for future in concurrent.futures.as_completed(futures): - result = future.result() - if result is not None: - res.extend(result) - - logger.debug("Finished lakehouse map_chunked task") - - return res - def delta_housekeeping(self) -> pd.DataFrame: """ Gathers stats and recommendations on Delta Housekeeping diff --git a/setup.py b/setup.py index 0e4d73f..9233b4d 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,6 @@ "delta-spark>=2.2.0", "pandas<2.0.0", # From 2.0.0 onwards, pandas does not support iteritems() anymore, spark.createDataFrame will fail "numpy<1.24", # From 1.24 onwards, module 'numpy' has no attribute 'bool'. - "more_itertools", ] TEST_REQUIREMENTS = [ diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index 7ccbd9a..125d305 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -26,7 +26,7 @@ def test_apply_output(housekeeping_stats, expected_need_optimize): stats=housekeeping_stats, ) res = dha.apply() - assert len(res) == 7 + assert len(res) == 6 need_optimize = [item for item in res if (list(item.keys())[0] == dha.tables_not_optimized_legend)] assert len(need_optimize) == 1 need_optimize_df = list(need_optimize[0].values())[0] @@ -44,6 +44,6 @@ def test_empty_apply_output(housekeeping_stats): min_table_size_optimize=1024*1024*1024*1024 ) res = dha.apply() - assert len(res) == 6 + assert len(res) == 5 need_optimize = [item for item in res if list(item.keys())[0] == dha.tables_not_optimized_legend] assert len(need_optimize) == 0 diff --git a/tests/unit/explorer_test.py b/tests/unit/explorer_test.py index 8475599..1df5f19 100644 --- a/tests/unit/explorer_test.py +++ b/tests/unit/explorer_test.py @@ -1,3 +1,4 @@ +import pandas import pytest from discoverx.explorer import DataExplorer, DataExplorerActions, InfoFetcher, TableInfo @@ -89,3 +90,14 @@ def test_no_tables_matching_filter(spark, info_fetcher): data_explorer = DataExplorer("some_catalog.default.non_existent_table", spark, info_fetcher) with pytest.raises(ValueError): data_explorer.map(lambda table_info: table_info) + + +def test_delta_housekeeeping_call(spark, info_fetcher): + data_explorer = DataExplorer("*.default.*", spark, info_fetcher) + result: pandas.DataFrame = data_explorer.delta_housekeeping().stats() + print(result['tableName'].count()) + assert result['tableName'].count() == 3 + for res in result['tableName'].tolist(): + assert res in ["tb_all_types", "tb_1", "tb_2"] + for col in result.columns: + assert col in ["catalog", "database", "tableName", "error"] From feeafaf06c70f673af921e9fb6c886a0f3f8ac29 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Wed, 3 Jan 2024 15:35:40 +0100 Subject: [PATCH 09/25] readability, cleanup, follow discoverx patterns --- README.md | 5 + discoverx/delta_housekeeping.py | 94 +++++++++++++------ examples/exec_delta_housekeeping.py | 24 +++-- tests/unit/delta_housekeeping_actions_test.py | 4 +- tests/unit/explorer_test.py | 2 +- 5 files changed, 87 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index c733664..9100573 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,11 @@ Operations are applied concurrently across multiple tables * OPTIMIZE with z-order on tables having specified columns * Detect tables having too many small files ([example notebook](examples/detect_small_files.py)) * Visualise quantity of data written per table per period + * Delta housekeeping analysis ([example notebook](examples/exec_delta_housekeeping.py)) which provide: + * stats (size of tables and number of files, timestamps of latest OPTIMIZE & VACUUM operations, stats of OPTIMIZE) + * recommendations on tables that need to be OPTIMIZED/VACUUM'ed + * are tables OPTIMIZED/VACUUM'ed often enough + * tables that have small files / tables for which ZORDER is not being effective * **Governance** * PII detection with Presidio ([example notebook](examples/pii_detection_presidio.py)) * Text Analysis with MosaicML and Databricks MLflow ([example notebook](examples/text_analysis_mosaicml_mlflow.py)) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 889066f..ceda5aa 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -98,11 +98,30 @@ def _process_describe_history( return out + @staticmethod + def save_as_table( + result: DataFrame, + housekeeping_table_name: str, + ): + """ + Static method to store intermediate results of the scan operation into Delta + Would make sense only if using map_chunked from the `DataExplorer` object + (otherwise tables are writen one by one into Delta with overhead) + + TODO create function in `DataExplorer` that uses this for a chunked + """ + ( + result + .write + .format("delta") + .mode("append") + .option("mergeSchema", "true") + .saveAsTable(housekeeping_table_name) + ) + def scan( self, table_info_list: Iterable[TableInfo], - housekeeping_table_name: str = "lorenzorubi.default.housekeeping_summary_v2", # TODO remove - do_save_as_table: bool = False, ) -> pd.DataFrame: """ Scans a table_info / table_info_list to fetch Delta stats @@ -181,25 +200,20 @@ def scan( out = dh.unionByName(errors_df, allowMissingColumns=True) - if do_save_as_table: - ( - out - .write - .format("delta") - .mode("append") - .option("mergeSchema", "true") - .saveAsTable(housekeeping_table_name) - ) - return out.toPandas() class DeltaHousekeepingActions: + """ + Processes the output of the `DeltaHousekeeping` object to provide recommendations + - tables that need to be OPTIMIZED/VACUUM'ed + - are tables OPTIMIZED/VACUUM'ed often enough + - tables that have small files / tables for which ZORDER is not being effective + """ + def __init__( self, - # delta_housekeeping: DeltaHousekeeping, mapped_pd_dfs: Iterable[pd.DataFrame], - # spark: SparkSession = None, min_table_size_optimize: int = 128*1024*1024, # i.e. 128 MB min_days_not_optimized: int = 7, min_days_not_vacuumed: int = 31, @@ -209,14 +223,10 @@ def __init__( min_number_of_files_for_zorder: int = 8, stats: pd.DataFrame = None, # for testability only ) -> None: - # self._delta_housekeeping = delta_housekeeping if stats is None: self._mapped_pd_dfs = mapped_pd_dfs stats = pd.concat(self._mapped_pd_dfs) self._stats: pd.DataFrame = stats - # if spark is None: - # spark = SparkSession.builder.getOrCreate() - # self._spark = spark self.min_table_size_optimize = min_table_size_optimize self.min_days_not_optimized = min_days_not_optimized self.min_days_not_vacuumed = min_days_not_vacuumed @@ -234,9 +244,6 @@ def __init__( self.tables_to_analyze = "Tables that need more analysis (small_files)" self.tables_zorder_not_effective = "Tables for which ZORDER is not being effective" - def stats(self) -> pd.DataFrame: - return self._stats - def _need_optimize(self) -> pd.DataFrame: stats = self._stats.copy() stats = stats.loc[stats.max_optimize_timestamp.isnull() & stats.bytes.notnull()] @@ -322,7 +329,37 @@ def _zorder_not_effective(self) -> pd.DataFrame: stats ) - def apply(self): + def stats(self) -> DataFrame: + """Ouputs the stats per table""" + import pyspark.pandas as ps + + return ps.from_pandas(self._stats) + + def display(self) -> None: + """Executes the Delta housekeeping analysis and displays a sample of results""" + return self.apply().display() + + def apply(self) -> DataFrame: + """Displays recommendations in a DataFrame format""" + import pyspark.pandas as ps + + out = None + for recomm in self.generate_recommendations(): + for legend, df in recomm.items(): + out_df = ps.from_pandas(df).withColumn("recommendation", F.lit(legend)) + if out is None: + out = out_df + else: + out = out.unionByName(out_df, allowMissingColumns=True) + return out + + def generate_recommendations(self) -> Iterable[dict]: + """ + Generates Delta Housekeeping recommendations as a list of dictionaries (internal use + unit tests only) + A dict per recommendation where: + - The key is the legend of the recommendation + - The value is a pandas df with the affected tables + """ out = [] for df, legend in zip([ self._need_optimize(), @@ -349,20 +386,19 @@ def apply(self): out.append({legend: df}) return out - def to_html(self): + def explain(self) -> None: # TODO better formatting! from bs4 import BeautifulSoup - res = self.apply() soup = BeautifulSoup(features='xml') body = soup.new_tag('body') soup.insert(0, body) - for r in res: - for k,v in r.items(): + for recomm in self.generate_recommendations(): + for legend, df in recomm.items(): title_s = soup.new_tag('title') - title_s.string = k - body.insert(0, v.to_html()) + title_s.string = legend + body.insert(0, df.to_html()) body.insert(0, title_s) - return soup + displayHTML(soup) diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 23b4318..a39ba06 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -1,13 +1,16 @@ # Databricks notebook source # MAGIC %md # MAGIC # Run Delta Housekeeping across multiple tables +# MAGIC Analysis that provides stats on Delta tables / recommendations for improvements, including: +# MAGIC - stats:size of tables and number of files, timestamps of latest OPTIMIZE & VACUUM operations, stats of OPTIMIZE) +# MAGIC - recommendations on tables that need to be OPTIMIZED/VACUUM'ed +# MAGIC - are tables OPTIMIZED/VACUUM'ed often enough +# MAGIC - tables that have small files / tables for which ZORDER is not being effective # MAGIC # COMMAND ---------- -# TODO remove -%reload_ext autoreload -%autoreload 2 +# MAGIC %pip install dbl-discoverx # COMMAND ---------- @@ -25,24 +28,25 @@ # COMMAND ---------- -# DBTITLE 1,Generate a pandas dataframe with stats per table -display(output.stats()) +# DBTITLE 1,Display the stats per table +stats = output.stats() +stats.display() # COMMAND ---------- # DBTITLE 1,apply() operation generates a list of dictionaries (if you need to postprocess the output) result = output.apply() +result.display() # COMMAND ---------- -for r in result: - print(list(r.keys())[0]) - display(list(r.values())[0]) +# DBTITLE 1,display() runs apply and displays the result +output.display() # COMMAND ---------- -# DBTITLE 1,to_html() outputs the DeltaHousekeeping recommendations -displayHTML(output.to_html()) +# DBTITLE 1,explain() outputs the DeltaHousekeeping recommendations in HTML format +output.explain() # COMMAND ---------- diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index 125d305..faed05e 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -25,7 +25,7 @@ def test_apply_output(housekeeping_stats, expected_need_optimize): None, stats=housekeeping_stats, ) - res = dha.apply() + res = dha.generate_recommendations() assert len(res) == 6 need_optimize = [item for item in res if (list(item.keys())[0] == dha.tables_not_optimized_legend)] assert len(need_optimize) == 1 @@ -43,7 +43,7 @@ def test_empty_apply_output(housekeeping_stats): stats=housekeeping_stats, min_table_size_optimize=1024*1024*1024*1024 ) - res = dha.apply() + res = dha.generate_recommendations() assert len(res) == 5 need_optimize = [item for item in res if list(item.keys())[0] == dha.tables_not_optimized_legend] assert len(need_optimize) == 0 diff --git a/tests/unit/explorer_test.py b/tests/unit/explorer_test.py index 1df5f19..429f3ae 100644 --- a/tests/unit/explorer_test.py +++ b/tests/unit/explorer_test.py @@ -94,7 +94,7 @@ def test_no_tables_matching_filter(spark, info_fetcher): def test_delta_housekeeeping_call(spark, info_fetcher): data_explorer = DataExplorer("*.default.*", spark, info_fetcher) - result: pandas.DataFrame = data_explorer.delta_housekeeping().stats() + result: pandas.DataFrame = data_explorer.delta_housekeeping()._stats print(result['tableName'].count()) assert result['tableName'].count() == 3 for res in result['tableName'].tolist(): From e8a1b66e77786345463e1463cdc25ed1c8ee11af Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Wed, 3 Jan 2024 17:01:55 +0000 Subject: [PATCH 10/25] debugging on cluster + adding spark session to `DeltaHousekeepingActions` --- discoverx/delta_housekeeping.py | 15 +++++++++------ discoverx/explorer.py | 2 +- examples/exec_delta_housekeeping.py | 12 +++--------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index ceda5aa..f13cc29 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -214,6 +214,7 @@ class DeltaHousekeepingActions: def __init__( self, mapped_pd_dfs: Iterable[pd.DataFrame], + spark: SparkSession = None, min_table_size_optimize: int = 128*1024*1024, # i.e. 128 MB min_days_not_optimized: int = 7, min_days_not_vacuumed: int = 31, @@ -227,6 +228,11 @@ def __init__( self._mapped_pd_dfs = mapped_pd_dfs stats = pd.concat(self._mapped_pd_dfs) self._stats: pd.DataFrame = stats + + if spark is None: + spark = SparkSession.builder.getOrCreate() + self._spark = spark + self.min_table_size_optimize = min_table_size_optimize self.min_days_not_optimized = min_days_not_optimized self.min_days_not_vacuumed = min_days_not_vacuumed @@ -331,9 +337,7 @@ def _zorder_not_effective(self) -> pd.DataFrame: def stats(self) -> DataFrame: """Ouputs the stats per table""" - import pyspark.pandas as ps - - return ps.from_pandas(self._stats) + return self._spark.createDataFrame(self._stats) def display(self) -> None: """Executes the Delta housekeeping analysis and displays a sample of results""" @@ -341,12 +345,10 @@ def display(self) -> None: def apply(self) -> DataFrame: """Displays recommendations in a DataFrame format""" - import pyspark.pandas as ps - out = None for recomm in self.generate_recommendations(): for legend, df in recomm.items(): - out_df = ps.from_pandas(df).withColumn("recommendation", F.lit(legend)) + out_df = self._spark.createDataFrame(df).withColumn("recommendation", F.lit(legend)) if out is None: out = out_df else: @@ -389,6 +391,7 @@ def generate_recommendations(self) -> Iterable[dict]: def explain(self) -> None: # TODO better formatting! from bs4 import BeautifulSoup + from databricks.sdk.runtime import displayHTML soup = BeautifulSoup(features='xml') diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 378d86f..ed9ffc0 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -186,7 +186,7 @@ def delta_housekeeping(self) -> pd.DataFrame: """ dh = DeltaHousekeeping(self._spark) dfs_pd: Iterable[pd.DataFrame] = self.map(dh.scan) - return DeltaHousekeepingActions(dfs_pd) + return DeltaHousekeepingActions(dfs_pd, spark=self._spark) class DataExplorerActions: diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index a39ba06..504f0cb 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -28,19 +28,13 @@ # COMMAND ---------- -# DBTITLE 1,Display the stats per table -stats = output.stats() -stats.display() - -# COMMAND ---------- - -# DBTITLE 1,apply() operation generates a list of dictionaries (if you need to postprocess the output) +# DBTITLE 1,apply() operation generates a spark dataframe with recommendations result = output.apply() -result.display() +result.select("catalog", "database", "tableName", "recommendation").display() # COMMAND ---------- -# DBTITLE 1,display() runs apply and displays the result +# DBTITLE 1,display() runs apply and displays the full result (including stats per table) output.display() # COMMAND ---------- From e177ef44e1e1930a0d6690b782848b2feea20ede Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Wed, 3 Jan 2024 19:40:42 +0100 Subject: [PATCH 11/25] simplify scan implementation & remove dependency to BeautifulSoup --- discoverx/delta_housekeeping.py | 130 +++++++++++++------------------- 1 file changed, 52 insertions(+), 78 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index f13cc29..02d5b23 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -119,46 +119,29 @@ def save_as_table( .saveAsTable(housekeeping_table_name) ) - def scan( - self, - table_info_list: Iterable[TableInfo], - ) -> pd.DataFrame: - """ - Scans a table_info / table_info_list to fetch Delta stats - - DESCRIBE DETAIL - - DESCRIBE HISTORY - """ - dd_list = [] - statements = [] - errors = [] - - if not isinstance(table_info_list, Iterable): - table_info_list = [table_info_list] - - for table_info in table_info_list: - try: - # runs a describe detail per table, figures out if exception - dd = self._spark.sql(f""" - DESCRIBE DETAIL {table_info.catalog}.{table_info.schema}.{table_info.table}; - """) - dd = ( - dd - .withColumn("split", F.split(F.col('name'), '\.')) - .withColumn("catalog", F.col("split").getItem(0)) - .withColumn("database", F.col("split").getItem(1)) - .withColumn("tableName", F.col("split").getItem(2)) - .select([ - F.col("catalog"), - F.col("database"), - F.col("tableName"), - F.col("numFiles").alias("number_of_files"), - F.col("sizeInBytes").alias("bytes"), - ]) - ) - dd_list.append(dd) - - # prepares a DESCRIBE HISTORY statement per table (will be run outside of the loop) - statements.append(f""" + def get_describe_detail(self, table_info: TableInfo): + dd = self._spark.sql(f""" + DESCRIBE DETAIL {table_info.catalog}.{table_info.schema}.{table_info.table}; + """) + dd = ( + dd + .withColumn("split", F.split(F.col('name'), '\.')) + .withColumn("catalog", F.col("split").getItem(0)) + .withColumn("database", F.col("split").getItem(1)) + .withColumn("tableName", F.col("split").getItem(2)) + .select([ + F.col("catalog"), + F.col("database"), + F.col("tableName"), + F.col("numFiles").alias("number_of_files"), + F.col("sizeInBytes").alias("bytes"), + ]) + ) + return dd + + @staticmethod + def get_describe_history_statement(table_info: TableInfo): + return f""" SELECT '{table_info.catalog}' AS catalog, '{table_info.schema}' AS database, @@ -171,36 +154,35 @@ def scan( operationParameters.zOrderBy AS z_order_by FROM (DESCRIBE HISTORY {table_info.catalog}.{table_info.schema}.{table_info.table}) WHERE operation in ('OPTIMIZE', 'VACUUM END') - """) - except Exception as e: - errors.append(self._spark.createDataFrame( - [(table_info.catalog or "", table_info.schema, table_info.table, str(e))], - ["catalog", "database", "tableName", "error"] - )) - - # statement to UNION all DESCRIBE HISTORY together - statement = " UNION ".join(statements) - - dh = self._spark.createDataFrame([], self.empty_schema) - if statements: - dh = self._process_describe_history( - reduce( - lambda left, right: left.union(right), - dd_list - ), - self._spark.sql(statement), - ) + """ - errors_df = self._spark.createDataFrame([], self.empty_schema) - if errors: - errors_df = reduce( - lambda left, right: left.union(right), - errors - ) + def scan( + self, + table_info: TableInfo, + ) -> pd.DataFrame: + """ + Scans a table_info to fetch Delta stats + - DESCRIBE DETAIL + - DESCRIBE HISTORY + """ + try: + # runs a describe detail per table, figures out if exception + dd = self.get_describe_detail(table_info) - out = dh.unionByName(errors_df, allowMissingColumns=True) + # prepares a DESCRIBE HISTORY statement per table (will be run outside the try-catch) + statement = self.get_describe_history_statement(table_info) - return out.toPandas() + return self._process_describe_history( + dd, + self._spark.sql(statement), + ).toPandas() + + except Exception as e: + errors_df = self._spark.createDataFrame( + [(table_info.catalog or "", table_info.schema, table_info.table, str(e))], + ["catalog", "database", "tableName", "error"] + ) + return errors_df.toPandas() class DeltaHousekeepingActions: @@ -390,18 +372,10 @@ def generate_recommendations(self) -> Iterable[dict]: def explain(self) -> None: # TODO better formatting! - from bs4 import BeautifulSoup - from databricks.sdk.runtime import displayHTML + from databricks.sdk.runtime import display - soup = BeautifulSoup(features='xml') - body = soup.new_tag('body') - soup.insert(0, body) for recomm in self.generate_recommendations(): for legend, df in recomm.items(): - title_s = soup.new_tag('title') - title_s.string = legend - body.insert(0, df.to_html()) - body.insert(0, title_s) - - displayHTML(soup) + display(legend) + display(df) From 023b02f87f3e5a5e15aa1cfacb55e84878614bdc Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Fri, 5 Jan 2024 18:31:18 +0100 Subject: [PATCH 12/25] faster implementation + unit tests --- discoverx/delta_housekeeping.py | 65 ++++----- .../delta_housekeeping/dd_click_sales.csv | 2 + .../dd_housekeeping_summary.csv | 2 + .../delta_housekeeping/dh_click_sales.csv | 4 + .../dh_housekeeping_summary.csv | 25 ++++ .../expected_pdh_click_sales.csv | 2 + .../expected_pdh_housekeeping_summary.csv | 2 + tests/unit/delta_housekeeping_test.py | 138 ++++++++++++++++++ 8 files changed, 206 insertions(+), 34 deletions(-) create mode 100644 tests/unit/data/delta_housekeeping/dd_click_sales.csv create mode 100644 tests/unit/data/delta_housekeeping/dd_housekeeping_summary.csv create mode 100644 tests/unit/data/delta_housekeeping/dh_click_sales.csv create mode 100644 tests/unit/data/delta_housekeeping/dh_housekeeping_summary.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_pdh_click_sales.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_pdh_housekeeping_summary.csv create mode 100644 tests/unit/delta_housekeeping_test.py diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 02d5b23..03de50c 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -24,7 +24,7 @@ def __init__(self, spark: SparkSession) -> None: @staticmethod def _process_describe_history( describe_detail_df: DataFrame, describe_history_df: DataFrame - ) -> DataFrame: + ) -> pd.DataFrame: """ processes the DESCRIBE HISTORY result of potentially several tables in different schemas/catalogs Provides @@ -33,10 +33,13 @@ def _process_describe_history( - stats of OPTIMIZE (including ZORDER) - timestamp for last & second last VACUUM + returns a pandas DataFrame, and converts Spark internal dfs to pandas as soon as they are manageable + the reason being that DESCRIBE HISTORY / DESCRIBE DETAIL cannot be cached + TODO reconsider if it is better outside of the class """ if not "operation" in describe_history_df.columns: - return describe_detail_df + return describe_detail_df.toPandas() # window over operation operation_order = ( @@ -46,52 +49,46 @@ def _process_describe_history( Window.partitionBy(["catalog", "database", "tableName", "operation"]).orderBy(F.col("timestamp").desc()) )) ) + + if operation_order.isEmpty(): + return describe_detail_df.toPandas() + + operation_order = operation_order.toPandas() + # max & 2nd timestamp of OPTIMIZE into output - out = describe_detail_df.join( - operation_order - .filter((F.col("operation") == "OPTIMIZE") & (F.col("operation_order") == 1)) - .select("catalog", "database", "tableName", "timestamp") - .withColumnRenamed("timestamp", "max_optimize_timestamp"), + out = describe_detail_df.toPandas().merge( + operation_order[(operation_order.operation == "OPTIMIZE") & (operation_order.operation_order == 1)] + .loc[:, ["catalog", "database", "tableName", "timestamp"]] + .rename(columns={'timestamp': 'max_optimize_timestamp'}), how="outer", on=["catalog", "database", "tableName"] ) - out = out.join( - operation_order - .filter((F.col("operation") == "OPTIMIZE") & (F.col("operation_order") == 2)) - .select("catalog", "database", "tableName", "timestamp") - .withColumnRenamed("timestamp", "2nd_optimize_timestamp"), + out = out.merge( + operation_order[(operation_order.operation == "OPTIMIZE") & (operation_order.operation_order == 2)] + .loc[:, ["catalog", "database", "tableName", "timestamp"]] + .rename(columns={'timestamp': '2nd_optimize_timestamp'}), how="outer", on=["catalog", "database", "tableName"] ) # max timestamp of VACUUM into output - out = out.join( - operation_order - .filter((F.col("operation") == "VACUUM END") & (F.col("operation_order") == 1)) - .select("catalog", "database", "tableName", "timestamp") - .withColumnRenamed("timestamp", "max_vacuum_timestamp"), + out = out.merge( + operation_order[(operation_order.operation == "VACUUM END") & (operation_order.operation_order == 1)] + .loc[:, ["catalog", "database", "tableName", "timestamp"]] + .rename(columns={'timestamp': 'max_vacuum_timestamp'}), how="outer", on=["catalog", "database", "tableName"] ) - out = out.join( - operation_order - .filter((F.col("operation") == "VACUUM END") & (F.col("operation_order") == 2)) - .select("catalog", "database", "tableName", "timestamp") - .withColumnRenamed("timestamp", "2nd_vacuum_timestamp"), + out = out.merge( + operation_order[(operation_order.operation == "VACUUM END") & (operation_order.operation_order == 2)] + .loc[:, ["catalog", "database", "tableName", "timestamp"]] + .rename(columns={'timestamp': '2nd_vacuum_timestamp'}), how="outer", on=["catalog", "database", "tableName"] ) # summary of table metrics table_metrics_1 = ( - operation_order.filter((F.col("operation") == "OPTIMIZE") & (F.col("operation_order") == 1)) - .select([ - F.col("catalog"), - F.col("database"), - F.col("tableName"), - F.col("min_file_size"), - F.col("p50_file_size"), - F.col("max_file_size"), - F.col("z_order_by"), - ]) + operation_order[(operation_order['operation'] == 'OPTIMIZE') & (operation_order['operation_order'] == 1)] + .loc[:, ['catalog', 'database', 'tableName', 'min_file_size', 'p50_file_size', 'max_file_size', 'z_order_by']] ) # write to output - out = out.join( + out = out.merge( table_metrics_1, how="outer", on=["catalog", "database", "tableName"] ) @@ -175,7 +172,7 @@ def scan( return self._process_describe_history( dd, self._spark.sql(statement), - ).toPandas() + ) except Exception as e: errors_df = self._spark.createDataFrame( diff --git a/tests/unit/data/delta_housekeeping/dd_click_sales.csv b/tests/unit/data/delta_housekeeping/dd_click_sales.csv new file mode 100644 index 0000000..4ffb5a3 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/dd_click_sales.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes +lorenzorubi,default,click_sales,6,326068799 diff --git a/tests/unit/data/delta_housekeeping/dd_housekeeping_summary.csv b/tests/unit/data/delta_housekeeping/dd_housekeeping_summary.csv new file mode 100644 index 0000000..70fc5a1 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/dd_housekeeping_summary.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes +lorenzorubi,default,housekeeping_summary,1,192917 diff --git a/tests/unit/data/delta_housekeeping/dh_click_sales.csv b/tests/unit/data/delta_housekeeping/dh_click_sales.csv new file mode 100644 index 0000000..f35d6b6 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/dh_click_sales.csv @@ -0,0 +1,4 @@ +catalog,database,tableName,operation,timestamp,min_file_size,p50_file_size,max_file_size,z_order_by +lorenzorubi,default,click_sales,VACUUM END,2023-12-06T16:40:28Z,null,null,null,null +lorenzorubi,default,click_sales,VACUUM END,2023-12-05T01:19:47Z,null,null,null,null +lorenzorubi,default,click_sales,VACUUM END,2023-11-25T04:03:41Z,null,null,null,null diff --git a/tests/unit/data/delta_housekeeping/dh_housekeeping_summary.csv b/tests/unit/data/delta_housekeeping/dh_housekeeping_summary.csv new file mode 100644 index 0000000..d1ee36e --- /dev/null +++ b/tests/unit/data/delta_housekeeping/dh_housekeeping_summary.csv @@ -0,0 +1,25 @@ +catalog,database,tableName,operation,timestamp,min_file_size,p50_file_size,max_file_size,z_order_by +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T05:50:14Z,192917,192917,192917,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T05:21:22Z,184203,184203,184203,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T04:37:19Z,176955,176955,176955,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T04:10:26Z,168560,168560,168560,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T03:11:02Z,161710,161710,161710,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T02:44:41Z,154166,154166,154166,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T02:18:54Z,145990,145990,145990,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T01:42:12Z,137677,137677,137677,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T01:09:19Z,130864,130864,130864,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:53:33Z,123702,123702,123702,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:43:44Z,118806,118806,118806,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:28:00Z,111983,111983,111983,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-05T00:14:21Z,104790,104790,104790,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T23:47:02Z,97314,97314,97314,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T23:18:17Z,91509,91509,91509,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T22:14:48Z,84152,84152,84152,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:57:53Z,76464,76464,76464,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:30:49Z,67498,67498,67498,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T21:18:59Z,59412,59412,59412,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T20:30:48Z,51173,51173,51173,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T20:12:59Z,42346,42346,42346,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:35:05Z,34463,34463,34463,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:30:46Z,28604,28604,28604,[] +lorenzorubi,default,housekeeping_summary,OPTIMIZE,2023-12-04T19:06:51Z,8412,17592,17592,[] diff --git a/tests/unit/data/delta_housekeeping/expected_pdh_click_sales.csv b/tests/unit/data/delta_housekeeping/expected_pdh_click_sales.csv new file mode 100644 index 0000000..569c1e8 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_pdh_click_sales.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by +lorenzorubi,default,click_sales,6,326068799,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,, diff --git a/tests/unit/data/delta_housekeeping/expected_pdh_housekeeping_summary.csv b/tests/unit/data/delta_housekeeping/expected_pdh_housekeeping_summary.csv new file mode 100644 index 0000000..af564ba --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_pdh_housekeeping_summary.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by +lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917,192917,192917,[] diff --git a/tests/unit/delta_housekeeping_test.py b/tests/unit/delta_housekeeping_test.py new file mode 100644 index 0000000..e53d0bf --- /dev/null +++ b/tests/unit/delta_housekeeping_test.py @@ -0,0 +1,138 @@ +import pytest +import pandas as pd +from discoverx.delta_housekeeping import DeltaHousekeeping +from pathlib import Path +import pyspark.sql.functions as F + + +def _resolve_file_path(request, relative_path): + module_path = Path(request.module.__file__) + test_file_path = module_path.parent / relative_path + return pd.read_csv( + str(test_file_path.resolve()), + dtype={ + "max_optimize_timestamp": "str", + "2nd_optimize_timestamp": "str", + "max_vacuum_timestamp": "str", + "2nd_vacuum_timestamp": "str", + } + ) + + +@pytest.fixture() +def dh_click_sales(request): + return _resolve_file_path(request, "data/delta_housekeeping/dh_click_sales.csv") + + +@pytest.fixture() +def dd_click_sales(request): + return _resolve_file_path(request, "data/delta_housekeeping/dd_click_sales.csv") + + +@pytest.fixture() +def expected_pdh_click_sales(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_click_sales.csv") + + +@pytest.fixture() +def dh_housekeeping_summary(request): + return _resolve_file_path(request, "data/delta_housekeeping/dh_housekeeping_summary.csv") + + +@pytest.fixture() +def dd_housekeeping_summary(request): + return _resolve_file_path(request, "data/delta_housekeeping/dd_housekeeping_summary.csv") + + +@pytest.fixture() +def expected_pdh_housekeeping_summary(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_housekeeping_summary.csv") + + +@pytest.mark.skip() +def test_process_describe_history_template(): + from pyspark.sql import SparkSession + spark = SparkSession.builder.getOrCreate() + + dh = DeltaHousekeeping(spark) + dd_click_sales = pd.read_csv( + "/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/dd_click_sales.csv") + dh_click_sales = pd.read_csv( + "/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/dh_click_sales.csv") + expected_pdh_click_sales = pd.read_csv( + "/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/expected_pdh_click_sales.csv" + ) + + describe_detail_df = spark.createDataFrame(dd_click_sales) + # describe_detail_df = ( + # describe_detail_df + # .withColumn("split", F.split(F.col('name'), '\.')) + # .withColumn("catalog", F.col("split").getItem(0)) + # .withColumn("database", F.col("split").getItem(1)) + # .withColumn("tableName", F.col("split").getItem(2)) + # .select([ + # F.col("catalog"), + # F.col("database"), + # F.col("tableName"), + # F.col("numFiles").alias("number_of_files"), + # F.col("sizeInBytes").alias("bytes"), + # ]) + # ) + # describe_detail_df.toPandas().to_csv("/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/dd_housekeeping_summary.csv", index=False) + + describe_history_df = spark.createDataFrame(dh_click_sales) + describe_history_df = describe_history_df.withColumn("operation", F.lit("NOOP")) + + out = dh._process_describe_history(describe_detail_df, describe_history_df) + + out.toPandas().to_csv("/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/expected_pdh_housekeeping_summary.csv", index=False) + assert out + + +def test_process_describe_history_no_optimize(spark, dh_click_sales, dd_click_sales, expected_pdh_click_sales): + dh = DeltaHousekeeping(spark) + describe_detail_df = spark.createDataFrame(dd_click_sales) + describe_history_df = spark.createDataFrame(dh_click_sales) + out = dh._process_describe_history(describe_detail_df, describe_history_df) + pd.testing.assert_frame_equal( + out.reset_index(), + expected_pdh_click_sales.reset_index(), + ) + + +def test_process_describe_history_no_vacuum( + spark, dh_housekeeping_summary, dd_housekeeping_summary, expected_pdh_housekeeping_summary +): + dh = DeltaHousekeeping(spark) + describe_detail_df = spark.createDataFrame(dd_housekeeping_summary) + describe_history_df = spark.createDataFrame(dh_housekeeping_summary) + out = dh._process_describe_history(describe_detail_df, describe_history_df) + pd.testing.assert_frame_equal( + out.reset_index(), + expected_pdh_housekeeping_summary.reset_index(), + ) + + +def test_process_describe_history_no_operation(spark, dd_click_sales): + dh = DeltaHousekeeping(spark) + describe_detail_df = spark.createDataFrame(dd_click_sales) + describe_history_df = spark.createDataFrame([], "string") + out = dh._process_describe_history(describe_detail_df, describe_history_df) + # output should be equal to DESCRIBE DETAIL + pd.testing.assert_frame_equal( + out.reset_index(), + dd_click_sales.reset_index(), + ) + + +def test_process_describe_history_empty_history(spark, dd_click_sales, dh_click_sales): + dh = DeltaHousekeeping(spark) + describe_detail_df = spark.createDataFrame(dd_click_sales) + describe_history_df = spark.createDataFrame(dh_click_sales) + describe_history_df = describe_history_df.withColumn("operation", F.lit("NOOP")) + out = dh._process_describe_history(describe_detail_df, describe_history_df) + # output should be equal to DESCRIBE DETAIL + pd.testing.assert_frame_equal( + out.reset_index(), + dd_click_sales.reset_index(), + ) \ No newline at end of file From c2b028f26de52f02aa44e420244785afd5b016dd Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Fri, 5 Jan 2024 18:35:33 +0100 Subject: [PATCH 13/25] cleanup --- tests/unit/delta_housekeeping_test.py | 40 --------------------------- 1 file changed, 40 deletions(-) diff --git a/tests/unit/delta_housekeeping_test.py b/tests/unit/delta_housekeeping_test.py index e53d0bf..63aea52 100644 --- a/tests/unit/delta_housekeeping_test.py +++ b/tests/unit/delta_housekeeping_test.py @@ -49,46 +49,6 @@ def expected_pdh_housekeeping_summary(request): return _resolve_file_path(request, "data/delta_housekeeping/expected_pdh_housekeeping_summary.csv") -@pytest.mark.skip() -def test_process_describe_history_template(): - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - - dh = DeltaHousekeeping(spark) - dd_click_sales = pd.read_csv( - "/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/dd_click_sales.csv") - dh_click_sales = pd.read_csv( - "/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/dh_click_sales.csv") - expected_pdh_click_sales = pd.read_csv( - "/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/expected_pdh_click_sales.csv" - ) - - describe_detail_df = spark.createDataFrame(dd_click_sales) - # describe_detail_df = ( - # describe_detail_df - # .withColumn("split", F.split(F.col('name'), '\.')) - # .withColumn("catalog", F.col("split").getItem(0)) - # .withColumn("database", F.col("split").getItem(1)) - # .withColumn("tableName", F.col("split").getItem(2)) - # .select([ - # F.col("catalog"), - # F.col("database"), - # F.col("tableName"), - # F.col("numFiles").alias("number_of_files"), - # F.col("sizeInBytes").alias("bytes"), - # ]) - # ) - # describe_detail_df.toPandas().to_csv("/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/dd_housekeeping_summary.csv", index=False) - - describe_history_df = spark.createDataFrame(dh_click_sales) - describe_history_df = describe_history_df.withColumn("operation", F.lit("NOOP")) - - out = dh._process_describe_history(describe_detail_df, describe_history_df) - - out.toPandas().to_csv("/Users/lorenzo.rubio/Documents/GitHub/discoverx_lorenzorubi-db/tests/unit/data/delta_housekeeping/expected_pdh_housekeeping_summary.csv", index=False) - assert out - - def test_process_describe_history_no_optimize(spark, dh_click_sales, dd_click_sales, expected_pdh_click_sales): dh = DeltaHousekeeping(spark) describe_detail_df = spark.createDataFrame(dd_click_sales) From 0e4c8e5c21526e230254d2d1ff022cffc0e7150a Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Tue, 9 Jan 2024 21:10:34 +0100 Subject: [PATCH 14/25] cleanup and PR comments --- discoverx/delta_housekeeping.py | 5 ----- examples/exec_delta_housekeeping.py | 4 ++-- tests/unit/delta_housekeeping_actions_test.py | 1 - 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 03de50c..5d1f03a 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -1,5 +1,4 @@ from typing import Iterable -from functools import reduce from datetime import datetime, timezone import pandas as pd @@ -35,8 +34,6 @@ def _process_describe_history( returns a pandas DataFrame, and converts Spark internal dfs to pandas as soon as they are manageable the reason being that DESCRIBE HISTORY / DESCRIBE DETAIL cannot be cached - - TODO reconsider if it is better outside of the class """ if not "operation" in describe_history_df.columns: return describe_detail_df.toPandas() @@ -104,8 +101,6 @@ def save_as_table( Static method to store intermediate results of the scan operation into Delta Would make sense only if using map_chunked from the `DataExplorer` object (otherwise tables are writen one by one into Delta with overhead) - - TODO create function in `DataExplorer` that uses this for a chunked """ ( result diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 504f0cb..03d1e77 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -20,9 +20,9 @@ # COMMAND ---------- -# DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object you can apply operations to +# DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object on which you can run operations output = ( - dx.from_tables("lorenzorubi.*.*") + dx.from_tables(f"{dbutils.widgets.get('catalog')}.*.*") .delta_housekeeping() ) diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index faed05e..f7ac0cc 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -34,7 +34,6 @@ def test_apply_output(housekeeping_stats, expected_need_optimize): need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], expected_need_optimize.loc[:, ["catalog", "database", "tableName"]], ) - # TODO complete all the tests def test_empty_apply_output(housekeeping_stats): From 9a9fe6bb632dc3a8d2ff2f672b492641868aa4eb Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Fri, 12 Jan 2024 13:17:18 +0100 Subject: [PATCH 15/25] proper use of dbwidgets --- examples/exec_delta_housekeeping.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 03d1e77..561f5cd 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -14,6 +14,24 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ### Declare Variables + +# COMMAND ---------- + +dbutils.widgets.text("catalogs", "*", "Catalogs") +dbutils.widgets.text("schemas", "*", "Schemas") +dbutils.widgets.text("tables", "*", "Tables") + +# COMMAND ---------- + +catalogs = dbutils.widgets.get("catalogs") +schemas = dbutils.widgets.get("schemas") +tables = dbutils.widgets.get("tables") +from_table_statement = ".".join([catalogs, schemas, tables]) + +# COMMAND ---------- + from discoverx import DX dx = DX() @@ -22,7 +40,7 @@ # DBTITLE 1,Run the discoverx DeltaHousekeeping operation -generates an output object on which you can run operations output = ( - dx.from_tables(f"{dbutils.widgets.get('catalog')}.*.*") + dx.from_tables(from_table_statement) .delta_housekeeping() ) From 6c5ecf2fe5dce07f4935a209b53afe9e9b74803c Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Sun, 28 Jan 2024 13:35:22 +0100 Subject: [PATCH 16/25] refactoring apply to return a single dataframe --- discoverx/delta_housekeeping.py | 280 +++++++++++++----- .../expected_need_analysis.csv | 2 + .../expected_need_optimize.csv | 8 +- .../expected_need_vacuum.csv | 18 ++ .../expected_not_optimized_last_days.csv | 3 + .../expected_not_vacuumed_last_days.csv | 3 + tests/unit/delta_housekeeping_actions_test.py | 257 +++++++++++++++- 7 files changed, 477 insertions(+), 94 deletions(-) create mode 100644 tests/unit/data/delta_housekeeping/expected_need_analysis.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_need_vacuum.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_not_optimized_last_days.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_not_vacuumed_last_days.csv diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 5d1f03a..cff38a1 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -1,4 +1,4 @@ -from typing import Iterable +from typing import Iterable, Callable from datetime import datetime, timezone import pandas as pd @@ -190,10 +190,10 @@ def __init__( mapped_pd_dfs: Iterable[pd.DataFrame], spark: SparkSession = None, min_table_size_optimize: int = 128*1024*1024, # i.e. 128 MB - min_days_not_optimized: int = 7, - min_days_not_vacuumed: int = 31, - max_optimize_freq: int = 2, - max_vacuum_freq: int = 2, + min_days_not_optimized: int = 7, # in days + min_days_not_vacuumed: int = 31, # in days + max_optimize_freq: int = 2, # in days - e.g. 2 means that a daily run would be flagged + max_vacuum_freq: int = 2, # in days - e.g. 2 means that a daily run would be flagged small_file_threshold: int = 32*1024*1024, # i.e. 32 MB min_number_of_files_for_zorder: int = 8, stats: pd.DataFrame = None, # for testability only @@ -214,56 +214,153 @@ def __init__( self.max_vacuum_freq = max_vacuum_freq self.small_file_threshold = small_file_threshold self.min_number_of_files_for_zorder = min_number_of_files_for_zorder - self.tables_not_optimized_legend = "Tables that are never OPTIMIZED and would benefit from it" - self.tables_not_vacuumed_legend = "Tables that are never VACUUM'ed" + self.tables_not_optimized_legend = "The table has not been OPTIMIZED and would benefit from it" + self.tables_not_vacuumed_legend = "The table has never been VACUUM'ed" self.tables_not_optimized_last_days = "Tables that are not OPTIMIZED often enough" self.tables_not_vacuumed_last_days = "Tables that are not VACUUM'ed often enough" self.tables_optimized_too_freq = "Tables that are OPTIMIZED too often" self.tables_vacuumed_too_freq = "Tables that are VACUUM'ed too often" self.tables_do_not_need_optimize = "Tables that are too small to be OPTIMIZED" - self.tables_to_analyze = "Tables that need more analysis (small_files)" + self.tables_to_analyze = "Tables that need more analysis -small_files" self.tables_zorder_not_effective = "Tables for which ZORDER is not being effective" - def _need_optimize(self) -> pd.DataFrame: + def _apply_changes_to_stats( + self, + condition: pd.Series, + boolean_column_name: str, + reason_column_name: str, + f_apply_legend: Callable, + **kwargs + ) -> pd.DataFrame: + compose_results = False + boolean_column_name_new = boolean_column_name + reason_column_name_new = reason_column_name + if boolean_column_name in self._stats.columns: + compose_results = True + boolean_column_name_new = boolean_column_name + "_new" + reason_column_name_new = reason_column_name + "_new" + stats = self._stats.copy() - stats = stats.loc[stats.max_optimize_timestamp.isnull() & stats.bytes.notnull()] - return ( - stats.loc[(stats.bytes.astype(int) > self.min_table_size_optimize)] + stats[boolean_column_name_new] = False + stats[reason_column_name_new] = None + stats_sub = stats.loc[condition] + stats_sub = f_apply_legend(stats_sub, boolean_column_name_new, reason_column_name_new, **kwargs) + self._stats = pd.merge( + self._stats, + stats_sub.loc[:, ["catalog", "database", "tableName", boolean_column_name_new, reason_column_name_new]], + on=["catalog", "database", "tableName"], + how="outer", + ) + self._stats = self._stats.fillna({boolean_column_name: False, reason_column_name: ""}) + if compose_results: + self._stats = self._stats.fillna({boolean_column_name_new: False, reason_column_name_new: ""}) + self._stats.loc[:, boolean_column_name] = \ + self._stats[boolean_column_name] | self._stats[boolean_column_name_new] + self._stats.loc[:, reason_column_name] = \ + self._stats[[reason_column_name, reason_column_name_new]].agg(' | '.join, axis=1) # TODO should figure out if either side is None + self._stats.drop([boolean_column_name_new, reason_column_name_new], axis=1, inplace=True) + + def _need_optimize(self) -> pd.DataFrame: + def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): + condition2 = stats_sub.bytes.astype(int) > self.min_table_size_optimize + stats_sub.loc[condition2, boolean_column_name] = True + stats_sub.loc[condition2, reason_column_name] = self.tables_not_optimized_legend + return stats_sub + + self._apply_changes_to_stats( + condition=self._stats.max_optimize_timestamp.isnull() & self._stats.bytes.notnull(), + boolean_column_name="rec_optimize", + reason_column_name="rec_optimize_reason", + f_apply_legend=check_min_table_size_apply_legend, ) def _optimize_not_needed(self) -> pd.DataFrame: - stats = self._stats.copy() - stats = stats.loc[stats.max_optimize_timestamp.isnull() & stats.bytes.notnull()] - return ( - stats.loc[stats.max_optimize_timestamp.notnull() & (stats.bytes.astype(int) > self.min_table_size_optimize)] + def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): + condition2 = stats_sub.max_optimize_timestamp.notnull() & (stats_sub.bytes.astype(int) > self.min_table_size_optimize) + stats_sub.loc[condition2, boolean_column_name] = True + stats_sub.loc[condition2, reason_column_name] = self.tables_do_not_need_optimize + return stats_sub + + self._apply_changes_to_stats( + condition=self._stats.max_optimize_timestamp.isnull() & self._stats.bytes.notnull(), + boolean_column_name="rec_optimize", + reason_column_name="rec_optimize_reason", + f_apply_legend=check_min_table_size_apply_legend, ) - def _not_optimized_last_days(self) -> pd.DataFrame: - stats = self._stats.copy() - stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp'], utc=True) - stats['optimize_lag'] = ( - datetime.now(timezone.utc) - stats['max_optimize_timestamp'] + @staticmethod + def check_timestamps_apply_legend( + stats_sub, boolean_column_name, reason_column_name, **kwargs, + ): + stats_sub.loc[:, kwargs["timestamp_to_evaluate"]] = pd.to_datetime(stats_sub[kwargs["timestamp_to_evaluate"]], utc=True) + stats_sub.loc[:, 'lag'] = ( + datetime.now(timezone.utc) - stats_sub[kwargs["timestamp_to_evaluate"]] ).dt.days - return ( - stats[stats['optimize_lag'] < self.min_days_not_optimized] + condition2 = stats_sub['lag'] > kwargs["threshold"] + stats_sub.loc[condition2, boolean_column_name] = True + stats_sub.loc[condition2, reason_column_name] = kwargs["reason"] + return stats_sub + + def _not_optimized_last_days(self) -> pd.DataFrame: + self._apply_changes_to_stats( + condition=~self._stats.max_optimize_timestamp.isnull(), + boolean_column_name="rec_optimize", + reason_column_name="rec_optimize_reason", + f_apply_legend=self.check_timestamps_apply_legend, + timestamp_to_evaluate="max_optimize_timestamp", + threshold=self.min_days_not_optimized, + reason=self.tables_not_optimized_last_days, ) + @staticmethod + def check_timestamp_diff_apply_legend( + stats_sub, boolean_column_name, reason_column_name, **kwargs, + ): + stats_sub.loc[:, kwargs["timestamp1_to_evaluate"]] = pd.to_datetime(stats_sub[kwargs["timestamp1_to_evaluate"]], utc=True) + stats_sub.loc[:, kwargs["timestamp2_to_evaluate"]] = pd.to_datetime(stats_sub[kwargs["timestamp2_to_evaluate"]], utc=True) + stats_sub.loc[:, 'lag'] = ( + stats_sub[kwargs["timestamp1_to_evaluate"]] - stats_sub[kwargs["timestamp2_to_evaluate"]] + ).dt.days + condition2 = stats_sub['lag'] > kwargs["threshold"] + stats_sub.loc[condition2, boolean_column_name] = True + stats_sub.loc[condition2, reason_column_name] = kwargs["reason"] + return stats_sub + def _optimized_too_frequently(self) -> pd.DataFrame: - stats = self._stats.copy() - stats['max_optimize_timestamp'] = pd.to_datetime(stats['max_optimize_timestamp']) - stats['2nd_optimize_timestamp'] = pd.to_datetime(stats['2nd_optimize_timestamp']) - stats['optimize_lag'] = (stats['max_optimize_timestamp'] - stats['2nd_optimize_timestamp']).dt.days - return ( - stats[stats['optimize_lag'] < self.max_optimize_freq] + self._apply_changes_to_stats( + condition=~self._stats.max_optimize_timestamp.isnull() & ~self._stats["2nd_optimize_timestamp"].isnull(), + boolean_column_name="rec_optimize", + reason_column_name="rec_optimize_reason", + f_apply_legend=self.check_timestamp_diff_apply_legend, + timestamp1_to_evaluate="max_optimize_timestamp", + timestamp2_to_evaluate="2nd_optimize_timestamp", + threshold=self.max_optimize_freq, + reason=self.tables_optimized_too_freq, ) def _never_vacuumed(self) -> pd.DataFrame: - stats = self._stats - return ( - stats.loc[stats.max_vacuum_timestamp.isnull()] + def apply_legend(stats_sub, boolean_column_name, reason_column_name): + stats_sub.loc[:, boolean_column_name] = True + stats_sub.loc[:, reason_column_name] = self.tables_not_vacuumed_legend + return stats_sub + + self._apply_changes_to_stats( + condition=self._stats.max_vacuum_timestamp.isnull(), + boolean_column_name="rec_vacuum", + reason_column_name="rec_vacuum_reason", + f_apply_legend=apply_legend, ) def _not_vacuumed_last_days(self) -> pd.DataFrame: + self._apply_changes_to_stats( + condition=~self._stats.max_vacuum_timestamp.isnull(), + boolean_column_name="rec_vacuum", + reason_column_name="rec_vacuum_reason", + f_apply_legend=self.check_timestamps_apply_legend, + timestamp_to_evaluate="max_vacuum_timestamp", + threshold=self.min_days_not_vacuumed, + reason=self.tables_not_vacuumed_last_days, + ) stats = self._stats.copy() stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp'], utc=True) stats['vacuum_lag'] = ( @@ -274,39 +371,49 @@ def _not_vacuumed_last_days(self) -> pd.DataFrame: ) def _vacuumed_too_frequently(self) -> pd.DataFrame: - stats = self._stats.copy() - stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp']) - stats['2nd_vacuum_timestamp'] = pd.to_datetime(stats['2nd_vacuum_timestamp']) - stats['vacuum_lag'] = (stats['max_vacuum_timestamp'] - stats['2nd_vacuum_timestamp']).dt.days - return ( - stats[stats['vacuum_lag'] < self.max_vacuum_freq] + self._apply_changes_to_stats( + condition=~self._stats.max_vacuum_timestamp.isnull() & ~self._stats["2nd_vacuum_timestamp"].isnull(), + boolean_column_name="rec_vacuum", + reason_column_name="rec_vacuum_reason", + f_apply_legend=self.check_timestamp_diff_apply_legend, + timestamp1_to_evaluate="max_vacuum_timestamp", + timestamp2_to_evaluate="2nd_vacuum_timestamp", + threshold=self.max_vacuum_freq, + reason=self.tables_vacuumed_too_freq, ) def _analyze_these_tables(self) -> pd.DataFrame: - stats = self._stats.copy() - stats = stats.loc[stats['max_optimize_timestamp'].notnull() & - stats['p50_file_size'].notnull() & - (stats['number_of_files'] > 1)] - stats = stats.loc[(stats['p50_file_size'].astype(int) < self.small_file_threshold)] - return ( - stats.sort_values(by=['database', 'tableName', 'number_of_files'], ascending=[True, True, False]) + def check_analyze_tables_apply_legend(stats_sub, boolean_column_name, reason_column_name): + condition2 = stats_sub['p50_file_size'].astype(int) < self.small_file_threshold + stats_sub.loc[condition2, boolean_column_name] = True + stats_sub.loc[condition2, reason_column_name] = self.tables_to_analyze + return stats_sub + + self._apply_changes_to_stats( + condition=self._stats.max_optimize_timestamp.notnull() & self._stats.p50_file_size.notnull() & (self._stats.number_of_files > 1), + boolean_column_name="rec_misc", + reason_column_name="rec_misc_reason", + f_apply_legend=check_analyze_tables_apply_legend, ) def _zorder_not_effective(self) -> pd.DataFrame: - stats = self._stats.copy() - stats = stats.loc[stats['max_optimize_timestamp'].notnull() & - stats['p50_file_size'].notnull()] - - # clean up z_order_by column and split into array - stats['z_order_by_clean'] = stats['z_order_by'].apply( - lambda x: None if x == "[]" else x.replace('[', '').replace(']', '').replace('"', '')) - stats['z_order_by_array'] = stats['z_order_by_clean'].str.split(',') - - # filter rows with zorder columns and number_of_files is less than threshold - stats = stats[stats['z_order_by_array'].str.len() > 0] - stats = stats[stats['number_of_files'].astype(int) < self.min_number_of_files_for_zorder] - return ( - stats + def check_zorder_not_effective_apply_legend(stats_sub, boolean_column_name, reason_column_name): + stats_sub['z_order_by_clean'] = stats_sub['z_order_by'].apply( + lambda x: None if x == "[]" else x.replace('[', '').replace(']', '').replace('"', '')) + stats_sub['z_order_by_array'] = stats_sub['z_order_by_clean'].str.split(',') + + stats_sub = stats_sub.loc[stats_sub['z_order_by_array'].str.len() > 0, :] + stats_sub = stats_sub.loc[stats_sub['number_of_files'].astype(int) < self.min_number_of_files_for_zorder, :] + + stats_sub.loc[:, boolean_column_name] = True + stats_sub.loc[:, reason_column_name] = self.tables_zorder_not_effective + return stats_sub + + self._apply_changes_to_stats( + condition=self._stats.max_optimize_timestamp.notnull() & self._stats.p50_file_size.notnull(), + boolean_column_name="rec_misc", + reason_column_name="rec_misc_reason", + f_apply_legend=check_zorder_not_effective_apply_legend, ) def stats(self) -> DataFrame: @@ -336,38 +443,49 @@ def generate_recommendations(self) -> Iterable[dict]: - The key is the legend of the recommendation - The value is a pandas df with the affected tables """ + self._need_optimize() + self._never_vacuumed(), + self._not_optimized_last_days(), + self._not_vacuumed_last_days(), + self._optimized_too_frequently(), + self._vacuumed_too_frequently(), + self._optimize_not_needed(), + self._analyze_these_tables(), + self._zorder_not_effective(), + return self._stats.copy() + + def _explain(self) -> Iterable: + stats = self.generate_recommendations() + stats_optimize = stats.loc[stats["rec_optimize"], :] + stats_vacuum = stats.loc[stats["rec_vacuum"], :] + stats_misc = stats.loc[stats["rec_misc"], :] out = [] - for df, legend in zip([ - self._need_optimize(), - self._never_vacuumed(), - self._not_optimized_last_days(), - self._not_vacuumed_last_days(), - self._optimized_too_frequently(), - self._vacuumed_too_frequently(), - self._optimize_not_needed(), - self._analyze_these_tables(), - self._zorder_not_effective(), - ], [ + for legend_optimize in [ self.tables_not_optimized_legend, - self.tables_not_vacuumed_legend, self.tables_not_optimized_last_days, - self.tables_not_vacuumed_last_days, self.tables_optimized_too_freq, + ]: + out.append({legend_optimize: stats_optimize.loc[stats_optimize["rec_optimize_reason"].str.contains(legend_optimize)]}) + + for legend_vacuum in [ + self.tables_not_vacuumed_legend, + self.tables_not_vacuumed_last_days, self.tables_vacuumed_too_freq, - self.tables_do_not_need_optimize, + ]: + out.append({legend_vacuum: stats_vacuum.loc[stats_vacuum["rec_vacuum_reason"].str.contains(legend_vacuum)]}) + + for legend_misc in [ self.tables_to_analyze, self.tables_zorder_not_effective, - ]): - if not df.empty: - out.append({legend: df}) + ]: + out.append({legend_misc: stats_misc.loc[stats_misc["rec_misc_reason"].str.contains(legend_misc)]}) + return out def explain(self) -> None: # TODO better formatting! from databricks.sdk.runtime import display - - for recomm in self.generate_recommendations(): - for legend, df in recomm.items(): - display(legend) - display(df) + for legend, df in self._explain().items(): + display(legend) + display(df) diff --git a/tests/unit/data/delta_housekeeping/expected_need_analysis.csv b/tests/unit/data/delta_housekeeping/expected_need_analysis.csv new file mode 100644 index 0000000..2defd2a --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_need_analysis.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason,rec_misc,rec_misc_reason +lorenzorubi,default,housekeeping_summary_v2,3.0,12326.0,2023-12-18T11:25:35Z,,,,5273.0,5273.0,5273.0,[],,True, | Tables that are not OPTIMIZED often enough | | ,True,The table has never been VACUUM'ed | | ,True,Tables that need more analysis -small_files diff --git a/tests/unit/data/delta_housekeeping/expected_need_optimize.csv b/tests/unit/data/delta_housekeeping/expected_need_optimize.csv index 237a44c..9766cf0 100644 --- a/tests/unit/data/delta_housekeeping/expected_need_optimize.csv +++ b/tests/unit/data/delta_housekeeping/expected_need_optimize.csv @@ -1,4 +1,4 @@ -catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error -lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,, -lorenzorubi,tpch,orders,2406.0,317120666.0,,,,,,,,, -lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,, +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason +lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,,,True,The table has not been OPTIMIZED and would benefit from it +lorenzorubi,tpch,orders,2406.0,317120666.0,,,,,,,,,,True,The table has not been OPTIMIZED and would benefit from it +lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,,,True,The table has not been OPTIMIZED and would benefit from it diff --git a/tests/unit/data/delta_housekeeping/expected_need_vacuum.csv b/tests/unit/data/delta_housekeeping/expected_need_vacuum.csv new file mode 100644 index 0000000..27c04ab --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_need_vacuum.csv @@ -0,0 +1,18 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason +lorenzorubi,default,housekeeping_summary_v3,1.0,3787.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,maxmind_geo,gold_ipv6,1.0,4907069.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,default,housekeeping_summary,1.0,192917.0,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917.0,192917.0,192917.0,[],,False,,True,The table has never been VACUUM'ed +lorenzorubi,default,housekeeping_summary_v2,3.0,12326.0,2023-12-18T11:25:35Z,,,,5273.0,5273.0,5273.0,[],,False,,True,The table has never been VACUUM'ed +lorenzorubi,maxmind_geo,raw_locations,1.0,6933.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,tpch,customer,1.0,61897021.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,tpch,nation,1.0,3007.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,maxmind_geo,raw_ipv6,1.0,1783720.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,maxmind_geo,gold_ipv4,1.0,7220024.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,dais_dlt_2023,enriched_orders,,,,,,,,,,,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`enriched_orders` does not support DESCRIBE DETAIL. ; line 2 pos 20,False,,True,The table has never been VACUUM'ed +lorenzorubi,default,click_sales_history,1.0,7710.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,tpch,orders,2406.0,317120666.0,,,,,,,,,,True,The table has not been OPTIMIZED and would benefit from it,True,The table has never been VACUUM'ed +lorenzorubi,maxmind_geo,raw_ipv4,1.0,3115269.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,gcp_cost_analysis,sku_prices,1.0,835.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,dais_dlt_2023,daily_totalorders_by_nation,,,,,,,,,,,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_totalorders_by_nation` does not support DESCRIBE DETAIL. ; line 2 pos 20,False,,True,The table has never been VACUUM'ed +lorenzorubi,gcp_cost_analysis,project_ids,2.0,1774.0,,,,,,,,,,False,,True,The table has never been VACUUM'ed +lorenzorubi,dais_dlt_2023,daily_2nd_high_orderprice,,,,,,,,,,,[UNSUPPORTED_VIEW_OPERATION.WITHOUT_SUGGESTION] The view `lorenzorubi`.`dais_dlt_2023`.`daily_2nd_high_orderprice` does not support DESCRIBE DETAIL. ; line 2 pos 20,False,,True,The table has never been VACUUM'ed diff --git a/tests/unit/data/delta_housekeeping/expected_not_optimized_last_days.csv b/tests/unit/data/delta_housekeeping/expected_not_optimized_last_days.csv new file mode 100644 index 0000000..f6b4724 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_not_optimized_last_days.csv @@ -0,0 +1,3 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason +lorenzorubi,default,housekeeping_summary,1.0,192917.0,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917.0,192917.0,192917.0,[],,True, | Tables that are not OPTIMIZED often enough,True,The table has never been VACUUM'ed +lorenzorubi,default,housekeeping_summary_v2,3.0,12326.0,2023-12-18T11:25:35Z,,,,5273.0,5273.0,5273.0,[],,True, | Tables that are not OPTIMIZED often enough,True,The table has never been VACUUM'ed diff --git a/tests/unit/data/delta_housekeeping/expected_not_vacuumed_last_days.csv b/tests/unit/data/delta_housekeeping/expected_not_vacuumed_last_days.csv new file mode 100644 index 0000000..fa34f67 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_not_vacuumed_last_days.csv @@ -0,0 +1,3 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason +lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,,,True,The table has not been OPTIMIZED and would benefit from it | ,True, | Tables that are not VACUUM'ed often enough +lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,,,True,The table has not been OPTIMIZED and would benefit from it | ,True, | Tables that are not VACUUM'ed often enough diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index f7ac0cc..87b3ecf 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -20,29 +20,268 @@ def expected_need_optimize(request): return _resolve_file_path(request, "data/delta_housekeeping/expected_need_optimize.csv") -def test_apply_output(housekeeping_stats, expected_need_optimize): +@pytest.fixture() +def expected_need_vacuum(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_need_vacuum.csv") + + +@pytest.fixture() +def expected_not_optimized_last_days(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_not_optimized_last_days.csv") + + +@pytest.fixture() +def expected_not_vacuumed_last_days(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_not_vacuumed_last_days.csv") + + +@pytest.fixture() +def expected_need_analysis(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_need_analysis.csv") + + +def test_apply_need_optimize(housekeeping_stats, expected_need_optimize): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, ) res = dha.generate_recommendations() - assert len(res) == 6 - need_optimize = [item for item in res if (list(item.keys())[0] == dha.tables_not_optimized_legend)] - assert len(need_optimize) == 1 - need_optimize_df = list(need_optimize[0].values())[0] + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_legend), : + ] + assert need_optimize_df.shape[0] == 3 pd.testing.assert_frame_equal( need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], expected_need_optimize.loc[:, ["catalog", "database", "tableName"]], ) -def test_empty_apply_output(housekeeping_stats): +def test_empty_apply_need_optimize(housekeeping_stats): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, min_table_size_optimize=1024*1024*1024*1024 ) res = dha.generate_recommendations() - assert len(res) == 5 - need_optimize = [item for item in res if list(item.keys())[0] == dha.tables_not_optimized_legend] - assert len(need_optimize) == 0 + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_legend), : + ] + assert need_optimize_df.shape[0] == 0 + + +def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_vacuum_df = res.loc[res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_not_vacuumed_legend), :] + assert need_vacuum_df.shape[0] == 17 + pd.testing.assert_frame_equal( + need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_need_vacuum.loc[:, ["catalog", "database", "tableName"]], + ) + + +def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimized_last_days): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_last_days), : + ] + assert need_optimize_df.shape[0] == 2 + pd.testing.assert_frame_equal( + need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_not_optimized_last_days.loc[:, ["catalog", "database", "tableName"]], + ) + + +def test_empty_apply_not_optimized_last_days(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + min_days_not_optimized=1e60 + ) + res = dha.generate_recommendations() + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_last_days), : + ] + assert need_optimize_df.shape[0] == 0 + + +def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_last_days): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_vacuum_df = res.loc[ + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_not_vacuumed_last_days), : + ] + assert need_vacuum_df.shape[0] == 2 + pd.testing.assert_frame_equal( + need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_not_vacuumed_last_days.loc[:, ["catalog", "database", "tableName"]], + ) + + +def test_empty_apply_not_vacuumed_last_days(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + min_days_not_vacuumed=1e60 + ) + res = dha.generate_recommendations() + need_vacuum_df = res.loc[ + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_not_vacuumed_last_days), : + ] + assert need_vacuum_df.shape[0] == 0 + + +def test_apply_optimized_too_freq(housekeeping_stats): + # TODO add an example in the dataset? + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_optimized_too_freq), : + ] + assert need_optimize_df.shape[0] == 0 + + +def test_empty_apply_optimized_too_freq(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + max_optimize_freq=1e60 + ) + res = dha.generate_recommendations() + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_optimized_too_freq), : + ] + assert need_optimize_df.shape[0] == 0 + + +def test_apply_vacuumed_too_freq(housekeeping_stats): + # TODO add an example in the dataset? + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_vacuum_df = res.loc[ + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_vacuumed_too_freq), : + ] + assert need_vacuum_df.shape[0] == 0 + + +def test_empty_apply_vacuumed_too_freq(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + max_vacuum_freq=1e60 + ) + res = dha.generate_recommendations() + need_vacuum_df = res.loc[ + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_vacuumed_too_freq), : + ] + assert need_vacuum_df.shape[0] == 0 + + +def test_apply_do_not_need_optimize(housekeeping_stats): + # TODO add an example in the dataset? + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_do_not_need_optimize), : + ] + assert need_optimize_df.shape[0] == 0 + + +def test_empty_apply_do_not_need_optimize(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + min_table_size_optimize=1e60 + ) + res = dha.generate_recommendations() + need_optimize_df = res.loc[ + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_do_not_need_optimize), : + ] + assert need_optimize_df.shape[0] == 0 + + +def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_analysis_df = res.loc[ + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_to_analyze), : + ] + assert need_analysis_df.shape[0] == 1 + # module_path = Path(request.module.__file__) + # test_file_path = module_path.parent / "data/delta_housekeeping/expected_need_analysis.csv" + # need_analysis_df.to_csv(str(test_file_path.resolve()), index=False) + pd.testing.assert_frame_equal( + need_analysis_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_need_analysis.loc[:, ["catalog", "database", "tableName"]], + ) + + +def test_empty_apply_analyze_tables(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + small_file_threshold=0 + ) + res = dha.generate_recommendations() + need_analysis_df = res.loc[ + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_to_analyze), : + ] + assert need_analysis_df.shape[0] == 0 + + +def test_apply_zorder_not_effective(request, housekeeping_stats, expected_need_analysis): + # TODO add an example in the dataset? + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha.generate_recommendations() + need_analysis_df = res.loc[ + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_zorder_not_effective), : + ] + assert need_analysis_df.shape[0] == 0 + + +def test_empty_apply_zorder_not_effective(housekeeping_stats): + # TODO add an example in the dataset? + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + min_number_of_files_for_zorder=0 + ) + res = dha.generate_recommendations() + need_analysis_df = res.loc[ + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_zorder_not_effective), : + ] + assert need_analysis_df.shape[0] == 0 + + +def test_explain(housekeeping_stats): + dha = DeltaHousekeepingActions( + None, + stats=housekeeping_stats, + ) + res = dha._explain() + assert len(res) == 8 From 5359876f3be9cc5e74712d814391037518e6e7f6 Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Sun, 4 Feb 2024 21:33:23 +0100 Subject: [PATCH 17/25] add test datasets for all housekeeping checks + bug fixes --- discoverx/delta_housekeeping.py | 10 +- .../delta_housekeeping/dhk_pandas_result.csv | 2 +- .../expected_do_not_need_optimize.csv | 3 + .../expected_optimized_too_freq.csv | 2 + .../expected_vacuumed_too_freq.csv | 3 + .../expected_zorder_not_effective.csv | 2 + tests/unit/delta_housekeeping_actions_test.py | 105 ++++++++++++------ 7 files changed, 88 insertions(+), 39 deletions(-) create mode 100644 tests/unit/data/delta_housekeeping/expected_do_not_need_optimize.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_optimized_too_freq.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_vacuumed_too_freq.csv create mode 100644 tests/unit/data/delta_housekeeping/expected_zorder_not_effective.csv diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index cff38a1..95153e3 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -276,13 +276,13 @@ def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_col def _optimize_not_needed(self) -> pd.DataFrame: def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): - condition2 = stats_sub.max_optimize_timestamp.notnull() & (stats_sub.bytes.astype(int) > self.min_table_size_optimize) + condition2 = stats_sub.max_optimize_timestamp.notnull() & (stats_sub.bytes.astype(int) < self.min_table_size_optimize) stats_sub.loc[condition2, boolean_column_name] = True stats_sub.loc[condition2, reason_column_name] = self.tables_do_not_need_optimize return stats_sub self._apply_changes_to_stats( - condition=self._stats.max_optimize_timestamp.isnull() & self._stats.bytes.notnull(), + condition=self._stats.max_optimize_timestamp.notnull() & self._stats.bytes.notnull(), boolean_column_name="rec_optimize", reason_column_name="rec_optimize_reason", f_apply_legend=check_min_table_size_apply_legend, @@ -321,14 +321,14 @@ def check_timestamp_diff_apply_legend( stats_sub.loc[:, 'lag'] = ( stats_sub[kwargs["timestamp1_to_evaluate"]] - stats_sub[kwargs["timestamp2_to_evaluate"]] ).dt.days - condition2 = stats_sub['lag'] > kwargs["threshold"] + condition2 = stats_sub['lag'] < kwargs["threshold"] stats_sub.loc[condition2, boolean_column_name] = True stats_sub.loc[condition2, reason_column_name] = kwargs["reason"] return stats_sub def _optimized_too_frequently(self) -> pd.DataFrame: self._apply_changes_to_stats( - condition=~self._stats.max_optimize_timestamp.isnull() & ~self._stats["2nd_optimize_timestamp"].isnull(), + condition=self._stats.max_optimize_timestamp.notnull() & self._stats["2nd_optimize_timestamp"].notnull(), boolean_column_name="rec_optimize", reason_column_name="rec_optimize_reason", f_apply_legend=self.check_timestamp_diff_apply_legend, @@ -372,7 +372,7 @@ def _not_vacuumed_last_days(self) -> pd.DataFrame: def _vacuumed_too_frequently(self) -> pd.DataFrame: self._apply_changes_to_stats( - condition=~self._stats.max_vacuum_timestamp.isnull() & ~self._stats["2nd_vacuum_timestamp"].isnull(), + condition=self._stats.max_vacuum_timestamp.notnull() & self._stats["2nd_vacuum_timestamp"].notnull(), boolean_column_name="rec_vacuum", reason_column_name="rec_vacuum_reason", f_apply_legend=self.check_timestamp_diff_apply_legend, diff --git a/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv b/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv index 6a60520..2616a7c 100644 --- a/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv +++ b/tests/unit/data/delta_housekeeping/dhk_pandas_result.csv @@ -2,7 +2,7 @@ catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_opti lorenzorubi,default,housekeeping_summary_v3,1,3787,null,null,null,null,null,null,null,null,null lorenzorubi,maxmind_geo,gold_ipv6,1,4907069,null,null,null,null,null,null,null,null,null lorenzorubi,default,click_sales,6,326068799,null,null,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,null,null,null,null,null -lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,null,null,192917,192917,192917,[],null +lorenzorubi,default,housekeeping_summary,1,192917,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,null,null,192917,192917,192917,["a"],null lorenzorubi,default,housekeeping_summary_v2,3,12326,2023-12-18T11:25:35Z,null,null,null,5273,5273,5273,[],null lorenzorubi,maxmind_geo,raw_locations,1,6933,null,null,null,null,null,null,null,null,null lorenzorubi,tpch,customer,1,61897021,null,null,null,null,null,null,null,null,null diff --git a/tests/unit/data/delta_housekeeping/expected_do_not_need_optimize.csv b/tests/unit/data/delta_housekeeping/expected_do_not_need_optimize.csv new file mode 100644 index 0000000..ee17ad5 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_do_not_need_optimize.csv @@ -0,0 +1,3 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason,rec_misc,rec_misc_reason +lorenzorubi,default,housekeeping_summary,1.0,192917.0,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917.0,192917.0,192917.0,[],,True, | Tables that are not OPTIMIZED often enough | Tables that are OPTIMIZED too often | Tables that are too small to be OPTIMIZED,True,The table has never been VACUUM'ed | | ,False, | +lorenzorubi,default,housekeeping_summary_v2,3.0,12326.0,2023-12-18T11:25:35Z,,,,5273.0,5273.0,5273.0,[],,True, | Tables that are not OPTIMIZED often enough | | Tables that are too small to be OPTIMIZED,True,The table has never been VACUUM'ed | | ,True,Tables that need more analysis -small_files | diff --git a/tests/unit/data/delta_housekeeping/expected_optimized_too_freq.csv b/tests/unit/data/delta_housekeeping/expected_optimized_too_freq.csv new file mode 100644 index 0000000..656b2e0 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_optimized_too_freq.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason,rec_misc,rec_misc_reason +lorenzorubi,default,housekeeping_summary,1.0,192917.0,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917.0,192917.0,192917.0,[],,True, | Tables that are not OPTIMIZED often enough | Tables that are OPTIMIZED too often | ,True,The table has never been VACUUM'ed | | ,False, | diff --git a/tests/unit/data/delta_housekeeping/expected_vacuumed_too_freq.csv b/tests/unit/data/delta_housekeeping/expected_vacuumed_too_freq.csv new file mode 100644 index 0000000..ac87c76 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_vacuumed_too_freq.csv @@ -0,0 +1,3 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason,rec_misc,rec_misc_reason +lorenzorubi,default,click_sales,6.0,326068799.0,,,2023-12-06T16:40:28Z,2023-12-05T01:19:47Z,,,,,,True,The table has not been OPTIMIZED and would benefit from it | | | ,True, | Tables that are not VACUUM'ed often enough | Tables that are VACUUM'ed too often,False, | +lorenzorubi,default,complete_data,6.0,326060019.0,,,2023-12-06T16:40:36Z,2023-12-05T01:19:25Z,,,,,,True,The table has not been OPTIMIZED and would benefit from it | | | ,True, | Tables that are not VACUUM'ed often enough | Tables that are VACUUM'ed too often,False, | diff --git a/tests/unit/data/delta_housekeeping/expected_zorder_not_effective.csv b/tests/unit/data/delta_housekeeping/expected_zorder_not_effective.csv new file mode 100644 index 0000000..cd0da45 --- /dev/null +++ b/tests/unit/data/delta_housekeeping/expected_zorder_not_effective.csv @@ -0,0 +1,2 @@ +catalog,database,tableName,number_of_files,bytes,max_optimize_timestamp,2nd_optimize_timestamp,max_vacuum_timestamp,2nd_vacuum_timestamp,min_file_size,p50_file_size,max_file_size,z_order_by,error,rec_optimize,rec_optimize_reason,rec_vacuum,rec_vacuum_reason,rec_misc,rec_misc_reason +lorenzorubi,default,housekeeping_summary,1.0,192917.0,2023-12-05T05:50:14Z,2023-12-05T05:21:22Z,,,192917.0,192917.0,192917.0,"[""a""]",,True, | Tables that are not OPTIMIZED often enough | Tables that are OPTIMIZED too often | Tables that are too small to be OPTIMIZED,True,The table has never been VACUUM'ed | | ,True, | Tables for which ZORDER is not being effective diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index 87b3ecf..dc30b56 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -1,5 +1,7 @@ import pytest import pandas as pd +import datetime +import discoverx.delta_housekeeping as mut from discoverx.delta_housekeeping import DeltaHousekeepingActions from pathlib import Path @@ -40,7 +42,36 @@ def expected_need_analysis(request): return _resolve_file_path(request, "data/delta_housekeeping/expected_need_analysis.csv") -def test_apply_need_optimize(housekeeping_stats, expected_need_optimize): +@pytest.fixture() +def expected_optimized_too_freq(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_optimized_too_freq.csv") + + +@pytest.fixture() +def expected_vacuumed_too_freq(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_vacuumed_too_freq.csv") + + +@pytest.fixture() +def expected_do_not_need_optimize(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_do_not_need_optimize.csv") + + +@pytest.fixture() +def expected_zorder_not_effective(request): + return _resolve_file_path(request, "data/delta_housekeeping/expected_zorder_not_effective.csv") + + +@pytest.fixture() +def patch_datetime_now(monkeypatch): + class mydatetime: + @classmethod + def now(cls, tzinfo): + return datetime.datetime(2024, 1, 28, 12, 0, 0).replace(tzinfo=tzinfo) + monkeypatch.setattr(mut, 'datetime', mydatetime) + + +def test_apply_need_optimize(housekeeping_stats, expected_need_optimize, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -56,7 +87,7 @@ def test_apply_need_optimize(housekeeping_stats, expected_need_optimize): ) -def test_empty_apply_need_optimize(housekeeping_stats): +def test_empty_apply_need_optimize(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -69,7 +100,7 @@ def test_empty_apply_need_optimize(housekeeping_stats): assert need_optimize_df.shape[0] == 0 -def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum): +def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -83,7 +114,7 @@ def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum): ) -def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimized_last_days): +def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimized_last_days, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -99,7 +130,7 @@ def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimize ) -def test_empty_apply_not_optimized_last_days(housekeeping_stats): +def test_empty_apply_not_optimized_last_days(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -112,7 +143,7 @@ def test_empty_apply_not_optimized_last_days(housekeeping_stats): assert need_optimize_df.shape[0] == 0 -def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_last_days): +def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_last_days, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -128,7 +159,7 @@ def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_ ) -def test_empty_apply_not_vacuumed_last_days(housekeeping_stats): +def test_empty_apply_not_vacuumed_last_days(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -141,8 +172,7 @@ def test_empty_apply_not_vacuumed_last_days(housekeeping_stats): assert need_vacuum_df.shape[0] == 0 -def test_apply_optimized_too_freq(housekeeping_stats): - # TODO add an example in the dataset? +def test_apply_optimized_too_freq(housekeeping_stats, expected_optimized_too_freq, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -151,14 +181,18 @@ def test_apply_optimized_too_freq(housekeeping_stats): need_optimize_df = res.loc[ res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_optimized_too_freq), : ] - assert need_optimize_df.shape[0] == 0 + assert need_optimize_df.shape[0] == 1 + pd.testing.assert_frame_equal( + need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_optimized_too_freq.loc[:, ["catalog", "database", "tableName"]], + ) -def test_empty_apply_optimized_too_freq(housekeeping_stats): +def test_empty_apply_optimized_too_freq(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, - max_optimize_freq=1e60 + max_optimize_freq=0 ) res = dha.generate_recommendations() need_optimize_df = res.loc[ @@ -167,8 +201,7 @@ def test_empty_apply_optimized_too_freq(housekeeping_stats): assert need_optimize_df.shape[0] == 0 -def test_apply_vacuumed_too_freq(housekeeping_stats): - # TODO add an example in the dataset? +def test_apply_vacuumed_too_freq(housekeeping_stats, expected_vacuumed_too_freq, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -177,14 +210,18 @@ def test_apply_vacuumed_too_freq(housekeeping_stats): need_vacuum_df = res.loc[ res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_vacuumed_too_freq), : ] - assert need_vacuum_df.shape[0] == 0 + assert need_vacuum_df.shape[0] == 2 + pd.testing.assert_frame_equal( + need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_vacuumed_too_freq.loc[:, ["catalog", "database", "tableName"]], + ) -def test_empty_apply_vacuumed_too_freq(housekeeping_stats): +def test_empty_apply_vacuumed_too_freq(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, - max_vacuum_freq=1e60 + max_vacuum_freq=0 ) res = dha.generate_recommendations() need_vacuum_df = res.loc[ @@ -193,8 +230,7 @@ def test_empty_apply_vacuumed_too_freq(housekeeping_stats): assert need_vacuum_df.shape[0] == 0 -def test_apply_do_not_need_optimize(housekeeping_stats): - # TODO add an example in the dataset? +def test_apply_do_not_need_optimize(housekeeping_stats, expected_do_not_need_optimize, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -203,14 +239,18 @@ def test_apply_do_not_need_optimize(housekeeping_stats): need_optimize_df = res.loc[ res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_do_not_need_optimize), : ] - assert need_optimize_df.shape[0] == 0 + assert need_optimize_df.shape[0] == 2 + pd.testing.assert_frame_equal( + need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_do_not_need_optimize.loc[:, ["catalog", "database", "tableName"]], + ) -def test_empty_apply_do_not_need_optimize(housekeeping_stats): +def test_empty_apply_do_not_need_optimize(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, - min_table_size_optimize=1e60 + min_table_size_optimize=0 ) res = dha.generate_recommendations() need_optimize_df = res.loc[ @@ -219,7 +259,7 @@ def test_empty_apply_do_not_need_optimize(housekeeping_stats): assert need_optimize_df.shape[0] == 0 -def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis): +def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -229,16 +269,13 @@ def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis): res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_to_analyze), : ] assert need_analysis_df.shape[0] == 1 - # module_path = Path(request.module.__file__) - # test_file_path = module_path.parent / "data/delta_housekeeping/expected_need_analysis.csv" - # need_analysis_df.to_csv(str(test_file_path.resolve()), index=False) pd.testing.assert_frame_equal( need_analysis_df.reset_index().loc[:, ["catalog", "database", "tableName"]], expected_need_analysis.loc[:, ["catalog", "database", "tableName"]], ) -def test_empty_apply_analyze_tables(housekeeping_stats): +def test_empty_apply_analyze_tables(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -251,8 +288,7 @@ def test_empty_apply_analyze_tables(housekeeping_stats): assert need_analysis_df.shape[0] == 0 -def test_apply_zorder_not_effective(request, housekeeping_stats, expected_need_analysis): - # TODO add an example in the dataset? +def test_apply_zorder_not_effective(housekeeping_stats, expected_zorder_not_effective, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -261,11 +297,14 @@ def test_apply_zorder_not_effective(request, housekeeping_stats, expected_need_a need_analysis_df = res.loc[ res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_zorder_not_effective), : ] - assert need_analysis_df.shape[0] == 0 + assert need_analysis_df.shape[0] == 1 + pd.testing.assert_frame_equal( + need_analysis_df.reset_index().loc[:, ["catalog", "database", "tableName"]], + expected_zorder_not_effective.loc[:, ["catalog", "database", "tableName"]], + ) -def test_empty_apply_zorder_not_effective(housekeeping_stats): - # TODO add an example in the dataset? +def test_empty_apply_zorder_not_effective(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, @@ -278,7 +317,7 @@ def test_empty_apply_zorder_not_effective(housekeeping_stats): assert need_analysis_df.shape[0] == 0 -def test_explain(housekeeping_stats): +def test_explain(housekeeping_stats, patch_datetime_now): dha = DeltaHousekeepingActions( None, stats=housekeeping_stats, From 9758a002b34f20a0125527d0b2a8cd6baba0e316 Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sat, 10 Feb 2024 11:47:04 +0000 Subject: [PATCH 18/25] fix explain / apply methods --- discoverx/delta_housekeeping.py | 40 +++++++++++++++-------------- examples/exec_delta_housekeeping.py | 2 +- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 95153e3..4975bc3 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -244,7 +244,7 @@ def _apply_changes_to_stats( stats[boolean_column_name_new] = False stats[reason_column_name_new] = None stats_sub = stats.loc[condition] - stats_sub = f_apply_legend(stats_sub, boolean_column_name_new, reason_column_name_new, **kwargs) + stats_sub = f_apply_legend(stats_sub.copy(), boolean_column_name_new, reason_column_name_new, **kwargs) self._stats = pd.merge( self._stats, stats_sub.loc[:, ["catalog", "database", "tableName", boolean_column_name_new, reason_column_name_new]], @@ -426,17 +426,9 @@ def display(self) -> None: def apply(self) -> DataFrame: """Displays recommendations in a DataFrame format""" - out = None - for recomm in self.generate_recommendations(): - for legend, df in recomm.items(): - out_df = self._spark.createDataFrame(df).withColumn("recommendation", F.lit(legend)) - if out is None: - out = out_df - else: - out = out.unionByName(out_df, allowMissingColumns=True) - return out + return self._spark.createDataFrame(self.generate_recommendations()) - def generate_recommendations(self) -> Iterable[dict]: + def generate_recommendations(self) -> pd.DataFrame: """ Generates Delta Housekeeping recommendations as a list of dictionaries (internal use + unit tests only) A dict per recommendation where: @@ -454,38 +446,48 @@ def generate_recommendations(self) -> Iterable[dict]: self._zorder_not_effective(), return self._stats.copy() - def _explain(self) -> Iterable: + def _explain(self) -> Iterable[dict]: stats = self.generate_recommendations() stats_optimize = stats.loc[stats["rec_optimize"], :] stats_vacuum = stats.loc[stats["rec_vacuum"], :] stats_misc = stats.loc[stats["rec_misc"], :] + schema = self._spark.createDataFrame(stats).schema out = [] for legend_optimize in [ self.tables_not_optimized_legend, self.tables_not_optimized_last_days, self.tables_optimized_too_freq, ]: - out.append({legend_optimize: stats_optimize.loc[stats_optimize["rec_optimize_reason"].str.contains(legend_optimize)]}) + out.append({legend_optimize: self._spark.createDataFrame( + stats_optimize.loc[stats_optimize["rec_optimize_reason"].str.contains(legend_optimize)], + schema + )}) for legend_vacuum in [ self.tables_not_vacuumed_legend, self.tables_not_vacuumed_last_days, self.tables_vacuumed_too_freq, ]: - out.append({legend_vacuum: stats_vacuum.loc[stats_vacuum["rec_vacuum_reason"].str.contains(legend_vacuum)]}) + out.append({legend_vacuum: self._spark.createDataFrame( + stats_vacuum.loc[stats_vacuum["rec_vacuum_reason"].str.contains(legend_vacuum)], + schema + )}) for legend_misc in [ self.tables_to_analyze, self.tables_zorder_not_effective, ]: - out.append({legend_misc: stats_misc.loc[stats_misc["rec_misc_reason"].str.contains(legend_misc)]}) + out.append({legend_misc: self._spark.createDataFrame( + stats_misc.loc[stats_misc["rec_misc_reason"].str.contains(legend_misc)], + schema + )}) return out def explain(self) -> None: - # TODO better formatting! from databricks.sdk.runtime import display - for legend, df in self._explain().items(): - display(legend) - display(df) + for item in self._explain(): + for legend, df in item.items(): + display(legend) + display(df) diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 561f5cd..411b5e7 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -48,7 +48,7 @@ # DBTITLE 1,apply() operation generates a spark dataframe with recommendations result = output.apply() -result.select("catalog", "database", "tableName", "recommendation").display() +result.select("catalog", "database", "tableName", "rec_optimize", "rec_optimize_reason", "rec_vacuum", "rec_vacuum_reason", "rec_misc", "rec_misc_reason").display() # COMMAND ---------- From 59760f9a733d47e992cbf0ddc6daea6872e09850 Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sat, 10 Feb 2024 17:23:37 +0000 Subject: [PATCH 19/25] refactoring to control output column names --- discoverx/delta_housekeeping.py | 142 ++++++++++-------- tests/unit/delta_housekeeping_actions_test.py | 36 ++--- 2 files changed, 95 insertions(+), 83 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 4975bc3..fd169ce 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -214,15 +214,45 @@ def __init__( self.max_vacuum_freq = max_vacuum_freq self.small_file_threshold = small_file_threshold self.min_number_of_files_for_zorder = min_number_of_files_for_zorder - self.tables_not_optimized_legend = "The table has not been OPTIMIZED and would benefit from it" - self.tables_not_vacuumed_legend = "The table has never been VACUUM'ed" - self.tables_not_optimized_last_days = "Tables that are not OPTIMIZED often enough" - self.tables_not_vacuumed_last_days = "Tables that are not VACUUM'ed often enough" - self.tables_optimized_too_freq = "Tables that are OPTIMIZED too often" - self.tables_vacuumed_too_freq = "Tables that are VACUUM'ed too often" - self.tables_do_not_need_optimize = "Tables that are too small to be OPTIMIZED" - self.tables_to_analyze = "Tables that need more analysis -small_files" - self.tables_zorder_not_effective = "Tables for which ZORDER is not being effective" + self.reason_col_suffix = "_reason" + self.recomendations_dict = { + "not_optimized": { + "legend": "Tables that have never been OPTIMIZED and would benefit from it", + "col_name": "rec_optimize" # "rec_not_optimized" + }, + "not_vacuumed": { + "legend": "Tables that have never been VACUUM'ed", + "col_name": "rec_vacuum" # "rec_not_vacuumed" + }, + "not_optimized_last_days": { + "legend": "Tables that are not OPTIMIZED often enough", + "col_name": "rec_optimize" # "rec_not_optimized_last_days" + }, + "not_vacuumed_last_days": { + "legend": "Tables that are not VACUUM'ed often enough", + "col_name": "rec_vacuum" # "rec_not_vacuumed_last_days" + }, + "optimized_too_freq": { + "legend": "Tables that are OPTIMIZED too often", + "col_name": "rec_optimize" # "rec_optimized_too_freq" + }, + "vacuumed_too_freq": { + "legend": "Tables that are VACUUM'ed too often", + "col_name": "rec_vacuum" # "rec_vacuumed_too_freq" + }, + "do_not_need_optimize": { + "legend": "Tables that are too small to be OPTIMIZED", + "col_name": "rec_optimize" # "rec_do_not_need_optimize" + }, + "to_analyze": { + "legend": "Tables that need more analysis -small_files", + "col_name": "rec_misc" # "rec_to_analyze" + }, + "zorder_not_effective": { + "legend": "Tables for which ZORDER is not being effective", + "col_name": "rec_misc" # "rec_zorder_not_effective" + }, + } def _apply_changes_to_stats( self, @@ -264,13 +294,13 @@ def _need_optimize(self) -> pd.DataFrame: def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): condition2 = stats_sub.bytes.astype(int) > self.min_table_size_optimize stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = self.tables_not_optimized_legend + stats_sub.loc[condition2, reason_column_name] = self.recomendations_dict["not_optimized"]["legend"] return stats_sub self._apply_changes_to_stats( condition=self._stats.max_optimize_timestamp.isnull() & self._stats.bytes.notnull(), - boolean_column_name="rec_optimize", - reason_column_name="rec_optimize_reason", + boolean_column_name=self.recomendations_dict["not_optimized"]["col_name"], + reason_column_name=self.recomendations_dict["not_optimized"]["col_name"] + self.reason_col_suffix, f_apply_legend=check_min_table_size_apply_legend, ) @@ -278,13 +308,13 @@ def _optimize_not_needed(self) -> pd.DataFrame: def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): condition2 = stats_sub.max_optimize_timestamp.notnull() & (stats_sub.bytes.astype(int) < self.min_table_size_optimize) stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = self.tables_do_not_need_optimize + stats_sub.loc[condition2, reason_column_name] = self.recomendations_dict["do_not_need_optimize"]["legend"] return stats_sub self._apply_changes_to_stats( condition=self._stats.max_optimize_timestamp.notnull() & self._stats.bytes.notnull(), - boolean_column_name="rec_optimize", - reason_column_name="rec_optimize_reason", + boolean_column_name=self.recomendations_dict["do_not_need_optimize"]["col_name"], + reason_column_name=self.recomendations_dict["do_not_need_optimize"]["col_name"] + self.reason_col_suffix, f_apply_legend=check_min_table_size_apply_legend, ) @@ -304,12 +334,12 @@ def check_timestamps_apply_legend( def _not_optimized_last_days(self) -> pd.DataFrame: self._apply_changes_to_stats( condition=~self._stats.max_optimize_timestamp.isnull(), - boolean_column_name="rec_optimize", - reason_column_name="rec_optimize_reason", + boolean_column_name=self.recomendations_dict["not_optimized_last_days"]["col_name"], + reason_column_name=self.recomendations_dict["not_optimized_last_days"]["col_name"] + self.reason_col_suffix, f_apply_legend=self.check_timestamps_apply_legend, timestamp_to_evaluate="max_optimize_timestamp", threshold=self.min_days_not_optimized, - reason=self.tables_not_optimized_last_days, + reason=self.recomendations_dict["not_optimized_last_days"]["legend"], ) @staticmethod @@ -329,37 +359,37 @@ def check_timestamp_diff_apply_legend( def _optimized_too_frequently(self) -> pd.DataFrame: self._apply_changes_to_stats( condition=self._stats.max_optimize_timestamp.notnull() & self._stats["2nd_optimize_timestamp"].notnull(), - boolean_column_name="rec_optimize", - reason_column_name="rec_optimize_reason", + boolean_column_name=self.recomendations_dict["optimized_too_freq"]["col_name"], + reason_column_name=self.recomendations_dict["optimized_too_freq"]["col_name"] + self.reason_col_suffix, f_apply_legend=self.check_timestamp_diff_apply_legend, timestamp1_to_evaluate="max_optimize_timestamp", timestamp2_to_evaluate="2nd_optimize_timestamp", threshold=self.max_optimize_freq, - reason=self.tables_optimized_too_freq, + reason=self.recomendations_dict["optimized_too_freq"]["legend"], ) def _never_vacuumed(self) -> pd.DataFrame: def apply_legend(stats_sub, boolean_column_name, reason_column_name): stats_sub.loc[:, boolean_column_name] = True - stats_sub.loc[:, reason_column_name] = self.tables_not_vacuumed_legend + stats_sub.loc[:, reason_column_name] = self.recomendations_dict["not_vacuumed"]["legend"] return stats_sub self._apply_changes_to_stats( condition=self._stats.max_vacuum_timestamp.isnull(), - boolean_column_name="rec_vacuum", - reason_column_name="rec_vacuum_reason", + boolean_column_name=self.recomendations_dict["not_vacuumed"]["col_name"], + reason_column_name=self.recomendations_dict["not_vacuumed"]["col_name"] + self.reason_col_suffix, f_apply_legend=apply_legend, ) def _not_vacuumed_last_days(self) -> pd.DataFrame: self._apply_changes_to_stats( condition=~self._stats.max_vacuum_timestamp.isnull(), - boolean_column_name="rec_vacuum", - reason_column_name="rec_vacuum_reason", + boolean_column_name=self.recomendations_dict["not_vacuumed_last_days"]["col_name"], + reason_column_name=self.recomendations_dict["not_vacuumed_last_days"]["col_name"] + self.reason_col_suffix, f_apply_legend=self.check_timestamps_apply_legend, timestamp_to_evaluate="max_vacuum_timestamp", threshold=self.min_days_not_vacuumed, - reason=self.tables_not_vacuumed_last_days, + reason=self.recomendations_dict["not_vacuumed_last_days"]["legend"], ) stats = self._stats.copy() stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp'], utc=True) @@ -373,26 +403,26 @@ def _not_vacuumed_last_days(self) -> pd.DataFrame: def _vacuumed_too_frequently(self) -> pd.DataFrame: self._apply_changes_to_stats( condition=self._stats.max_vacuum_timestamp.notnull() & self._stats["2nd_vacuum_timestamp"].notnull(), - boolean_column_name="rec_vacuum", - reason_column_name="rec_vacuum_reason", + boolean_column_name=self.recomendations_dict["vacuumed_too_freq"]["col_name"], + reason_column_name=self.recomendations_dict["vacuumed_too_freq"]["col_name"] + self.reason_col_suffix, f_apply_legend=self.check_timestamp_diff_apply_legend, timestamp1_to_evaluate="max_vacuum_timestamp", timestamp2_to_evaluate="2nd_vacuum_timestamp", threshold=self.max_vacuum_freq, - reason=self.tables_vacuumed_too_freq, + reason=self.recomendations_dict["vacuumed_too_freq"]["legend"], ) def _analyze_these_tables(self) -> pd.DataFrame: def check_analyze_tables_apply_legend(stats_sub, boolean_column_name, reason_column_name): condition2 = stats_sub['p50_file_size'].astype(int) < self.small_file_threshold stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = self.tables_to_analyze + stats_sub.loc[condition2, reason_column_name] = self.recomendations_dict["to_analyze"]["legend"] return stats_sub self._apply_changes_to_stats( condition=self._stats.max_optimize_timestamp.notnull() & self._stats.p50_file_size.notnull() & (self._stats.number_of_files > 1), - boolean_column_name="rec_misc", - reason_column_name="rec_misc_reason", + boolean_column_name=self.recomendations_dict["to_analyze"]["col_name"], + reason_column_name=self.recomendations_dict["to_analyze"]["col_name"] + self.reason_col_suffix, f_apply_legend=check_analyze_tables_apply_legend, ) @@ -406,13 +436,13 @@ def check_zorder_not_effective_apply_legend(stats_sub, boolean_column_name, reas stats_sub = stats_sub.loc[stats_sub['number_of_files'].astype(int) < self.min_number_of_files_for_zorder, :] stats_sub.loc[:, boolean_column_name] = True - stats_sub.loc[:, reason_column_name] = self.tables_zorder_not_effective + stats_sub.loc[:, reason_column_name] = self.recomendations_dict["zorder_not_effective"]["legend"] return stats_sub self._apply_changes_to_stats( condition=self._stats.max_optimize_timestamp.notnull() & self._stats.p50_file_size.notnull(), - boolean_column_name="rec_misc", - reason_column_name="rec_misc_reason", + boolean_column_name=self.recomendations_dict["zorder_not_effective"]["col_name"], + reason_column_name=self.recomendations_dict["zorder_not_effective"]["col_name"] + self.reason_col_suffix, f_apply_legend=check_zorder_not_effective_apply_legend, ) @@ -448,37 +478,19 @@ def generate_recommendations(self) -> pd.DataFrame: def _explain(self) -> Iterable[dict]: stats = self.generate_recommendations() - stats_optimize = stats.loc[stats["rec_optimize"], :] - stats_vacuum = stats.loc[stats["rec_vacuum"], :] - stats_misc = stats.loc[stats["rec_misc"], :] schema = self._spark.createDataFrame(stats).schema out = [] - for legend_optimize in [ - self.tables_not_optimized_legend, - self.tables_not_optimized_last_days, - self.tables_optimized_too_freq, - ]: - out.append({legend_optimize: self._spark.createDataFrame( - stats_optimize.loc[stats_optimize["rec_optimize_reason"].str.contains(legend_optimize)], - schema - )}) - - for legend_vacuum in [ - self.tables_not_vacuumed_legend, - self.tables_not_vacuumed_last_days, - self.tables_vacuumed_too_freq, - ]: - out.append({legend_vacuum: self._spark.createDataFrame( - stats_vacuum.loc[stats_vacuum["rec_vacuum_reason"].str.contains(legend_vacuum)], - schema - )}) - - for legend_misc in [ - self.tables_to_analyze, - self.tables_zorder_not_effective, - ]: - out.append({legend_misc: self._spark.createDataFrame( - stats_misc.loc[stats_misc["rec_misc_reason"].str.contains(legend_misc)], + for _, item in self.recomendations_dict.items(): + legend = None + col_name = None + for k, v in item.items(): + if k == "legend": + legend = v + elif k == "col_name": + col_name = v + + out.append({legend: self._spark.createDataFrame( + stats.loc[stats[col_name] & stats[col_name + self.reason_col_suffix].str.contains(legend)], schema )}) diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index dc30b56..be59985 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -78,7 +78,7 @@ def test_apply_need_optimize(housekeeping_stats, expected_need_optimize, patch_d ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_legend), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : ] assert need_optimize_df.shape[0] == 3 pd.testing.assert_frame_equal( @@ -95,7 +95,7 @@ def test_empty_apply_need_optimize(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_legend), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -106,7 +106,7 @@ def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum, patch_datet stats=housekeeping_stats, ) res = dha.generate_recommendations() - need_vacuum_df = res.loc[res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_not_vacuumed_legend), :] + need_vacuum_df = res.loc[res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["not_vacuumed"]["legend"]), :] assert need_vacuum_df.shape[0] == 17 pd.testing.assert_frame_equal( need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -121,7 +121,7 @@ def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimize ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_last_days), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : ] assert need_optimize_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -138,7 +138,7 @@ def test_empty_apply_not_optimized_last_days(housekeeping_stats, patch_datetime_ ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_not_optimized_last_days), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -150,7 +150,7 @@ def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_ ) res = dha.generate_recommendations() need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_not_vacuumed_last_days), : + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : ] assert need_vacuum_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -167,7 +167,7 @@ def test_empty_apply_not_vacuumed_last_days(housekeeping_stats, patch_datetime_n ) res = dha.generate_recommendations() need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_not_vacuumed_last_days), : + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : ] assert need_vacuum_df.shape[0] == 0 @@ -179,7 +179,7 @@ def test_apply_optimized_too_freq(housekeeping_stats, expected_optimized_too_fre ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_optimized_too_freq), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : ] assert need_optimize_df.shape[0] == 1 pd.testing.assert_frame_equal( @@ -196,7 +196,7 @@ def test_empty_apply_optimized_too_freq(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_optimized_too_freq), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -208,7 +208,7 @@ def test_apply_vacuumed_too_freq(housekeeping_stats, expected_vacuumed_too_freq, ) res = dha.generate_recommendations() need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_vacuumed_too_freq), : + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : ] assert need_vacuum_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -225,7 +225,7 @@ def test_empty_apply_vacuumed_too_freq(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations() need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.tables_vacuumed_too_freq), : + res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : ] assert need_vacuum_df.shape[0] == 0 @@ -237,7 +237,7 @@ def test_apply_do_not_need_optimize(housekeeping_stats, expected_do_not_need_opt ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_do_not_need_optimize), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : ] assert need_optimize_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -254,7 +254,7 @@ def test_empty_apply_do_not_need_optimize(housekeeping_stats, patch_datetime_now ) res = dha.generate_recommendations() need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.tables_do_not_need_optimize), : + res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -266,7 +266,7 @@ def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis, patch_ ) res = dha.generate_recommendations() need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_to_analyze), : + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : ] assert need_analysis_df.shape[0] == 1 pd.testing.assert_frame_equal( @@ -283,7 +283,7 @@ def test_empty_apply_analyze_tables(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations() need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_to_analyze), : + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : ] assert need_analysis_df.shape[0] == 0 @@ -295,7 +295,7 @@ def test_apply_zorder_not_effective(housekeeping_stats, expected_zorder_not_effe ) res = dha.generate_recommendations() need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_zorder_not_effective), : + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : ] assert need_analysis_df.shape[0] == 1 pd.testing.assert_frame_equal( @@ -312,7 +312,7 @@ def test_empty_apply_zorder_not_effective(housekeeping_stats, patch_datetime_now ) res = dha.generate_recommendations() need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.tables_zorder_not_effective), : + res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : ] assert need_analysis_df.shape[0] == 0 @@ -323,4 +323,4 @@ def test_explain(housekeeping_stats, patch_datetime_now): stats=housekeeping_stats, ) res = dha._explain() - assert len(res) == 8 + assert len(res) == 9 From a0d434ea085e678f4d845c80f629f56aa237420a Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sun, 11 Feb 2024 12:17:18 +0000 Subject: [PATCH 20/25] refactoring to spark API -intermediate commit --- discoverx/delta_housekeeping.py | 377 ++++++++---------- tests/unit/delta_housekeeping_actions_test.py | 92 +++-- 2 files changed, 225 insertions(+), 244 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index fd169ce..281b35d 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -198,15 +198,16 @@ def __init__( min_number_of_files_for_zorder: int = 8, stats: pd.DataFrame = None, # for testability only ) -> None: - if stats is None: - self._mapped_pd_dfs = mapped_pd_dfs - stats = pd.concat(self._mapped_pd_dfs) - self._stats: pd.DataFrame = stats - if spark is None: spark = SparkSession.builder.getOrCreate() self._spark = spark - + + if stats is None: + self._mapped_pd_dfs = mapped_pd_dfs + stats = pd.concat(self._mapped_pd_dfs) + self._stats_df: DataFrame = self._spark.createDataFrame(stats) + self._stats_rec: DataFrame = None + self.min_table_size_optimize = min_table_size_optimize self.min_days_not_optimized = min_days_not_optimized self.min_days_not_vacuumed = min_days_not_vacuumed @@ -218,267 +219,224 @@ def __init__( self.recomendations_dict = { "not_optimized": { "legend": "Tables that have never been OPTIMIZED and would benefit from it", - "col_name": "rec_optimize" # "rec_not_optimized" + "col_name": "rec_not_optimized" }, "not_vacuumed": { "legend": "Tables that have never been VACUUM'ed", - "col_name": "rec_vacuum" # "rec_not_vacuumed" + "col_name": "rec_not_vacuumed" }, "not_optimized_last_days": { "legend": "Tables that are not OPTIMIZED often enough", - "col_name": "rec_optimize" # "rec_not_optimized_last_days" + "col_name": "rec_not_optimized_last_days" }, "not_vacuumed_last_days": { "legend": "Tables that are not VACUUM'ed often enough", - "col_name": "rec_vacuum" # "rec_not_vacuumed_last_days" + "col_name": "rec_not_vacuumed_last_days" }, "optimized_too_freq": { "legend": "Tables that are OPTIMIZED too often", - "col_name": "rec_optimize" # "rec_optimized_too_freq" + "col_name": "rec_optimized_too_freq" }, "vacuumed_too_freq": { "legend": "Tables that are VACUUM'ed too often", - "col_name": "rec_vacuum" # "rec_vacuumed_too_freq" + "col_name": "rec_vacuumed_too_freq" }, "do_not_need_optimize": { "legend": "Tables that are too small to be OPTIMIZED", - "col_name": "rec_optimize" # "rec_do_not_need_optimize" + "col_name": "rec_do_not_need_optimize" }, "to_analyze": { "legend": "Tables that need more analysis -small_files", - "col_name": "rec_misc" # "rec_to_analyze" + "col_name": "rec_to_analyze" }, "zorder_not_effective": { "legend": "Tables for which ZORDER is not being effective", - "col_name": "rec_misc" # "rec_zorder_not_effective" + "col_name": "rec_zorder_not_effective" }, } - def _apply_changes_to_stats( - self, - condition: pd.Series, - boolean_column_name: str, - reason_column_name: str, - f_apply_legend: Callable, - **kwargs - ) -> pd.DataFrame: - compose_results = False - boolean_column_name_new = boolean_column_name - reason_column_name_new = reason_column_name - if boolean_column_name in self._stats.columns: - compose_results = True - boolean_column_name_new = boolean_column_name + "_new" - reason_column_name_new = reason_column_name + "_new" - - stats = self._stats.copy() - stats[boolean_column_name_new] = False - stats[reason_column_name_new] = None - stats_sub = stats.loc[condition] - stats_sub = f_apply_legend(stats_sub.copy(), boolean_column_name_new, reason_column_name_new, **kwargs) - self._stats = pd.merge( - self._stats, - stats_sub.loc[:, ["catalog", "database", "tableName", boolean_column_name_new, reason_column_name_new]], - on=["catalog", "database", "tableName"], - how="outer", - ) - self._stats = self._stats.fillna({boolean_column_name: False, reason_column_name: ""}) - if compose_results: - self._stats = self._stats.fillna({boolean_column_name_new: False, reason_column_name_new: ""}) - self._stats.loc[:, boolean_column_name] = \ - self._stats[boolean_column_name] | self._stats[boolean_column_name_new] - self._stats.loc[:, reason_column_name] = \ - self._stats[[reason_column_name, reason_column_name_new]].agg(' | '.join, axis=1) # TODO should figure out if either side is None - self._stats.drop([boolean_column_name_new, reason_column_name_new], axis=1, inplace=True) - - def _need_optimize(self) -> pd.DataFrame: - def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): - condition2 = stats_sub.bytes.astype(int) > self.min_table_size_optimize - stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = self.recomendations_dict["not_optimized"]["legend"] - return stats_sub - - self._apply_changes_to_stats( - condition=self._stats.max_optimize_timestamp.isnull() & self._stats.bytes.notnull(), - boolean_column_name=self.recomendations_dict["not_optimized"]["col_name"], - reason_column_name=self.recomendations_dict["not_optimized"]["col_name"] + self.reason_col_suffix, - f_apply_legend=check_min_table_size_apply_legend, + def _need_optimize(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["not_optimized"] + return stats_rec.withColumn( + conf_dict["col_name"], + F.when( + (F.col("max_optimize_timestamp").isNull() | F.isnan("max_optimize_timestamp")) & + F.col("bytes").isNotNull() & (F.col("bytes").astype("int") > F.lit(self.min_table_size_optimize)), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def _optimize_not_needed(self) -> pd.DataFrame: - def check_min_table_size_apply_legend(stats_sub, boolean_column_name, reason_column_name): - condition2 = stats_sub.max_optimize_timestamp.notnull() & (stats_sub.bytes.astype(int) < self.min_table_size_optimize) - stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = self.recomendations_dict["do_not_need_optimize"]["legend"] - return stats_sub - - self._apply_changes_to_stats( - condition=self._stats.max_optimize_timestamp.notnull() & self._stats.bytes.notnull(), - boolean_column_name=self.recomendations_dict["do_not_need_optimize"]["col_name"], - reason_column_name=self.recomendations_dict["do_not_need_optimize"]["col_name"] + self.reason_col_suffix, - f_apply_legend=check_min_table_size_apply_legend, + def _optimize_not_needed(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["do_not_need_optimize"] + return stats_rec.withColumn( + conf_dict["col_name"], + F.when( + F.col("max_optimize_timestamp").isNotNull() & ~F.isnan(F.col("max_optimize_timestamp")) & + F.col("bytes").isNotNull() & + (F.col("bytes").astype("int") < F.lit(self.min_table_size_optimize)), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - @staticmethod - def check_timestamps_apply_legend( - stats_sub, boolean_column_name, reason_column_name, **kwargs, - ): - stats_sub.loc[:, kwargs["timestamp_to_evaluate"]] = pd.to_datetime(stats_sub[kwargs["timestamp_to_evaluate"]], utc=True) - stats_sub.loc[:, 'lag'] = ( - datetime.now(timezone.utc) - stats_sub[kwargs["timestamp_to_evaluate"]] - ).dt.days - condition2 = stats_sub['lag'] > kwargs["threshold"] - stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = kwargs["reason"] - return stats_sub - - def _not_optimized_last_days(self) -> pd.DataFrame: - self._apply_changes_to_stats( - condition=~self._stats.max_optimize_timestamp.isnull(), - boolean_column_name=self.recomendations_dict["not_optimized_last_days"]["col_name"], - reason_column_name=self.recomendations_dict["not_optimized_last_days"]["col_name"] + self.reason_col_suffix, - f_apply_legend=self.check_timestamps_apply_legend, - timestamp_to_evaluate="max_optimize_timestamp", - threshold=self.min_days_not_optimized, - reason=self.recomendations_dict["not_optimized_last_days"]["legend"], + def _not_optimized_last_days(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["not_optimized_last_days"] + return stats_rec.withColumn( + "optimize_lag", + F.date_diff(F.lit(datetime.today()), F.col("max_optimize_timestamp")) + ).withColumn( + conf_dict["col_name"], + F.when( + F.col("optimize_lag") > F.lit(self.min_days_not_optimized), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - @staticmethod - def check_timestamp_diff_apply_legend( - stats_sub, boolean_column_name, reason_column_name, **kwargs, - ): - stats_sub.loc[:, kwargs["timestamp1_to_evaluate"]] = pd.to_datetime(stats_sub[kwargs["timestamp1_to_evaluate"]], utc=True) - stats_sub.loc[:, kwargs["timestamp2_to_evaluate"]] = pd.to_datetime(stats_sub[kwargs["timestamp2_to_evaluate"]], utc=True) - stats_sub.loc[:, 'lag'] = ( - stats_sub[kwargs["timestamp1_to_evaluate"]] - stats_sub[kwargs["timestamp2_to_evaluate"]] - ).dt.days - condition2 = stats_sub['lag'] < kwargs["threshold"] - stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = kwargs["reason"] - return stats_sub - - def _optimized_too_frequently(self) -> pd.DataFrame: - self._apply_changes_to_stats( - condition=self._stats.max_optimize_timestamp.notnull() & self._stats["2nd_optimize_timestamp"].notnull(), - boolean_column_name=self.recomendations_dict["optimized_too_freq"]["col_name"], - reason_column_name=self.recomendations_dict["optimized_too_freq"]["col_name"] + self.reason_col_suffix, - f_apply_legend=self.check_timestamp_diff_apply_legend, - timestamp1_to_evaluate="max_optimize_timestamp", - timestamp2_to_evaluate="2nd_optimize_timestamp", - threshold=self.max_optimize_freq, - reason=self.recomendations_dict["optimized_too_freq"]["legend"], + def _optimized_too_frequently(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["optimized_too_freq"] + return stats_rec.withColumn( + "optimize_freq", + F.when( + F.col("max_optimize_timestamp").isNotNull() & ~F.isnan(F.col("max_optimize_timestamp")) & + F.col("2nd_optimize_timestamp").isNotNull() & ~F.isnan(F.col("2nd_optimize_timestamp")), + F.date_diff(F.col("max_optimize_timestamp"), F.col("2nd_optimize_timestamp")) + ) + ).withColumn( + conf_dict["col_name"], + F.when( + F.col("optimize_freq") < F.lit(self.max_optimize_freq), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def _never_vacuumed(self) -> pd.DataFrame: - def apply_legend(stats_sub, boolean_column_name, reason_column_name): - stats_sub.loc[:, boolean_column_name] = True - stats_sub.loc[:, reason_column_name] = self.recomendations_dict["not_vacuumed"]["legend"] - return stats_sub - - self._apply_changes_to_stats( - condition=self._stats.max_vacuum_timestamp.isnull(), - boolean_column_name=self.recomendations_dict["not_vacuumed"]["col_name"], - reason_column_name=self.recomendations_dict["not_vacuumed"]["col_name"] + self.reason_col_suffix, - f_apply_legend=apply_legend, + def _never_vacuumed(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["not_vacuumed"] + return stats_rec.withColumn( + conf_dict["col_name"], + F.when( + F.col("max_vacuum_timestamp").isNull() | F.isnan("max_vacuum_timestamp"), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def _not_vacuumed_last_days(self) -> pd.DataFrame: - self._apply_changes_to_stats( - condition=~self._stats.max_vacuum_timestamp.isnull(), - boolean_column_name=self.recomendations_dict["not_vacuumed_last_days"]["col_name"], - reason_column_name=self.recomendations_dict["not_vacuumed_last_days"]["col_name"] + self.reason_col_suffix, - f_apply_legend=self.check_timestamps_apply_legend, - timestamp_to_evaluate="max_vacuum_timestamp", - threshold=self.min_days_not_vacuumed, - reason=self.recomendations_dict["not_vacuumed_last_days"]["legend"], - ) - stats = self._stats.copy() - stats['max_vacuum_timestamp'] = pd.to_datetime(stats['max_vacuum_timestamp'], utc=True) - stats['vacuum_lag'] = ( - datetime.now(timezone.utc) - stats['max_vacuum_timestamp'] - ).dt.days - return ( - stats[stats['vacuum_lag'] < self.min_days_not_vacuumed] + def _not_vacuumed_last_days(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["not_vacuumed_last_days"] + return stats_rec.withColumn( + "vacuum_lag", + F.date_diff(F.lit(datetime.today()), F.col("max_vacuum_timestamp")) + ).withColumn( + conf_dict["col_name"], + F.when( + F.col("vacuum_lag") > F.lit(self.min_days_not_vacuumed), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def _vacuumed_too_frequently(self) -> pd.DataFrame: - self._apply_changes_to_stats( - condition=self._stats.max_vacuum_timestamp.notnull() & self._stats["2nd_vacuum_timestamp"].notnull(), - boolean_column_name=self.recomendations_dict["vacuumed_too_freq"]["col_name"], - reason_column_name=self.recomendations_dict["vacuumed_too_freq"]["col_name"] + self.reason_col_suffix, - f_apply_legend=self.check_timestamp_diff_apply_legend, - timestamp1_to_evaluate="max_vacuum_timestamp", - timestamp2_to_evaluate="2nd_vacuum_timestamp", - threshold=self.max_vacuum_freq, - reason=self.recomendations_dict["vacuumed_too_freq"]["legend"], + def _vacuumed_too_frequently(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["vacuumed_too_freq"] + return stats_rec.withColumn( + "vacuum_freq", + F.when( + F.col("max_vacuum_timestamp").isNotNull() & ~F.isnan(F.col("max_vacuum_timestamp")) & + F.col("2nd_vacuum_timestamp").isNotNull() & ~F.isnan(F.col("2nd_vacuum_timestamp")), + F.date_diff(F.col("max_vacuum_timestamp"), F.col("2nd_vacuum_timestamp")) + ) + ).withColumn( + conf_dict["col_name"], + F.when( + F.col("vacuum_freq") < F.lit(self.max_vacuum_freq), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def _analyze_these_tables(self) -> pd.DataFrame: - def check_analyze_tables_apply_legend(stats_sub, boolean_column_name, reason_column_name): - condition2 = stats_sub['p50_file_size'].astype(int) < self.small_file_threshold - stats_sub.loc[condition2, boolean_column_name] = True - stats_sub.loc[condition2, reason_column_name] = self.recomendations_dict["to_analyze"]["legend"] - return stats_sub - - self._apply_changes_to_stats( - condition=self._stats.max_optimize_timestamp.notnull() & self._stats.p50_file_size.notnull() & (self._stats.number_of_files > 1), - boolean_column_name=self.recomendations_dict["to_analyze"]["col_name"], - reason_column_name=self.recomendations_dict["to_analyze"]["col_name"] + self.reason_col_suffix, - f_apply_legend=check_analyze_tables_apply_legend, + def _analyze_these_tables(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["to_analyze"] + return stats_rec.withColumn( + conf_dict["col_name"], + F.when( + F.col("max_optimize_timestamp").isNotNull() & ~F.isnan(F.col("max_optimize_timestamp")) & + F.col("p50_file_size").isNotNull() & (F.col("number_of_files") > F.lit(1)) & + (F.col("p50_file_size").astype("int") < F.lit(self.small_file_threshold)), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def _zorder_not_effective(self) -> pd.DataFrame: - def check_zorder_not_effective_apply_legend(stats_sub, boolean_column_name, reason_column_name): - stats_sub['z_order_by_clean'] = stats_sub['z_order_by'].apply( - lambda x: None if x == "[]" else x.replace('[', '').replace(']', '').replace('"', '')) - stats_sub['z_order_by_array'] = stats_sub['z_order_by_clean'].str.split(',') - - stats_sub = stats_sub.loc[stats_sub['z_order_by_array'].str.len() > 0, :] - stats_sub = stats_sub.loc[stats_sub['number_of_files'].astype(int) < self.min_number_of_files_for_zorder, :] - - stats_sub.loc[:, boolean_column_name] = True - stats_sub.loc[:, reason_column_name] = self.recomendations_dict["zorder_not_effective"]["legend"] - return stats_sub - - self._apply_changes_to_stats( - condition=self._stats.max_optimize_timestamp.notnull() & self._stats.p50_file_size.notnull(), - boolean_column_name=self.recomendations_dict["zorder_not_effective"]["col_name"], - reason_column_name=self.recomendations_dict["zorder_not_effective"]["col_name"] + self.reason_col_suffix, - f_apply_legend=check_zorder_not_effective_apply_legend, + def _zorder_not_effective(self, stats_rec: DataFrame) -> DataFrame: + conf_dict = self.recomendations_dict["zorder_not_effective"] + return stats_rec.withColumn( + "z_order_by_clean", + F.when( + F.col("max_optimize_timestamp").isNull() | F.isnan(F.col("max_optimize_timestamp")) | + F.col("p50_file_size").isNull() | (F.col("z_order_by") == "[]"), + None + ).otherwise( + F.regexp_replace( + F.regexp_replace( + F.regexp_replace( + F.col("z_order_by"), "\\[", "" + ), "\\]", "" + ), '"', "" + ) + ) + ).withColumn( + "z_order_by_array", F.split(F.col("z_order_by_clean"), ",") + ).withColumn( + conf_dict["col_name"], + F.when( + (F.size(F.col("z_order_by_array")) > 0) & + (F.col("number_of_files") < F.lit(self.min_number_of_files_for_zorder)), + F.lit(True) + ).otherwise(F.lit(False)) + ).withColumn( + conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) - def stats(self) -> DataFrame: - """Ouputs the stats per table""" - return self._spark.createDataFrame(self._stats) - def display(self) -> None: """Executes the Delta housekeeping analysis and displays a sample of results""" return self.apply().display() def apply(self) -> DataFrame: """Displays recommendations in a DataFrame format""" - return self._spark.createDataFrame(self.generate_recommendations()) + return self.generate_recommendations() - def generate_recommendations(self) -> pd.DataFrame: + def generate_recommendations(self) -> DataFrame: """ Generates Delta Housekeeping recommendations as a list of dictionaries (internal use + unit tests only) A dict per recommendation where: - The key is the legend of the recommendation - The value is a pandas df with the affected tables """ - self._need_optimize() - self._never_vacuumed(), - self._not_optimized_last_days(), - self._not_vacuumed_last_days(), - self._optimized_too_frequently(), - self._vacuumed_too_frequently(), - self._optimize_not_needed(), - self._analyze_these_tables(), - self._zorder_not_effective(), - return self._stats.copy() + if self._stats_rec is None: + stats_rec = self._need_optimize(self._stats_df) + stats_rec = self._never_vacuumed(stats_rec) + stats_rec = self._not_optimized_last_days(stats_rec) + stats_rec = self._not_vacuumed_last_days(stats_rec) + stats_rec = self._optimized_too_frequently(stats_rec) + stats_rec = self._vacuumed_too_frequently(stats_rec) + stats_rec = self._optimize_not_needed(stats_rec) + stats_rec = self._analyze_these_tables(stats_rec) + stats_rec = self._zorder_not_effective(stats_rec) + self._stats_rec = stats_rec + + return self._stats_rec def _explain(self) -> Iterable[dict]: stats = self.generate_recommendations() - schema = self._spark.createDataFrame(stats).schema out = [] for _, item in self.recomendations_dict.items(): legend = None @@ -489,10 +447,9 @@ def _explain(self) -> Iterable[dict]: elif k == "col_name": col_name = v - out.append({legend: self._spark.createDataFrame( - stats.loc[stats[col_name] & stats[col_name + self.reason_col_suffix].str.contains(legend)], - schema - )}) + out.append({ + legend: stats.filter(F.col(col_name) & F.col(col_name + self.reason_col_suffix).rlike(legend)) + }) return out diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index be59985..fc476e5 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -68,6 +68,11 @@ class mydatetime: @classmethod def now(cls, tzinfo): return datetime.datetime(2024, 1, 28, 12, 0, 0).replace(tzinfo=tzinfo) + + @classmethod + def today(cls): + return datetime.datetime(2024, 1, 28, 12, 0, 0) + monkeypatch.setattr(mut, 'datetime', mydatetime) @@ -76,9 +81,10 @@ def test_apply_need_optimize(housekeeping_stats, expected_need_optimize, patch_d None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_optimized"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : ] assert need_optimize_df.shape[0] == 3 pd.testing.assert_frame_equal( @@ -93,9 +99,10 @@ def test_empty_apply_need_optimize(housekeeping_stats, patch_datetime_now): stats=housekeeping_stats, min_table_size_optimize=1024*1024*1024*1024 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_optimized"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -105,8 +112,11 @@ def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum, patch_datet None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() - need_vacuum_df = res.loc[res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["not_vacuumed"]["legend"]), :] + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_vacuumed"]["col_name"] + need_vacuum_df = res.loc[ + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_vacuumed"]["legend"]), : + ] assert need_vacuum_df.shape[0] == 17 pd.testing.assert_frame_equal( need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -119,9 +129,10 @@ def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimize None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_optimized_last_days"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : ] assert need_optimize_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -136,9 +147,10 @@ def test_empty_apply_not_optimized_last_days(housekeeping_stats, patch_datetime_ stats=housekeeping_stats, min_days_not_optimized=1e60 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_optimized_last_days"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -148,9 +160,10 @@ def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_ None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_vacuumed_last_days"]["col_name"] need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : ] assert need_vacuum_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -165,9 +178,10 @@ def test_empty_apply_not_vacuumed_last_days(housekeeping_stats, patch_datetime_n stats=housekeeping_stats, min_days_not_vacuumed=1e60 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["not_vacuumed_last_days"]["col_name"] need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : ] assert need_vacuum_df.shape[0] == 0 @@ -177,9 +191,10 @@ def test_apply_optimized_too_freq(housekeeping_stats, expected_optimized_too_fre None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["optimized_too_freq"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : ] assert need_optimize_df.shape[0] == 1 pd.testing.assert_frame_equal( @@ -194,9 +209,10 @@ def test_empty_apply_optimized_too_freq(housekeeping_stats, patch_datetime_now): stats=housekeeping_stats, max_optimize_freq=0 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["optimized_too_freq"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -206,9 +222,10 @@ def test_apply_vacuumed_too_freq(housekeeping_stats, expected_vacuumed_too_freq, None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["vacuumed_too_freq"]["col_name"] need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : ] assert need_vacuum_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -223,9 +240,10 @@ def test_empty_apply_vacuumed_too_freq(housekeeping_stats, patch_datetime_now): stats=housekeeping_stats, max_vacuum_freq=0 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["vacuumed_too_freq"]["col_name"] need_vacuum_df = res.loc[ - res["rec_vacuum"] & res["rec_vacuum_reason"].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : ] assert need_vacuum_df.shape[0] == 0 @@ -235,9 +253,10 @@ def test_apply_do_not_need_optimize(housekeeping_stats, expected_do_not_need_opt None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["do_not_need_optimize"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : ] assert need_optimize_df.shape[0] == 2 pd.testing.assert_frame_equal( @@ -252,9 +271,10 @@ def test_empty_apply_do_not_need_optimize(housekeeping_stats, patch_datetime_now stats=housekeeping_stats, min_table_size_optimize=0 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["do_not_need_optimize"]["col_name"] need_optimize_df = res.loc[ - res["rec_optimize"] & res["rec_optimize_reason"].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : ] assert need_optimize_df.shape[0] == 0 @@ -264,9 +284,10 @@ def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis, patch_ None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["to_analyze"]["col_name"] need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : ] assert need_analysis_df.shape[0] == 1 pd.testing.assert_frame_equal( @@ -281,9 +302,10 @@ def test_empty_apply_analyze_tables(housekeeping_stats, patch_datetime_now): stats=housekeeping_stats, small_file_threshold=0 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["to_analyze"]["col_name"] need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : ] assert need_analysis_df.shape[0] == 0 @@ -293,9 +315,10 @@ def test_apply_zorder_not_effective(housekeeping_stats, expected_zorder_not_effe None, stats=housekeeping_stats, ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["zorder_not_effective"]["col_name"] need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : ] assert need_analysis_df.shape[0] == 1 pd.testing.assert_frame_equal( @@ -310,9 +333,10 @@ def test_empty_apply_zorder_not_effective(housekeeping_stats, patch_datetime_now stats=housekeeping_stats, min_number_of_files_for_zorder=0 ) - res = dha.generate_recommendations() + res = dha.generate_recommendations().toPandas() + col_name = dha.recomendations_dict["zorder_not_effective"]["col_name"] need_analysis_df = res.loc[ - res["rec_misc"] & res["rec_misc_reason"].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : + res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : ] assert need_analysis_df.shape[0] == 0 From 0abe9a2327f6f06254a62d885023dd144b939d09 Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sun, 11 Feb 2024 12:39:09 +0000 Subject: [PATCH 21/25] tests with DBR -nan's & timestamps --- discoverx/delta_housekeeping.py | 24 +++++++++++++++--------- examples/exec_delta_housekeeping.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 281b35d..077c806 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -206,6 +206,14 @@ def __init__( self._mapped_pd_dfs = mapped_pd_dfs stats = pd.concat(self._mapped_pd_dfs) self._stats_df: DataFrame = self._spark.createDataFrame(stats) + for column in [col_name for col_name in self._stats_df.columns if 'timestamp' in col_name]: + column_type = [dtype for col_name, dtype in self._stats_df.dtypes if col_name == column][0] + if column_type == 'timestamp': + continue + self._stats_df = self._stats_df.withColumn( + column, + F.when(F.isnan(F.col(column)), None).otherwise(F.col(column)) + ) self._stats_rec: DataFrame = None self.min_table_size_optimize = min_table_size_optimize @@ -260,7 +268,7 @@ def _need_optimize(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( conf_dict["col_name"], F.when( - (F.col("max_optimize_timestamp").isNull() | F.isnan("max_optimize_timestamp")) & + F.col("max_optimize_timestamp").isNull() & F.col("bytes").isNotNull() & (F.col("bytes").astype("int") > F.lit(self.min_table_size_optimize)), F.lit(True) ).otherwise(F.lit(False)) @@ -273,7 +281,7 @@ def _optimize_not_needed(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( conf_dict["col_name"], F.when( - F.col("max_optimize_timestamp").isNotNull() & ~F.isnan(F.col("max_optimize_timestamp")) & + F.col("max_optimize_timestamp").isNotNull() & F.col("bytes").isNotNull() & (F.col("bytes").astype("int") < F.lit(self.min_table_size_optimize)), F.lit(True) @@ -302,8 +310,7 @@ def _optimized_too_frequently(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( "optimize_freq", F.when( - F.col("max_optimize_timestamp").isNotNull() & ~F.isnan(F.col("max_optimize_timestamp")) & - F.col("2nd_optimize_timestamp").isNotNull() & ~F.isnan(F.col("2nd_optimize_timestamp")), + F.col("max_optimize_timestamp").isNotNull() & F.col("2nd_optimize_timestamp").isNotNull(), F.date_diff(F.col("max_optimize_timestamp"), F.col("2nd_optimize_timestamp")) ) ).withColumn( @@ -321,7 +328,7 @@ def _never_vacuumed(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( conf_dict["col_name"], F.when( - F.col("max_vacuum_timestamp").isNull() | F.isnan("max_vacuum_timestamp"), + F.col("max_vacuum_timestamp").isNull(), F.lit(True) ).otherwise(F.lit(False)) ).withColumn( @@ -348,8 +355,7 @@ def _vacuumed_too_frequently(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( "vacuum_freq", F.when( - F.col("max_vacuum_timestamp").isNotNull() & ~F.isnan(F.col("max_vacuum_timestamp")) & - F.col("2nd_vacuum_timestamp").isNotNull() & ~F.isnan(F.col("2nd_vacuum_timestamp")), + F.col("max_vacuum_timestamp").isNotNull() & F.col("2nd_vacuum_timestamp").isNotNull(), F.date_diff(F.col("max_vacuum_timestamp"), F.col("2nd_vacuum_timestamp")) ) ).withColumn( @@ -367,7 +373,7 @@ def _analyze_these_tables(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( conf_dict["col_name"], F.when( - F.col("max_optimize_timestamp").isNotNull() & ~F.isnan(F.col("max_optimize_timestamp")) & + F.col("max_optimize_timestamp").isNotNull() & F.col("p50_file_size").isNotNull() & (F.col("number_of_files") > F.lit(1)) & (F.col("p50_file_size").astype("int") < F.lit(self.small_file_threshold)), F.lit(True) @@ -381,7 +387,7 @@ def _zorder_not_effective(self, stats_rec: DataFrame) -> DataFrame: return stats_rec.withColumn( "z_order_by_clean", F.when( - F.col("max_optimize_timestamp").isNull() | F.isnan(F.col("max_optimize_timestamp")) | + F.col("max_optimize_timestamp").isNull() | F.col("p50_file_size").isNull() | (F.col("z_order_by") == "[]"), None ).otherwise( diff --git a/examples/exec_delta_housekeeping.py b/examples/exec_delta_housekeeping.py index 411b5e7..bf41e4c 100644 --- a/examples/exec_delta_housekeeping.py +++ b/examples/exec_delta_housekeeping.py @@ -48,7 +48,7 @@ # DBTITLE 1,apply() operation generates a spark dataframe with recommendations result = output.apply() -result.select("catalog", "database", "tableName", "rec_optimize", "rec_optimize_reason", "rec_vacuum", "rec_vacuum_reason", "rec_misc", "rec_misc_reason").display() +result.select("catalog", "database", "tableName", *[c for c in result.columns if c.startswith("rec_")]).display() # COMMAND ---------- From 16e7ec6caf280ddc01ffbce163524d1d13dc8b6d Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sun, 11 Feb 2024 13:01:07 +0000 Subject: [PATCH 22/25] failing test + cleanup --- tests/unit/explorer_test.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/unit/explorer_test.py b/tests/unit/explorer_test.py index 54d5793..a59f361 100644 --- a/tests/unit/explorer_test.py +++ b/tests/unit/explorer_test.py @@ -3,8 +3,6 @@ from discoverx.explorer import DataExplorer, DataExplorerActions, InfoFetcher, TableInfo -# # Sample test data -# sample_table_info = TableInfo("catalog1", "schema1", "table1", []) @pytest.fixture() def info_fetcher(spark): return InfoFetcher(spark=spark, information_schema="default") @@ -72,8 +70,8 @@ def test_map(spark, info_fetcher): assert len(result) == 1 assert result[0].table == "tb_1" assert result[0].schema == "default" - assert result[0].catalog == None - assert result[0].tags == None + assert result[0].catalog is None + assert result[0].tags is None def test_map_with_tags(spark, info_fetcher): @@ -82,7 +80,7 @@ def test_map_with_tags(spark, info_fetcher): assert len(result) == 1 assert result[0].table == "tb_1" assert result[0].schema == "default" - assert result[0].catalog == None + assert result[0].catalog is None assert len(result[0].tags.table_tags) == 1 @@ -94,7 +92,7 @@ def test_map_with_source_data_formats(spark, info_fetcher): assert len(result) == 1 assert result[0].table == "tb_1" assert result[0].schema == "default" - assert result[0].catalog == None + assert result[0].catalog is None data_explorer = DataExplorer("*.default.tb_1", spark, info_fetcher).with_data_source_formats( data_source_formats=["CSV"] @@ -111,7 +109,7 @@ def test_no_tables_matching_filter(spark, info_fetcher): def test_delta_housekeeeping_call(spark, info_fetcher): data_explorer = DataExplorer("*.default.*", spark, info_fetcher) - result: pandas.DataFrame = data_explorer.delta_housekeeping()._stats + result: pandas.DataFrame = data_explorer.delta_housekeeping()._stats_df.toPandas() print(result['tableName'].count()) assert result['tableName'].count() == 3 for res in result['tableName'].tolist(): From 24edacb63d0424b34598b066b7258b18de61f105 Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sun, 11 Feb 2024 13:04:59 +0000 Subject: [PATCH 23/25] cleanup --- discoverx/explorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discoverx/explorer.py b/discoverx/explorer.py index 38964aa..a578d0e 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -184,7 +184,7 @@ def scan( discover.scan(rules=rules, sample_size=sample_size, what_if=what_if) return discover - def map(self, f: Callable) -> list[any]: + def map(self, f: Callable[[TableInfo], Any]) -> list[Any]: """Runs a function for each table in the data explorer Args: From 1b1de405ed691e64c16fbd26ed5f6ce796784565 Mon Sep 17 00:00:00 2001 From: lorenzorubi-db Date: Sun, 11 Feb 2024 13:10:02 +0000 Subject: [PATCH 24/25] cleanup --- discoverx/explorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discoverx/explorer.py b/discoverx/explorer.py index a578d0e..d252f52 100644 --- a/discoverx/explorer.py +++ b/discoverx/explorer.py @@ -2,7 +2,7 @@ import copy import re import pandas as pd -from typing import Optional, List, Callable, Iterable +from typing import Optional, List, Callable, Iterable, Any from discoverx import logging from discoverx.common import helper from discoverx.discovery import Discovery From aa671a2aa87805a8e3a677f60bb40462c9b4b07f Mon Sep 17 00:00:00 2001 From: Lorenzo Rubio Date: Fri, 16 Feb 2024 17:50:07 +0100 Subject: [PATCH 25/25] remove 'reason' column from the output dfs --- discoverx/delta_housekeeping.py | 20 +----- tests/unit/delta_housekeeping_actions_test.py | 68 +++++-------------- 2 files changed, 18 insertions(+), 70 deletions(-) diff --git a/discoverx/delta_housekeeping.py b/discoverx/delta_housekeeping.py index 077c806..45daa11 100644 --- a/discoverx/delta_housekeeping.py +++ b/discoverx/delta_housekeeping.py @@ -272,8 +272,6 @@ def _need_optimize(self, stats_rec: DataFrame) -> DataFrame: F.col("bytes").isNotNull() & (F.col("bytes").astype("int") > F.lit(self.min_table_size_optimize)), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _optimize_not_needed(self, stats_rec: DataFrame) -> DataFrame: @@ -286,8 +284,6 @@ def _optimize_not_needed(self, stats_rec: DataFrame) -> DataFrame: (F.col("bytes").astype("int") < F.lit(self.min_table_size_optimize)), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _not_optimized_last_days(self, stats_rec: DataFrame) -> DataFrame: @@ -301,8 +297,6 @@ def _not_optimized_last_days(self, stats_rec: DataFrame) -> DataFrame: F.col("optimize_lag") > F.lit(self.min_days_not_optimized), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _optimized_too_frequently(self, stats_rec: DataFrame) -> DataFrame: @@ -319,8 +313,6 @@ def _optimized_too_frequently(self, stats_rec: DataFrame) -> DataFrame: F.col("optimize_freq") < F.lit(self.max_optimize_freq), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _never_vacuumed(self, stats_rec: DataFrame) -> DataFrame: @@ -331,8 +323,6 @@ def _never_vacuumed(self, stats_rec: DataFrame) -> DataFrame: F.col("max_vacuum_timestamp").isNull(), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _not_vacuumed_last_days(self, stats_rec: DataFrame) -> DataFrame: @@ -346,8 +336,6 @@ def _not_vacuumed_last_days(self, stats_rec: DataFrame) -> DataFrame: F.col("vacuum_lag") > F.lit(self.min_days_not_vacuumed), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _vacuumed_too_frequently(self, stats_rec: DataFrame) -> DataFrame: @@ -364,8 +352,6 @@ def _vacuumed_too_frequently(self, stats_rec: DataFrame) -> DataFrame: F.col("vacuum_freq") < F.lit(self.max_vacuum_freq), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _analyze_these_tables(self, stats_rec: DataFrame) -> DataFrame: @@ -378,8 +364,6 @@ def _analyze_these_tables(self, stats_rec: DataFrame) -> DataFrame: (F.col("p50_file_size").astype("int") < F.lit(self.small_file_threshold)), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def _zorder_not_effective(self, stats_rec: DataFrame) -> DataFrame: @@ -408,8 +392,6 @@ def _zorder_not_effective(self, stats_rec: DataFrame) -> DataFrame: (F.col("number_of_files") < F.lit(self.min_number_of_files_for_zorder)), F.lit(True) ).otherwise(F.lit(False)) - ).withColumn( - conf_dict["col_name"] + self.reason_col_suffix, F.lit(conf_dict["legend"]) ) def display(self) -> None: @@ -454,7 +436,7 @@ def _explain(self) -> Iterable[dict]: col_name = v out.append({ - legend: stats.filter(F.col(col_name) & F.col(col_name + self.reason_col_suffix).rlike(legend)) + legend: stats.filter(F.col(col_name)) }) return out diff --git a/tests/unit/delta_housekeeping_actions_test.py b/tests/unit/delta_housekeeping_actions_test.py index fc476e5..0ab54fc 100644 --- a/tests/unit/delta_housekeeping_actions_test.py +++ b/tests/unit/delta_housekeeping_actions_test.py @@ -83,9 +83,7 @@ def test_apply_need_optimize(housekeeping_stats, expected_need_optimize, patch_d ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_optimized"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 3 pd.testing.assert_frame_equal( need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -101,9 +99,7 @@ def test_empty_apply_need_optimize(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_optimized"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 0 @@ -114,9 +110,7 @@ def test_apply_need_vacuum(housekeeping_stats, expected_need_vacuum, patch_datet ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_vacuumed"]["col_name"] - need_vacuum_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_vacuumed"]["legend"]), : - ] + need_vacuum_df = res.loc[res[col_name], :] assert need_vacuum_df.shape[0] == 17 pd.testing.assert_frame_equal( need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -131,9 +125,7 @@ def test_apply_not_optimized_last_days(housekeeping_stats, expected_not_optimize ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_optimized_last_days"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 2 pd.testing.assert_frame_equal( need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -149,9 +141,7 @@ def test_empty_apply_not_optimized_last_days(housekeeping_stats, patch_datetime_ ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_optimized_last_days"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_optimized_last_days"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 0 @@ -162,9 +152,7 @@ def test_apply_not_vacuumed_last_days(housekeeping_stats, expected_not_vacuumed_ ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_vacuumed_last_days"]["col_name"] - need_vacuum_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : - ] + need_vacuum_df = res.loc[res[col_name], :] assert need_vacuum_df.shape[0] == 2 pd.testing.assert_frame_equal( need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -180,9 +168,7 @@ def test_empty_apply_not_vacuumed_last_days(housekeeping_stats, patch_datetime_n ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["not_vacuumed_last_days"]["col_name"] - need_vacuum_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["not_vacuumed_last_days"]["legend"]), : - ] + need_vacuum_df = res.loc[res[col_name], :] assert need_vacuum_df.shape[0] == 0 @@ -193,9 +179,7 @@ def test_apply_optimized_too_freq(housekeeping_stats, expected_optimized_too_fre ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["optimized_too_freq"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 1 pd.testing.assert_frame_equal( need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -211,9 +195,7 @@ def test_empty_apply_optimized_too_freq(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["optimized_too_freq"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["optimized_too_freq"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 0 @@ -224,9 +206,7 @@ def test_apply_vacuumed_too_freq(housekeeping_stats, expected_vacuumed_too_freq, ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["vacuumed_too_freq"]["col_name"] - need_vacuum_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : - ] + need_vacuum_df = res.loc[res[col_name], :] assert need_vacuum_df.shape[0] == 2 pd.testing.assert_frame_equal( need_vacuum_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -242,9 +222,7 @@ def test_empty_apply_vacuumed_too_freq(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["vacuumed_too_freq"]["col_name"] - need_vacuum_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["vacuumed_too_freq"]["legend"]), : - ] + need_vacuum_df = res.loc[res[col_name], :] assert need_vacuum_df.shape[0] == 0 @@ -255,9 +233,7 @@ def test_apply_do_not_need_optimize(housekeeping_stats, expected_do_not_need_opt ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["do_not_need_optimize"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 2 pd.testing.assert_frame_equal( need_optimize_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -273,9 +249,7 @@ def test_empty_apply_do_not_need_optimize(housekeeping_stats, patch_datetime_now ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["do_not_need_optimize"]["col_name"] - need_optimize_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["do_not_need_optimize"]["legend"]), : - ] + need_optimize_df = res.loc[res[col_name], :] assert need_optimize_df.shape[0] == 0 @@ -286,9 +260,7 @@ def test_apply_analyze_tables(housekeeping_stats, expected_need_analysis, patch_ ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["to_analyze"]["col_name"] - need_analysis_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : - ] + need_analysis_df = res.loc[res[col_name], :] assert need_analysis_df.shape[0] == 1 pd.testing.assert_frame_equal( need_analysis_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -304,9 +276,7 @@ def test_empty_apply_analyze_tables(housekeeping_stats, patch_datetime_now): ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["to_analyze"]["col_name"] - need_analysis_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["to_analyze"]["legend"]), : - ] + need_analysis_df = res.loc[res[col_name], :] assert need_analysis_df.shape[0] == 0 @@ -317,9 +287,7 @@ def test_apply_zorder_not_effective(housekeeping_stats, expected_zorder_not_effe ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["zorder_not_effective"]["col_name"] - need_analysis_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : - ] + need_analysis_df = res.loc[res[col_name], :] assert need_analysis_df.shape[0] == 1 pd.testing.assert_frame_equal( need_analysis_df.reset_index().loc[:, ["catalog", "database", "tableName"]], @@ -335,9 +303,7 @@ def test_empty_apply_zorder_not_effective(housekeeping_stats, patch_datetime_now ) res = dha.generate_recommendations().toPandas() col_name = dha.recomendations_dict["zorder_not_effective"]["col_name"] - need_analysis_df = res.loc[ - res[col_name] & res[col_name + dha.reason_col_suffix].str.contains(dha.recomendations_dict["zorder_not_effective"]["legend"]), : - ] + need_analysis_df = res.loc[res[col_name], :] assert need_analysis_df.shape[0] == 0