From e19f144b4679b578035ef76c66d4d370c272eb48 Mon Sep 17 00:00:00 2001 From: jineetd <35962652+jineetd@users.noreply.github.com> Date: Wed, 18 Oct 2023 01:52:04 -0400 Subject: [PATCH] Starting the change for XGBoost integration into EVADb. (#1232) Co-authored-by: Jineet Desai Co-authored-by: Andy Xu --- docs/_toc.yml | 19 ++++ evadb/executor/create_function_executor.py | 90 +++++++++++++++++++ evadb/functions/sklearn.py | 25 +++++- evadb/utils/generic_utils.py | 32 +++++++ setup.py | 16 ++++ .../long/test_model_train.py | 28 ++++++ test/markers.py | 14 +++ 7 files changed, 221 insertions(+), 3 deletions(-) diff --git a/docs/_toc.yml b/docs/_toc.yml index 5aedd7e97..56eeccb10 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -869,11 +869,15 @@ parts: <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) - file: source/reference/ai/model-train-xgboost title: Model Training with XGBoost ======= <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 70850a8b (feat: sync master staging (#1050)) ======= <<<<<<< HEAD @@ -890,6 +894,10 @@ parts: title: Model Training with XGBoost ======= >>>>>>> 03a6c555 (feat: sync master staging (#1050)) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) >>>>>>> 53dafecf (feat: sync master staging (#1050)) ======= - file: source/reference/ai/index @@ -931,10 +939,13 @@ parts: <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> 03a6c555 (feat: sync master staging (#1050)) ======= >>>>>>> ae08f806 (Bump v0.3.4+ dev) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) ======= - file: source/reference/ai/model-train title: Model Training @@ -945,15 +956,19 @@ parts: ======= >>>>>>> 6d6a14c8 (Bump v0.3.4+ dev) ======= +<<<<<<< HEAD ======= >>>>>>> 70850a8b (feat: sync master staging (#1050)) ======= >>>>>>> 22e78346 (Bump v0.3.4+ dev) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) >>>>>>> eva-master ======= - file: source/reference/ai/model-train-xgboost title: Model Training with XGBoost <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) <<<<<<< HEAD >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) @@ -1025,6 +1040,10 @@ parts: ======= >>>>>>> 6d6a14c8 (Bump v0.3.4+ dev) >>>>>>> ae08f806 (Bump v0.3.4+ dev) +======= +>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) - file: source/reference/ai/model-forecasting title: Time Series Forecasting - file: source/reference/ai/hf diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index fb370c0c3..dffc70b78 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -170,6 +170,7 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD DEFAULT_SKLEARN_TRAIN_MODEL, ======= @@ -178,9 +179,14 @@ >>>>>>> 9fe75f29 (feat: sync master staging (#1050)) ======= >>>>>>> b87af508 (feat: sync master staging (#1050)) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) DEFAULT_TRAIN_REGRESSION_METRIC, ======= >>>>>>> 2dacff69 (feat: sync master staging (#1050)) +======= + DEFAULT_TRAIN_REGRESSION_METRIC, +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) DEFAULT_TRAIN_TIME_LIMIT, DEFAULT_XGBOOST_TASK, SKLEARN_SUPPORTED_MODELS, @@ -373,6 +379,7 @@ try_to_import_torch, try_to_import_ultralytics, try_to_import_xgboost, +<<<<<<< HEAD ======= ======= string_comparison_case_insensitive, @@ -398,6 +405,8 @@ try_to_import_ultralytics, >>>>>>> b87af508 (feat: sync master staging (#1050)) >>>>>>> 2dacff69 (feat: sync master staging (#1050)) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) ) from evadb.utils.logging_manager import logger @@ -1107,12 +1116,18 @@ def handle_sklearn_function(self): FunctionMetadataCatalogEntry("model_path", model_path) ) <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) # Pass the prediction column name to sklearn.py self.node.metadata.append( FunctionMetadataCatalogEntry("predict_col", arg_map["predict"]) ) +<<<<<<< HEAD ======= >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) impl_path = Path(f"{self.function_dir}/sklearn.py").absolute().as_posix() io_list = self._resolve_function_io(None) @@ -1130,6 +1145,7 @@ def handle_sklearn_function(self): <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= ======= >>>>>>> 7cac771f (Bump v0.3.4+ dev) @@ -1139,6 +1155,72 @@ def handle_sklearn_function(self): >>>>>>> c5f43c65 (Bump v0.3.4+ dev) ======= >>>>>>> ae08f806 (Bump v0.3.4+ dev) +======= +======= +<<<<<<< HEAD +======= + def handle_xgboost_function(self): + """Handle xgboost functions + + We use the Flaml AutoML model for training xgboost models. + """ + try_to_import_xgboost() + + assert ( + len(self.children) == 1 + ), "Create sklearn function expects 1 child, finds {}.".format( + len(self.children) + ) + + aggregated_batch_list = [] + child = self.children[0] + for batch in child.exec(): + aggregated_batch_list.append(batch) + aggregated_batch = Batch.concat(aggregated_batch_list, copy=False) + aggregated_batch.drop_column_alias() + + arg_map = {arg.key: arg.value for arg in self.node.metadata} + from flaml import AutoML + + model = AutoML() + settings = { + "time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT), + "metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC), + "estimator_list": ["xgboost"], + "task": "regression", + } + model.fit( + dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings + ) + model_path = os.path.join( + self.db.config.get_value("storage", "model_dir"), self.node.name + ) + pickle.dump(model, open(model_path, "wb")) + self.node.metadata.append( + FunctionMetadataCatalogEntry("model_path", model_path) + ) + # Pass the prediction column to xgboost.py. + self.node.metadata.append( + FunctionMetadataCatalogEntry("predict_col", arg_map["predict"]) + ) + + impl_path = Path(f"{self.function_dir}/xgboost.py").absolute().as_posix() + io_list = self._resolve_function_io(None) + return ( + self.node.name, + impl_path, + self.node.function_type, + io_list, + self.node.metadata, + ) + +>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) + def handle_ultralytics_function(self): + """Handle Ultralytics functions""" +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +<<<<<<< HEAD +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) def handle_xgboost_function(self): """Handle xgboost functions @@ -3455,6 +3537,9 @@ def exec(self, *args, **kwargs): train_time, ) = self.handle_sklearn_function() <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) elif string_comparison_case_insensitive(self.node.function_type, "XGBoost"): ( name, @@ -3466,12 +3551,14 @@ def exec(self, *args, **kwargs): best_score, train_time, ) = self.handle_xgboost_function() +<<<<<<< HEAD ======= <<<<<<< HEAD <<<<<<< HEAD ) = self.handle_sklearn_function() >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) ======= +<<<<<<< HEAD ) = self.handle_xgboost_function() >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) ======= @@ -3480,6 +3567,9 @@ def exec(self, *args, **kwargs): ======= >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) elif string_comparison_case_insensitive(self.node.function_type, "Forecasting"): ( name, diff --git a/evadb/functions/sklearn.py b/evadb/functions/sklearn.py index 5aa5d724c..44e5d715c 100644 --- a/evadb/functions/sklearn.py +++ b/evadb/functions/sklearn.py @@ -33,10 +33,13 @@ def name(self) -> str: <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> 2170a7a9 (Bump v0.3.4+ dev) ======= >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) def setup(self, model_path: str, predict_col: str, **kwargs): try_to_import_flaml_automl() @@ -56,6 +59,7 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame: def setup(self, model_path: str, **kwargs): <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= def setup(self, model_path: str, predict_col: str, **kwargs): >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) @@ -74,20 +78,29 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame: ======= ======= >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +======= + def setup(self, model_path: str, predict_col: str, **kwargs): +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) try_to_import_sklearn() self.model = pickle.load(open(model_path, "rb")) + self.predict_col = predict_col def forward(self, frames: pd.DataFrame) -> pd.DataFrame: - # The last column is the predictor variable column. Hence we do not - # pass that column in the predict method for sklearn. - predictions = self.model.predict(frames.iloc[:, :-1]) + # Do not pass the prediction column in the predict method for sklearn. + frames.drop([self.predict_col], axis=1, inplace=True) + predictions = self.model.predict(frames) predict_df = pd.DataFrame(predictions) # We need to rename the column of the output dataframe. For this we <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 2170a7a9 (Bump v0.3.4+ dev) ======= >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) # shall rename it to the column name same as that of the last column of # frames. This is because the last column of frames corresponds to the # variable we want to predict. @@ -95,15 +108,21 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame: >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) ======= # shall rename it to the column name same as that of the predict column # passed in the training frames in EVA query. predict_df.rename(columns={0: self.predict_col}, inplace=True) >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +<<<<<<< HEAD ======= >>>>>>> 2170a7a9 (Bump v0.3.4+ dev) ======= >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) return predict_df def to_device(self, device: str): diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index cc6171e0a..0ccfde5a5 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -572,6 +572,12 @@ def is_sklearn_available() -> bool: <<<<<<< HEAD +<<<<<<< HEAD +======= +<<<<<<< HEAD +======= +<<<<<<< HEAD +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) def try_to_import_sklearn(): try: import sklearn # noqa: F401 @@ -587,16 +593,21 @@ def is_sklearn_available() -> bool: try: try_to_import_sklearn() <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 6d6a14c8 (Bump v0.3.4+ dev) >>>>>>> 22e78346 (Bump v0.3.4+ dev) ======= <<<<<<< HEAD >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +<<<<<<< HEAD +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) return True except ValueError: # noqa: E722 return False +<<<<<<< HEAD <<<<<<< HEAD def try_to_import_sklearn(): try: @@ -667,6 +678,11 @@ def is_sklearn_available() -> bool: >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) ======= ======= +======= +======= +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) def try_to_import_xgboost(): try: import flaml # noqa: F401 @@ -683,11 +699,14 @@ def is_xgboost_available() -> bool: try_to_import_xgboost() <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) <<<<<<< HEAD >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) ======= ======= +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) ======= try_to_import_forecast() >>>>>>> 2dacff69 (feat: sync master staging (#1050)) @@ -700,6 +719,7 @@ def is_xgboost_available() -> bool: ======= ======= >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) +<<<<<<< HEAD >>>>>>> 7cac771f (Bump v0.3.4+ dev) >>>>>>> 2170a7a9 (Bump v0.3.4+ dev) ======= @@ -711,6 +731,11 @@ def is_xgboost_available() -> bool: ======= >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +======= +>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) return True except ValueError: # noqa: E722 return False @@ -719,6 +744,9 @@ def is_xgboost_available() -> bool: <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) ======= >>>>>>> 2dacff69 (feat: sync master staging (#1050)) <<<<<<< HEAD @@ -727,6 +755,7 @@ def is_xgboost_available() -> bool: ======= >>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) ======= ======= @@ -738,6 +767,9 @@ def is_xgboost_available() -> bool: ======= >>>>>>> 2dacff69 (feat: sync master staging (#1050)) >>>>>>> 03a6c555 (feat: sync master staging (#1050)) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) ############################## ## VISION ############################## diff --git a/setup.py b/setup.py index 679b84e56..a69c1acde 100644 --- a/setup.py +++ b/setup.py @@ -273,6 +273,8 @@ def read(path, encoding="utf-8"): <<<<<<< HEAD sklearn_libs = ["scikit-learn"] +xgboost_libs = ["flaml[automl]"] + forecasting_libs = [ "statsforecast", # MODEL TRAIN AND FINE TUNING "neuralforecast" # MODEL TRAIN AND FINE TUNING @@ -396,8 +398,13 @@ def read(path, encoding="utf-8"): <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs + imagegen_libs + xgboost_libs ======= +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) <<<<<<< HEAD ======= >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) @@ -554,8 +561,17 @@ def read(path, encoding="utf-8"): ======= "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs + imagegen_libs, >>>>>>> bf022329 (Add stable diffusion integration (#1240)) +<<<<<<< HEAD >>>>>>> 2b924b76 (Add stable diffusion integration (#1240)) +<<<<<<< HEAD >>>>>>> 8a8a90aa (Add stable diffusion integration (#1240)) +======= +======= +======= + "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs + imagegen_libs + xgboost_libs +>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) } setup( diff --git a/test/integration_tests/long/test_model_train.py b/test/integration_tests/long/test_model_train.py index d39971297..64919aa61 100644 --- a/test/integration_tests/long/test_model_train.py +++ b/test/integration_tests/long/test_model_train.py @@ -17,16 +17,20 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> 2170a7a9 (Bump v0.3.4+ dev) ======= >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) from test.markers import ludwig_skip_marker, sklearn_skip_marker, xgboost_skip_marker ======= from test.markers import ludwig_skip_marker, sklearn_skip_marker >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= from test.markers import ludwig_skip_marker, sklearn_skip_marker, xgboost_skip_marker >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) @@ -34,6 +38,11 @@ >>>>>>> 2170a7a9 (Bump v0.3.4+ dev) ======= >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +======= +from test.markers import ludwig_skip_marker, sklearn_skip_marker, xgboost_skip_marker +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) from test.util import get_evadb_for_testing, shutdown_ray import pytest @@ -371,6 +380,25 @@ def test_xgboost_regression(self): self.assertEqual(len(result.columns), 1) self.assertEqual(len(result), 10) + @xgboost_skip_marker + def test_xgboost_regression(self): + create_predict_function = """ + CREATE FUNCTION IF NOT EXISTS PredictRent FROM + ( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals ) + TYPE XGBoost + PREDICT 'rental_price' + TIME_LIMIT 180 + METRIC 'r2'; + """ + execute_query_fetch_all(self.evadb, create_predict_function) + + predict_query = """ + SELECT PredictRent(number_of_rooms, number_of_bathrooms, days_on_market, rental_price) FROM HomeRentals LIMIT 10; + """ + result = execute_query_fetch_all(self.evadb, predict_query) + self.assertEqual(len(result.columns), 1) + self.assertEqual(len(result), 10) + if __name__ == "__main__": unittest.main() diff --git a/test/markers.py b/test/markers.py index 336d97f6c..255882bcb 100644 --- a/test/markers.py +++ b/test/markers.py @@ -154,6 +154,7 @@ is_xgboost_available, <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) ======= ======= @@ -165,6 +166,13 @@ is_sklearn_available, >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) >>>>>>> c5f43c65 (Bump v0.3.4+ dev) +======= +======= + is_sklearn_available, +>>>>>>> 40a10ce1 (Bump v0.3.4+ dev) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) +>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232)) ) asyncio_skip_marker = pytest.mark.skipif( @@ -245,12 +253,18 @@ ) <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) xgboost_skip_marker = pytest.mark.skipif( is_xgboost_available() is False, reason="Run only if xgboost is available" ) +<<<<<<< HEAD ======= >>>>>>> 40a10ce1 (Bump v0.3.4+ dev) +======= +>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232)) chatgpt_skip_marker = pytest.mark.skip( reason="requires chatgpt", )