Add the validation score and training time for create_function in XGB…

…oost (#1327) Let us show the validation score and training time for the XGBoost AutoML model trained. This shall give us fair enough idea on how the model trained on the training data set. --------- Co-authored-by: Jineet Desai <[email protected]>
georgia-tech-db · Nov 2, 2023 · f409057 · f409057
1 parent 52ff444
commit f409057
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 5 deletions.
diff --git a/evadb/binder/function_expression_binder.py b/evadb/binder/function_expression_binder.py
@@ -112,7 +112,7 @@ def bind_func_expr(binder: StatementBinder, node: FunctionExpression):
             if string_comparison_case_insensitive(node.name, "CHATGPT"):
                 # if the user didn't provide any API_KEY, check if we have one in the catalog
                 if "OPENAI_API_KEY" not in properties.keys():
-                    OpenAI_key = binder._catalog().get_configuration_catalog_value(
+                    openai_key = binder._catalog().get_configuration_catalog_value(
                         "OPENAI_API_KEY"
                     )
                     properties["openai_api_key"] = openai_key

diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py
@@ -259,12 +259,16 @@ def handle_xgboost_function(self):
 
         impl_path = Path(f"{self.function_dir}/xgboost.py").absolute().as_posix()
         io_list = self._resolve_function_io(None)
+        best_score = model.best_loss
+        train_time = model.best_config_train_time
         return (
             self.node.name,
             impl_path,
             self.node.function_type,
             io_list,
             self.node.metadata,
+            best_score,
+            train_time,
         )
 
     def handle_ultralytics_function(self):
@@ -586,6 +590,8 @@ def exec(self, *args, **kwargs):
         )
 
         overwrite = False
+        best_score = False
+        train_time = False
         # check catalog if it already has this function entry
         if self.catalog().get_function_catalog_entry_by_name(self.node.name):
             if self.node.if_not_exists:
@@ -648,6 +654,8 @@ def exec(self, *args, **kwargs):
                 function_type,
                 io_list,
                 metadata,
+                best_score,
+                train_time,
             ) = self.handle_xgboost_function()
         elif string_comparison_case_insensitive(self.node.function_type, "Forecasting"):
             (
@@ -674,7 +682,18 @@ def exec(self, *args, **kwargs):
             msg = f"Function {self.node.name} overwritten."
         else:
             msg = f"Function {self.node.name} added to the database."
-        yield Batch(pd.DataFrame([msg]))
+        if best_score and train_time:
+            yield Batch(
+                pd.DataFrame(
+                    [
+                        msg,
+                        "Validation Score: " + str(best_score),
+                        "Training time: " + str(train_time),
+                    ]
+                )
+            )
+        else:
+            yield Batch(pd.DataFrame([msg]))
 
     def _try_initializing_function(
         self, impl_path: str, function_args: Dict = {}

diff --git a/test/integration_tests/long/test_model_train.py b/test/integration_tests/long/test_model_train.py
@@ -138,7 +138,9 @@ def test_xgboost_regression(self):
             METRIC 'r2'
             TASK 'regression';
         """
-        execute_query_fetch_all(self.evadb, create_predict_function)
+        result = execute_query_fetch_all(self.evadb, create_predict_function)
+        self.assertEqual(len(result.columns), 1)
+        self.assertEqual(len(result), 3)
 
         predict_query = """
             SELECT PredictRentXgboost(number_of_rooms, number_of_bathrooms, days_on_market, rental_price) FROM HomeRentals LIMIT 10;
@@ -158,7 +160,9 @@ def test_xgboost_classification(self):
             METRIC 'accuracy'
             TASK 'classification';
         """
-        execute_query_fetch_all(self.evadb, create_predict_function)
+        result = execute_query_fetch_all(self.evadb, create_predict_function)
+        self.assertEqual(len(result.columns), 1)
+        self.assertEqual(len(result), 3)
 
         predict_query = """
             SELECT PredictEmployeeXgboost(payment_tier, age, gender, experience_in_current_domain, leave_or_not) FROM Employee LIMIT 10;

diff --git a/test/integration_tests/short/test_select_executor.py b/test/integration_tests/short/test_select_executor.py
@@ -108,7 +108,7 @@ def test_should_raise_binder_error_on_non_existent_column(self):
         with self.assertRaises(BinderError) as ctx:
             execute_query_fetch_all(self.evadb, select_query)
         self.assertEqual(
-            "Cannnot find column b1. Did you mean a1? The feasible columns are ['_row_id', 'a0', 'a1', 'a2'].",
+            "Cannot find column b1. Did you mean a1? The feasible columns are ['_row_id', 'a0', 'a1', 'a2'].",
             str(ctx.exception),
         )