From 57990df403ed2d76299bcd65de76685defe4a1a2 Mon Sep 17 00:00:00 2001 From: Jineet Desai Date: Tue, 17 Oct 2023 14:49:28 -0400 Subject: [PATCH] Passing prediction column from handler to the model .py files. With this we won't have to rely on the last column always being the prediction column. --- evadb/executor/create_function_executor.py | 8 ++++++++ evadb/functions/sklearn.py | 16 ++++++++-------- evadb/functions/xgboost.py | 17 +++++++++-------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 32ac20732..379157563 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -154,6 +154,10 @@ def handle_sklearn_function(self): self.node.metadata.append( FunctionMetadataCatalogEntry("model_path", model_path) ) + # Pass the prediction column name to sklearn.py + self.node.metadata.append( + FunctionMetadataCatalogEntry("predict_col", arg_map["predict"]) + ) impl_path = Path(f"{self.function_dir}/sklearn.py").absolute().as_posix() io_list = self._resolve_function_io(None) @@ -205,6 +209,10 @@ def handle_xgboost_function(self): self.node.metadata.append( FunctionMetadataCatalogEntry("model_path", model_path) ) + # Pass the prediction column to xgboost.py. + self.node.metadata.append( + FunctionMetadataCatalogEntry("predict_col", arg_map["predict"]) + ) impl_path = Path(f"{self.function_dir}/xgboost.py").absolute().as_posix() io_list = self._resolve_function_io(None) diff --git a/evadb/functions/sklearn.py b/evadb/functions/sklearn.py index ca3676f14..4ab2b0abf 100644 --- a/evadb/functions/sklearn.py +++ b/evadb/functions/sklearn.py @@ -25,21 +25,21 @@ class GenericSklearnModel(AbstractFunction): def name(self) -> str: return "GenericSklearnModel" - def setup(self, model_path: str, **kwargs): + def setup(self, model_path: str, predict_col: str, **kwargs): try_to_import_sklearn() self.model = pickle.load(open(model_path, "rb")) + self.predict_col = predict_col def forward(self, frames: pd.DataFrame) -> pd.DataFrame: - # The last column is the predictor variable column. Hence we do not - # pass that column in the predict method for sklearn. - predictions = self.model.predict(frames.iloc[:, :-1]) + # Do not pass the prediction column in the predict method for sklearn. + frames.drop([self.predict_col], axis=1, inplace=True) + predictions = self.model.predict(frames) predict_df = pd.DataFrame(predictions) # We need to rename the column of the output dataframe. For this we - # shall rename it to the column name same as that of the last column of - # frames. This is because the last column of frames corresponds to the - # variable we want to predict. - predict_df.rename(columns={0: frames.columns[-1]}, inplace=True) + # shall rename it to the column name same as that of the predict column + # passed in the training frames in EVA query. + predict_df.rename(columns={0: self.predict_col}, inplace=True) return predict_df def to_device(self, device: str): diff --git a/evadb/functions/xgboost.py b/evadb/functions/xgboost.py index adb6eb6a5..063529411 100644 --- a/evadb/functions/xgboost.py +++ b/evadb/functions/xgboost.py @@ -25,21 +25,22 @@ class GenericXGBoostModel(AbstractFunction): def name(self) -> str: return "GenericXGBoostModel" - def setup(self, model_path: str, **kwargs): + def setup(self, model_path: str, predict_col: str, **kwargs): try_to_import_xgboost() self.model = pickle.load(open(model_path, "rb")) + self.predict_col = predict_col def forward(self, frames: pd.DataFrame) -> pd.DataFrame: - # Last column is the value to predict, hence don't pass that to the - # predict method. - predictions = self.model.predict(frames.iloc[:, :-1]) + # We do not pass the prediction column to the predict method of XGBoost + # AutoML. + frames.drop([self.predict_col], axis=1, inplace=True) + predictions = self.model.predict(frames) predict_df = pd.DataFrame(predictions) # We need to rename the column of the output dataframe. For this we - # shall rename it to the column name same as that of the last column of - # frames. This is because the last column of frames corresponds to the - # variable we want to predict. - predict_df.rename(columns={0: frames.columns[-1]}, inplace=True) + # shall rename it to the column name same as that of the predict column + # passed to EVA query. + predict_df.rename(columns={0: self.predict_col}, inplace=True) return predict_df def to_device(self, device: str):