From e47011e152ce2218e1a9290b9c0f203707735602 Mon Sep 17 00:00:00 2001 From: Zoey Zhou Date: Mon, 20 Nov 2023 16:23:22 -0500 Subject: [PATCH 1/2] Enhanced error handling and testing for forward() to only take one pandas dataframe --- evadb/functions/decorators/utils.py | 5 ++ evadb/functions/test_bad_fuzzyjoin_udf.py | 81 +++++++++++++++++++ .../long/test_function_executor.py | 16 ++++ 3 files changed, 102 insertions(+) create mode 100644 evadb/functions/test_bad_fuzzyjoin_udf.py diff --git a/evadb/functions/decorators/utils.py b/evadb/functions/decorators/utils.py index b5a9611143..8889f9d403 100644 --- a/evadb/functions/decorators/utils.py +++ b/evadb/functions/decorators/utils.py @@ -15,6 +15,7 @@ from typing import List, Type from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.executor.executor_utils import ExecutorError from evadb.functions.abstract.abstract_function import AbstractFunction @@ -47,6 +48,10 @@ def load_io_from_function_decorators( io_signature is not None ), f"Cannot infer io signature from the decorator for {function}." + # added error check when forward() takes more than one pandas dataframe + if len(io_signature) > 1: + raise ExecutorError("forward() only takes one pandas dataframe as input.") + result_list = [] for io in io_signature: result_list.extend(io.generate_catalog_entries(is_input)) diff --git a/evadb/functions/test_bad_fuzzyjoin_udf.py b/evadb/functions/test_bad_fuzzyjoin_udf.py new file mode 100644 index 0000000000..16f7989da7 --- /dev/null +++ b/evadb/functions/test_bad_fuzzyjoin_udf.py @@ -0,0 +1,81 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# coding=utf-8 +# Copyright 2018-2022 EVA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import pandas as pd +from thefuzz import fuzz + +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe + + +class FuzzDistance(AbstractFunction): + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) + def setup(self): + pass + + @property + def name(self) -> str: + return "FuzzDistance" + + @forward( + input_signatures=[ + PandasDataframe( + columns=["data1", "data2"], + column_types=[NdArrayType.STR, NdArrayType.STR], + column_shapes=[(1), (1)], + ), + # Incorrectly tries to use multiple DataFrames + PandasDataframe( + columns=["data3", "data4"], + column_types=[NdArrayType.STR, NdArrayType.STR], + column_shapes=[(1), (1)], + ), + ], + output_signatures=[ + PandasDataframe( + columns=["distance"], + column_types=[NdArrayType.FLOAT32], + column_shapes=[(1)], + ) + ], + ) + def forward(self, df: pd.DataFrame) -> pd.DataFrame: + def _forward(row: pd.Series) -> np.ndarray: + data1 = row.iloc[0] + data2 = row.iloc[1] + distance = fuzz.ratio(data1, data2) + return distance + + ret = pd.DataFrame() + ret["distance"] = df.apply(_forward, axis=1) + return ret diff --git a/test/integration_tests/long/test_function_executor.py b/test/integration_tests/long/test_function_executor.py index 2b21f20165..b7a056ee51 100644 --- a/test/integration_tests/long/test_function_executor.py +++ b/test/integration_tests/long/test_function_executor.py @@ -322,6 +322,22 @@ def test_should_raise_if_function_file_is_modified(self): # with self.assertRaises(AssertionError): execute_query_fetch_all(self.evadb, select_query) + def test_should_raise_error_for_multiple_dataframes(self): + # Checks for an error when multiple dataframes are used + create_function_query = """ + CREATE FUNCTION IF NOT EXISTS FuzzDistance + IMPL 'evadb/functions/test_bad_fuzzyjoin_udf.py' + """ + # Expect ExecutorError + with self.assertRaises(ExecutorError) as cm: + execute_query_fetch_all( + self.evadb, create_function_query, do_not_print_exceptions=True + ) + + # Check if the expected error message is in the exception + expected_error_msg = "forward() only takes one pandas dataframe as input." + self.assertIn(expected_error_msg, str(cm.exception)) + def test_create_function_with_decorators(self): execute_query_fetch_all( self.evadb, "DROP FUNCTION IF EXISTS DummyObjectDetectorDecorators;" From ac6bc26f96627dd40d222fb62756e6bf0039516e Mon Sep 17 00:00:00 2001 From: Zoey Zhou Date: Sat, 25 Nov 2023 22:59:41 -0500 Subject: [PATCH 2/2] Updated the documentation --- docs/source/reference/ai/custom-ai-function.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/reference/ai/custom-ai-function.rst b/docs/source/reference/ai/custom-ai-function.rst index 3db8457be4..e6b1330880 100644 --- a/docs/source/reference/ai/custom-ai-function.rst +++ b/docs/source/reference/ai/custom-ai-function.rst @@ -59,6 +59,8 @@ The arguments that need to be passed are Data types of the inputs to the forward function must be specified. If no constraints are given, then no validation is done for the inputs. + It is essential to adhere to the requirement of a single DataFrame input for the `forward` function. Providing multiple DataFrames or inputs that do not conform to the specified input signature will result in errors. This design choice is made to ensure consistency and efficiency in the processing of UDFs within EvaDB. + - output_signatures: List[IOArgument] Data types of the outputs to the forward function must be specified. If no constraints are given, then no validation is done for the inputs.