diff --git a/.gitignore b/.gitignore index dc7ddbf5ed..9beb186e3c 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,7 @@ env.bak/ venv.bak/ env38/ env_eva/ +evadb-venv/ test_eva_db/ # Spyder project settings diff --git a/evadb/functions/extract_column.py b/evadb/functions/extract_column.py new file mode 100644 index 0000000000..6897944b3e --- /dev/null +++ b/evadb/functions/extract_column.py @@ -0,0 +1,99 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.chatgpt import ChatGPT +from evadb.functions.decorators.decorators import forward +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe + + +class ExtractColumnFunction(ChatGPT): + @property + def name(self) -> str: + return "EXTRACT_COLUMN" + + def setup( + self, model="gpt-3.5-turbo", temperature: float = 0, openai_api_key="" + ) -> None: + super(ExtractColumnFunction, self).setup(model, temperature, openai_api_key) + + @forward( + input_signatures=[ + PandasDataframe( + columns=["field_name", "description", "data_type", "input_rows"], + column_types=[ + NdArrayType.STR, + NdArrayType.STR, + NdArrayType.STR, + NdArrayType.STR, + ], + column_shapes=[ + (1,), + (1,), + (1,), + (1,), + ], + ) + ], + output_signatures=[ + PandasDataframe( + columns=["response"], + column_types=[ + NdArrayType.STR, + ], + column_shapes=[(1,)], + ) + ], + ) + def forward(self, unstructured_df): + """ + NOTE (QUESTION) : Can we structure the inputs and outputs better + The circumvent issues surrounding the input being only one pandas dataframe and output columns being predefined + Will add all column types as a JSON and parse in the forward function + Provide only the file name from which the input will be read + Output in JSON which can be serialized and stored in the results column of the DF + """ + for row in unstructured_df.itertuples(): + field_name = row[0] + description = row[1] + data_type = row[2] + input_rows = row[3] + + prompt = """ + You are given a user query. Your task is to extract the following fields from the query and return the result in string format. + IMPORTANT: RETURN ONLY THE EXTRACTED VALUE (one word or phrase). DO NOT RETURN THE FIELD NAME OR ANY OTHER INFORMATION. + """ + content = """ + Extract the following fields from the unstructured text below: + Format of the field is given in the format + Field Name: Field Description: Field Type + {}: {}: {} + The unstructured text is as follows: + """.format( + field_name, description, data_type + ) + + output_df = pd.DataFrame({"response": []}) + + for row in input_rows: + query = row + input_df = pd.DataFrame( + {"query": [query], "content": content, "prompt": prompt} + ) + df = super(ExtractColumnFunction, self).forward(input_df) + output_df = pd.concat([output_df, df], ignore_index=True) + return output_df diff --git a/script/test/test.sh b/script/test/test.sh index ae67efb62c..47f54bbeaf 100644 --- a/script/test/test.sh +++ b/script/test/test.sh @@ -94,7 +94,7 @@ long_integration_test() { } notebook_test() { - PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" + PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" --ignore="tutorials/15-AI-powered-join.ipynb" --ignore="tutorials/16-homesale-forecasting.ipynb" --ignore="tutorials/17-home-rental-prediction.ipynb" --ignore="tutorials/18-stable-diffusion.ipynb" --ignore="tutorials/19-employee-classification-prediction.ipynb" --ignore="tutorials/20-structured-data.ipynb" code=$? print_error_code $code "NOTEBOOK TEST" } diff --git a/test/integration_tests/long/functions/test_extract_column.py b/test/integration_tests/long/functions/test_extract_column.py new file mode 100644 index 0000000000..d1cd0407e2 --- /dev/null +++ b/test/integration_tests/long/functions/test_extract_column.py @@ -0,0 +1,57 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +from test.markers import chatgpt_skip_marker +from test.util import get_evadb_for_testing + +from evadb.server.command_handler import execute_query_fetch_all + + +class ExtractColumnTest(unittest.TestCase): + def setUp(self) -> None: + self.evadb = get_evadb_for_testing() + self.evadb.catalog().reset() + create_table_query = """CREATE TABLE IF NOT EXISTS InputUnstructured ( + input_rows TEXT) + """ + + execute_query_fetch_all(self.evadb, create_table_query) + + input_row = "My keyboard has stopped working" + + insert_query = ( + f"""INSERT INTO InputUnstructured (input_rows) VALUES ("{input_row}")""" + ) + execute_query_fetch_all(self.evadb, insert_query) + # Add actual API key here + os.environ["OPENAI_API_KEY"] = "sk-..." + + def tearDown(self) -> None: + execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS InputUnstructured;") + + @chatgpt_skip_marker + def test_extract_column_function(self): + function_name = "ExtractColumn" + execute_query_fetch_all(self.evadb, f"DROP FUNCTION IF EXISTS {function_name};") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS {function_name} IMPL 'evadb/functions/extract_column.py';""" + + execute_query_fetch_all(self.evadb, create_function_query) + + extract_columns_query = f"SELECT {function_name}('Issue Component','The component that is causing the issue', 'string less than 2 words', input_rows) FROM InputUnstructured;" + output_batch = execute_query_fetch_all(self.evadb, extract_columns_query) + self.assertEqual(output_batch.columns, ["chatgpt.response"]) diff --git a/tutorials/20-structured-data.ipynb b/tutorials/20-structured-data.ipynb new file mode 100644 index 0000000000..ad0668b90e --- /dev/null +++ b/tutorials/20-structured-data.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import dependencies\n", + "import os\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --quiet \"evadb[document,notebook]\"\n", + "import evadb\n", + "cursor = evadb.connect().cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set your OpenAI key as an environment variable\n", + "import os\n", + "#os.environ['OPENAI_API_KEY'] = ''\n", + "open_ai_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set up the extract columns UDF available at functions/extract_columns.py\n", + "cursor.query(\"\"\"CREATE FUNCTION IF NOT EXISTS ExtractColumn\n", + " IMPL '../evadb/functions/extract_column.py';\n", + " \"\"\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # delete the table if it already exists\n", + "cursor.query(\"\"\"DROP TABLE IF EXISTS InputUnstructured\n", + " \"\"\").execute()\n", + "\n", + "# create the table specifying the type of the prompt column\n", + "cursor.query(\"\"\"CREATE TABLE IF NOT EXISTS InputUnstructured (\n", + " input_rows TEXT)\n", + " \"\"\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_rows_list = [\"The touch screen on my tablet stopped working for no reason.\",\n", + " \"Why does my computer take so long to start up? It's been like this for weeks.\",\n", + " \"My phone battery dies too quickly. I just bought it!\",\n", + " \"My headphones won't connect to my phone anymore, even though they used to work just fine.\",\n", + " \"The software update completely messed up my computer. Now nothing works properly.\"]\n", + "\n", + "for input_row in input_rows_list:\n", + " cursor.query(f\"\"\"INSERT INTO InputUnstructured (input_rows) VALUES (\"{input_row}\")\"\"\").execute()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table = cursor.query(\"SELECT * FROM InputUnstructured;\").df()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table = cursor.query(\n", + " \"\"\"SELECT ExtractColumn(\"Issue Component\",\"The component that is causing the issue\", \"string less than 2 words\", input_rows) FROM InputUnstructured;\"\"\"\n", + " ).df()\n", + "\n", + "for _, row in table.iterrows():\n", + " print(row['response'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}